NVIDIA/Megatron-LM
31 workflows · maturity 67% · 10 patterns · GitHub ↗
Practices
✓ Matrix✓ Permissions○ Security scan✓ AI review○ Cache✓ Concurrency✓ Reusable workflows
Detected patterns
Security dimensions
Workflows (31)
_build_test_publish_wheel matrix .github/workflows/_build_test_publish_wheel.yml
View raw YAML
on:
workflow_call:
inputs:
ref:
required: false
description: Ref (SHA or branch) to release
type: string
default: ${{ github.sha }}
dry-run:
required: false
description: Upload to PyPy Test instance
type: boolean
default: true
no-publish:
required: false
description: Do not publish the wheel
type: boolean
default: true
secrets:
TWINE_PASSWORD:
required: true
jobs:
build-and-test-wheels:
strategy:
fail-fast: false
matrix:
include:
- PACKAGE: megatron-core
PLATFORM: arm64
IMAGE: quay.io/pypa/manylinux_2_28_aarch64
- PACKAGE: megatron-core
PLATFORM: amd64
IMAGE: quay.io/pypa/manylinux_2_28_x86_64
- PACKAGE: megatron-fsdp
IMAGE: quay.io/pypa/manylinux_2_28_x86_64
PLATFORM: amd64
runs-on: ${{ matrix.PLATFORM == 'amd64' && 'ubuntu-22.04' || 'ubuntu-22.04-arm' }}
env:
PACKAGE: ${{ matrix.PACKAGE }}
IMAGE: ${{ matrix.IMAGE }}
PLATFORM: ${{ matrix.PLATFORM }}
PUBLISH_DRYRUN: ${{ inputs.dry-run }}
steps:
- name: Checkout repository
uses: actions/checkout@v6
with:
ref: ${{ inputs.ref }}
- name: Build wheel
id: build-wheel
run: |
set -x
if [ "$PACKAGE" = "megatron-core" ]; then
ROOTDIR="megatron/core"
BUILD_DIR="."
elif [ "$PACKAGE" = "megatron-fsdp" ]; then
ROOTDIR="megatron/core/distributed/fsdp/src/megatron_fsdp"
BUILD_DIR="megatron/core/distributed/fsdp/src"
else
echo Unknown package: $PACKAGE
exit 1
fi
if [ "$PUBLISH_DRYRUN" = "true" ]; then
PRE_RELEASE=$(sed -n "s/.*PRE_RELEASE = '\(.*\)'/\1/p" $ROOTDIR/package_info.py)
sed -i "/^PRE_RELEASE/c\PRE_RELEASE = '${PRE_RELEASE}.dev$((RANDOM % 900000 + 100000))'" $ROOTDIR/package_info.py
fi
pushd $BUILD_DIR
rm LICENSE || true
docker run --rm -v $(pwd):/workspace -w /workspace $IMAGE bash -c '\
for python_version in cp310 cp311 cp312 cp313; do \
/opt/python/${python_version}-${python_version}/bin/pip install --upgrade "setuptools<80.0.0,>=77.0.0" build; \
done && \
for python_version in cp310 cp311 cp312 cp313; do \
/opt/python/${python_version}-${python_version}/bin/python -m build; \
done \
'
PLATFORM_WHEELS=$(find dist -name "*.whl" -not -name "*-none-any.whl")
if [ -n "$PLATFORM_WHEELS" ]; then
echo "Found platform wheels to repair: $PLATFORM_WHEELS"
docker run --rm -v $(pwd):/workspace -w /workspace $IMAGE auditwheel repair $PLATFORM_WHEELS
docker run --rm -v $(pwd):/workspace -w /workspace $IMAGE rm -rf dist/*.whl
docker run --rm -v $(pwd):/workspace -w /workspace $IMAGE cp -a wheelhouse/* dist/
fi
popd
pushd $ROOTDIR
EXPECTED_RELEASE_NUMBER=$(python -c "import package_info; print(package_info.__version__)")
popd
echo "expected-release-number=$EXPECTED_RELEASE_NUMBER" | tee -a "${GITHUB_OUTPUT}"
if [ "$PACKAGE" = "megatron-fsdp" ]; then
mkdir -p dist/
cp -a megatron/core/distributed/fsdp/src/dist/* dist/
fi
ls -al dist/
- name: Test wheels
run: |
ls -al dist/
if [ "$PACKAGE" = "megatron-core" ]; then
ROOTPATH="megatron.core"
WHEEL_PREFIX="megatron_core"
elif [ "$PACKAGE" = "megatron-fsdp" ]; then
ROOTPATH="megatron_fsdp"
WHEEL_PREFIX="megatron_fsdp"
else
echo Unknown package: $PACKAGE
exit 1
fi
if [ "$PACKAGE" = "megatron-core" ]; then
if [[ "$PLATFORM" == "arm64" ]]; then
for file in dist/$WHEEL_PREFIX*cp310*aarch64.whl; do
pip install --no-cache-dir "$file"
done
else
for file in dist/$WHEEL_PREFIX*cp310*x86_64.whl; do
pip install --no-cache-dir "$file"
done
fi
else
pip install --no-cache-dir dist/$WHEEL_PREFIX*.whl
fi
sudo rm -rf megatron/
RELEASE_NUMBER=$(python -c "import $ROOTPATH; print($ROOTPATH.__version__)")
test "${{ steps.build-wheel.outputs.expected-release-number }}" == "$RELEASE_NUMBER"
- name: Upload wheels
uses: actions/upload-artifact@v6
with:
name: wheels-${{ matrix.PACKAGE }}-${{ matrix.PLATFORM }}-${{ inputs.dry-run && 'dry-run' || 'release' }}
path: dist/
publish-wheels:
needs: [build-and-test-wheels]
runs-on: ubuntu-latest
if: inputs.no-publish == false
strategy:
fail-fast: false
matrix:
include:
- PACKAGE: megatron-core
PLATFORM: arm64
- PACKAGE: megatron-core
PLATFORM: amd64
- PACKAGE: megatron-fsdp
PLATFORM: amd64
env:
PACKAGE: ${{ matrix.PACKAGE }}
steps:
- name: Download wheels
uses: actions/download-artifact@v7
with:
name: wheels-${{ matrix.PACKAGE }}-${{ matrix.PLATFORM }}-${{ inputs.dry-run && 'dry-run' || 'release' }}
path: dist/
merge-multiple: true
- name: Publish wheels
env:
TWINE_USERNAME: __token__
TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
TWINE_REPOSITORY: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && 'pypi' || 'testpypi' }}
PLATFORM: ${{ matrix.PLATFORM }}
run: |
# Delete sdist for arm64 since we already upload it with amd64.
if [ "$PLATFORM" == "arm64" ]; then
rm dist/*.tar.gz
fi
ls -al dist/
pip install twine
twine upload \
--verbose \
-r $TWINE_REPOSITORY \
-u $TWINE_USERNAME \
-p $TWINE_PASSWORD \
dist/*
_release_library perms .github/workflows/_release_library.yml
View raw YAML
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: "Release"
defaults:
run:
shell: bash -x -e -u -o pipefail {0}
on:
workflow_call:
inputs:
release-ref:
required: true
description: Ref (SHA or branch) to release
type: string
dry-run:
type: boolean
required: true
description: Do not publish a wheel and GitHub release.
version-bump-branch:
type: string
required: true
description: Branch to target for version bump
create-gh-release:
required: false
description: Create a GitHub release
type: boolean
default: true
gh-release-use-changelog-builder:
required: false
description: Use release-changelog-builder-action to dynamically build changelog
type: boolean
default: true
gh-release-changelog-config:
required: false
description: Path to changelog builder configuration file
type: string
default: ".github/workflows/config/changelog-config.json"
gh-release-from-tag:
required: false
description: Starting tag for changelog builder (leave empty for auto-detect)
type: string
default: ""
publish-docs:
required: false
description: Publish documentation to S3 after release
type: boolean
default: true
secrets:
TWINE_PASSWORD:
required: true
SLACK_WEBHOOK:
required: true
PAT:
required: true
AWS_ASSUME_ROLE_ARN:
required: true
AWS_ACCESS_KEY_ID:
required: true
AWS_SECRET_ACCESS_KEY:
required: true
AKAMAI_HOST:
required: true
AKAMAI_CLIENT_TOKEN:
required: true
AKAMAI_CLIENT_SECRET:
required: true
AKAMAI_ACCESS_TOKEN:
required: true
S3_BUCKET_NAME:
required: true
permissions:
contents: write # To read repository content
pull-requests: write # To create PR(s)
jobs:
build-test-publish-wheels-dry-run:
uses: ./.github/workflows/_build_test_publish_wheel.yml
with:
dry-run: true
ref: ${{ inputs.release-ref }}
no-publish: true
secrets:
TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
bump-next-version:
runs-on: ubuntu-latest
needs: build-test-publish-wheels-dry-run
if: |
(
success() || !failure()
)
&& !cancelled()
outputs:
release-version: ${{ steps.bump-version-mcore.outputs.release-version }}
env:
IS_DRY_RUN: ${{ inputs.dry-run }}
steps:
- name: Checkout repository
uses: actions/checkout@v6
with:
path: ${{ github.run_id }}
token: ${{ secrets.PAT }}
fetch-depth: 0
fetch-tags: true
ref: ${{ inputs.release-ref }}
- name: Bump version MCore
id: bump-version-mcore
env:
SRC_DIR: ""
PYPROJECT_NAME: "megatron.core"
run: |
set +u
cd ${{ github.run_id }}
PACKAGE_INFO_FILE="$SRC_DIR${PYPROJECT_NAME//.//}/package_info.py"
MAJOR=$(cat $PACKAGE_INFO_FILE | awk '/^MAJOR = /' | awk -F"= " '{print $2}')
MINOR=$(cat $PACKAGE_INFO_FILE | awk '/^MINOR = /' | awk -F"= " '{print $2}')
PATCH=$(cat $PACKAGE_INFO_FILE | awk '/^PATCH = /' | awk -F"= " '{print $2}')
PRERELEASE=$(cat $PACKAGE_INFO_FILE | awk '/^PRE_RELEASE = /' | awk -F"= " '{print $2}' | tr -d '"' | tr -d "'")
echo "release-version=$MAJOR.$MINOR.$PATCH$PRERELEASE" | tee -a "$GITHUB_OUTPUT"
if [[ "$PRERELEASE" != "" ]]; then
if [[ "$PRERELEASE" == *rc* ]]; then
NEXT_PATCH=$PATCH
NEXT_PRERELEASE=rc$((${PRERELEASE#rc} + 1))
elif [[ "$PRERELEASE" == *a* ]]; then
NEXT_PATCH=$PATCH
NEXT_PRERELEASE=a$((${PRERELEASE#a} + 1))
else
echo "Unknown pre-release: $PRERELEASE"
exit 1
fi
else
NEXT_PATCH=$((${PATCH} + 1))
NEXT_PRERELEASE=$PRERELEASE
fi
sed -i "/^PATCH/c\PATCH = $NEXT_PATCH" $PACKAGE_INFO_FILE
sed -i "/^PRE_RELEASE/c\PRE_RELEASE = \"$NEXT_PRERELEASE\"" $PACKAGE_INFO_FILE
echo "version=$MAJOR.$MINOR.$NEXT_PATCH$NEXT_PRERELEASE" | tee -a "$GITHUB_OUTPUT"
- name: Bump version MFSDP
id: bump-version-mfsdp
env:
SRC_DIR: "megatron/core/distributed/fsdp/src/"
PYPROJECT_NAME: "megatron_fsdp"
run: |
set +u
cd ${{ github.run_id }}
PACKAGE_INFO_FILE="$SRC_DIR${PYPROJECT_NAME//.//}/package_info.py"
MAJOR=$(cat $PACKAGE_INFO_FILE | awk '/^MAJOR = /' | awk -F"= " '{print $2}')
MINOR=$(cat $PACKAGE_INFO_FILE | awk '/^MINOR = /' | awk -F"= " '{print $2}')
PATCH=$(cat $PACKAGE_INFO_FILE | awk '/^PATCH = /' | awk -F"= " '{print $2}')
PRERELEASE=$(cat $PACKAGE_INFO_FILE | awk '/^PRE_RELEASE = /' | awk -F"= " '{print $2}' | tr -d '"' | tr -d "'")
if [[ "$PRERELEASE" != "" ]]; then
if [[ "$PRERELEASE" == *rc* ]]; then
NEXT_PATCH=$PATCH
NEXT_PRERELEASE=rc$((${PRERELEASE#rc} + 1))
elif [[ "$PRERELEASE" == *a* ]]; then
NEXT_PATCH=$PATCH
NEXT_PRERELEASE=a$((${PRERELEASE#a} + 1))
else
echo "Unknown pre-release: $PRERELEASE"
exit 1
fi
else
NEXT_PATCH=$((${PATCH} + 1))
NEXT_PRERELEASE=$PRERELEASE
fi
sed -i "/^PATCH/c\PATCH = $NEXT_PATCH" $PACKAGE_INFO_FILE
sed -i "/^PRE_RELEASE/c\PRE_RELEASE = \"$NEXT_PRERELEASE\"" $PACKAGE_INFO_FILE
echo "version=$MAJOR.$MINOR.$NEXT_PATCH$NEXT_PRERELEASE" | tee -a "$GITHUB_OUTPUT"
- name: Create and push deployment branch
env:
GH_TOKEN: ${{ secrets.PAT }}
run: |
cd ${{ github.run_id }}
TMP_BRANCH="deploy-release/$(uuidgen)"
git config --global user.name "github-actions[bot]"
git config --global user.email "github-actions[bot]@users.noreply.github.com"
git checkout -b "$TMP_BRANCH"
git add -A .
git commit -m "beep boop 🤖: Bumping versions" || echo "No changes to commit"
git push -u origin "$TMP_BRANCH"
echo "TMP_BRANCH=$TMP_BRANCH" | tee -a $GITHUB_ENV
# Create PR to collect app based status checks that run on PRs only
# (like DCO check)
PR_URL=$(gh pr create \
--base ${{ inputs.version-bump-branch }} \
--head $TMP_BRANCH \
--title "beep boop 🤖: Bumping versions" \
--body "This is an automated PR to bump versions.")
# Extract PR number from URL
PR_NUMBER=$(echo $PR_URL | grep -o '[0-9]*$')
- name: Wait for status checks on tmp branch
uses: actions/github-script@v8
id: wait-status
with:
github-token: ${{ secrets.PAT }}
script: |
const branch = process.env.TMP_BRANCH;
const owner = context.repo.owner;
const repo = context.repo.repo;
// Get latest commit SHA of branch
const { data: refData } = await github.rest.git.getRef({
owner,
repo,
ref: `heads/${branch}`, // note: no 'refs/' prefix here
});
const sha = refData.object.sha;
console.log(`Polling status for commit SHA: ${sha}`);
let checksPassed = false;
let maxAttempts = 30;
let attempt = 0;
const delay = ms => new Promise(res => setTimeout(res, ms));
while (!checksPassed && attempt < maxAttempts) {
attempt++;
// Use commit SHA instead of branch ref
const { data: status } = await github.rest.repos.getCombinedStatusForRef({
owner,
repo,
ref: sha,
});
const { data: checks } = await github.rest.checks.listForRef({
owner,
repo,
ref: sha,
});
const allStatuses = status.statuses;
const allChecks = checks.check_runs;
if (allStatuses.length === 0 && allChecks.length === 0) {
console.log(`Attempt ${attempt}: No checks or statuses yet. Waiting...`);
await delay(10000);
continue;
}
const statusesOk = allStatuses.every(s => s.state === 'success');
const checksOk = allChecks.every(c => c.status === 'completed');
if (statusesOk && checksOk) {
console.log('✅ All checks passed.');
checksPassed = true;
break
}
console.log(`Attempt ${attempt}: Checks not complete yet. Waiting...`);
await delay(10000);
}
if (!checksPassed) {
core.setFailed('❌ Status checks did not pass in time');
}
- name: Merge into ${{ inputs.version-bump-branch }}
run: |
cd ${{ github.run_id }}
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
CMD=$(echo -E 'git push origin ${{ inputs.version-bump-branch }}')
if [[ "$IS_DRY_RUN" == "true" ]]; then
echo "dry-run enabled, would have run: $CMD"
else
# Here we account for potential race conditions from multiple concurrent releases.
# Those can be legit (operating on different packages within the monorepo, for example)
# but the pushes would be still rejected purely because of git's inability to
# push non-fast-forward updates to the branch. In this case we would need to let
# a retry.
git fetch origin ${{ inputs.version-bump-branch }}
git checkout ${{ inputs.version-bump-branch }}
git merge ${{ env.TMP_BRANCH }}
for attempt in {1..3}; do
if eval "$CMD"; then
echo "Git push succeeded on attempt $attempt"
break
else
echo "Git push failed on attempt $attempt"
if [[ $attempt -lt 3 ]]; then
sleep $((RANDOM % 3 + 1))
# We refetch, reset and re-merge. Note resetting because the local
# branch is "contaminated" with previous merge attempt.
git fetch origin ${{ inputs.version-bump-branch }}
git reset --hard origin/${{ inputs.version-bump-branch }}
git merge ${{ env.TMP_BRANCH }}
else
echo "Git push failed after 3 attempts"
exit 1
fi
fi
done
fi
- name: Delete ${{ env.TMP_BRANCH }} branch
if: always()
run: |
cd ${{ github.run_id }}
git push -d origin ${{ env.TMP_BRANCH }}
build-test-publish-wheels:
needs: [bump-next-version]
uses: ./.github/workflows/_build_test_publish_wheel.yml
with:
dry-run: false
ref: ${{ inputs.release-ref }}
no-publish: false
secrets:
TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
create-gh-release:
needs: [build-test-publish-wheels, bump-next-version]
runs-on: ubuntu-latest
if: |
(
success() || !failure()
)
&& inputs.create-gh-release == true
&& !cancelled()
outputs:
is-release-candidate: ${{ steps.version-number.outputs.is-release-candidate }}
env:
REPOSITORY: ${{ github.repository }}
PROJECT_NAME: Megatron Core
VERSION: ${{ needs.bump-next-version.outputs.release-version }}
TAG_PREFIX: core_
steps:
- name: Checkout repository
uses: actions/checkout@v6
with:
path: ${{ github.run_id }}
ref: ${{ inputs.release-ref }}
token: ${{ secrets.PAT || secrets.GITHUB_TOKEN }}
- name: Determine fromTag for changelog
id: determine-from-tag
if: inputs.gh-release-use-changelog-builder == true
run: |
cd ${{ github.run_id }}
# If gh-release-from-tag is provided, use it
if [[ -n "${{ inputs.gh-release-from-tag }}" ]]; then
FROM_TAG="${{ inputs.gh-release-from-tag }}"
echo "Using provided fromTag: $FROM_TAG"
else
# Get the most recent tag
FROM_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "")
if [[ -z "$FROM_TAG" ]]; then
echo "No previous tags found, leaving fromTag empty"
else
echo "Auto-detected most recent tag: $FROM_TAG"
fi
fi
echo "from-tag=$FROM_TAG" >> $GITHUB_OUTPUT
- name: Build Changelog
id: build-changelog
if: inputs.gh-release-use-changelog-builder == true
uses: mikepenz/release-changelog-builder-action@v6.1.0
env:
GITHUB_TOKEN: ${{ secrets.PAT || secrets.GITHUB_TOKEN }}
with:
configuration: ${{ github.run_id }}/${{ inputs.gh-release-changelog-config }}
owner: ${{ github.repository_owner }}
repo: ${{ github.event.repository.name }}
ignorePreReleases: "false"
failOnError: "false"
fromTag: ${{ steps.determine-from-tag.outputs.from-tag }}
toTag: ${{ inputs.release-ref }}
mode: ${{ inputs.gh-release-changelog-mode }}
- name: Create release
id: version-number
env:
SHA: ${{ inputs.release-ref }}
GH_TOKEN: ${{ secrets.PAT }}
IS_DRY_RUN: ${{ inputs.dry-run }}
BUILT_CHANGELOG: ${{ steps.build-changelog.outputs.changelog }}
run: |
cd ${{ github.run_id }}
IS_RELEASE_CANDIDATE=$([[ "$VERSION" == *rc* ]] && echo "true" || echo "false")
IS_ALPHA=$([[ "$VERSION" == *a* ]] && echo "true" || echo "false")
IS_PRERELEASE=$([[ "$IS_RELEASE_CANDIDATE" == "true" || "$IS_ALPHA" == "true" ]] && echo "true" || echo "false")
NAME="NVIDIA $PROJECT_NAME ${VERSION}"
# Use built changelog if available, otherwise fall back to CHANGELOG.md
if [[ -n "$BUILT_CHANGELOG" ]]; then
CHANGELOG="$BUILT_CHANGELOG"
elif [[ "$IS_RELEASE_CANDIDATE" == "true" ]]; then
DATE=$(date +"%Y-%m-%d")
CHANGELOG="Prerelease: $NAME ($DATE)"
else
CHANGELOG=$(awk '/^## '"$NAME"'/{flag=1; next} /^## /{flag=0} flag' CHANGELOG.md)
CHANGELOG=$(echo "$CHANGELOG" | sed '/./,$!d' | sed ':a;N;$!ba;s/\n$//')
fi
echo "is-release-candidate=$IS_RELEASE_CANDIDATE" | tee -a "$GITHUB_OUTPUT"
PAYLOAD=$(jq -nc \
--arg TAG_NAME "${TAG_PREFIX}v${VERSION}" \
--arg CI_COMMIT_BRANCH "$SHA" \
--arg NAME "$NAME" \
--arg BODY "$CHANGELOG" \
--argjson PRERELEASE "$IS_PRERELEASE" \
'{
"tag_name": $TAG_NAME,
"target_commitish": $CI_COMMIT_BRANCH,
"name": $NAME,
"body": $BODY,
"draft": false,
"prerelease": $PRERELEASE,
"generate_release_notes": false
}'
)
echo -E "$PAYLOAD" > payload.txt
CMD=$(echo -E 'curl -L \
-X POST \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer '"$GH_TOKEN"'" \
-H "X-GitHub-Api-Version: 2022-11-28" \
https://api.github.com/repos/'"$REPOSITORY"'/releases \
-d @payload.txt
')
if [[ "$IS_DRY_RUN" == "true" ]]; then
echo -E "$CMD"
else
eval "$CMD"
fi
publish-docs:
needs: [bump-next-version, create-gh-release]
uses: ./.github/workflows/release-docs.yml
if: |
(
success() || !failure()
)
&& inputs.publish-docs == true
&& !cancelled()
with:
dry-run: ${{ inputs.dry-run }}
publish-as-latest: true
docs-version-override: ${{ needs.bump-next-version.outputs.release-version }}
build-docs-ref: ${{ inputs.release-ref }}
secrets: inherit
notify:
needs: [build-test-publish-wheels, create-gh-release, bump-next-version]
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v6
with:
repository: NVIDIA-NeMo/FW-CI-templates
ref: v0.17.0
path: send-slack-alert
- name: Send Slack alert
uses: ./send-slack-alert/.github/actions/send-slack-alert
env:
MESSAGE: |
${{ inputs.dry-run == true && 'This is a dry-run, nothing actually happened: ' || '' }}We have released `${{ needs.bump-next-version.outputs.release-version }}` of `NVIDIA Megatron Core` 🚀✨🎉
• <https://github.com/${{ github.repository }}/releases/tag/core_v${{ needs.bump-next-version.outputs.release-version }}|GitHub release>
• <https://${{ inputs.dry-run == true && 'test.' || '' }}pypi.org/project/megatron-core/${{ needs.bump-next-version.outputs.release-version }}/|PyPi release>
with:
message: ${{ env.MESSAGE }}
webhook: ${{ secrets.SLACK_WEBHOOK }}
_update_dependencies .github/workflows/_update_dependencies.yml
View raw YAML
name: ~Update dependencies template
on:
workflow_call:
inputs:
target-branch:
required: true
type: string
description: "The target branch to bump"
secrets:
PAT:
required: true
SSH_KEY:
required: true
SSH_PWD:
required: true
jobs:
pre-flight:
runs-on: ubuntu-latest
outputs:
bump-branch: bump-ci-container-${{ steps.ref.outputs.date }}-${{ inputs.target-branch }}
date: ${{ steps.ref.outputs.date }}
steps:
- name: Get date
id: ref
run: echo "date=$(date +%F)" | tee -a "$GITHUB_OUTPUT"
update-lockfile:
runs-on: linux-amd64-cpu16
needs: [pre-flight]
env:
SOURCE_BRANCH: ${{ needs.pre-flight.outputs.bump-branch }}
TARGET_BRANCH: ${{ inputs.target-branch }}
steps:
- name: Checkout repo
uses: actions/checkout@v6
with:
ref: ${{ env.TARGET_BRANCH }}
- name: Mock test data
run: mkdir -p assets/
- name: Fetch NGC Version
id: ngc-version
run: |
NGC_VERSION=$(cat docker/.ngc_version.dev)
echo "NGC_VERSION=${NGC_VERSION}" | tee -a "$GITHUB_OUTPUT"
- name: Build container
env:
GH_TOKEN: ${{ secrets.PAT }}
run: |
docker build -f docker/Dockerfile.ci.dev --build-arg FROM_IMAGE_NAME="${{ steps.ngc-version.outputs.NGC_VERSION }}" --target=main -t megatron-core .
- name: Create bump branch if not exists
run: |
if ! git ls-remote --exit-code origin $SOURCE_BRANCH; then
git checkout -b $SOURCE_BRANCH $TARGET_BRANCH
git push origin $SOURCE_BRANCH
fi
- name: Checkout repo
uses: actions/checkout@v6
with:
ref: ${{ env.SOURCE_BRANCH }}
- name: Upgrade lock file
env:
GH_TOKEN: ${{ secrets.PAT }}
run: |
docker run \
--rm \
-v $(pwd):/workspace \
-w /workspace \
-e GH_TOKEN=${{ secrets.PAT }} \
megatron-core \
bash -c 'uv lock --upgrade'
- name: Upload lock file
uses: actions/upload-artifact@v6
with:
name: lock-file-${{ env.SOURCE_BRANCH }}
path: uv.lock
create-pr:
needs: [update-lockfile, pre-flight]
runs-on: ubuntu-latest
env:
SOURCE_BRANCH: ${{ needs.pre-flight.outputs.bump-branch }}
TARGET_BRANCH: ${{ inputs.target-branch }}
steps:
- name: Checkout code
uses: actions/checkout@v6
with:
token: ${{ secrets.PAT }}
ref: ${{ env.TARGET_BRANCH }}
- name: Rebase against ${{ env.SOURCE_BRANCH }}
run: |
if git ls-remote --exit-code origin ${{ env.SOURCE_BRANCH }}; then
git fetch origin ${{ env.SOURCE_BRANCH }}
git rebase -S origin/${{ env.SOURCE_BRANCH }}
fi
- name: Download lock file
uses: actions/download-artifact@v7
with:
name: lock-file-${{ env.SOURCE_BRANCH }}
- name: Create Bump PR
uses: peter-evans/create-pull-request@v8
id: create-pull-request
env:
title: "chore(beep boop 🤖): Bump `uv.lock` (${{ inputs.target-branch}}) (${{ needs.pre-flight.outputs.date }})"
with:
branch: ${{ env.SOURCE_BRANCH }}
base: ${{ env.TARGET_BRANCH }}
title: ${{ env.title }}
token: ${{ secrets.PAT }}
body: |
🚀 PR to bump `uv.lock` in `${{ inputs.target-branch }}`.
📝 Please remember the following to-do's before merge:
- [ ] Verify the presubmit CI
🙏 Please merge this PR only if the CI workflow completed successfully.
commit-message: ${{ env.title }}
signoff: true
committer: "github-actions[bot] <github-actions[bot]@users.noreply.github.com>"
- name: Post /ok to test comment
env:
GH_TOKEN: ${{ secrets.PAT }}
run: |
PR_NUMBER="${{ steps.create-pull-request.outputs.pull-request-number }}"
if [ -z "$PR_NUMBER" ]; then
echo "No PR was created, skipping comment"
exit 0
fi
SHA="${{ steps.create-pull-request.outputs.pull-request-head-sha }}"
gh pr comment "$PR_NUMBER" --body "/ok to test $SHA"
- name: Wait for CI checks
env:
GH_TOKEN: ${{ secrets.PAT }}
run: |
PR_NUMBER="${{ steps.create-pull-request.outputs.pull-request-number }}"
if [ -z "$PR_NUMBER" ]; then
echo "No PR was created, skipping wait"
exit 0
fi
# Fetch required status checks from branch protection rules
REQUIRED_CHECKS=$(gh api \
"repos/${{ github.repository }}/branches/${{ env.TARGET_BRANCH }}/protection/required_status_checks" \
--jq '.checks[].context' 2>/dev/null \
|| gh api \
"repos/${{ github.repository }}/branches/${{ env.TARGET_BRANCH }}/protection/required_status_checks" \
--jq '.contexts[]' 2>/dev/null \
|| true)
if [ -z "$REQUIRED_CHECKS" ]; then
echo "No branch protection rules found for ${{ env.TARGET_BRANCH }}, skipping wait"
exit 0
fi
echo "Required checks from branch protection:"
echo "$REQUIRED_CHECKS"
echo "Waiting for required checks to complete on PR #$PR_NUMBER..."
i=0
INITIALIZED=false
while true; do
i=$((i + 1))
CHECKS_JSON=$(gh pr checks "$PR_NUMBER" --json name,state 2>/dev/null || echo "[]")
ALL_DONE=true
FAILED_CHECKS=""
while IFS= read -r check; do
CHECK_STATE=$(echo "$CHECKS_JSON" | jq -r --arg name "$check" '.[] | select(.name == $name) | .state // ""' | tr '[:upper:]' '[:lower:]')
case "$CHECK_STATE" in
*success*|*pass*|*skip*|*neutral*) ;;
*pending*|*queued*|*progress*|*waiting*|*request*|"")
ALL_DONE=false
INITIALIZED=true
break
;;
*)
if [ "$INITIALIZED" = "true" ]; then
FAILED_CHECKS="${FAILED_CHECKS} - ${check} (${CHECK_STATE})"$'\n'
else
ALL_DONE=false
fi
;;
esac
done <<< "$REQUIRED_CHECKS"
if [ "$ALL_DONE" = "true" ]; then
if [ -n "$FAILED_CHECKS" ]; then
echo "Required check(s) did not pass:"
echo "$FAILED_CHECKS"
exit 1
fi
echo "All required checks passed!"
break
fi
echo "Checks not yet complete (attempt $i), retrying in 30s..."
sleep 30
done
- name: Merge PR
env:
title: "chore(beep boop 🤖): Bump `uv.lock` (${{ env.TARGET_BRANCH}}) (${{ needs.pre-flight.outputs.date }})"
run: |
PR_NUMBER="${{ steps.create-pull-request.outputs.pull-request-number }}"
if [ -z "$PR_NUMBER" ]; then
echo "No PR was created, skipping merge"
exit 0
fi
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
git fetch origin ${{ env.SOURCE_BRANCH }}
git fetch origin ${{ env.TARGET_BRANCH }}
git checkout ${{ env.TARGET_BRANCH }}
git merge --squash origin/${{ env.SOURCE_BRANCH }}
git commit -m "${{ env.title }}"
git pull --rebase origin ${{ env.TARGET_BRANCH }}
git push origin ${{ env.TARGET_BRANCH }}
git push origin --delete ${{ env.SOURCE_BRANCH }}
auto-assign-milestone perms .github/workflows/auto-assign-milestone.yml
View raw YAML
name: Auto-assign Milestone to PR
on:
push:
branches:
- "pull-request/[0-9]+"
permissions:
contents: read
pull-requests: write
issues: write
jobs:
assign-milestone:
runs-on: ubuntu-latest
if: github.repository == 'NVIDIA/Megatron-LM'
steps:
- name: Get PR info
id: get-pr-info
if: startsWith(github.ref, 'refs/heads/pull-request/')
uses: nv-gha-runners/get-pr-info@main
- name: Check if PR has milestone
id: check_milestone
env:
GH_TOKEN: ${{ secrets.PAT }}
run: |
MILESTONE=$(gh pr view ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} \
--repo ${{ github.repository }} \
--json milestone \
--jq '.milestone.title')
if [ "$MILESTONE" = "null" ] || [ -z "$MILESTONE" ]; then
echo "has_milestone=false" >> $GITHUB_OUTPUT
else
echo "has_milestone=true" >> $GITHUB_OUTPUT
echo "PR already has milestone: $MILESTONE"
fi
- name: Get most recent open milestone
if: steps.check_milestone.outputs.has_milestone == 'false'
id: get_milestone
env:
GH_TOKEN: ${{ secrets.PAT }}
run: |
# Get the most recent open milestone (sorted by due date, then by creation date)
MILESTONE_NUMBER=$(gh api \
"repos/${{ github.repository }}/milestones?state=open&sort=due_on&direction=desc" \
--jq '.[0].number')
MILESTONE_TITLE=$(gh api \
"repos/${{ github.repository }}/milestones?state=open&sort=due_on&direction=desc" \
--jq '.[0].title')
if [ -z "$MILESTONE_NUMBER" ] || [ "$MILESTONE_NUMBER" = "null" ]; then
echo "No open milestones found"
echo "milestone_found=false" >> $GITHUB_OUTPUT
else
echo "milestone_found=true" >> $GITHUB_OUTPUT
echo "milestone_number=$MILESTONE_NUMBER" >> $GITHUB_OUTPUT
echo "milestone_title=$MILESTONE_TITLE" >> $GITHUB_OUTPUT
echo "Found milestone: $MILESTONE_TITLE (number: $MILESTONE_NUMBER)"
fi
- name: Assign milestone to PR
if: steps.check_milestone.outputs.has_milestone == 'false' && steps.get_milestone.outputs.milestone_found == 'true'
env:
GH_TOKEN: ${{ secrets.PAT }}
run: |
gh pr edit ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} \
--repo ${{ github.repository }} \
--milestone "${{ steps.get_milestone.outputs.milestone_title }}"
echo "✅ Assigned milestone '${{ steps.get_milestone.outputs.milestone_title }}' to PR #${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}"
auto-reminder-bot .github/workflows/auto-reminder-bot.yml
View raw YAML
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
name: Auto Reminder Bot
on:
workflow_dispatch:
schedule:
- cron: "0 12 * * *"
jobs:
run-script:
name: Run Auto Reminder Bot
runs-on: ubuntu-latest
if: github.repository == 'NVIDIA/Megatron-LM'
steps:
- name: Check out repository code
uses: actions/checkout@v6
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: "3.10"
- name: Install dependencies
run: |
pip install --no-cache-dir PyGithub slack-sdk
- name: Run Auto Reminder Bot
run: |
export SLACK_TOKEN=${{ secrets.SLACK_BOT_TOKEN }}
export SLACK_WEBHOOK_URL=${{ secrets.SLACK_REVIEW_REMINDER_CHANNEL_WEBHOOK }}
export GH_TOKEN=${{ secrets.PAT }}
python tests/test_utils/python_scripts/auto_reminder_github.py
auto-swap-labels perms .github/workflows/auto-swap-labels.yml
View raw YAML
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
name: Auto Swap Labels
on:
pull_request_target:
types: [ready_for_review, synchronize]
branches:
- main
workflow_run:
workflows: ["Review Trigger"]
types: [completed]
permissions:
pull-requests: write
contents: read
actions: read
jobs:
check-approval:
runs-on: ubuntu-latest
if: >-
github.repository == 'NVIDIA/Megatron-LM' && (
(github.event_name == 'pull_request_target' &&
github.event.pull_request.base.ref == 'main' &&
!github.event.pull_request.draft) ||
(github.event_name == 'workflow_run' &&
github.event.workflow_run.conclusion == 'success')
)
steps:
- name: Get PR number from workflow_run
id: get-pr
if: github.event_name == 'workflow_run'
continue-on-error: true
uses: actions/download-artifact@v4
with:
name: pr-number
path: pr-number
github-token: ${{ github.token }}
run-id: ${{ github.event.workflow_run.id }}
- name: Set PR number
id: pr
run: |
if [ "${{ github.event_name }}" = "workflow_run" ]; then
if [ "${{ steps.get-pr.outcome }}" != "success" ]; then
echo "No approval artifact found — review was not an approval. Skipping."
exit 0
fi
echo "number=$(cat pr-number/number)" >> $GITHUB_OUTPUT
else
echo "number=${{ github.event.pull_request.number }}" >> $GITHUB_OUTPUT
fi
- name: Check out repository code
if: steps.pr.outputs.number
uses: actions/checkout@v4
- name: Set up Python
if: steps.pr.outputs.number
uses: actions/setup-python@v6
with:
python-version: "3.10"
- name: Install dependencies
if: steps.pr.outputs.number
run: |
pip install --no-cache-dir PyGithub slack-sdk
- name: Run Auto Swap Labels
if: steps.pr.outputs.number
run: |
export GH_TOKEN=${{ secrets.PAT }}
export PR_NUMBER=${{ steps.pr.outputs.number }}
python tests/test_utils/python_scripts/swap_pr_labels.py
auto-update-copy-pr-bot .github/workflows/auto-update-copy-pr-bot.yml
View raw YAML
name: Auto Update Copy PR Bot
on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *"
jobs:
auto-update-copy-pr-bot:
runs-on: ubuntu-latest
if: github.repository == 'NVIDIA/Megatron-LM'
steps:
- name: Checkout code
uses: actions/checkout@v6
with:
token: ${{ secrets.PAT }}
ref: main
- name: Fetch list of members in mcore-reviewers team
shell: bash -euxo pipefail {0}
env:
GH_TOKEN: ${{ secrets.PAT }}
run: |
#!/bin/bash
get_members() {
local org=$1 team=$2 seen_file=$3
gh api "/orgs/$org/teams/$team/members" --paginate --jq '.[].login' >> "$seen_file"
gh api "/orgs/$org/teams/$team/teams" --paginate --jq '.[].slug' | while read -r child; do
get_members "$org" "$child" "$seen_file"
done
cat "$seen_file"
}
tmp=$(mktemp)
echo "" > final.txt
get_members "NVIDIA" "mcore-engineers" "$tmp" | sort -u >> final.txt && rm "$tmp"
tmp=$(mktemp)
get_members "NVIDIA" "mcore-reviewers" "$tmp" | sort -u >> final.txt && rm "$tmp"
cat final.txt | jq -sR 'split("\n") | map(select(. != "")) | flatten | unique'
export TRUSTEES=$(cat final.txt | jq -csR 'split("\n") | map(select(. != "")) | flatten | unique')
yq '.trustees_override = env(TRUSTEES)' .github/copy-pr-bot.yaml | yq -o yaml > .github/copy-pr-bot.yaml.new
mv .github/copy-pr-bot.yaml.new .github/copy-pr-bot.yaml
- name: Commit changes
env:
GH_TOKEN: ${{ secrets.PAT }}
run: |
git remote set-url origin https://x-access-token:${GH_TOKEN}@github.com/NVIDIA/Megatron-LM.git
git config --global user.name "GitHub Actions"
git config --global user.email "github-actions[bot]@users.noreply.github.com"
git add .github/copy-pr-bot.yaml
if git diff --cached --exit-code --quiet; then
echo "No changes to commit. Exiting gracefully."
exit 0
fi
git commit -m "Update copy-pr-bot.yaml [skip ci]"
git push -u origin main
build-docs .github/workflows/build-docs.yml
View raw YAML
# Copyright (c) 2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Build docs
on:
push:
branches:
- main
- "pull-request/[0-9]+"
- "deploy-release/*"
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }}
cancel-in-progress: true
jobs:
pre-flight:
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.73.2
build-docs:
needs: [pre-flight]
if: needs.pre-flight.outputs.is_deployment_workflow != 'true'
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_docs.yml@v0.80.2
build-docs-summary:
needs: [pre-flight, build-docs]
if: |
(
needs.pre-flight.outputs.is_deployment_workflow == 'true'
|| always()
)
&& !cancelled()
runs-on: ubuntu-latest
steps:
- name: Get workflow result
id: result
shell: bash -x -e -u -o pipefail {0}
env:
GH_TOKEN: ${{ github.token }}
RUN_ID: ${{ github.run_id }}
SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' }}
run: |
FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
echo "✅ All previous jobs completed successfully"
exit 0
else
echo "❌ Found $FAILED_JOBS failed job(s)"
# Show which jobs failed
gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
exit 1
fi
build-test-publish-wheel perms .github/workflows/build-test-publish-wheel.yml
View raw YAML
# Copyright (c) 2019-2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Build, test, and publish a PyPi wheel (to testpypi).
on:
push:
branches:
- main
- "pull-request/[0-9]+"
- "deploy-release/*"
merge_group:
types: [checks_requested]
defaults:
run:
shell: bash -x -e -u -o pipefail {0}
permissions:
id-token: write
contents: read
jobs:
pre-flight:
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.73.2
if: github.repository == 'NVIDIA/Megatron-LM'
build-test-publish-wheels:
needs: [pre-flight]
uses: ./.github/workflows/_build_test_publish_wheel.yml
with:
no-publish: true
secrets:
TWINE_PASSWORD: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && secrets.SVC_PYPI_TOKEN || secrets.SVC_PYPI_TEST_TOKEN }}
build-test-publish-wheel-summary:
needs: [pre-flight, build-test-publish-wheels]
if: |
(
needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_merge_group == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
|| always()
)
&& github.repository == 'NVIDIA/Megatron-LM'
&& !cancelled()
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Result
env:
GH_TOKEN: ${{ github.token }}
GITHUB_RUN_ID: ${{ github.run_id }}
SKIPPING_IS_ALLOWED: false
run: |
FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success" and (.name | test("build-and-test-wheels")))] | length') || echo 0
if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
echo "✅ All build-and-test-wheels jobs completed successfully"
exit 0
else
echo "❌ Found $FAILED_JOBS failed build-and-test-wheels job(s)"
# Show which jobs failed
gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success" and (.name | test("build-and-test-wheels"))) | .name'
exit 1
fi
cherry-pick-release-commit .github/workflows/cherry-pick-release-commit.yml
View raw YAML
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Create PR to main with cherry-pick from release
on:
push:
branches:
- main
jobs:
cherry-pick:
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cherry_pick.yml@v0.65.9
if: github.repository == 'NVIDIA/Megatron-LM'
with:
target-branches-pattern: 'core_(*dev_)?r[0-9]+\.[0-9]+\.[0-9]+'
secrets:
PAT: ${{ secrets.PAT }}
SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_TEAM_GROUP_ID }}
SLACK_WEBHOOK: ${{ secrets.SLACK_CI_CHANNEL_WEBHOOK }}
cicd-approve-test-queue matrix .github/workflows/cicd-approve-test-queue.yml
View raw YAML
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Approve Test Queue
on:
schedule:
- cron: "*/5 * * * *" # Runs every 5 minutes
workflow_dispatch: # Allows manual triggering
jobs:
approve-queue:
runs-on: ubuntu-latest
environment: main
if: github.repository == 'NVIDIA/Megatron-LM'
strategy:
matrix:
branch: [main, dev, others]
contributor_type: [internal, external]
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: "3.12"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install requests
- name: Download SSO users list
run: |
gh release download v0.1.0 \
--repo NVIDIA-GitHub-Management/github-audits \
--pattern users_sso.json \
--output users_sso.json || echo '{}' > users_sso.json
env:
GH_TOKEN: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }}
- name: Approve waiting deployments
env:
GITHUB_TOKEN: ${{ secrets.PAT }}
MAX_CONCURRENCY: ${{ vars.MAX_CONCURRENCY || 1 }}
MAX_CONCURRENCY_EXTERNAL: ${{ vars.MAX_CONCURRENCY_EXTERNAL || 1 }}
CONTRIBUTOR_TYPE: ${{ matrix.contributor_type }}
SSO_USERS_FILE: users_sso.json
PYTHONUNBUFFERED: 1
shell: python
run: |
import os
import json
import requests
import re
# GitHub API configuration
GITHUB_TOKEN = os.environ["GITHUB_TOKEN"]
REPO = os.environ["GITHUB_REPOSITORY"]
CONTRIBUTOR_TYPE = os.environ["CONTRIBUTOR_TYPE"]
if CONTRIBUTOR_TYPE == "external":
# Global limit across all branches — no division needed since we count globally.
MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY_EXTERNAL"])
else:
MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY"]) // 2
API_BASE = f"https://api.github.com/repos/NVIDIA/Megatron-LM"
# Load SSO users for internal/external classification
with open(os.environ["SSO_USERS_FILE"]) as f:
sso_users = json.load(f)
# Headers for GitHub API
headers = {
"Authorization": f"token {GITHUB_TOKEN}",
"Accept": "application/vnd.github.v3+json",
"X-GitHub-Api-Version": "2022-11-28",
}
def make_request(endpoint, method="GET", data=None):
"""Make a request to the GitHub API with error handling."""
url = f"{API_BASE}/{endpoint}"
try:
if method == "GET":
response = requests.get(url, headers=headers)
else:
response = requests.post(url, headers=headers, json=data)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
print(f"Error making request to {endpoint}: {str(e)}")
if hasattr(e.response, 'text'):
print(f"Response: {e.response.text}")
return None
def is_internal_contributor(pr_info):
"""Return True if the PR author is a member of NVIDIA or NVIDIA-NeMo org (is_org_member)."""
login = pr_info.get("user", {}).get("login", "")
org_roles = sso_users.get(login, {}).get("org_roles", [])
return any(role in ("NVIDIA:Member", "NVIDIA-NeMo:Member") for role in org_roles)
def get_pr_base_branch(workflow_run):
"""
Return the base branch of the PR associated with a workflow run, or None.
Extracts PR number from head branch like 'pull-request/1913' and fetches PR info.
Returns (base_branch, pr_info) tuple, or (None, None) if not a PR run.
"""
print(workflow_run.get("head_branch", ""))
head_branch = workflow_run.get("head_branch", "")
match = re.match(r"pull-request/(\d+)", head_branch)
if not match:
return None, None # Not a PR branch pattern
pr_number = int(match.group(1))
# Fetch PR info from GitHub API
pr_info = make_request(f"pulls/{pr_number}")
if not pr_info:
print(f"Failed to fetch PR #{pr_number}")
return None, None
base_branch = pr_info.get("base", {}).get("ref")
return base_branch, pr_info
def matches_contributor(workflow_run, contributor_type):
"""Return True if the workflow run matches the contributor type (ignores branch)."""
_, pr_info = get_pr_base_branch(workflow_run)
if pr_info is None:
return False
internal = is_internal_contributor(pr_info)
return (contributor_type == "internal") == internal
def matches_queue(workflow_run, target_branch, contributor_type):
"""
Return True if the workflow run belongs to this queue cell:
matching target branch AND matching contributor type (internal/external).
"""
base_branch, pr_info = get_pr_base_branch(workflow_run)
if base_branch is None:
return False
branch_match = (
(base_branch == target_branch) or
(base_branch != "main" and base_branch != "dev" and target_branch == "others")
)
if not branch_match:
return False
pr_number = re.match(r"pull-request/(\d+)", workflow_run.get("head_branch", "")).group(1)
internal = is_internal_contributor(pr_info)
contributor_match = (contributor_type == "internal") == internal
if branch_match and contributor_match:
print(f"PR #{pr_number} targets {target_branch}, contributor_type={contributor_type} (internal={internal})")
return branch_match and contributor_match
# Get current running and queued workflows
print("Fetching workflow runs...")
queued_workflow_runs = make_request("actions/runs?status=queued").get("workflow_runs", [])
in_progress_workflow_runs = make_request("actions/runs?status=in_progress").get("workflow_runs", [])
# For external contributors, enforce a single global concurrency limit across ALL branches.
# For internal contributors, enforce per-branch limits as before.
if CONTRIBUTOR_TYPE == "external":
queued_workflow_runs = [run for run in queued_workflow_runs
if run["name"] == "CICD Megatron-LM" and matches_contributor(run, CONTRIBUTOR_TYPE)]
in_progress_workflow_runs = [run for run in in_progress_workflow_runs
if run["name"] == "CICD Megatron-LM" and matches_contributor(run, CONTRIBUTOR_TYPE)]
else:
# Filter for workflows belonging to PRs targeting ${{ matrix.branch }} with matching contributor type
queued_workflow_runs = [run for run in queued_workflow_runs
if run["name"] == "CICD Megatron-LM" and matches_queue(run, "${{ matrix.branch }}", CONTRIBUTOR_TYPE)]
in_progress_workflow_runs = [run for run in in_progress_workflow_runs
if run["name"] == "CICD Megatron-LM" and matches_queue(run, "${{ matrix.branch }}", CONTRIBUTOR_TYPE)]
# Count running and queued workflows
queued_workflows = len(queued_workflow_runs)
in_progress_workflows = len(in_progress_workflow_runs)
total_workflows = queued_workflows + in_progress_workflows
print(f"Current queued workflows (PRs targeting ${{ matrix.branch }}, {CONTRIBUTOR_TYPE}): {queued_workflows}")
print(f"Current running workflows (PRs targeting ${{ matrix.branch }}, {CONTRIBUTOR_TYPE}): {in_progress_workflows}")
print(f"Total workflows: {total_workflows}")
print(f"Max concurrency: {MAX_CONCURRENCY}")
if total_workflows >= MAX_CONCURRENCY:
print("Maximum concurrency reached, no new approvals will be made")
exit(0)
# Get waiting CI workflows for test environment
print("Fetching deployments...")
pending_workflows = make_request("actions/runs?status=waiting").get("workflow_runs", [])
print("Pending workflows:", len(pending_workflows))
pending_workflows = [run for run in pending_workflows
if run["name"] == "CICD Megatron-LM" and matches_queue(run, "${{ matrix.branch }}", CONTRIBUTOR_TYPE)]
# Sort deployments by creation date (oldest first)
print("Sorting workflows...")
pending_workflows = sorted(pending_workflows, key=lambda x: x["created_at"])
# Process each deployment
print(f"Processing {len(pending_workflows)} pending workflows...")
for workflow in pending_workflows:
if total_workflows >= MAX_CONCURRENCY:
print("Maximum concurrency reached, stopping approvals")
break
workflow_id = workflow["id"]
workflow_name = workflow["display_title"]
print(f"Approving workflow {workflow_name} with Run Id: {workflow_id}")
deployment_url = f"actions/runs/{workflow_id}/pending_deployments"
deployment = make_request(deployment_url)[0]
environment_id = deployment["environment"]["id"]
# Approve the deployment
status_data = {
"environment_ids": [environment_id],
"state": "approved",
"comment": "Automatically approved by queue manager"
}
result = make_request(deployment_url, method="POST", data=status_data)
if result:
total_workflows += 1
else:
print(f"Failed to approve deployment {deployment['id']}")
exit(1)
notify:
if: failure()
runs-on: ubuntu-latest
needs: [approve-queue]
steps:
- name: Notify
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_CI_CHANNEL_WEBHOOK }}
SLACK_WEBHOOK_ADMIN: <!subteam^${{ secrets.SLACK_TEAM_GROUP_ID }}>
GITHUB_RUN_ID: ${{ github.run_id }}
GITHUB_REPOSITORY: ${{ github.repository }}
run: |
curl -X POST \
-H 'Content-type: application/json' \
--data "{\"text\":\":robot_joy: <https://github.com/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}|Test-queue-approval-bot workflow> failed. Please review manually.\n\ncc ${SLACK_WEBHOOK_ADMIN}\"}" \
$SLACK_WEBHOOK
cicd-main matrix perms .github/workflows/cicd-main.yml
View raw YAML
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: CICD Megatron-LM
on:
schedule:
- cron: 0 0 * * *
push:
branches:
- "pull-request/[0-9]+"
- "deploy-release/*"
merge_group:
types: [checks_requested]
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.event.merge_group.head_ref || github.ref }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
env:
container-registry: 766267172432.dkr.ecr.us-east-1.amazonaws.com
container-registry-gb200: us-east4-docker.pkg.dev/nv-projdgxchipp-20260113193621/megatron-lm
jobs:
is-not-external-contributor:
runs-on: ubuntu-latest
if: github.repository == 'NVIDIA/Megatron-LM'
outputs:
is_external_contributor: ${{ github.event.pull_request.user.type == 'User' }}
is_maintainer: ${{ steps.check-membership.outputs.is_maintainer }}
selected_runner: ${{ steps.check-membership.outputs.is_maintainer == 'true' && 'nvidia-ci-aws-gpu-x8' || 'nvidia-ci-aws-gpu-x8-ephemeral' }}
selected_runner_gb200: ${{ steps.check-membership.outputs.is_maintainer == 'true' && 'nvidia-ci-gcp-gpu-x4' || 'ubuntu-latest' }}
permissions:
issues: write
pull-requests: write
env:
GITHUB_TOKEN: ${{ secrets.PAT }}
REPO: ${{ github.repository }}
DISABLE_EXTERNAL_CONTRIBUTOR: ${{ vars.DISABLE_EXTERNAL_CONTRIBUTOR }}
steps:
- name: Checkout repository
uses: actions/checkout@v6
with:
token: ${{ env.GITHUB_TOKEN }}
- name: Get PR info
id: get-pr-info
if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push'
uses: nv-gha-runners/get-pr-info@main
- name: Check NVIDIA SSO membership
id: check-sso
uses: ./.github/actions/check-nvidia-sso-membership
with:
username: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').user.login }}
github_token: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }}
sso_users_filename: ${{ vars.SSO_USERS_FILENAME }}
- name: Set maintainer status
id: check-membership
env:
IS_MAIN_BRANCH: ${{ github.ref == 'refs/heads/main' }}
IS_MERGE_GROUP: ${{ github.event_name == 'merge_group' }}
SCHEDULED_JOB: ${{ github.event_name == 'schedule' }}
run: |
# Skip SSO check for scheduled jobs, main branch, or merge groups
if [ "${{ env.SCHEDULED_JOB }}" == "true" ] || [ "${IS_MAIN_BRANCH}" == "true" ] || [ "${IS_MERGE_GROUP}" == "true" ]; then
echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT
exit 0
fi
# Use SSO membership check result
IS_MEMBER="${{ steps.check-sso.outputs.is_member }}"
# If external contributor is disabled, check if user is a repo collaborator or an org collaborator to NVIDIA or NVIDIA-NeMo
if [ "${{ env.DISABLE_EXTERNAL_CONTRIBUTOR }}" == "true" ] && [ "${{ steps.check-sso.outputs.is_member }}" != "true" ]; then
PR_AUTHOR=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').user.login }}
echo "Checking if $PR_AUTHOR is a repo collaborator..."
API_URL="https://api.github.com/repos/$REPO/collaborators/$PR_AUTHOR"
REPO_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer $GITHUB_TOKEN" \
-H "X-GitHub-Api-Version: 2022-11-28" \
$API_URL)
echo "Checking if $PR_AUTHOR is an org collaborator to NVIDIA-NeMo..."
API_URL="https://api.github.com/orgs/NVIDIA-NeMo/members/$PR_AUTHOR"
ORG_NVIDIA_NEMO_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer $GITHUB_TOKEN" \
-H "X-GitHub-Api-Version: 2022-11-28" \
$API_URL)
echo "Checking if $PR_AUTHOR is an org collaborator to NVIDIA..."
API_URL="https://api.github.com/orgs/NVIDIA/members/$PR_AUTHOR"
ORG_NVIDIA_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer $GITHUB_TOKEN" \
-H "X-GitHub-Api-Version: 2022-11-28" \
$API_URL)
if [ "$REPO_MEMBERSHIP_RESPONSE" -eq 204 ] || [ "$ORG_NVIDIA_NEMO_MEMBERSHIP_RESPONSE" -eq 204 ] || [ "$ORG_NVIDIA_MEMBERSHIP_RESPONSE" -eq 204 ]; then
IS_MEMBER="true"
else
exit 1
fi
fi
# Use SSO membership check result
if [ "$IS_MEMBER" == "true" ]; then
echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT
else
echo "is_maintainer=false" | tee -a $GITHUB_OUTPUT
fi
pre-flight:
needs: [is-not-external-contributor]
if: github.repository == 'NVIDIA/Megatron-LM'
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.73.2
configure:
runs-on: ubuntu-latest
needs: [pre-flight]
if: github.repository == 'NVIDIA/Megatron-LM'
outputs:
scope: ${{ steps.configure.outputs.scope }}
n_repeat: ${{ steps.configure.outputs.n_repeat }}
lightweight: ${{ steps.configure.outputs.lightweight }}
lts: ${{ steps.configure.outputs.lts }}
mbridge_suite: ${{ steps.configure.outputs.mbridge_suite }}
dev: ${{ steps.configure.outputs.dev }}
steps:
- name: Get PR info
id: get-pr-info
if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push'
uses: nv-gha-runners/get-pr-info@main
- name: Configure
id: configure
shell: bash -x -e -u -o pipefail {0}
env:
GH_TOKEN: ${{ secrets.PAT }}
IS_CI_WORKLOAD: ${{ needs.pre-flight.outputs.is_ci_workload }}
IS_MERGE_GROUP: ${{ needs.pre-flight.outputs.is_merge_group }}
run: |
PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
# Fetch all labels in a single API call; fall back to empty list if no PR
LABELS=$(gh pr view $PR_NUMBER --repo ${{ github.repository }} --json labels --jq '[.labels[].name]') || LABELS='[]'
HAS_RUN_TESTS=$(echo "$LABELS" | jq 'any(. == "Run tests")')
HAS_RUN_FUNCTIONAL=$(echo "$LABELS" | jq 'any(. == "Run functional tests")')
HAS_LTS=$(echo "$LABELS" | jq 'any(. == "container::lts")')
HAS_MBRIDGE=$(echo "$LABELS" | jq 'any(. == "Run MBridge tests")')
# Scheduled/CI workloads have no PR — treat as "Run functional tests"
[ "$IS_CI_WORKLOAD" == "true" ] && HAS_RUN_FUNCTIONAL=true
if [ "$IS_MERGE_GROUP" == "true" ]; then
SCOPE=mr-github; N_REPEAT=1; LIGHTWEIGHT=false
elif [ "$HAS_RUN_TESTS" == "true" ]; then
SCOPE=mr-github; N_REPEAT=1; LIGHTWEIGHT=true
elif [ "$HAS_RUN_FUNCTIONAL" == "true" ]; then
SCOPE=mr-github; N_REPEAT=5; LIGHTWEIGHT=false
else
SCOPE=mr-github-slim; N_REPEAT=5; LIGHTWEIGHT=false
fi
if [ "$HAS_MBRIDGE" == "true" || $IS_MERGE_GROUP == "true" ]; then
MBRIDGE_SUITE="L1"
else
MBRIDGE_SUITE="unit-only"
fi
DEV=true
echo "scope=$SCOPE" | tee -a $GITHUB_OUTPUT
echo "n_repeat=$N_REPEAT" | tee -a $GITHUB_OUTPUT
echo "lightweight=$LIGHTWEIGHT" | tee -a $GITHUB_OUTPUT
echo "lts=$HAS_LTS" | tee -a $GITHUB_OUTPUT
echo "mbridge_suite=$MBRIDGE_SUITE" | tee -a $GITHUB_OUTPUT
echo "dev=$DEV" | tee -a $GITHUB_OUTPUT
# Pre-compute active row markers for the decision tree
_MG=$( [ "$IS_MERGE_GROUP" == "true" ] && echo "**→**" || echo "" )
_RT=$( [ "$IS_MERGE_GROUP" != "true" ] && [ "$HAS_RUN_TESTS" == "true" ] && echo "**→**" || echo "" )
_RF=$( [ "$IS_MERGE_GROUP" != "true" ] && [ "$HAS_RUN_TESTS" != "true" ] && [ "$HAS_RUN_FUNCTIONAL" == "true" ] && echo "**→**" || echo "" )
_DF=$( [ "$SCOPE" == "mr-github-slim" ] && echo "**→**" || echo "" )
_LTS=$( [ "$HAS_LTS" == "true" ] && echo "**→**" || echo "" )
_DEV=$( [ "$HAS_LTS" != "true" ] && echo "**→**" || echo "" )
cat <<SUMMARY >> $GITHUB_STEP_SUMMARY
Beep boop 🤖 I have consulted the labels and decided to run **$SCOPE** $( [ "$LIGHTWEIGHT" == "true" ] && echo "in lightweight mode " || echo "" )against the **$( [ "$HAS_LTS" == "true" ] && echo "lts" || echo "dev" )** container with **$N_REPEAT** repetition(s). You are welcome.
| Setting | Value |
|---|---|
| \`scope\` | \`$SCOPE\` |
| \`n_repeat\` | \`$N_REPEAT\` |
| \`lightweight\` | \`$LIGHTWEIGHT\` |
| \`lts\` | \`$HAS_LTS\` |
| \`dev\` | \`$DEV\` |
| \`mbridge_suite\` | \`$MBRIDGE_SUITE\` |
### Decision tree
**Test scope**
| | Trigger | \`scope\` | \`n_repeat\` | \`lightweight\` |
|---|---|---|---|---|
| $_MG | Merge group | \`mr-github\` | \`1\` | \`false\` |
| $_RT | Label: _Run tests_ | \`mr-github\` | \`1\` | \`true\` |
| $_RF | Label: _Run functional tests_ / CI workload | \`mr-github\` | \`5\` | \`false\` |
| $_DF | _(default)_ | \`mr-github-slim\` | \`5\` | \`false\` |
**Container image**
| | Trigger | \`image\` |
|---|---|---|
| $_LTS | Label: _container::lts_ | \`lts\` |
| $_DEV | _(default)_ | \`dev\` |
### Glossary
- **\`lightweight\`**: trains for 4 steps instead of 100 and skips comparison against golden values — faster feedback, no correctness guarantees
- **\`lts\`**: uses the Long Term Support container base image instead of the latest dev image
- **\`dev\`**: uses the latest development container base image (default)
SUMMARY
linting:
runs-on: ubuntu-latest
needs: [pre-flight]
if: |
(
needs.pre-flight.outputs.is_deployment_workflow == 'false'
&& needs.pre-flight.outputs.is_ci_workload == 'true'
) || (
needs.pre-flight.outputs.is_deployment_workflow == 'false'
&& needs.pre-flight.outputs.is_ci_workload == 'false'
&& needs.pre-flight.outputs.docs_only == 'false'
)
steps:
- name: Checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Install uv
uses: astral-sh/setup-uv@v1
with:
version: 0.7.2
- name: Install linting tools
run: |
uv sync --locked --only-group linting
- name: Get PR info
id: get-pr-info
if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push'
uses: nv-gha-runners/get-pr-info@main
- name: Run linting
if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push'
run: |
export PATH=".venv/bin:$PATH"
export GITLAB_ENDPOINT=github.com
export CI_PROJECT_NAMESPACE=NVIDIA
export BASE_REF="${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.ref }}"
export CHECK_ONLY=true
export SKIP_DOCS=false
bash tools/autoformat.sh
cicd-wait-in-queue:
runs-on: ubuntu-latest
needs: [pre-flight, linting]
environment: "test"
if: |
!(needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
|| needs.pre-flight.outputs.is_merge_group == 'true'
|| needs.pre-flight.outputs.docs_only == 'true')
steps:
- name: Running CI tests
run: |
echo "Running CI tests"
echo "is_merge_group: ${{ needs.pre-flight.outputs.is_merge_group }}"
cicd-parse-downstream-testing:
runs-on: ubuntu-latest
needs:
- pre-flight
- configure
- cicd-wait-in-queue
if: |
needs.pre-flight.result != 'cancelled'
&& needs.configure.result != 'cancelled'
&& needs.cicd-wait-in-queue.result != 'cancelled'
&& (
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
|| needs.pre-flight.outputs.is_merge_group == 'true'
)
&& !cancelled()
outputs:
mbridge-test-suite: ${{ needs.configure.outputs.mbridge_suite }}
steps:
- name: Checkout
uses: actions/checkout@v6
- name: How-To
run: bash .github/scripts/readme.sh
cicd-mbridge-testing:
runs-on: ubuntu-latest
needs:
- pre-flight
- cicd-wait-in-queue
- cicd-parse-downstream-testing
if: |
needs.pre-flight.result != 'cancelled'
&& needs.cicd-wait-in-queue.result != 'cancelled'
&& needs.cicd-parse-downstream-testing.result != 'cancelled'
&& (
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
|| needs.pre-flight.outputs.is_merge_group == 'true'
)
&& !cancelled()
steps:
- name: Get PR info
id: get-pr-info
if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push'
uses: nv-gha-runners/get-pr-info@main
- name: Checkout MBridge and create testing branch
uses: actions/checkout@v6
with:
ref: main
repository: NVIDIA-NeMo/Megatron-Bridge
path: megatron-bridge
token: ${{ secrets.PAT }}
- name: Create testing branch
env:
MBRIDGE_BRANCH_NAME: mcore-testing-${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || github.run_id }}
run: |
cd megatron-bridge
git fetch origin main
git checkout -b ${{ env.MBRIDGE_BRANCH_NAME }} origin/main
git push origin ${{ env.MBRIDGE_BRANCH_NAME }} --force
- name: Get merge commit sha
shell: bash -x -e -u -o pipefail {0}
id: sha
env:
IS_PR: ${{ startsWith(github.ref, 'refs/heads/pull-request/') }}
IS_MERGE_GROUP: ${{ github.event_name == 'merge_group' }}
run: |
if [[ "$IS_PR" == "true" ]]; then
SHA=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').merge_commit_sha }}
elif [[ "$IS_MERGE_GROUP" == "true" ]]; then
SHA=${{ github.event.merge_group.head_sha }}
else
SHA=${GITHUB_SHA}
fi
echo "main=${SHA}" | tee -a "$GITHUB_OUTPUT"
- name: Trigger MBridge tests
uses: convictional/trigger-workflow-and-wait@v1.6.5
env:
MBRIDGE_BRANCH_NAME: mcore-testing-${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || github.run_id }}
with:
owner: NVIDIA-NeMo
repo: Megatron-Bridge
workflow_file_name: cicd-main.yml
github_token: ${{ secrets.PAT }}
ref: ${{ env.MBRIDGE_BRANCH_NAME }}
wait_interval: 60
propagate_failure: true
client_payload: |
{
"mcore_ref": "${{ steps.sha.outputs.main }}",
"test_suite": "${{ needs.cicd-parse-downstream-testing.outputs.mbridge-test-suite }}",
"triggered_by": "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
}
- name: Delete testing branch
if: always()
env:
MBRIDGE_BRANCH_NAME: mcore-testing-${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || github.run_id }}
run: |
cd megatron-bridge
git push origin --delete ${{ env.MBRIDGE_BRANCH_NAME }}
cicd-compute-build-matrix:
runs-on: ubuntu-latest
needs: [is-not-external-contributor]
outputs:
matrix: ${{ steps.compute.outputs.matrix }}
steps:
- name: Compute build matrix
id: compute
env:
IS_MAINTAINER: ${{ needs.is-not-external-contributor.outputs.is_maintainer }}
SELECTED_RUNNER: ${{ needs.is-not-external-contributor.outputs.selected_runner }}
SELECTED_RUNNER_GB200: ${{ needs.is-not-external-contributor.outputs.selected_runner_gb200 }}
REGISTRY_AWS: ${{ env.container-registry }}
REGISTRY_GCP: ${{ env.container-registry-gb200 }}
run: |
AWS_ENTRY=$(jq -nc --arg registry "$REGISTRY_AWS" --arg runner "$SELECTED_RUNNER" \
'{"cloud": "aws", "registry": $registry, "runner": $runner}')
if [ "$IS_MAINTAINER" == "true" ]; then
GCP_ENTRY=$(jq -nc --arg registry "$REGISTRY_GCP" --arg runner "$SELECTED_RUNNER_GB200" \
'{"cloud": "gcp", "registry": $registry, "runner": $runner}')
MATRIX=$(jq -nc --argjson aws "$AWS_ENTRY" --argjson gcp "$GCP_ENTRY" \
'{"include": [$aws, $gcp]}')
else
MATRIX=$(jq -nc --argjson aws "$AWS_ENTRY" '{"include": [$aws]}')
fi
echo "matrix=$MATRIX" | tee -a "$GITHUB_OUTPUT"
cicd-container-build:
needs: [is-not-external-contributor, pre-flight, configure, cicd-wait-in-queue, cicd-compute-build-matrix]
strategy:
fail-fast: false
matrix: ${{ fromJson(needs.cicd-compute-build-matrix.outputs.matrix) }}
runs-on: ${{ matrix.runner }}
if: |
needs.is-not-external-contributor.result != 'cancelled'
&& needs.pre-flight.result != 'cancelled'
&& needs.cicd-wait-in-queue.result != 'cancelled'
&& needs.cicd-compute-build-matrix.result != 'cancelled'
&& (
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.is_merge_group == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
)
&& !cancelled()
steps:
- name: Get PR info
id: get-pr-info
if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push'
uses: nv-gha-runners/get-pr-info@main
- name: Get merge commit sha
shell: bash -x -e -u -o pipefail {0}
id: sha
env:
IS_PR: ${{ startsWith(github.ref, 'refs/heads/pull-request/') }}
IS_MERGE_GROUP: ${{ github.event_name == 'merge_group' }}
run: |
if [[ "$IS_PR" == "true" ]]; then
SHA=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').merge_commit_sha }}
elif [[ "$IS_MERGE_GROUP" == "true" ]]; then
SHA=${{ github.event.merge_group.head_sha }}
else
SHA=${GITHUB_SHA}
fi
echo "main=${SHA}" | tee -a "$GITHUB_OUTPUT"
- name: Checkout
uses: actions/checkout@v6
with:
ref: ${{ steps.sha.outputs.main }}
- name: Setup python
uses: actions/setup-python@v6
with:
python-version: 3.12
- name: Install GH CLI
shell: bash -x -e -u -o pipefail {0}
run: |
apt-get update
apt-get install -y gh
- name: Download test data
shell: bash
run: |
echo "::group::Download test data"
pip install --no-cache-dir click requests
python tests/test_utils/python_scripts/download_unit_tests_dataset.py --assets-dir ./assets
echo "::endgroup::"
- name: Install GH CLI
shell: bash
run: |
apt-get update
apt-get install -y gh
- name: Get last merged PR
id: cache_from
env:
GH_TOKEN: ${{ github.token }}
run: |
LAST_PRS=$(gh api graphql -f query='
query {
repository(owner: "NVIDIA", name: "Megatron-LM") {
pullRequests(states: MERGED, first: 100, orderBy: {field: UPDATED_AT, direction: DESC}) {
nodes {
number
}
}
}
}' | jq -r '.data.repository.pullRequests.nodes[].number' | while read -r number; do
echo "type=registry,ref=${{ matrix.registry }}/megatron-lm:$number-buildcache,mode=max"
done)
echo "LAST_PRS<<EOF" | tee -a $GITHUB_OUTPUT
echo "$LAST_PRS" | tee -a $GITHUB_OUTPUT
echo "EOF" | tee -a $GITHUB_OUTPUT
- name: Parse baseimage
shell: bash
id: base-image
env:
HAS_LTS_LABEL: ${{ needs.configure.outputs.lts }}
run: |
if [ "$HAS_LTS_LABEL" == "true" ]; then
NGC_VERSION=$(cat docker/.ngc_version.lts)
echo "version=$NGC_VERSION" | tee -a $GITHUB_OUTPUT
echo "image_type=lts" | tee -a $GITHUB_OUTPUT
else
NGC_VERSION=$(cat docker/.ngc_version.dev)
echo "version=$NGC_VERSION" | tee -a $GITHUB_OUTPUT
echo "image_type=dev" | tee -a $GITHUB_OUTPUT
fi
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Build and push
uses: docker/build-push-action@v6
with:
file: ./docker/Dockerfile.ci.dev
push: true
context: .
target: main
build-args: |
FROM_IMAGE_NAME=${{ steps.base-image.outputs.version }}
IMAGE_TYPE=${{ steps.base-image.outputs.image_type }}
cache-from: |
type=registry,ref=${{ matrix.registry }}/megatron-lm:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }}-buildcache,mode=max
type=registry,ref=${{ matrix.registry }}/megatron-lm:main-buildcache,mode=max
${{ steps.cache_from.outputs.LAST_PRS }}
cache-to: |
type=registry,ref=${{ matrix.registry }}/megatron-lm:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }}-buildcache,mode=max
no-cache: false
tags: |
${{ matrix.registry }}/megatron-lm:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }}
${{ matrix.registry }}/megatron-lm:${{ github.sha }}
secrets: |
GH_TOKEN=${{ secrets.PAT }}
cicd-parse-unit-tests:
runs-on: ubuntu-latest
outputs:
unit-tests: ${{ steps.parse-unit-tests.outputs.unit-tests }}
needs:
- pre-flight
- cicd-wait-in-queue
- cicd-container-build
if: |
needs.pre-flight.result != 'cancelled'
&& needs.cicd-wait-in-queue.result != 'cancelled'
&& needs.cicd-container-build.result != 'cancelled'
&& (
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
|| needs.pre-flight.outputs.is_merge_group == 'true'
)
&& !cancelled()
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Parse unit tests
id: parse-unit-tests
run: |
cat tests/test_utils/recipes/h100/unit-tests.yaml | yq -o json '[.products[].test_case[] | { "bucket": .}] | sort_by(.model, .test_case)' | jq -c > unit-tests.json
echo "unit-tests=$(cat unit-tests.json)" | tee -a $GITHUB_OUTPUT
cicd-unit-tests-latest:
strategy:
fail-fast: false
matrix:
include: ${{ fromJson(needs.cicd-parse-unit-tests.outputs.unit-tests) }}
needs:
- is-not-external-contributor
- pre-flight
- cicd-wait-in-queue
- cicd-container-build
- cicd-parse-unit-tests
runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner }}
timeout-minutes: 60
name: "${{ matrix.bucket }} - latest"
if: |
needs.is-not-external-contributor.result != 'cancelled'
&& needs.pre-flight.result != 'cancelled'
&& needs.cicd-wait-in-queue.result != 'cancelled'
&& needs.cicd-container-build.result != 'cancelled'
&& needs.cicd-parse-unit-tests.result != 'cancelled'
&& (
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
|| needs.pre-flight.outputs.is_merge_group == 'true'
)
&& !cancelled()
env:
PIP_DISABLE_PIP_VERSION_CHECK: 1
PIP_NO_PYTHON_VERSION_WARNING: 1
PIP_ROOT_USER_ACTION: ignore
steps:
- name: Checkout
uses: actions/checkout@v6
- name: main
uses: ./.github/actions
with:
test_case: ${{ matrix.bucket }}
tag: latest
timeout: ${{ matrix.timeout || 30 }}
is_unit_test: "true"
PAT: ${{ secrets.PAT }}
container-image: ${{ env.container-registry }}/megatron-lm:${{ github.sha }}
cicd-parse-integration-tests-h100:
runs-on: ubuntu-latest
needs:
- pre-flight
- configure
- cicd-wait-in-queue
- cicd-container-build
- cicd-unit-tests-latest
if: |
needs.pre-flight.result != 'cancelled'
&& needs.configure.result != 'cancelled'
&& needs.cicd-wait-in-queue.result != 'cancelled'
&& needs.cicd-container-build.result != 'cancelled'
&& needs.cicd-unit-tests-latest.result != 'cancelled'
&& (
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
|| needs.pre-flight.outputs.is_merge_group == 'true'
)
&& !cancelled()
outputs:
integration-tests-h100: ${{ steps.main.outputs.integration-tests-h100 }}
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Parse functional tests
id: main
env:
SCOPE: ${{ needs.configure.outputs.scope }}
LIGHTWEIGHT: ${{ needs.configure.outputs.lightweight }}
run: |
export PYTHONPATH=$(pwd)
ARGS=(--scope $SCOPE)
[ "$LIGHTWEIGHT" == "true" ] && ARGS+=(--enable-lightweight-mode)
python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
--n-repeat 5 \
--time-limit 2700 \
--test-cases all \
--container-image mcore_ci_dev \
--container-tag latest \
--dependent-job functional:configure \
--record-checkpoints false \
--slurm-account gh \
--no-enable-warmup \
--environment dev \
--platform dgx_h100 \
--cluster ghci \
${ARGS[@]} \
--output-path integration-tests-h100.yaml
cat integration-tests-h100.yaml | \
yq -o json 'del(.default, .stages, .workflow) | to_entries | map({"model": .value.stage, "test_case": .key}) | sort_by(.model, .test_case)' | jq -c > integration-tests-h100.json
echo "integration-tests-h100=$(cat integration-tests-h100.json)" | tee -a "$GITHUB_OUTPUT"
cicd-integration-tests-latest-h100:
timeout-minutes: 60
strategy:
fail-fast: false
matrix:
include: ${{ fromJson(needs.cicd-parse-integration-tests-h100.outputs.integration-tests-h100) }}
needs:
- is-not-external-contributor
- pre-flight
- configure
- cicd-wait-in-queue
- cicd-parse-integration-tests-h100
- cicd-unit-tests-latest
runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner }}
name: "${{ matrix.model }}/${{ matrix.test_case }} - latest"
env:
PIP_DISABLE_PIP_VERSION_CHECK: 1
PIP_NO_PYTHON_VERSION_WARNING: 1
PIP_ROOT_USER_ACTION: ignore
if: |
needs.is-not-external-contributor.result != 'cancelled'
&& needs.pre-flight.result != 'cancelled'
&& needs.configure.result != 'cancelled'
&& needs.cicd-wait-in-queue.result != 'cancelled'
&& needs.cicd-parse-integration-tests-h100.result != 'cancelled'
&& needs.cicd-unit-tests-latest.result != 'cancelled'
&& (
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
|| needs.pre-flight.outputs.is_merge_group == 'true'
)
&& !cancelled()
steps:
- name: Checkout
uses: actions/checkout@v6
- name: main
uses: ./.github/actions
with:
test_case: ${{ matrix.test_case }}
model: ${{ matrix.model }}
tag: latest
timeout: ${{ matrix.timeout || 30 }}
is_unit_test: "false"
PAT: ${{ secrets.PAT }}
container-image: ${{ env.container-registry }}/megatron-lm:${{ github.sha }}
scope: ${{ needs.configure.outputs.scope }}
n_repeat: ${{ needs.configure.outputs.n_repeat }}
lightweight: ${{ needs.configure.outputs.lightweight }}
cicd-parse-integration-tests-gb200:
runs-on: ubuntu-latest
needs:
- is-not-external-contributor
- pre-flight
- configure
- cicd-wait-in-queue
- cicd-container-build
- cicd-unit-tests-latest
if: |
needs.is-not-external-contributor.outputs.is_maintainer == 'true'
&& needs.pre-flight.result != 'cancelled'
&& needs.configure.result != 'cancelled'
&& needs.cicd-wait-in-queue.result != 'cancelled'
&& needs.cicd-container-build.result != 'cancelled'
&& needs.cicd-unit-tests-latest.result != 'cancelled'
&& (
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
|| needs.pre-flight.outputs.is_merge_group == 'true'
)
&& !cancelled()
outputs:
integration-tests-gb200: ${{ steps.main.outputs.integration-tests-gb200 }}
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Parse functional tests
id: main
env:
SCOPE: ${{ needs.configure.outputs.scope }}
LIGHTWEIGHT: ${{ needs.configure.outputs.lightweight }}
run: |
export PYTHONPATH=$(pwd)
ARGS=(--scope $SCOPE)
[ "$LIGHTWEIGHT" == "true" ] && ARGS+=(--enable-lightweight-mode)
python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
--n-repeat 5 \
--time-limit 2700 \
--test-cases all \
--container-image mcore_ci_dev \
--container-tag latest \
--dependent-job functional:configure \
--record-checkpoints false \
--slurm-account gh \
--no-enable-warmup \
--environment dev \
--platform dgx_gb200 \
--cluster dgxgb200_oci-hsg \
${ARGS[@]} \
--output-path integration-tests-gb200.yaml
cat integration-tests-gb200.yaml | \
yq -o json 'del(.default, .stages, .workflow) | to_entries | map({"model": .value.stage, "test_case": .key}) | sort_by(.model, .test_case)' | jq -c > integration-tests-gb200.json
echo "integration-tests-gb200=$(cat integration-tests-gb200.json)" | tee -a "$GITHUB_OUTPUT"
cicd-integration-tests-latest-gb200:
timeout-minutes: 60
strategy:
fail-fast: false
matrix:
include: ${{ fromJson(needs.cicd-parse-integration-tests-gb200.outputs.integration-tests-gb200) }}
needs:
- is-not-external-contributor
- pre-flight
- configure
- cicd-wait-in-queue
- cicd-parse-integration-tests-gb200
- cicd-unit-tests-latest
runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner_gb200 }}
name: "${{ matrix.model }}/${{ matrix.test_case }} - latest"
env:
PIP_DISABLE_PIP_VERSION_CHECK: 1
PIP_NO_PYTHON_VERSION_WARNING: 1
PIP_ROOT_USER_ACTION: ignore
if: |
needs.is-not-external-contributor.outputs.is_maintainer == 'true'
&& needs.is-not-external-contributor.result != 'cancelled'
&& needs.pre-flight.result != 'cancelled'
&& needs.configure.result != 'cancelled'
&& needs.cicd-wait-in-queue.result != 'cancelled'
&& needs.cicd-parse-integration-tests-gb200.result != 'cancelled'
&& needs.cicd-unit-tests-latest.result != 'cancelled'
&& (
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
|| needs.pre-flight.outputs.is_merge_group == 'true'
)
&& !cancelled()
steps:
- name: Checkout
uses: actions/checkout@v6
- name: main
uses: ./.github/actions
with:
test_case: ${{ matrix.test_case }}
model: ${{ matrix.model }}
tag: latest
timeout: ${{ matrix.timeout || 30 }}
is_unit_test: "false"
PAT: ${{ secrets.PAT }}
container-image: ${{ env.container-registry-gb200 }}/megatron-lm:${{ github.sha }}
scope: ${{ needs.configure.outputs.scope }}
n_repeat: ${{ needs.configure.outputs.n_repeat }}
lightweight: ${{ needs.configure.outputs.lightweight }}
platform: dgx_gb200
Nemo_CICD_Test:
needs:
- pre-flight
- is-not-external-contributor
- cicd-unit-tests-latest
- cicd-integration-tests-latest-h100
- cicd-integration-tests-latest-gb200
if: |
(
needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.is_merge_group == 'true'
|| always()
)
&& !cancelled()
&& github.repository == 'NVIDIA/Megatron-LM'
runs-on: ubuntu-latest
permissions: write-all
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Get workflow result
id: result
shell: bash -x -e -u -o pipefail {0}
env:
GH_TOKEN: ${{ github.token }}
GITHUB_RUN_ID: ${{ github.run_id }}
DOCS_ONLY: ${{ needs.pre-flight.outputs.docs_only }}
IS_DEPLOYMENT: ${{ needs.pre-flight.outputs.is_deployment_workflow }}
IS_MAINTAINER: ${{ needs.is-not-external-contributor.outputs.is_maintainer }}
UNIT_RESULT: ${{ needs.cicd-unit-tests-latest.result }}
H100_RESULT: ${{ needs.cicd-integration-tests-latest-h100.result }}
GB200_RESULT: ${{ needs.cicd-integration-tests-latest-gb200.result }}
run: |
# Docs-only and deployment workflows intentionally skip all tests
if [ "$DOCS_ONLY" == "true" ] || [ "$IS_DEPLOYMENT" == "true" ]; then
echo "✅ Docs-only or deployment workflow — test checks skipped"
exit 0
fi
FAILED=false
# Unit tests must always succeed (never skipped or cancelled)
if [ "$UNIT_RESULT" != "success" ]; then
echo "❌ cicd-unit-tests-latest: $UNIT_RESULT"
FAILED=true
fi
# H100 integration tests must always succeed
if [ "$H100_RESULT" != "success" ]; then
echo "❌ cicd-integration-tests-latest-h100: $H100_RESULT"
FAILED=true
fi
# GB200 integration tests may be skipped only for non-maintainer PRs
# (no GB200 runners available); maintainer runs must always succeed
if [ "$GB200_RESULT" == "skipped" ] && [ "$IS_MAINTAINER" == "true" ]; then
echo "❌ cicd-integration-tests-latest-gb200: skipped unexpectedly for a maintainer run"
FAILED=true
elif [ "$GB200_RESULT" != "success" ] && [ "$GB200_RESULT" != "skipped" ]; then
echo "❌ cicd-integration-tests-latest-gb200: $GB200_RESULT"
FAILED=true
fi
# Broad scan: catch any individual job failures or cancellations
# (e.g. a single matrix instance cancelled mid-run)
BAD_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '
[.jobs[] | select(
.status == "completed"
and (.conclusion == "failure" or .conclusion == "cancelled")
and .name != "merge-queue-notification"
and .name != "cicd-mbridge-testing"
)] | length
') || BAD_JOBS=0
if [ "${BAD_JOBS:-0}" -gt 0 ]; then
echo "❌ Found ${BAD_JOBS} failed or cancelled job(s):"
gh run view $GITHUB_RUN_ID --json jobs --jq '
.jobs[] | select(
.status == "completed"
and (.conclusion == "failure" or .conclusion == "cancelled")
and .name != "merge-queue-notification"
and .name != "cicd-mbridge-testing"
) | .name + " → " + .conclusion
'
FAILED=true
fi
if [ "$FAILED" != "true" ]; then
echo "✅ All previous jobs completed successfully"
else
exit 1
fi
Coverage_Fake:
runs-on: ubuntu-latest
needs: [Nemo_CICD_Test, pre-flight]
if: |
(
needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
|| github.event == 'merge_group'
)
&& needs.pre-flight.outputs.is_ci_workload == 'false'
&& !cancelled()
&& github.repository == 'NVIDIA/Megatron-LM'
steps:
- name: Generate fake coverage report
uses: actions/github-script@v8
with:
github-token: ${{ secrets.PAT }}
script: |
await github.rest.repos.createCommitStatus({
owner: context.repo.owner,
repo: context.repo.repo,
sha: context.sha,
state: 'success',
description: 'No code changes - coverage check skipped',
context: 'codecov/patch'
});
Coverage:
runs-on: ubuntu-latest
needs: [Nemo_CICD_Test]
if: |
(
(needs.pre-flight.outputs.is_ci_workload == 'true' && !failure())
|| success()
)
&& !cancelled()
&& github.repository == 'NVIDIA/Megatron-LM'
strategy:
matrix:
flag: [unit-test]
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Download coverage reports of current branch
uses: actions/download-artifact@v7
with:
pattern: coverage-${{ matrix.flag }}-*
- name: List coverage files
run: find . -type f -name "*.xml" -o -name "*.lcov"
- name: Get total coverage of current branch
shell: bash -x -e -u -o pipefail {0}
if: always()
run: |
pip install coverage
ls -al .
ls -al coverage-*/
coverage combine --keep $(ls coverage-*/.coverage)
coverage report -i
rm -rf coverage-*
ls -al
- name: Upload coverage reports to Codecov
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
verbose: true
flags: ${{ matrix.flag }}
- name: Upload artifacts
uses: actions/upload-artifact@v6
with:
name: coverage-${{ matrix.flag }}-aggregated
path: |
.coverage
include-hidden-files: true
merge-queue-notification:
runs-on: ubuntu-latest
if: github.event_name == 'merge_group'
permissions:
pull-requests: write
steps:
- name: Extract PR number from merge group
id: get-pr-number
run: |
# Extract PR number from merge group head_ref (format: refs/heads/gh-readonly-queue/main/pr-<number>-<sha>)
PR_NUMBER=$(echo "${{ github.event.merge_group.head_ref }}" | sed -n 's/.*\/pr-\([0-9]*\)-.*/\1/p')
echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT
- name: Comment on PR with action run URL
uses: actions/github-script@v8
with:
github-token: ${{ secrets.PAT }}
script: |
const prNumber = ${{ steps.get-pr-number.outputs.pr_number }};
const runUrl = `https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}`;
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: prNumber,
body: `🔄 Merge queue validation started!\n\nYou can track the progress here: ${runUrl}`
});
cleanup-taint-node:
runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner }}
needs:
- is-not-external-contributor
- cicd-container-build
- cicd-unit-tests-latest
- cicd-integration-tests-latest-h100
- cicd-integration-tests-latest-gb200
- Coverage
- Coverage_Fake
if: |
always()
&& !cancelled()
&& contains(needs.is-not-external-contributor.outputs.selected_runner, 'ephemeral')
&& !needs.pre-flight.outputs.is_deployment_workflow == 'true'
steps:
- name: Taint node for cleanup
shell: bash
run: taint-node.sh
claude-complexity-label AI .github/workflows/claude-complexity-label.yml
View raw YAML
name: Claude Complexity Label
on:
pull_request_target:
types: [ready_for_review]
jobs:
label-complexity:
name: Label PR Complexity
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: write
issues: write
id-token: write
env:
GH_TOKEN: ${{ secrets.PAT }}
REPO: ${{ github.repository }}
PR_NUMBER: ${{ github.event.pull_request.number }}
steps:
- name: Checkout repository
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Run Claude Complexity Analysis
uses: anthropics/claude-code-action@v1
with:
anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
github_token: ${{ secrets.PAT }}
prompt: |
REPO: ${{ env.REPO }}
PR NUMBER: ${{ env.PR_NUMBER }}
You are a PR complexity analyzer. Your job is to analyze the diff of this PR and apply exactly one complexity label.
STEPS:
1. Get the PR diff by running: gh pr diff $PR_NUMBER --repo $REPO
2. Analyze every changed line (added or removed) in the diff and classify each as one of:
- "docs-only": changes to docstrings, comments (lines starting with # or //), documentation files (.md, .rst, .txt), or similar non-functional text
- "test": changes in test files (files with "test" in the name/path, or inside a tests/ directory)
- "real code": all other changes (functional source code)
3. Compute "real code line changes" using this formula:
real_code_line_changes = (number of real code lines changed) + (number of test lines changed / 10)
Count both added and removed lines. Do not count unchanged context lines. Do not count comments or docstrings.
4. Remove any previously applied complexity or docs-only labels:
gh pr edit $PR_NUMBER --repo $REPO --remove-label "complexity: low,complexity: medium,complexity: high,docs-only"
5. Apply exactly ONE label using the gh CLI:
- If there are ZERO real code lines and ZERO test lines (only docs-only changes), apply label "docs-only":
gh pr edit $PR_NUMBER --repo $REPO --add-label "docs-only"
- If real_code_line_changes < 100, apply label "complexity: low":
gh pr edit $PR_NUMBER --repo $REPO --add-label "complexity: low"
- If real_code_line_changes >= 100 and < 500, apply label "complexity: medium":
gh pr edit $PR_NUMBER --repo $REPO --add-label "complexity: medium"
- If real_code_line_changes >= 500, apply label "complexity: high":
gh pr edit $PR_NUMBER --repo $REPO --add-label "complexity: high"
Do NOT post any comments on the PR. Only apply the label.
claude_args: |
--allowedTools "Bash(gh pr diff:*),Bash(gh pr edit:*),Bash(gh pr view:*)"
claude-copy-to-main AI .github/workflows/claude-copy-to-main.yml
View raw YAML
name: Claude Copy PR to Main
on:
issue_comment:
types: [created]
jobs:
copy-to-main:
name: Copy PR to Main
if: |
github.event_name == 'issue_comment' &&
github.event.issue.pull_request &&
contains(github.event.comment.body, '/claude copy')
runs-on: ubuntu-latest
permissions:
contents: write
pull-requests: write
issues: write
id-token: write
env:
GH_TOKEN: ${{ secrets.PAT }}
REPO: ${{ github.repository }}
PR_NUMBER: ${{ github.event.issue.number }}
steps:
- name: Check commenter has write access
env:
COMMENTER: ${{ github.event.comment.user.login }}
run: |
PERMISSION=$(gh api repos/$REPO/collaborators/$COMMENTER/permission --jq .permission)
if [[ "$PERMISSION" != "admin" && "$PERMISSION" != "write" ]]; then
gh pr comment $PR_NUMBER --repo $REPO --body "❌ You do not have write access to use \`/claude copy\`."
exit 1
fi
- name: Check PR is merged and targets non-main
run: |
PR_JSON=$(gh pr view $PR_NUMBER --repo $REPO --json baseRefName,mergedAt)
PR_BASE=$(echo "$PR_JSON" | jq -r .baseRefName)
PR_MERGED=$(echo "$PR_JSON" | jq -r .mergedAt)
if [ "$PR_BASE" = "main" ]; then
gh pr comment $PR_NUMBER --repo $REPO --body "❌ This PR already targets \`main\`. \`/claude copy\` only works on PRs targeting non-main branches."
exit 1
fi
if [ "$PR_MERGED" = "null" ] || [ -z "$PR_MERGED" ]; then
gh pr comment $PR_NUMBER --repo $REPO --body "❌ This PR has not been merged yet. \`/claude copy\` only works on merged PRs."
exit 1
fi
- name: Checkout repository
uses: actions/checkout@v6
with:
fetch-depth: 0
token: ${{ secrets.PAT }}
- name: Fetch PR head ref from fork
run: |
git fetch origin pull/$PR_NUMBER/head:pr-$PR_NUMBER-head
- name: Run Claude Copy to Main
uses: anthropics/claude-code-action@v1
with:
anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
trigger_phrase: "/claude copy"
github_token: ${{ secrets.PAT }}
prompt: |
REPO: ${{ env.REPO }}
PR NUMBER: ${{ env.PR_NUMBER }}
You are a PR copy assistant. Your job is to apply the final changes from a merged PR onto a new branch based on `main` and create a new PR targeting `main`.
The PR's commits originated from a fork and have been fetched locally as the branch: pr-${PR_NUMBER}-head
STEPS:
1. Get the PR details (title, body, and base branch):
gh pr view $PR_NUMBER --repo $REPO --json title,body,baseRefName
2. Configure git for committing (use the svcnvidia-nemo-ci service account since secrets.PAT belongs to it):
git config user.name "svcnvidia-nemo-ci"
git config user.email "svcnvidia-nemo-ci@nvidia.com"
3. Create a new branch from `main`:
git checkout main
git pull origin main
git checkout -b copy-pr-${PR_NUMBER}-to-main
4. Generate a patch of the PR's final changes and apply it:
MERGE_BASE=$(git merge-base origin/<baseRefName> pr-${PR_NUMBER}-head)
git diff $MERGE_BASE pr-${PR_NUMBER}-head | git apply --3way
(Replace <baseRefName> with the actual base branch name from step 1.)
If the apply fails due to merge conflicts:
a. Identify conflicted files: git diff --name-only --diff-filter=U
b. For each conflicted file, read its contents to see the conflict markers
c. Resolve the conflicts by favoring the `main` branch side when there is a genuine
conflict between the two sides. The goal is to bring the PR's changes into main
without overriding what is already on main.
d. Stage the resolved files: git add <file>
5. Commit the changes:
git add -A
git commit -m "Copy PR #${PR_NUMBER} to main"
6. Push the new branch:
git push origin copy-pr-${PR_NUMBER}-to-main
7. Create a new PR targeting `main`:
gh pr create --repo $REPO \
--base main \
--head copy-pr-${PR_NUMBER}-to-main \
--title "[Copy to main] <original PR title>" \
--body "🤖 **This PR was auto-generated by Claude** via the \`/claude copy\` command.\n\nCherry-picked from #${PR_NUMBER}.\n\n---\n\n<original PR body>"
8. Comment on the original PR with a link to the newly created PR.
IMPORTANT:
- When resolving merge conflicts, favor `main` over the non-main branch. Do not override changes already on main.
- Do NOT force push.
claude_args: |
--allowedTools "Bash(git:*),Bash(gh:*),Read,Edit"
--model "claude-opus-4-6"
claude_review AI .github/workflows/claude_review.yml
View raw YAML
name: Claude Code Review
on:
issue_comment:
types: [created]
jobs:
review-on-comment:
name: Claude Review (comment trigger)
if: |
github.event_name == 'issue_comment' &&
github.event.issue.pull_request &&
contains(github.event.comment.body, '/claude review')
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: write
issues: write
id-token: write
env:
GH_TOKEN: ${{ github.token }}
REPO: ${{ github.repository }}
PR_NUMBER: ${{ github.event.issue.number }}
steps:
- name: Get PR head commit
id: get-pr-head-commit
run: |
echo "sha=$(gh pr view $PR_NUMBER --repo $REPO --json headRefOid -q .headRefOid)" | tee -a $GITHUB_OUTPUT
- name: Checkout repository
uses: actions/checkout@v6
with:
fetch-depth: 1
ref: ${{ steps.get-pr-head-commit.outputs.sha }}
- name: Run Claude Code Review
uses: anthropics/claude-code-action@v1
with:
anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
trigger_phrase: "/claude review"
show_full_output: true
claude_args: |
--allowedTools "mcp__github_inline_comment__create_inline_comment,Bash(gh pr comment:*),Bash(gh pr diff:*),Bash(gh pr view:*),Bash(gh pr review:*)"
--model "claude-opus-4-6"
prompt: |
REPO: ${{ env.REPO }}
PR NUMBER: ${{ env.PR_NUMBER }}
You are doing a light code review. Keep it concise and actionable.
Focus ONLY on:
- Critical bugs or logic errors
- Typos in code, comments, or strings
- Missing or insufficient test coverage for changed code
- Outdated or inaccurate documentation affected by the changes
Do NOT comment on:
- Style preferences or formatting
- Minor naming suggestions
- Architectural opinions or refactoring ideas
- Performance unless there is a clear, measurable issue
Only use inline ```suggestion blocks for simple, self-contained line replacements (typos,
renames, single-line fixes). For structural changes that add, remove, or reorganize blocks
of code (e.g. adding a new function, inserting a YAML step, reordering logic), use a
top-level PR comment with a code block showing the proposed change instead — inline
suggestions cannot express insertions or multi-block restructuring and will break the code
if applied.
It's perfectly acceptable to not have anything to comment on.
If you do not have anything to comment on, approve the PR with: gh pr review $PR_NUMBER --repo $REPO --approve --body "LGTM"
close-inactive-issue-pr .github/workflows/close-inactive-issue-pr.yml
View raw YAML
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Stale-Close-Inactive-Issues-PRs
on:
schedule:
- cron: "30 1 * * *"
jobs:
close-issues:
if: github.repository == 'NVIDIA/Megatron-LM'
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_close_inactive_issue_pr.yml@v0.44.0
community-bot .github/workflows/community-bot.yml
View raw YAML
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Community Bot
on:
issues:
types: [opened, edited, reopened, closed, deleted]
issue_comment:
types: [created, edited, deleted]
jobs:
community-bot:
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_community_bot.yml@v0.65.10
with:
community_project_id: ${{ vars.COMMUNITY_PROJECT_ID }}
if: github.repository == 'NVIDIA/Megatron-LM'
secrets:
GH_TOKEN: ${{ secrets.PAT }}
copyright-check .github/workflows/copyright-check.yml
View raw YAML
# Copyright (c) 2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Copyright check
on:
push:
branches:
- "pull-request/[0-9]+"
- "deploy-release/*"
merge_group:
types: [checks_requested]
jobs:
pre-flight:
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.73.2
if: github.repository == 'NVIDIA/Megatron-LM'
copyright-check:
needs: [pre-flight]
if: |
!(needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_merge_group == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true')
&& github.repository == 'NVIDIA/Megatron-LM'
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_copyright_check.yml@v0.66.7
copyright-check-summary:
needs: [pre-flight, copyright-check]
if: |
(
needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
|| always()
)
&& !cancelled()
&& github.repository == 'NVIDIA/Megatron-LM'
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Result
env:
GH_TOKEN: ${{ github.token }}
GITHUB_RUN_ID: ${{ github.run_id }}
SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' || needs.pre-flight.outputs.is_ci_workload == 'true' }}
run: |
FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
echo "✅ All previous jobs completed successfully"
exit 0
else
echo "❌ Found $FAILED_JOBS failed job(s)"
# Show which jobs failed
gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
exit 1
fi
dependabot matrix perms .github/workflows/dependabot.yml
View raw YAML
name: Dependabot
on:
schedule:
- cron: "0 8 * * 1"
workflow_dispatch: # Allow manual triggering
permissions:
id-token: write
contents: write
jobs:
get-release-branch-names:
runs-on: ubuntu-latest
outputs:
mcore: ${{ steps.get-branch.outputs.mcore_release_branch }}
if: github.repository == 'NVIDIA/Megatron-LM'
steps:
- name: Get release branch names
id: get-branch
env:
PAT: ${{ secrets.PAT }}
run: |
latest_branch=$(git ls-remote --heads https://token:${PAT}@github.com/NVIDIA/Megatron-LM.git 'refs/heads/core_r*' |
grep -o 'core_r[0-9]\+\.[0-9]\+\.[0-9]\+' |
sort -V |
tail -n1)
echo "mcore_release_branch=$latest_branch" | tee -a $GITHUB_OUTPUT
bump-tags:
needs: [get-release-branch-names]
if: github.repository == 'NVIDIA/Megatron-LM'
strategy:
fail-fast: false
matrix:
include:
- target-branch: ${{ needs.get-release-branch-names.outputs.mcore }}
- target-branch: main
uses: ./.github/workflows/_update_dependencies.yml
with:
target-branch: ${{ matrix.target-branch }}
secrets:
PAT: ${{ secrets.PAT }}
SSH_KEY: ${{ secrets.SSH_KEY }}
SSH_PWD: ${{ secrets.SSH_PWD }}
notify:
if: failure() && github.repository == 'NVIDIA/Megatron-LM'
runs-on: ubuntu-latest
needs: [bump-tags]
steps:
- name: Notify
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_CI_CHANNEL_WEBHOOK }}
SLACK_WEBHOOK_ADMIN: <!subteam^${{ secrets.SLACK_TEAM_GROUP_ID }}>
GITHUB_RUN_ID: ${{ github.run_id }}
GITHUB_REPOSITORY: ${{ github.repository }}
run: |
curl -X POST \
-H 'Content-type: application/json' \
--data "{\"text\":\":robot_joy: <https://github.com/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}|Dependabot workflow> failed. Please fix manually.\n\ncc ${SLACK_WEBHOOK_ADMIN}\"}" \
$SLACK_WEBHOOK
force-draft-pr perms .github/workflows/force-draft-pr.yml
View raw YAML
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
name: Force Draft PR
on:
pull_request_target:
types: [opened]
branches:
- main
permissions:
pull-requests: write
jobs:
force-draft:
runs-on: ubuntu-latest
if: ${{ !github.event.pull_request.draft && github.repository == 'NVIDIA/Megatron-LM' }}
steps:
- name: Convert PR to draft
env:
GH_TOKEN: ${{ secrets.PAT }}
run: |
gh pr ready --undo ${{ github.event.pull_request.number }} --repo ${{ github.repository }}
- name: Add comment explaining draft policy
env:
GH_TOKEN: ${{ github.token }}
run: |
gh pr comment ${{ github.event.pull_request.number }} --repo ${{ github.repository }} --body \
"This PR has been automatically converted to **draft** because all PRs must start as drafts.
When you are ready for review, click **Ready for Review** to begin the review process. This will:
1. Add the oncall reviewer (optional reviewer)
2. Add required review teams based on your changes
See the [contribution guide](https://github.com/NVIDIA/Megatron-LM/blob/main/docs/developer/submit.md) for more details."
install-test matrix .github/workflows/install-test.yml
View raw YAML
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This workflow verifies that the basic install works across all supported platforms.
# For basic install, all imports need to either be successful or appropriately guarded.
name: Installation Test
on:
push:
branches:
- dev
- main
- "pull-request/[0-9]+"
- "deploy-release/*"
merge_group:
types: [checks_requested]
jobs:
pre-flight:
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.73.2
if: github.repository == 'NVIDIA/Megatron-LM'
pip-test-pytorch:
needs: [pre-flight]
if: |
!(needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_merge_group == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true')
&& github.repository == 'NVIDIA/Megatron-LM'
runs-on: linux-amd64-cpu16
name: Pip - Python${{ matrix.python-version }} - AMD64/Linux - NGC PyTorch
container:
image: nvcr.io/nvidia/pytorch:25.05-py3
strategy:
fail-fast: false
matrix:
python-version: ["3.12"]
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Set PATH
run: |
echo "UV_PROJECT_ENVIRONMENT=/opt/venv" | tee -a "$GITHUB_ENV"
echo "UV_LINK_MODE=copy" | tee -a "$GITHUB_ENV"
echo "CUDA_HOME=/usr/local/cuda" | tee -a "$GITHUB_ENV"
echo "LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH" | tee -a "$GITHUB_ENV"
echo "PATH=$HOME/.local/bin:$PATH:$CUDA_HOME/bin" | tee -a "$GITHUB_ENV"
echo "TORCH_CUDA_ARCH_LIST=6.0;6.1;7.0;7.5;8.0;8.6;9.0" | tee -a "$GITHUB_ENV"
- name: Install megatron-core
shell: bash -x -e -u -o pipefail {0}
run: bash docker/common/install.sh --environment dev --base-image pytorch --python-version ${{ matrix.python-version }}
- name: Checkout check-imports
uses: actions/checkout@v6
with:
repository: NVIDIA-NeMo/FW-CI-templates
ref: v0.63.2
path: FW-CI-templates
- name: Check imports for megatron-core
uses: ./FW-CI-templates/.github/actions/check-imports
with:
package-name: megatron.core
python-binary: ${{ env.UV_PROJECT_ENVIRONMENT }}/bin/python
uv-test-pytorch:
needs: [pre-flight]
if: |
!(needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_merge_group == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true')
&& github.repository == 'NVIDIA/Megatron-LM'
runs-on: linux-amd64-cpu16
name: UV - Python${{ matrix.python-version }} - AMD64/Linux - NGC PyTorch
container:
image: nvcr.io/nvidia/pytorch:25.05-py3
strategy:
fail-fast: false
matrix:
python-version: ["3.12"]
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Set PATH
run: |
echo "UV_PROJECT_ENVIRONMENT=/opt/venv" | tee -a "$GITHUB_ENV"
echo "VIRTUAL_ENV=/opt/venv" | tee -a "$GITHUB_ENV"
echo "UV_LINK_MODE=copy" | tee -a "$GITHUB_ENV"
echo "CUDA_HOME=/usr/local/cuda" | tee -a "$GITHUB_ENV"
echo "LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH" | tee -a "$GITHUB_ENV"
echo "PATH=$HOME/.local/bin:$PATH:$CUDA_HOME/bin" | tee -a "$GITHUB_ENV"
echo "CUDACXX=/usr/local/cuda/bin/nvcc" | tee -a "$GITHUB_ENV"
echo "TORCH_CUDA_ARCH_LIST=6.0;6.1;7.0;7.5;8.0;8.6;9.0" | tee -a "$GITHUB_ENV"
- name: Install project
shell: bash
run: bash docker/common/install.sh --environment dev --base-image pytorch --use-uv
# NGC PyTorch 25.05 has a version of triton that is broken on CPU only machines.
# - name: Checkout check-imports
# uses: actions/checkout@v6
# with:
# repository: NVIDIA-NeMo/FW-CI-templates
# ref: v0.63.2
# path: FW-CI-templates
# - name: Check imports for megatron-core
# uses: ./FW-CI-templates/.github/actions/check-imports
# with:
# package-name: megatron.core
# python-binary: ${{ env.UV_PROJECT_ENVIRONMENT }}/bin/python
install-test-summary:
needs: [pre-flight, pip-test-pytorch, uv-test-pytorch]
runs-on: ubuntu-latest
name: Install test summary
if: |
(
needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
|| always()
)
&& !cancelled()
&& github.repository == 'NVIDIA/Megatron-LM'
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Get workflow result
id: result
shell: bash -x -e -u -o pipefail {0}
env:
GH_TOKEN: ${{ github.token }}
RUN_ID: ${{ github.run_id }}
SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' }}
run: |
FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
echo "✅ All previous jobs completed successfully"
exit 0
else
echo "❌ Found $FAILED_JOBS failed job(s)"
# Show which jobs failed
gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
exit 1
fi
multi-approval-bot .github/workflows/multi-approval-bot.yml
View raw YAML
name: "Codeowners Approval Workflow"
on:
push:
branches:
- "pull-request/[0-9]+"
merge_group:
types: [checks_requested]
jobs:
pre-flight:
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.73.2
if: github.repository == 'NVIDIA/Megatron-LM'
codeowners-approval:
needs: [pre-flight]
runs-on: ubuntu-latest
if: |
!(needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_merge_group == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true')
steps:
- name: Get PR info
id: get-pr-info
if: startsWith(github.ref, 'refs/heads/pull-request/')
uses: nv-gha-runners/get-pr-info@main
- name: Checkout action
uses: actions/checkout@v6
with:
repository: noamelf/codeowner-multi-approval-action
ref: v0.1
path: codeowner-multi-approval-action
- name: Check Codeowners Approval
uses: ./codeowner-multi-approval-action
with:
pr-number: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
repo-name: ${{ github.repository }}
github-token: ${{ secrets.PAT }}
multi-approval-bot-summary:
needs: [pre-flight, codeowners-approval]
if: |
(
needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_merge_group == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
|| always()
)
&& github.repository == 'NVIDIA/Megatron-LM'
&& !cancelled()
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Result
env:
GH_TOKEN: ${{ github.token }}
GITHUB_RUN_ID: ${{ github.run_id }}
SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' || needs.pre-flight.outputs.is_ci_workload == 'true' }}
run: |
FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
echo "✅ All previous jobs completed successfully"
exit 0
else
echo "❌ Found $FAILED_JOBS failed job(s)"
# Show which jobs failed
gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
exit 1
fi
oncall-assign perms .github/workflows/oncall-assign.yml
View raw YAML
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Oncall Assign
on:
pull_request_target:
types: [ready_for_review]
branches:
- main
permissions:
pull-requests: write
contents: read
jobs:
assign-reviewer:
runs-on: ubuntu-latest
if: ${{ !github.event.pull_request.draft }}
steps:
- name: Checkout code
uses: actions/checkout@v6
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: '3.10'
- name: Install dependencies
run: pip install requests slack-sdk
- name: Assign Reviewer
env:
GH_TOKEN: ${{ secrets.PAT }}
run: |
python .github/scripts/oncall_manager.py assign --pr ${{ github.event.pull_request.number }}
oncall-rotation perms .github/workflows/oncall-rotation.yml
View raw YAML
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Oncall Rotation
on:
schedule:
# Runs at 09:00 UTC every Wednesday
- cron: "0 9 * * 3"
workflow_dispatch:
permissions:
contents: write
jobs:
rotate-schedule:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v6
with:
token: ${{ secrets.PAT }}
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: "3.10"
- name: Rotate Schedule
env:
# Token to read org team members. Needs read:org scope.
GH_TOKEN: ${{ secrets.NVIDIA_MCORE_ONCALL_TOKEN || secrets.PAT || secrets.GITHUB_TOKEN }}
# Slack token for updating the Slack usergroup
SLACK_TOKEN: ${{ secrets.ONCALL_SLACK_TOKEN }}
run: |
pip install --no-cache-dir "uv<0.9.29"
uv venv .venv
uv cache clean
uv sync --no-cache
uv run --with slack-sdk python .github/scripts/oncall_manager.py rotate
- name: Commit and Push changes
run: |
git config --global user.name "github-actions[bot]"
git config --global user.email "github-actions[bot]@users.noreply.github.com"
git add .github/oncall_schedule.json
git commit -m "chore: rotate oncall schedule" || echo "No changes to commit"
git pull --rebase
git push origin HEAD:main
release perms .github/workflows/release.yaml
View raw YAML
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: "Release Megatron-Core"
on:
workflow_dispatch:
inputs:
release-ref:
description: Ref (SHA or branch name) to release
required: true
type: string
dry-run:
description: Do not publish a wheel and GitHub release.
required: true
default: true
type: boolean
create-gh-release:
description: Create a GitHub release
required: true
default: true
type: boolean
generate-changelog:
description: Generate changelog
required: false
default: true
type: boolean
publish-docs:
description: Publish docs
required: false
default: true
type: boolean
version-bump-branch:
description: Branch for version bump
required: true
type: string
gh-release-from-tag:
description: Tag of previous release for changelog builder
required: false
type: string
default: ""
permissions:
contents: write # To read repository content
pull-requests: write # To create PRs
jobs:
release:
uses: ./.github/workflows/_release_library.yml
with:
release-ref: ${{ inputs.release-ref || github.sha }}
dry-run: ${{ inputs.dry-run || false }}
version-bump-branch: ${{ inputs.version-bump-branch || github.ref_name }}
create-gh-release: ${{ inputs.create-gh-release || true }}
gh-release-use-changelog-builder: ${{ inputs.generate-changelog }}
publish-docs: ${{ inputs.publish-docs }}
gh-release-from-tag: ${{ inputs.gh-release-from-tag }}
secrets:
TWINE_PASSWORD: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && secrets.SVC_PYPI_TOKEN || secrets.SVC_PYPI_TEST_TOKEN }}
SLACK_WEBHOOK: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && secrets.SLACK_MAIN_CHANNEL_WEBHOOK || secrets.SLACK_CI_CHANNEL_WEBHOOK }}
PAT: ${{ secrets.PAT }}
AWS_ASSUME_ROLE_ARN: ${{ secrets.AWS_ASSUME_ROLE_ARN }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AKAMAI_HOST: ${{ secrets.AKAMAI_HOST }}
AKAMAI_CLIENT_TOKEN: ${{ secrets.AKAMAI_CLIENT_TOKEN }}
AKAMAI_CLIENT_SECRET: ${{ secrets.AKAMAI_CLIENT_SECRET }}
AKAMAI_ACCESS_TOKEN: ${{ secrets.AKAMAI_ACCESS_TOKEN }}
S3_BUCKET_NAME: ${{ secrets.S3_BUCKET_NAME }}
release-docs .github/workflows/release-docs.yml
View raw YAML
# Copyright (c) 2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Release docs
on:
workflow_dispatch:
inputs:
dry-run:
description: Whether to run the workflow in dry-run mode
required: true
type: boolean
default: true
publish-as-latest:
description: Publish as Latest stable version.
required: false
type: boolean
default: true
docs-version-override:
description: Docs version if commit is not tagged
required: false
type: string
default: ""
update-version-picker:
description: Update version picker.
required: false
type: boolean
default: true
notify-emails:
description: Email addresses to send the notification to. Format as "me@me.com,you@you.com".
required: false
type: string
workflow_call:
inputs:
dry-run:
description: Whether to run the workflow in dry-run mode
required: true
type: boolean
default: true
publish-as-latest:
description: Publish as Latest stable version.
required: false
type: boolean
default: true
docs-version-override:
description: Docs version if commit is not tagged
required: false
type: string
default: ""
update-version-picker:
description: Update version picker.
required: false
type: boolean
default: true
notify-emails:
description: Email addresses to send the notification to. Format as "me@me.com,you@you.com".
required: false
type: string
build-docs-ref:
description: Reference to build the docs from
required: false
type: string
default: ${{ github.sha }}
jobs:
build-docs:
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_docs.yml@v0.67.0
with:
ref: ${{ inputs.build-docs-ref }}
publish-docs:
runs-on: ubuntu-latest
needs: [build-docs]
steps:
- uses: actions/checkout@v6
with:
repository: NVIDIA-NeMo/FW-CI-templates
ref: v0.74.0
path: FW-CI-templates
- uses: ./FW-CI-templates/.github/actions/publish-docs
# This workflow runs either on main, or on a version tag. Any other git ref will lead
# to an error.
# If its on main, it will publish to "latest" directory in Akamai.
# If its on a versioned tag, it will extract the version number from the tag (strip `v` prefix)
# and publish to the versioned directory in Akamai.
with:
dry-run: ${{ inputs.dry-run }}
artifacts-name: docs-html
artifacts-path: _build/html
emails-csv: ${{ inputs.notify-emails && format('{0},{1}', vars.docs_release_emails, inputs.notify-emails) || vars.docs_release_emails }}
overwrite-latest-on-tag: ${{ inputs.publish-as-latest }}
docs-version-override: ${{ inputs.docs-version-override }}
update-version-picker: ${{ inputs.update-version-picker }}
run-on-version-tag-only: ${{ github.ref_name != 'main' }}
request-name: megatron-core-publish-docs-${{ github.run_id }}
aws-region: ${{ vars.DOCS_AWS_REGION }}
aws-role-to-assume: ${{ secrets.AWS_ASSUME_ROLE_ARN }}
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
akamai-host: ${{ secrets.AKAMAI_HOST }}
akamai-client-token: ${{ secrets.AKAMAI_CLIENT_TOKEN }}
akamai-client-secret: ${{ secrets.AKAMAI_CLIENT_SECRET }}
akamai-access-token: ${{ secrets.AKAMAI_ACCESS_TOKEN }}
s3-target-root: ${{ secrets.S3_BUCKET_NAME }}
s3-target-path: megatron-core/developer-guide
release-freeze .github/workflows/release-freeze.yml
View raw YAML
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: "Code freeze"
on:
workflow_dispatch:
inputs:
release-type:
type: choice
description: Type of release
options:
- major
- minor
freeze-commit:
type: string
description: Commit SHA to use for cut-off
required: false
default: main
dry-run:
type: boolean
description: Dry-run of code-freeze
required: false
default: true
jobs:
code-freeze:
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_code_freeze.yml@v0.22.5
with:
library-name: Megatron-Bridge
python-package: megatron.bridge
release-type: ${{ inputs.release-type }}
freeze-commit: ${{ inputs.freeze-commit }}
dry-run: ${{ inputs.dry-run }}
secrets:
SLACK_WEBHOOK: ${{ secrets.SLACK_MAIN_CHANNEL_WEBHOOK }}
SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_TEAM_GROUP_ID }}
release-nightly-docs .github/workflows/release-nightly-docs.yml
View raw YAML
# Copyright (c) 2026, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Release Nightly Docs
on:
schedule:
- cron: "0 10 * * *"
jobs:
call-release-docs:
uses: ./.github/workflows/release-docs.yml
with:
dry-run: false
publish-as-latest: false
docs-version-override: "nightly"
update-version-picker: false
secrets: inherit
review-trigger .github/workflows/review-trigger.yml
View raw YAML
# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Lightweight workflow that triggers on review approval, otherwise there is no access to right secret.
# No secrets needed — just signals auto-swap-labels.yml via workflow_run.
name: Review Trigger
on:
pull_request_review:
types: [submitted]
jobs:
signal:
runs-on: ubuntu-latest
if: >-
github.event.review.state == 'approved' &&
github.event.pull_request.base.ref == 'main' &&
github.repository == 'NVIDIA/Megatron-LM'
steps:
- name: Save PR number
run: |
mkdir -p pr
echo "${{ github.event.pull_request.number }}" > pr/number
- name: Upload PR number
uses: actions/upload-artifact@v4
with:
name: pr-number
path: pr/
sync-team-usergroups .github/workflows/sync-team-usergroups.yml
View raw YAML
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Sync GitHub Teams to Slack User Groups
on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *"
jobs:
sync-usergroups:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v6
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: "3.10"
- name: Sync Teams to User Groups
env:
GH_TOKEN: ${{ secrets.NVIDIA_MCORE_ONCALL_TOKEN || secrets.PAT || secrets.GITHUB_TOKEN }}
SLACK_TOKEN: ${{ secrets.ONCALL_SLACK_TOKEN }}
run: |
pip install --no-cache-dir "uv<0.9.29"
uv venv .venv
uv cache clean
uv sync --no-cache
uv run --with slack-sdk python .github/scripts/sync_team_usergroups.py
trigger-mbridge-tests .github/workflows/trigger-mbridge-tests.yml
View raw YAML
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
name: Trigger MBridge Tests
on:
workflow_dispatch:
inputs:
mbridge_ref:
description: "MBridge branch/ref to trigger"
required: false
type: string
default: "main"
test_suite:
description: "Test suite to run"
required: false
type: choice
options:
- "all"
- "unit-only"
- "functional-only"
default: "all"
jobs:
trigger-mbridge-tests:
runs-on: ubuntu-latest
steps:
- name: Trigger MBridge tests
uses: convictional/trigger-workflow-and-wait@v1.6.5
with:
owner: NVIDIA-NeMo
repo: Megatron-Bridge
workflow_file_name: cicd-main.yml
github_token: ${{ secrets.PAT }}
ref: ${{ inputs.mbridge_ref }}
wait_interval: 60
propagate_failure: true
client_payload: |
{
"mcore_ref": "${{ github.sha }}",
"test_suite": "${{ inputs.test_suite }}",
"triggered_by": "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
}