Skip to content

Commit 94ffaa7

Browse files
committed
Merge branch 'main' into titaiwang/drop_draft_export
2 parents 5e76e95 + ca9fe01 commit 94ffaa7

File tree

98 files changed

+2841
-2985
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

98 files changed

+2841
-2985
lines changed

.ci/docker/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,8 @@ If your new Docker image needs a library installed from a specific pinned commit
120120
If you're introducing a new argument to the Docker build, make sure to add it in the Docker build step in `.ci/docker/build.sh`:
121121
```bash
122122
docker build \
123-
....
124-
--build-arg "NEW_ARG_1=${NEW_ARG_1}"
123+
....
124+
--build-arg "NEW_ARG_1=${NEW_ARG_1}"
125125
```
126126

127127
3. **Update Dockerfile logic**:

.ci/docker/common/install_cuda.sh

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ else
1010
arch_path='sbsa'
1111
fi
1212

13-
NVSHMEM_VERSION=3.3.20
13+
NVSHMEM_VERSION=3.3.24
1414

1515
function install_cuda {
1616
version=$1
@@ -65,7 +65,7 @@ function install_nvshmem {
6565
# This pattern is a lie as it is not consistent across versions, for 3.3.9 it was cuda_ver-arch-nvshhem-ver
6666
filename="libnvshmem-linux-${arch_path}-${nvshmem_version}_cuda${cuda_major_version}-archive"
6767
suffix=".tar.xz"
68-
url="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${cuda_major_version}/txz/agnostic/${dl_arch}/${filename}${suffix}"
68+
url="https://developer.download.nvidia.com/compute/nvshmem/redist/libnvshmem/linux-${arch_path}/${filename}${suffix}"
6969

7070
# download, unpack, install
7171
wget -q "${url}"
@@ -148,7 +148,6 @@ function install_128 {
148148

149149
function install_130 {
150150
CUDNN_VERSION=9.12.0.46
151-
NVSHMEM_VERSION=3.3.20
152151
echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
153152
# install CUDA 13.0 in the same container
154153
install_cuda 13.0.0 cuda_13.0.0_580.65.06_linux

.ci/docker/common/install_triton.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ if [ ! -f setup.py ]; then
5757
cd python
5858
fi
5959

60-
pip_install pybind11==2.13.6
60+
pip_install pybind11==3.0.1
6161

6262
# TODO: remove patch setup.py once we have a proper fix for https://github.com/triton-lang/triton/issues/4527
6363
as_jenkins sed -i -e 's/https:\/\/tritonlang.blob.core.windows.net\/llvm-builds/https:\/\/oaitriton.blob.core.windows.net\/public\/llvm-builds/g' setup.py

.ci/pytorch/check_binary.sh

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -300,24 +300,3 @@ except RuntimeError as e:
300300
exit 1
301301
fi
302302
fi
303-
304-
###############################################################################
305-
# Check for C++ ABI compatibility to GCC-11 - GCC 13
306-
###############################################################################
307-
if [[ "$(uname)" == 'Linux' && "$PACKAGE_TYPE" == 'manywheel' ]]; then
308-
pushd /tmp
309-
# Per https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Dialect-Options.html
310-
# gcc-11 is ABI16, gcc-13 is ABI18, gcc-14 is ABI19
311-
# gcc 11 - CUDA 11.8, xpu, rocm
312-
# gcc 13 - CUDA 12.6, 12.8 and cpu
313-
# Please see issue for reference: https://github.com/pytorch/pytorch/issues/152426
314-
if [[ "$(uname -m)" == "s390x" ]]; then
315-
cxx_abi="19"
316-
elif [[ "$DESIRED_CUDA" != 'xpu' && "$DESIRED_CUDA" != 'rocm'* ]]; then
317-
cxx_abi="18"
318-
else
319-
cxx_abi="16"
320-
fi
321-
python -c "import torch; exit(0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi10${cxx_abi}' else 1)"
322-
popd
323-
fi

.ci/pytorch/common_utils.sh

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -149,13 +149,22 @@ function get_pinned_commit() {
149149
cat .github/ci_commit_pins/"${1}".txt
150150
}
151151

152+
function detect_cuda_arch() {
153+
if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then
154+
if command -v nvidia-smi; then
155+
TORCH_CUDA_ARCH_LIST=$(nvidia-smi --query-gpu=compute_cap --format=csv | tail -n 1)
156+
elif [[ "${TEST_CONFIG}" == *nogpu* ]]; then
157+
# There won't be nvidia-smi in nogpu tests, so just set TORCH_CUDA_ARCH_LIST to the default
158+
# minimum supported value here
159+
TORCH_CUDA_ARCH_LIST=8.0
160+
fi
161+
export TORCH_CUDA_ARCH_LIST
162+
fi
163+
}
164+
152165
function install_torchaudio() {
153166
local commit
154167
commit=$(get_pinned_commit audio)
155-
if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]] && command -v nvidia-smi; then
156-
TORCH_CUDA_ARCH_LIST=$(nvidia-smi --query-gpu=compute_cap --format=csv | tail -n 1)
157-
export TORCH_CUDA_ARCH_LIST
158-
fi
159168
pip_build_and_install "git+https://github.com/pytorch/audio.git@${commit}" dist/audio
160169
}
161170

.ci/pytorch/multigpu-test.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ if [[ "${SHARD_NUMBER:-2}" == "2" ]]; then
4545
# DTensor tests
4646
time python test/run_test.py --verbose -i distributed/tensor/test_random_ops
4747
time python test/run_test.py --verbose -i distributed/tensor/test_dtensor_compile
48+
time python test/run_test.py --verbose -i distributed/tensor/test_utils.py
4849

4950
# DeviceMesh test
5051
time python test/run_test.py --verbose -i distributed/test_device_mesh

.ci/pytorch/test.sh

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ if [[ "$BUILD_ENVIRONMENT" == *clang9* || "$BUILD_ENVIRONMENT" == *xpu* ]]; then
9191
export VALGRIND=OFF
9292
fi
9393

94+
detect_cuda_arch
9495

9596
if [[ "$BUILD_ENVIRONMENT" == *s390x* ]]; then
9697
# There are additional warnings on s390x, maybe due to newer gcc.
@@ -1630,11 +1631,7 @@ elif [[ "${TEST_CONFIG}" == *xla* ]]; then
16301631
build_xla
16311632
test_xla
16321633
elif [[ "$TEST_CONFIG" == *vllm* ]]; then
1633-
if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then
1634-
TORCH_CUDA_ARCH_LIST=$(nvidia-smi --query-gpu=compute_cap --format=csv | tail -n 1)
1635-
export TORCH_CUDA_ARCH_LIST
1636-
fi
1637-
echo "VLLM CI TORCH_CUDA_ARCH_LIST: $TORCH_CUDA_ARCH_LIST"
1634+
echo "vLLM CI uses TORCH_CUDA_ARCH_LIST: $TORCH_CUDA_ARCH_LIST"
16381635
(cd .ci/lumen_cli && python -m pip install -e .)
16391636
python -m cli.run test external vllm --test-plan "$TEST_CONFIG" --shard-id "$SHARD_NUMBER" --num-shards "$NUM_TEST_SHARDS"
16401637
elif [[ "${TEST_CONFIG}" == *executorch* ]]; then

.github/scripts/generate_binary_build_matrix.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@
107107
"nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | "
108108
"nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | "
109109
"nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | "
110-
"nvidia-nvshmem-cu13==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
110+
"nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | "
111111
"nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
112112
"nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
113113
"nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'"
@@ -210,7 +210,7 @@ def arch_type(arch_version: str) -> str:
210210
"cpu": "libtorch-cxx11-builder:cpu",
211211
}
212212

213-
FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t"]
213+
FULL_PYTHON_VERSIONS = ["3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t"]
214214

215215

216216
def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:

.github/workflows/_link_check.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ jobs:
1313
if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-url-lint') }}
1414
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
1515
with:
16+
job-name: lint-urls
1617
timeout: 120
1718
runner: ${{ inputs.runner }}linux.2xlarge
1819
docker-image: ci-image:pytorch-linux-jammy-linter
@@ -38,6 +39,7 @@ jobs:
3839
if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-xref-lint') }}
3940
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
4041
with:
42+
job-name: lint-xrefs
4143
timeout: 60
4244
runner: ${{ inputs.runner }}linux.2xlarge
4345
docker-image: ci-image:pytorch-linux-jammy-linter

.github/workflows/_linux-test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -409,7 +409,7 @@ jobs:
409409
job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
410410

411411
- name: Authenticate with AWS
412-
if: ${{ contains(matrix.runner, 'b200') }}
412+
if: ${{ always() && contains(matrix.runner, 'b200') }}
413413
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
414414
with:
415415
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results

0 commit comments

Comments
 (0)