Skip to content

Commit ecdf475

Browse files
committed
Update on "[PGNCCL] Fix behavior of destroy_process_group"
Today `destroy_process_group()` is implemented via `ncclCommAbort`. When user call it in CPU, risk is that a healthy NCCL kernel gets preempted, which causes data corruption. Instead of aborting kernels, we should flush collectives in `destroy_process_group`, i.e. let them complete normally, before we tear down resources. This PR implements such "flushing" behavior using `ncclCommFinalize`, then reclaims resources via `ncclCommDestroy`. Expected behaviors: For a bad program, a hang is expected at `destroy_process_group()`. If the PG uses non-blocking communicators, such hang is recoverable, because we attaches a timeout to the flush behavior. cc H-Huang awgu wanchaol fegin fduwjj wz337 wconstab d4l3k c-p-i-o [ghstack-poisoned]
2 parents 22b4c0b + 4dc354d commit ecdf475

File tree

664 files changed

+22784
-13323
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

664 files changed

+22784
-13323
lines changed

.ci/docker/build.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -510,7 +510,7 @@ docker build \
510510
--build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
511511
--build-arg "KATEX=${KATEX:-}" \
512512
--build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
513-
--build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx906;gfx90a}" \
513+
--build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a}" \
514514
--build-arg "IMAGE_NAME=${IMAGE_NAME}" \
515515
--build-arg "UCX_COMMIT=${UCX_COMMIT}" \
516516
--build-arg "UCC_COMMIT=${UCC_COMMIT}" \
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
91b14bf5593cf58a8541f3e6b9125600a867d4ef
1+
e98b6fcb8df5b44eb0d0addb6767c573d37ba024

.ci/docker/common/install_conda.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
6565

6666
# Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README
6767
if [[ $(uname -m) == "aarch64" ]]; then
68-
conda_install "openblas==0.3.25=*openmp*"
68+
conda_install "openblas==0.3.28=*openmp*"
6969
else
7070
conda_install "mkl=2021.4.0 mkl-include=2021.4.0"
7171
fi

.ci/docker/common/install_cuda.sh

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,17 @@ function install_cusparselt_062 {
3838
rm -rf tmp_cusparselt
3939
}
4040

41+
function install_cusparselt_063 {
42+
# cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
43+
mkdir tmp_cusparselt && pushd tmp_cusparselt
44+
wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.6.3.2-archive.tar.xz
45+
tar xf libcusparse_lt-linux-x86_64-0.6.3.2-archive.tar.xz
46+
cp -a libcusparse_lt-linux-x86_64-0.6.3.2-archive/include/* /usr/local/cuda/include/
47+
cp -a libcusparse_lt-linux-x86_64-0.6.3.2-archive/lib/* /usr/local/cuda/lib64/
48+
popd
49+
rm -rf tmp_cusparselt
50+
}
51+
4152
function install_118 {
4253
CUDNN_VERSION=9.1.0.70
4354
echo "Installing CUDA 11.8 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.4.0"
@@ -140,13 +151,13 @@ function install_124 {
140151
}
141152

142153
function install_126 {
143-
echo "Installing CUDA 12.6.2 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
154+
echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
144155
rm -rf /usr/local/cuda-12.6 /usr/local/cuda
145-
# install CUDA 12.6.2 in the same container
146-
wget -q https://developer.download.nvidia.com/compute/cuda/12.6.2/local_installers/cuda_12.6.2_560.35.03_linux.run
147-
chmod +x cuda_12.6.2_560.35.03_linux.run
148-
./cuda_12.6.2_560.35.03_linux.run --toolkit --silent
149-
rm -f cuda_12.6.2_560.35.03_linux.run
156+
# install CUDA 12.6.3 in the same container
157+
wget -q https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux.run
158+
chmod +x cuda_12.6.3_560.35.05_linux.run
159+
./cuda_12.6.3_560.35.05_linux.run --toolkit --silent
160+
rm -f cuda_12.6.3_560.35.05_linux.run
150161
rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.6 /usr/local/cuda
151162

152163
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
@@ -167,7 +178,7 @@ function install_126 {
167178
cd ..
168179
rm -rf nccl
169180

170-
install_cusparselt_062
181+
install_cusparselt_063
171182

172183
ldconfig
173184
}

.ci/docker/common/install_cuda_aarch64.sh

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,17 @@ function install_cusparselt_062 {
1717
rm -rf tmp_cusparselt
1818
}
1919

20+
function install_cusparselt_063 {
21+
# cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
22+
mkdir tmp_cusparselt && pushd tmp_cusparselt
23+
wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.6.3.2-archive.tar.xz
24+
tar xf libcusparse_lt-linux-x86_64-0.6.3.2-archive.tar.xz
25+
cp -a libcusparse_lt-linux-x86_64-0.6.3.2-archive/include/* /usr/local/cuda/include/
26+
cp -a libcusparse_lt-linux-x86_64-0.6.3.2-archive/lib/* /usr/local/cuda/lib64/
27+
popd
28+
rm -rf tmp_cusparselt
29+
}
30+
2031
function install_124 {
2132
CUDNN_VERSION=9.1.0.70
2233
echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
@@ -83,13 +94,13 @@ function prune_124 {
8394
}
8495

8596
function install_126 {
86-
echo "Installing CUDA 12.6.2 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
97+
echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
8798
rm -rf /usr/local/cuda-12.6 /usr/local/cuda
88-
# install CUDA 12.6.2 in the same container
89-
wget -q https://developer.download.nvidia.com/compute/cuda/12.6.2/local_installers/cuda_12.6.2_560.35.03_linux_sbsa.run
90-
chmod +x cuda_12.6.2_560.35.03_linux_sbsa.run
91-
./cuda_12.6.2_560.35.03_linux_sbsa.run --toolkit --silent
92-
rm -f cuda_12.6.2_560.35.03_linux_sbsa.run
99+
# install CUDA 12.6.3 in the same container
100+
wget -q https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux_sbsa.run
101+
chmod +x cuda_12.6.3_560.35.05_linux_sbsa.run
102+
./cuda_12.6.3_560.35.05_linux_sbsa.run --toolkit --silent
103+
rm -f cuda_12.6.3_560.35.05_linux_sbsa.run
93104
rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.6 /usr/local/cuda
94105

95106
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
@@ -110,7 +121,7 @@ function install_126 {
110121
cd ..
111122
rm -rf nccl
112123

113-
install_cusparselt_062
124+
install_cusparselt_063
114125

115126
ldconfig
116127
}

.ci/docker/common/install_miopen.sh

Lines changed: 12 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ case "$ID" in
1616
ubuntu)
1717
IS_UBUNTU=1
1818
;;
19-
centos)
19+
centos|almalinux)
2020
IS_UBUNTU=0
2121
;;
2222
*)
@@ -43,12 +43,6 @@ else
4343
fi
4444
ROCM_INT=$(($ROCM_VERSION_MAJOR * 10000 + $ROCM_VERSION_MINOR * 100 + $ROCM_VERSION_PATCH))
4545

46-
# Install custom MIOpen + COMgr for ROCm >= 4.0.1
47-
if [[ $ROCM_INT -lt 40001 ]]; then
48-
echo "ROCm version < 4.0.1; will not install custom MIOpen"
49-
exit 0
50-
fi
51-
5246
# Function to retry functions that sometimes timeout or have flaky failures
5347
retry () {
5448
$* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
@@ -66,75 +60,35 @@ else
6660
ROCM_INSTALL_PATH="/opt/rocm-${ROCM_VERSION}"
6761
fi
6862

69-
# MIOPEN_USE_HIP_KERNELS is a Workaround for COMgr issues
7063
MIOPEN_CMAKE_COMMON_FLAGS="
7164
-DMIOPEN_USE_COMGR=ON
7265
-DMIOPEN_BUILD_DRIVER=OFF
7366
"
74-
# Pull MIOpen repo and set DMIOPEN_EMBED_DB based on ROCm version
75-
if [[ $ROCM_INT -ge 60300 ]]; then
76-
echo "ROCm 6.3+ MIOpen does not need any patches, do not build from source"
77-
exit 0
78-
elif [[ $ROCM_INT -ge 60204 ]] && [[ $ROCM_INT -lt 60300 ]]; then
79-
echo "ROCm 6.2.4+ MIOpen does not need any patches, do not build from source"
80-
exit 0
81-
elif [[ $ROCM_INT -ge 60200 ]] && [[ $ROCM_INT -lt 60204 ]]; then
67+
if [[ $ROCM_INT -ge 60200 ]] && [[ $ROCM_INT -lt 60204 ]]; then
8268
MIOPEN_BRANCH="release/rocm-rel-6.2-staging"
83-
elif [[ $ROCM_INT -ge 60100 ]] && [[ $ROCM_INT -lt 60200 ]]; then
84-
echo "ROCm 6.1 MIOpen does not need any patches, do not build from source"
85-
exit 0
86-
elif [[ $ROCM_INT -ge 60000 ]] && [[ $ROCM_INT -lt 60100 ]]; then
87-
echo "ROCm 6.0 MIOpen does not need any patches, do not build from source"
88-
exit 0
89-
elif [[ $ROCM_INT -ge 50700 ]] && [[ $ROCM_INT -lt 60000 ]]; then
90-
echo "ROCm 5.7 MIOpen does not need any patches, do not build from source"
91-
exit 0
92-
elif [[ $ROCM_INT -ge 50600 ]] && [[ $ROCM_INT -lt 50700 ]]; then
93-
MIOPEN_BRANCH="release/rocm-rel-5.6-staging"
94-
elif [[ $ROCM_INT -ge 50500 ]] && [[ $ROCM_INT -lt 50600 ]]; then
95-
MIOPEN_BRANCH="release/rocm-rel-5.5-gfx11"
96-
elif [[ $ROCM_INT -ge 50400 ]] && [[ $ROCM_INT -lt 50500 ]]; then
97-
MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36 -DMIOPEN_USE_MLIR=Off"
98-
MIOPEN_BRANCH="release/rocm-rel-5.4-staging"
99-
elif [[ $ROCM_INT -ge 50300 ]] && [[ $ROCM_INT -lt 50400 ]]; then
100-
MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36 -DMIOPEN_USE_MLIR=Off"
101-
MIOPEN_BRANCH="release/rocm-rel-5.3-staging"
102-
elif [[ $ROCM_INT -ge 50200 ]] && [[ $ROCM_INT -lt 50300 ]]; then
103-
MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36 -DMIOPEN_USE_MLIR=Off"
104-
MIOPEN_BRANCH="release/rocm-rel-5.2-staging"
105-
elif [[ $ROCM_INT -ge 50100 ]] && [[ $ROCM_INT -lt 50200 ]]; then
106-
MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36"
107-
MIOPEN_BRANCH="release/rocm-rel-5.1-staging"
108-
elif [[ $ROCM_INT -ge 50000 ]] && [[ $ROCM_INT -lt 50100 ]]; then
109-
MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36"
110-
MIOPEN_BRANCH="release/rocm-rel-5.0-staging"
11169
else
112-
echo "Unhandled ROCM_VERSION ${ROCM_VERSION}"
113-
exit 1
70+
echo "ROCm ${ROCM_VERSION} does not need any patches, do not build from source"
71+
exit 0
11472
fi
11573

11674

11775
if [[ ${IS_UBUNTU} == 1 ]]; then
11876
apt-get remove -y miopen-hip
11977
else
120-
yum remove -y miopen-hip
78+
# Workaround since almalinux manylinux image already has this and cget doesn't like that
79+
rm -rf /usr/local/lib/pkgconfig/sqlite3.pc
80+
81+
# Versioned package name needs regex match
82+
# Use --noautoremove to prevent other rocm packages from being uninstalled
83+
yum remove -y miopen-hip* --noautoremove
12184
fi
12285

12386
git clone https://github.com/ROCm/MIOpen -b ${MIOPEN_BRANCH}
12487
pushd MIOpen
12588
# remove .git to save disk space since CI runner was running out
12689
rm -rf .git
12790
# Don't build CK to save docker build time
128-
if [[ $ROCM_INT -ge 60200 ]]; then
129-
sed -i '/composable_kernel/d' requirements.txt
130-
fi
131-
# Don't build MLIR to save docker build time
132-
# since we are disabling MLIR backend for MIOpen anyway
133-
if [[ $ROCM_INT -ge 50400 ]] && [[ $ROCM_INT -lt 50500 ]]; then
134-
sed -i '/rocMLIR/d' requirements.txt
135-
elif [[ $ROCM_INT -ge 50200 ]] && [[ $ROCM_INT -lt 50400 ]]; then
136-
sed -i '/llvm-project-mlir/d' requirements.txt
137-
fi
91+
sed -i '/composable_kernel/d' requirements.txt
13892
## MIOpen minimum requirements
13993
cmake -P install_deps.cmake --minimum
14094

@@ -156,7 +110,7 @@ cd build
156110
PKG_CONFIG_PATH=/usr/local/lib/pkgconfig CXX=${ROCM_INSTALL_PATH}/llvm/bin/clang++ cmake .. \
157111
${MIOPEN_CMAKE_COMMON_FLAGS} \
158112
${MIOPEN_CMAKE_DB_FLAGS} \
159-
-DCMAKE_PREFIX_PATH="${ROCM_INSTALL_PATH}/hip;${ROCM_INSTALL_PATH}"
113+
-DCMAKE_PREFIX_PATH="${ROCM_INSTALL_PATH}"
160114
make MIOpen -j $(nproc)
161115

162116
# Build MIOpen package

.ci/docker/common/install_openblas.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
set -ex
55

66
cd /
7-
git clone https://github.com/OpenMathLib/OpenBLAS.git -b v0.3.25 --depth 1 --shallow-submodules
7+
git clone https://github.com/OpenMathLib/OpenBLAS.git -b v0.3.28 --depth 1 --shallow-submodules
88

99

1010
OPENBLAS_BUILD_FLAGS="

.ci/docker/common/install_rocm_drm.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ case "$ID" in
1212
apt-get install -y libpciaccess-dev pkg-config
1313
apt-get clean
1414
;;
15-
centos)
15+
centos|almalinux)
1616
yum install -y libpciaccess-devel pkgconfig
1717
;;
1818
*)

.ci/docker/common/install_rocm_magma.sh

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,18 @@
33

44
set -ex
55

6+
# Magma build scripts need `python`
7+
ln -sf /usr/bin/python3 /usr/bin/python
8+
9+
ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
10+
case "$ID" in
11+
almalinux)
12+
yum install -y gcc-gfortran
13+
;;
14+
*)
15+
echo "No preinstalls to build magma..."
16+
;;
17+
esac
618

719
MKLROOT=${MKLROOT:-/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION}
820

.ci/docker/manywheel/Dockerfile_2_28

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
# syntax = docker/dockerfile:experimental
2-
ARG ROCM_VERSION=3.7
32
ARG BASE_CUDA_VERSION=11.8
43
ARG GPU_IMAGE=amd64/almalinux:8
54
FROM quay.io/pypa/manylinux_2_28_x86_64 as base
@@ -130,10 +129,10 @@ RUN for cpython_version in "cp312-cp312" "cp313-cp313" "cp313-cp313t"; do \
130129
done;
131130

132131

133-
# cmake-3.18.4 from pip
132+
# cmake-3.18.4 from pip; force in case cmake3 already exists
134133
RUN yum install -y python3-pip && \
135134
python3 -mpip install cmake==3.18.4 && \
136-
ln -s /usr/local/bin/cmake /usr/bin/cmake3
135+
ln -sf /usr/local/bin/cmake /usr/bin/cmake3
137136

138137
FROM cpu_final as cuda_final
139138
RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
@@ -142,17 +141,24 @@ COPY --from=magma /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda-${BAS
142141
RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda
143142
ENV PATH=/usr/local/cuda/bin:$PATH
144143

145-
146-
FROM common as rocm_final
147-
ARG ROCM_VERSION=3.7
148-
# Install ROCm
149-
ADD ./common/install_rocm.sh install_rocm.sh
150-
RUN bash ./install_rocm.sh ${ROCM_VERSION} && rm install_rocm.sh
151-
# cmake is already installed inside the rocm base image, but both 2 and 3 exist
152-
# cmake3 is needed for the later MIOpen custom build, so that step is last.
153-
RUN yum install -y cmake3 && \
154-
rm -f /usr/bin/cmake && \
155-
ln -s /usr/bin/cmake3 /usr/bin/cmake
144+
FROM cpu_final as rocm_final
145+
ARG ROCM_VERSION=6.0
146+
ARG PYTORCH_ROCM_ARCH
147+
ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
148+
ARG DEVTOOLSET_VERSION=11
149+
ENV LDFLAGS="-Wl,-rpath=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64 -Wl,-rpath=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib"
150+
# Somewhere in ROCm stack, we still use non-existing /opt/rocm/hip path,
151+
# below workaround helps avoid error
152+
ENV ROCM_PATH /opt/rocm
153+
# cmake-3.28.4 from pip to get enable_language(HIP)
154+
# and avoid 3.21.0 cmake+ninja issues with ninja inserting "-Wl,--no-as-needed" in LINK_FLAGS for static linker
155+
RUN python3 -m pip install --upgrade pip && \
156+
python3 -mpip install cmake==3.28.4
157+
ADD ./common/install_rocm_drm.sh install_rocm_drm.sh
158+
RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
159+
ENV MKLROOT /opt/intel
160+
ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
161+
RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh
156162
ADD ./common/install_miopen.sh install_miopen.sh
157163
RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
158164

0 commit comments

Comments
 (0)