Skip to content

Commit 38f90ce

Browse files
author
Jiewen Tan
committed
Update on "[c10d] Make reduce as a custom op"
Summary: This patch makes reduce as a custom op such that it's dispatcher passable. It's one part of the effort to route comm ops to the dispatcher such that tracing mechanisms that relies on the dispatcher can trace them, e.g., LazyTensor and AOTAutograd. Test Plan: python test/distributed/test_c10d_nccl.py -k test_reduce_ops python test/distributed/test_c10d_gloo.py -k test_reduce_basics ...and other existing distributed tests. [ghstack-poisoned]
2 parents a61886c + 7d4269b commit 38f90ce

File tree

530 files changed

+18677
-8169
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

530 files changed

+18677
-8169
lines changed

.buckconfig.oss

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,11 @@
33

44
[buildfile]
55
name = BUCK.oss
6+
includes = //tools/build_defs/select.bzl
67

78
[repositories]
89
bazel_skylib = third_party/bazel-skylib/
10+
ovr_config = .
911

1012
[download]
1113
in_build = true

.circleci/docker/build.sh

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ elif [[ "$image" == *-bionic* ]]; then
5454
UBUNTU_VERSION=18.04
5555
elif [[ "$image" == *-focal* ]]; then
5656
UBUNTU_VERSION=20.04
57+
elif [[ "$image" == *-jammy* ]]; then
58+
UBUNTU_VERSION=22.04
5759
elif [[ "$image" == *ubuntu* ]]; then
5860
extract_version_from_image_name ubuntu UBUNTU_VERSION
5961
elif [[ "$image" == *centos* ]]; then
@@ -70,7 +72,8 @@ else
7072
fi
7173

7274
DOCKERFILE="${OS}/Dockerfile"
73-
if [[ "$image" == *cuda* ]]; then
75+
# When using ubuntu - 22.04, start from Ubuntu docker image, instead of nvidia/cuda docker image.
76+
if [[ "$image" == *cuda* && "$UBUNTU_VERSION" != "22.04" ]]; then
7477
DOCKERFILE="${OS}-cuda/Dockerfile"
7578
elif [[ "$image" == *rocm* ]]; then
7679
DOCKERFILE="${OS}-rocm/Dockerfile"
@@ -173,6 +176,13 @@ case "$image" in
173176
DB=yes
174177
VISION=yes
175178
;;
179+
pytorch-linux-focal-py3-clang10-onnx)
180+
ANACONDA_PYTHON_VERSION=3.7
181+
CLANG_VERSION=10
182+
PROTOBUF=yes
183+
DB=yes
184+
VISION=yes
185+
;;
176186
pytorch-linux-xenial-py3-clang5-android-ndk-r19c)
177187
ANACONDA_PYTHON_VERSION=3.7
178188
CLANG_VERSION=5.0
@@ -249,6 +259,15 @@ case "$image" in
249259
VISION=yes
250260
KATEX=yes
251261
;;
262+
pytorch-linux-jammy-cuda11.6-cudnn8-py3.8-clang12)
263+
ANACONDA_PYTHON_VERSION=3.8
264+
CUDA_VERSION=11.6
265+
CUDNN_VERSION=8
266+
CLANG_VERSION=12
267+
PROTOBUF=yes
268+
DB=yes
269+
VISION=yes
270+
;;
252271
*)
253272
# Catch-all for builds that are not hardcoded.
254273
PROTOBUF=yes

.circleci/docker/common/install_base.sh

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,22 @@ install_ubuntu() {
1515
elif [[ "$UBUNTU_VERSION" == "20.04"* ]]; then
1616
cmake3="cmake=3.16*"
1717
maybe_libiomp_dev=""
18+
elif [[ "$UBUNTU_VERSION" == "22.04"* ]]; then
19+
cmake3="cmake=3.22*"
20+
maybe_libiomp_dev=""
1821
else
1922
cmake3="cmake=3.5*"
2023
maybe_libiomp_dev="libiomp-dev"
2124
fi
2225

26+
if [[ "$CLANG_VERSION" == 12 ]]; then
27+
maybe_libomp_dev="libomp-12-dev"
28+
elif [[ "$CLANG_VERSION" == 10 ]]; then
29+
maybe_libomp_dev="libomp-10-dev"
30+
else
31+
maybe_libomp_dev=""
32+
fi
33+
2334
# TODO: Remove this once nvidia package repos are back online
2435
# Comment out nvidia repositories to prevent them from getting apt-get updated, see https://github.com/pytorch/pytorch/issues/74968
2536
# shellcheck disable=SC2046
@@ -51,6 +62,7 @@ install_ubuntu() {
5162
libjpeg-dev \
5263
libasound2-dev \
5364
libsndfile-dev \
65+
${maybe_libomp_dev} \
5466
software-properties-common \
5567
wget \
5668
sudo \
@@ -60,6 +72,20 @@ install_ubuntu() {
6072
# see: https://github.com/pytorch/pytorch/issues/65931
6173
apt-get install -y libgnutls30
6274

75+
# cuda-toolkit does not work with gcc-11.2.0 which is default in Ubunutu 22.04
76+
# see: https://github.com/NVlabs/instant-ngp/issues/119
77+
if [[ "$UBUNTU_VERSION" == "22.04"* ]]; then
78+
apt-get install -y g++-10
79+
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 30
80+
update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 30
81+
update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-10 30
82+
83+
# https://www.spinics.net/lists/libreoffice/msg07549.html
84+
sudo rm -rf /usr/lib/gcc/x86_64-linux-gnu/11
85+
wget https://github.com/gcc-mirror/gcc/commit/2b2d97fc545635a0f6aa9c9ee3b017394bc494bf.patch -O noexecpt.patch
86+
sudo patch /usr/include/c++/10/bits/range_access.h noexecpt.patch
87+
fi
88+
6389
# Cleanup package manager
6490
apt-get autoclean && apt-get clean
6591
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

.circleci/docker/common/install_cache.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@ set -ex
55
install_ubuntu() {
66
echo "Preparing to build sccache from source"
77
apt-get update
8-
apt-get install -y cargo pkg-config libssl-dev
8+
# libssl-dev will not work as it is upgraded to libssl3 in Ubuntu-22.04.
9+
# Instead use lib and headers from OpenSSL1.1 installed in `install_openssl.sh``
10+
apt-get install -y cargo
911
echo "Checking out sccache repo"
1012
git clone https://github.com/pytorch/sccache
1113
cd sccache

.circleci/docker/common/install_openssl.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,7 @@ cd "${OPENSSL}"
1010
./config --prefix=/opt/openssl -d '-Wl,--enable-new-dtags,-rpath,$(LIBRPATH)'
1111
# NOTE: openssl install errors out when built with the -j option
1212
make -j6; make install_sw
13+
# Link the ssl libraries to the /usr/lib folder.
14+
sudo ln -s /opt/openssl/lib/lib* /usr/lib
1315
cd ..
1416
rm -rf "${OPENSSL}"

.circleci/docker/ubuntu-cuda/Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ ENV INSTALLED_VISION ${VISION}
6565
ADD ./common/install_openssl.sh install_openssl.sh
6666
ENV OPENSSL_ROOT_DIR /opt/openssl
6767
RUN bash ./install_openssl.sh
68+
ENV OPENSSL_DIR /opt/openssl
6869

6970
# (optional) Install non-default CMake version
7071
ARG CMAKE_VERSION

.circleci/docker/ubuntu/Dockerfile

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,15 @@ ARG UBUNTU_VERSION
66

77
ENV DEBIAN_FRONTEND noninteractive
88

9+
ARG CLANG_VERSION
10+
911
# Install common dependencies (so that this step can be cached separately)
1012
ARG EC2
1113
ADD ./common/install_base.sh install_base.sh
1214
RUN bash ./install_base.sh && rm install_base.sh
1315

1416
# Install clang
1517
ARG LLVMDEV
16-
ARG CLANG_VERSION
1718
ADD ./common/install_clang.sh install_clang.sh
1819
RUN bash ./install_clang.sh && rm install_clang.sh
1920

@@ -50,6 +51,13 @@ RUN bash ./install_gcc.sh && rm install_gcc.sh
5051
ADD ./common/install_lcov.sh install_lcov.sh
5152
RUN bash ./install_lcov.sh && rm install_lcov.sh
5253

54+
# Install cuda and cudnn
55+
ARG CUDA_VERSION
56+
RUN wget -q https://raw.githubusercontent.com/pytorch/builder/main/common/install_cuda.sh -O install_cuda.sh
57+
RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
58+
ENV DESIRED_CUDA ${CUDA_VERSION}
59+
ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
60+
5361
# (optional) Install protobuf for ONNX
5462
ARG PROTOBUF
5563
ADD ./common/install_protobuf.sh install_protobuf.sh
@@ -111,6 +119,8 @@ RUN rm install_ninja.sh
111119
ADD ./common/install_openssl.sh install_openssl.sh
112120
RUN bash ./install_openssl.sh
113121
ENV OPENSSL_ROOT_DIR /opt/openssl
122+
ENV OPENSSL_DIR /opt/openssl
123+
RUN rm install_openssl.sh
114124

115125
# Install ccache/sccache (do this last, so we get priority in PATH)
116126
ADD ./common/install_cache.sh install_cache.sh
@@ -122,12 +132,22 @@ ADD ./common/install_jni.sh install_jni.sh
122132
ADD ./java/jni.h jni.h
123133
RUN bash ./install_jni.sh && rm install_jni.sh
124134

135+
# Install Open MPI for CUDA
136+
ADD ./common/install_openmpi.sh install_openmpi.sh
137+
RUN if [ -n "${CUDA_VERSION}" ]; then bash install_openmpi.sh; fi
138+
RUN rm install_openmpi.sh
139+
125140
# Include BUILD_ENVIRONMENT environment variable in image
126141
ARG BUILD_ENVIRONMENT
127142
ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}
128143

129144
# Install LLVM dev version (Defined in the pytorch/builder github repository)
130145
COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
131146

147+
# AWS specific CUDA build guidance
148+
ENV TORCH_CUDA_ARCH_LIST Maxwell
149+
ENV TORCH_NVCC_FLAGS "-Xfatbin -compress-all"
150+
ENV CUDA_PATH /usr/local/cuda
151+
132152
USER jenkins
133153
CMD ["bash"]

.circleci/scripts/binary_linux_test.sh

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -53,9 +53,7 @@ if [[ "\$python_nodot" = *39* ]]; then
5353
NUMPY_PIN=">=1.20"
5454
fi
5555
56-
if [[ "$DESIRED_CUDA" == "cu116" ]]; then
57-
EXTRA_CONDA_FLAGS="-c=conda-forge"
58-
fi
56+
5957
6058
# Move debug wheels out of the the package dir so they don't get installed
6159
mkdir -p /tmp/debug_final_pkgs
@@ -88,13 +86,14 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
8886
if [[ "$DESIRED_CUDA" == 'cpu' ]]; then
8987
retry conda install -c pytorch -y cpuonly
9088
else
91-
# DESIRED_CUDA is in format cu90 or cu102
92-
if [[ "${#DESIRED_CUDA}" == 4 ]]; then
93-
cu_ver="${DESIRED_CUDA:2:1}.${DESIRED_CUDA:3}"
94-
else
95-
cu_ver="${DESIRED_CUDA:2:2}.${DESIRED_CUDA:4}"
89+
90+
cu_ver="${DESIRED_CUDA:2:2}.${DESIRED_CUDA:4}"
91+
CUDA_PACKAGE="cudatoolkit"
92+
if [[ "$DESIRED_CUDA" == "cu116" ]]; then
93+
CUDA_PACKAGE="cuda"
9694
fi
97-
retry conda install \${EXTRA_CONDA_FLAGS} -yq -c nvidia -c pytorch "cudatoolkit=\${cu_ver}"
95+
96+
retry conda install \${EXTRA_CONDA_FLAGS} -yq -c nvidia -c pytorch "\${CUDA_PACKAGE}=\${cu_ver}"
9897
fi
9998
conda install \${EXTRA_CONDA_FLAGS} -y "\$pkg" --offline
10099
)

.git-blame-ignore-revs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,13 @@ cc11aaaa60aadf28e3ec278bce26a42c1cd68a4f
1818
e3900d2ba5c9f91a24a9ce34520794c8366d5c54
1919
# 2021-04-21 Removed all unqualified `type: ignore`
2020
75024e228ca441290b6a1c2e564300ad507d7af6
21+
# 2021-04-30 [PyTorch] Autoformat c10
22+
44cc873fba5e5ffc4d4d4eef3bd370b653ce1ce1
2123
# 2021-05-14 Removed all versionless Python shebangs
2224
2e26976ad3b06ce95dd6afccfdbe124802edf28f
2325
# 2021-06-07 Strictly typed everything in `.github` and `tools`
2426
737d920b21db9b4292d056ee1329945990656304
27+
# 2022-06-09 Apply clang-format to ATen headers
28+
95b15c266baaf989ef7b6bbd7c23a2d90bacf687
29+
# 2022-06-11 [lint] autoformat test/cpp and torch/csrc
30+
30fb2c4abaaaa966999eab11674f25b18460e609

.github/actions/teardown-rocm/action.yml

Lines changed: 0 additions & 25 deletions
This file was deleted.

0 commit comments

Comments
 (0)