Skip to content

Commit 0f6c757

Browse files
authored
Merge branch 'pytorch:main' into faster-random-batch
2 parents 0fcc301 + 56039b5 commit 0f6c757

File tree

2,698 files changed

+148264
-77226
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

2,698 files changed

+148264
-77226
lines changed

.ci/aarch64_linux/aarch64_ci_build.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ cd /
2020
# on the mounted pytorch repo
2121
git config --global --add safe.directory /pytorch
2222
pip install -r /pytorch/requirements.txt
23-
pip install auditwheel
23+
pip install auditwheel==6.2.0
2424
if [ "$DESIRED_CUDA" = "cpu" ]; then
2525
echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
2626
#USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files

.ci/aarch64_linux/aarch64_wheel_ci_build.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,10 +99,14 @@ def update_wheel(wheel_path, desired_cuda) -> None:
9999
if "126" in desired_cuda:
100100
libs_to_copy += [
101101
"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.6",
102+
"/usr/local/cuda/lib64/libcufile.so.0",
103+
"/usr/local/cuda/lib64/libcufile_rdma.so.1",
102104
]
103105
elif "128" in desired_cuda:
104106
libs_to_copy += [
105107
"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.8",
108+
"/usr/local/cuda/lib64/libcufile.so.0",
109+
"/usr/local/cuda/lib64/libcufile_rdma.so.1",
106110
]
107111
else:
108112
libs_to_copy += [
@@ -132,6 +136,9 @@ def complete_wheel(folder: str) -> str:
132136
"""
133137
wheel_name = list_dir(f"/{folder}/dist")[0]
134138

139+
# Please note for cuda we don't run auditwheel since we use custom script to package
140+
# the cuda dependencies to the wheel file using update_wheel() method.
141+
# However we need to make sure filename reflects the correct Manylinux platform.
135142
if "pytorch" in folder and not enable_cuda:
136143
print("Repairing Wheel with AuditWheel")
137144
check_call(["auditwheel", "repair", f"dist/{wheel_name}"], cwd=folder)
@@ -143,7 +150,14 @@ def complete_wheel(folder: str) -> str:
143150
f"/{folder}/dist/{repaired_wheel_name}",
144151
)
145152
else:
146-
repaired_wheel_name = wheel_name
153+
repaired_wheel_name = wheel_name.replace(
154+
"linux_aarch64", "manylinux_2_28_aarch64"
155+
)
156+
print(f"Renaming {wheel_name} wheel to {repaired_wheel_name}")
157+
os.rename(
158+
f"/{folder}/dist/{wheel_name}",
159+
f"/{folder}/dist/{repaired_wheel_name}",
160+
)
147161

148162
print(f"Copying {repaired_wheel_name} to artifacts")
149163
shutil.copy2(
@@ -204,7 +218,7 @@ def parse_arguments():
204218
else:
205219
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 "
206220
elif branch.startswith(("v1.", "v2.")):
207-
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
221+
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
208222

209223
if enable_mkldnn:
210224
build_ArmComputeLibrary()

.ci/aarch64_linux/build_aarch64_wheel.py

Lines changed: 2 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,11 @@
1919

2020
# AMI images for us-east-1, change the following based on your ~/.aws/config
2121
os_amis = {
22-
"ubuntu18_04": "ami-078eece1d8119409f", # login_name: ubuntu
2322
"ubuntu20_04": "ami-052eac90edaa9d08f", # login_name: ubuntu
2423
"ubuntu22_04": "ami-0c6c29c5125214c77", # login_name: ubuntu
2524
"redhat8": "ami-0698b90665a2ddcf1", # login_name: ec2-user
2625
}
2726

28-
ubuntu18_04_ami = os_amis["ubuntu18_04"]
2927
ubuntu20_04_ami = os_amis["ubuntu20_04"]
3028

3129

@@ -659,18 +657,6 @@ def configure_system(
659657
"sudo apt-get install -y python3-dev python3-yaml python3-setuptools python3-wheel python3-pip"
660658
)
661659
host.run_cmd("pip3 install dataclasses typing-extensions")
662-
# Install and switch to gcc-8 on Ubuntu-18.04
663-
if not host.using_docker() and host.ami == ubuntu18_04_ami and compiler == "gcc-8":
664-
host.run_cmd("sudo apt-get install -y g++-8 gfortran-8")
665-
host.run_cmd(
666-
"sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 100"
667-
)
668-
host.run_cmd(
669-
"sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-8 100"
670-
)
671-
host.run_cmd(
672-
"sudo update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-8 100"
673-
)
674660
if not use_conda:
675661
print("Installing Cython + numpy from PyPy")
676662
host.run_cmd("sudo pip3 install Cython")
@@ -761,7 +747,7 @@ def start_build(
761747
version = host.check_output("cat pytorch/version.txt").strip()[:-2]
762748
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1"
763749
if branch.startswith(("v1.", "v2.")):
764-
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1"
750+
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1"
765751
if host.using_docker():
766752
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
767753
if enable_mkldnn:
@@ -1026,7 +1012,7 @@ def parse_arguments():
10261012
install_condaforge_python(host, args.python_version)
10271013
sys.exit(0)
10281014

1029-
python_version = args.python_version if args.python_version is not None else "3.8"
1015+
python_version = args.python_version if args.python_version is not None else "3.9"
10301016

10311017
if args.use_torch_from_pypi:
10321018
configure_system(host, compiler=args.compiler, python_version=python_version)

.ci/caffe2/test.sh

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,6 @@ if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
1313
echo 'Skipping tests'
1414
exit 0
1515
fi
16-
if [[ "${BUILD_ENVIRONMENT}" == *-rocm* ]]; then
17-
# temporary to locate some kernel issues on the CI nodes
18-
export HSAKMT_DEBUG_LEVEL=4
19-
fi
2016
# These additional packages are needed for circleci ROCm builds.
2117
if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then
2218
# Need networkx 2.0 because bellmand_ford was moved in 2.1 . Scikit-image by

.ci/docker/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,5 +34,5 @@ See `build.sh` for valid build environments (it's the giant switch).
3434
./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
3535

3636
# Set flags (see build.sh) and build image
37-
sudo bash -c 'PROTOBUF=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
37+
sudo bash -c 'TRITON=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
3838
```

.ci/docker/almalinux/Dockerfile

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
ARG CUDA_VERSION=12.4
22
ARG BASE_TARGET=cuda${CUDA_VERSION}
3+
ARG ROCM_IMAGE=rocm/dev-almalinux-8:6.3-complete
34
FROM amd64/almalinux:8 as base
45

56
ENV LC_ALL en_US.UTF-8
@@ -8,10 +9,6 @@ ENV LANGUAGE en_US.UTF-8
89

910
ARG DEVTOOLSET_VERSION=11
1011

11-
ENV LC_ALL en_US.UTF-8
12-
ENV LANG en_US.UTF-8
13-
ENV LANGUAGE en_US.UTF-8
14-
1512
RUN yum -y update
1613
RUN yum -y install epel-release
1714
RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
@@ -41,9 +38,12 @@ RUN bash ./install_conda.sh && rm install_conda.sh
4138

4239
# Install CUDA
4340
FROM base as cuda
44-
ARG CUDA_VERSION=12.4
41+
ARG CUDA_VERSION=12.6
4542
RUN rm -rf /usr/local/cuda-*
4643
ADD ./common/install_cuda.sh install_cuda.sh
44+
COPY ./common/install_nccl.sh install_nccl.sh
45+
COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
46+
COPY ./common/install_cusparselt.sh install_cusparselt.sh
4747
ENV CUDA_HOME=/usr/local/cuda-${CUDA_VERSION}
4848
# Preserve CUDA_VERSION for the builds
4949
ENV CUDA_VERSION=${CUDA_VERSION}
@@ -54,28 +54,29 @@ FROM cuda as cuda11.8
5454
RUN bash ./install_cuda.sh 11.8
5555
ENV DESIRED_CUDA=11.8
5656

57-
FROM cuda as cuda12.1
58-
RUN bash ./install_cuda.sh 12.1
59-
ENV DESIRED_CUDA=12.1
60-
61-
FROM cuda as cuda12.4
62-
RUN bash ./install_cuda.sh 12.4
63-
ENV DESIRED_CUDA=12.4
64-
6557
FROM cuda as cuda12.6
6658
RUN bash ./install_cuda.sh 12.6
6759
ENV DESIRED_CUDA=12.6
6860

61+
FROM cuda as cuda12.8
62+
RUN bash ./install_cuda.sh 12.8
63+
ENV DESIRED_CUDA=12.8
64+
65+
FROM ${ROCM_IMAGE} as rocm
66+
ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
67+
ADD ./common/install_mkl.sh install_mkl.sh
68+
RUN bash ./install_mkl.sh && rm install_mkl.sh
69+
ENV MKLROOT /opt/intel
70+
6971
# Install MNIST test data
7072
FROM base as mnist
7173
ADD ./common/install_mnist.sh install_mnist.sh
7274
RUN bash ./install_mnist.sh
7375

7476
FROM base as all_cuda
7577
COPY --from=cuda11.8 /usr/local/cuda-11.8 /usr/local/cuda-11.8
76-
COPY --from=cuda12.1 /usr/local/cuda-12.1 /usr/local/cuda-12.1
77-
COPY --from=cuda12.4 /usr/local/cuda-12.4 /usr/local/cuda-12.4
7878
COPY --from=cuda12.6 /usr/local/cuda-12.6 /usr/local/cuda-12.6
79+
COPY --from=cuda12.4 /usr/local/cuda-12.8 /usr/local/cuda-12.8
7980

8081
# Final step
8182
FROM ${BASE_TARGET} as final

.ci/docker/almalinux/build.sh

Lines changed: 44 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,82 +1,70 @@
11
#!/usr/bin/env bash
22
# Script used only in CD pipeline
33

4-
set -eou pipefail
4+
set -exou pipefail
55

66
image="$1"
77
shift
88

99
if [ -z "${image}" ]; then
10-
echo "Usage: $0 IMAGE"
10+
echo "Usage: $0 IMAGENAME:ARCHTAG"
1111
exit 1
1212
fi
1313

14-
DOCKER_IMAGE_NAME="pytorch/${image}"
14+
# Go from imagename:tag to tag
15+
DOCKER_TAG_PREFIX=$(echo "${image}" | awk -F':' '{print $2}')
1516

17+
CUDA_VERSION=""
18+
ROCM_VERSION=""
19+
EXTRA_BUILD_ARGS=""
20+
if [[ "${DOCKER_TAG_PREFIX}" == cuda* ]]; then
21+
# extract cuda version from image name and tag. e.g. manylinux2_28-builder:cuda12.8 returns 12.8
22+
CUDA_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'cuda' '{print $2}')
23+
EXTRA_BUILD_ARGS="--build-arg CUDA_VERSION=${CUDA_VERSION}"
24+
elif [[ "${DOCKER_TAG_PREFIX}" == rocm* ]]; then
25+
# extract rocm version from image name and tag. e.g. manylinux2_28-builder:rocm6.2.4 returns 6.2.4
26+
ROCM_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'rocm' '{print $2}')
27+
EXTRA_BUILD_ARGS="--build-arg ROCM_IMAGE=rocm/dev-almalinux-8:${ROCM_VERSION}-complete"
28+
fi
1629

17-
export DOCKER_BUILDKIT=1
18-
TOPDIR=$(git rev-parse --show-toplevel)
19-
20-
CUDA_VERSION=${CUDA_VERSION:-12.1}
21-
22-
case ${CUDA_VERSION} in
30+
case ${DOCKER_TAG_PREFIX} in
2331
cpu)
2432
BASE_TARGET=base
25-
DOCKER_TAG=cpu
2633
;;
27-
all)
28-
BASE_TARGET=all_cuda
29-
DOCKER_TAG=latest
34+
cuda*)
35+
BASE_TARGET=cuda${CUDA_VERSION}
36+
;;
37+
rocm*)
38+
BASE_TARGET=rocm
3039
;;
3140
*)
32-
BASE_TARGET=cuda${CUDA_VERSION}
33-
DOCKER_TAG=cuda${CUDA_VERSION}
41+
echo "ERROR: Unknown docker tag ${DOCKER_TAG_PREFIX}"
42+
exit 1
3443
;;
3544
esac
3645

46+
# TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
47+
# is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
48+
sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
49+
sudo systemctl daemon-reload
50+
sudo systemctl restart docker
3751

38-
(
39-
set -x
40-
# TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
41-
# is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
42-
sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
43-
sudo systemctl daemon-reload
44-
sudo systemctl restart docker
52+
export DOCKER_BUILDKIT=1
53+
TOPDIR=$(git rev-parse --show-toplevel)
54+
tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
4555

46-
docker build \
47-
--target final \
48-
--progress plain \
49-
--build-arg "BASE_TARGET=${BASE_TARGET}" \
50-
--build-arg "CUDA_VERSION=${CUDA_VERSION}" \
51-
--build-arg "DEVTOOLSET_VERSION=11" \
52-
-t ${DOCKER_IMAGE_NAME} \
53-
$@ \
54-
-f "${TOPDIR}/.ci/docker/almalinux/Dockerfile" \
55-
${TOPDIR}/.ci/docker/
56-
)
56+
docker build \
57+
--target final \
58+
--progress plain \
59+
--build-arg "BASE_TARGET=${BASE_TARGET}" \
60+
--build-arg "DEVTOOLSET_VERSION=11" \
61+
${EXTRA_BUILD_ARGS} \
62+
-t ${tmp_tag} \
63+
$@ \
64+
-f "${TOPDIR}/.ci/docker/almalinux/Dockerfile" \
65+
${TOPDIR}/.ci/docker/
5766

58-
if [[ "${DOCKER_TAG}" =~ ^cuda* ]]; then
67+
if [ -n "${CUDA_VERSION}" ]; then
5968
# Test that we're using the right CUDA compiler
60-
(
61-
set -x
62-
docker run --rm "${DOCKER_IMAGE_NAME}" nvcc --version | grep "cuda_${CUDA_VERSION}"
63-
)
64-
fi
65-
66-
GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
67-
GIT_BRANCH_NAME=${GITHUB_REF##*/}
68-
GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
69-
DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE_NAME}-${GIT_BRANCH_NAME}
70-
DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE_NAME}-${GIT_COMMIT_SHA}
71-
if [[ "${WITH_PUSH:-}" == true ]]; then
72-
(
73-
set -x
74-
docker push "${DOCKER_IMAGE_NAME}"
75-
if [[ -n ${GITHUB_REF} ]]; then
76-
docker tag ${DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_BRANCH_TAG}
77-
docker tag ${DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_SHA_TAG}
78-
docker push "${DOCKER_IMAGE_BRANCH_TAG}"
79-
docker push "${DOCKER_IMAGE_SHA_TAG}"
80-
fi
81-
)
69+
docker run --rm "${tmp_tag}" nvcc --version | grep "cuda_${CUDA_VERSION}"
8270
fi

0 commit comments

Comments
 (0)