Skip to content

Commit 5b06e18

Browse files
committed
Update on "[NCCL] Add more details for checkForNCCLErrors"
#45950 enhanced our NCCL logging errors so that we add some basic debug information about what when wrong when erroring out with a NCCL error. However, that PR only used the added function for `C10D_NCCL_CHECK` which is used to check the return values of NCCL calls. However, in ProcessGroupNCCL we also have `checkForNCCLErrors` which checks for errors on nccl communicators, and in case of errors it would be good to have this logging there too. Also renames the function s/errorMessage/getNcclErrorDetailStr Differential Revision: [D27100497](https://our.internmc.facebook.com/intern/diff/D27100497/) [ghstack-poisoned]
2 parents 0a69b2c + d67a41a commit 5b06e18

File tree

205 files changed

+4655
-1647
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

205 files changed

+4655
-1647
lines changed

.circleci/cimodel/data/simple/docker_definitions.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,6 @@
1010
"pytorch-linux-bionic-py3.6-clang9",
1111
"pytorch-linux-bionic-cuda10.2-cudnn7-py3.6-clang9",
1212
"pytorch-linux-bionic-py3.8-gcc9",
13-
"pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4",
14-
"pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7",
1513
"pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7",
1614
"pytorch-linux-xenial-cuda10.1-cudnn7-py3-gcc7",
1715
"pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",

.circleci/config.yml

Lines changed: 7 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -472,7 +472,7 @@ jobs:
472472
473473
docker cp /home/circleci/project/. $id:/var/lib/jenkins/workspace
474474
475-
export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && .jenkins/pytorch/build.sh && find ${BUILD_ROOT} -type f -name "*.a" -or -name "*.o" -delete") | docker exec -u jenkins -i "$id" bash) 2>&1'
475+
export COMMAND='((echo "sudo chown -R jenkins workspace && export CIRCLE_JOB="$CIRCLE_JOB" && cd workspace && .jenkins/pytorch/build.sh && find ${BUILD_ROOT} -type f -name "*.a" -or -name "*.o" -delete") | docker exec -u jenkins -i "$id" bash) 2>&1'
476476
477477
echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
478478
@@ -603,6 +603,7 @@ jobs:
603603
# =================== The following code will be executed inside Docker container ===================
604604
set -ex
605605
export SCRIBE_GRAPHQL_ACCESS_TOKEN="${SCRIBE_GRAPHQL_ACCESS_TOKEN}"
606+
export CIRCLE_JOB="$CIRCLE_JOB"
606607
${PARALLEL_FLAGS}
607608
cd workspace
608609
EOL
@@ -696,6 +697,11 @@ jobs:
696697
executor: <<parameters.executor>>
697698
steps:
698699
- checkout
700+
- run:
701+
name: _HACK_ Install CUDA compatible cmath
702+
no_output_timeout: 1m
703+
command: |
704+
powershell .circleci/scripts/vs_install_cmath.ps1
699705
- run:
700706
name: Install Cuda
701707
no_output_timeout: 30m
@@ -6729,12 +6735,6 @@ workflows:
67296735
- docker_build_job:
67306736
name: "docker-pytorch-linux-bionic-py3.8-gcc9"
67316737
image_name: "pytorch-linux-bionic-py3.8-gcc9"
6732-
- docker_build_job:
6733-
name: "docker-pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4"
6734-
image_name: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4"
6735-
- docker_build_job:
6736-
name: "docker-pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7"
6737-
image_name: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7"
67386738
- docker_build_job:
67396739
name: "docker-pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7"
67406740
image_name: "pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7"
@@ -9143,32 +9143,6 @@ workflows:
91439143
vc_product: Community
91449144
vc_version: ""
91459145
vc_year: "2019"
9146-
- docker_build_job:
9147-
name: "docker-pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4"
9148-
image_name: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4"
9149-
- docker_build_job:
9150-
name: "docker-pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7"
9151-
image_name: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7"
9152-
- pytorch_linux_build:
9153-
name: pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build
9154-
requires:
9155-
- "docker-pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7"
9156-
build_environment: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-build"
9157-
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7"
9158-
- pytorch_linux_test:
9159-
name: pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test
9160-
requires:
9161-
- pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build
9162-
build_environment: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-test"
9163-
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7"
9164-
use_cuda_docker_runtime: "1"
9165-
resource_class: gpu.medium
9166-
- pytorch_linux_build:
9167-
name: pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc5_4_build
9168-
requires:
9169-
- "docker-pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4"
9170-
build_environment: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4-build"
9171-
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4"
91729146

91739147
# The following allows these jobs to run on ci-all and release branches
91749148
debuggable-scheduled-ci:
@@ -9266,57 +9240,6 @@ workflows:
92669240
only:
92679241
- /ci-all\/.*/
92689242
- /release\/.*/
9269-
- docker_build_job:
9270-
name: "docker-pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4"
9271-
image_name: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4"
9272-
filters:
9273-
branches:
9274-
only:
9275-
- /ci-all\/.*/
9276-
- /release\/.*/
9277-
- docker_build_job:
9278-
name: "docker-pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7"
9279-
image_name: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7"
9280-
filters:
9281-
branches:
9282-
only:
9283-
- /ci-all\/.*/
9284-
- /release\/.*/
9285-
- pytorch_linux_build:
9286-
name: pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build
9287-
requires:
9288-
- "docker-pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7"
9289-
build_environment: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-build"
9290-
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7"
9291-
filters:
9292-
branches:
9293-
only:
9294-
- /ci-all\/.*/
9295-
- /release\/.*/
9296-
- pytorch_linux_test:
9297-
name: pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test
9298-
requires:
9299-
- pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build
9300-
build_environment: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-test"
9301-
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7"
9302-
use_cuda_docker_runtime: "1"
9303-
resource_class: gpu.medium
9304-
filters:
9305-
branches:
9306-
only:
9307-
- /ci-all\/.*/
9308-
- /release\/.*/
9309-
- pytorch_linux_build:
9310-
name: pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc5_4_build
9311-
requires:
9312-
- "docker-pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4"
9313-
build_environment: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4-build"
9314-
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4"
9315-
filters:
9316-
branches:
9317-
only:
9318-
- /ci-all\/.*/
9319-
- /release\/.*/
93209243
ecr_gc:
93219244
triggers:
93229245
- schedule:

.circleci/docker/build.sh

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -101,24 +101,6 @@ case "$image" in
101101
DB=yes
102102
VISION=yes
103103
;;
104-
pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4)
105-
CUDA_VERSION=9.2
106-
CUDNN_VERSION=7
107-
ANACONDA_PYTHON_VERSION=3.6
108-
GCC_VERSION=5
109-
PROTOBUF=yes
110-
DB=yes
111-
VISION=yes
112-
;;
113-
pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7)
114-
CUDA_VERSION=9.2
115-
CUDNN_VERSION=7
116-
ANACONDA_PYTHON_VERSION=3.6
117-
GCC_VERSION=7
118-
PROTOBUF=yes
119-
DB=yes
120-
VISION=yes
121-
;;
122104
pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7)
123105
CUDA_VERSION=10.0
124106
CUDNN_VERSION=7

.circleci/docker/common/install_conda.sh

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,8 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
8080
else
8181
conda_install numpy=1.18.5 pyyaml mkl mkl-include setuptools cffi future six dataclasses typing_extensions
8282
fi
83-
if [[ "$CUDA_VERSION" == 9.2* ]]; then
84-
conda_install magma-cuda92 -c pytorch
85-
elif [[ "$CUDA_VERSION" == 10.0* ]]; then
83+
84+
if [[ "$CUDA_VERSION" == 10.0* ]]; then
8685
conda_install magma-cuda100 -c pytorch
8786
elif [[ "$CUDA_VERSION" == 10.1* ]]; then
8887
conda_install magma-cuda101 -c pytorch

.circleci/regenerate.sh

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,17 @@
1-
#!/bin/bash -xe
1+
#!/bin/bash -e
22

33
# Allows this script to be invoked from any directory:
44
cd "$(dirname "$0")"
55

6-
OLD_FILE=$(mktemp)
7-
cp config.yml "$OLD_FILE"
6+
UNCOMMIT_CHANGE=$(git status -s | grep -c " config.yml")
7+
if [[ $UNCOMMIT_CHANGE != 0 ]]; then
8+
OLD_FILE=$(mktemp)
9+
cp config.yml "$OLD_FILE"
10+
echo "Uncommitted change detected in .circleci/config.yml"
11+
echo "It has been backed up to $OLD_FILE"
12+
fi
13+
814
NEW_FILE=$(mktemp)
915
./generate_config_yml.py > "$NEW_FILE"
1016
cp "$NEW_FILE" config.yml
17+
echo "New config generated in .circleci/config.yml"
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
$CMATH_DOWNLOAD_LINK = "https://raw.githubusercontent.com/microsoft/STL/12c684bba78f9b032050526abdebf14f58ca26a3/stl/inc/cmath"
2+
$VC14_28_INSTALL_PATH="C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.28.29910\include"
3+
4+
curl.exe --retry 3 -kL $CMATH_DOWNLOAD_LINK --output "$home\cmath"
5+
Move-Item -Path "$home\cmath" -Destination "$VC14_28_INSTALL_PATH" -Force

.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ jobs:
3434
3535
docker cp /home/circleci/project/. $id:/var/lib/jenkins/workspace
3636
37-
export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && .jenkins/pytorch/build.sh && find ${BUILD_ROOT} -type f -name "*.a" -or -name "*.o" -delete") | docker exec -u jenkins -i "$id" bash) 2>&1'
37+
export COMMAND='((echo "sudo chown -R jenkins workspace && export CIRCLE_JOB="$CIRCLE_JOB" && cd workspace && .jenkins/pytorch/build.sh && find ${BUILD_ROOT} -type f -name "*.a" -or -name "*.o" -delete") | docker exec -u jenkins -i "$id" bash) 2>&1'
3838
3939
echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
4040
@@ -165,6 +165,7 @@ jobs:
165165
# =================== The following code will be executed inside Docker container ===================
166166
set -ex
167167
export SCRIBE_GRAPHQL_ACCESS_TOKEN="${SCRIBE_GRAPHQL_ACCESS_TOKEN}"
168+
export CIRCLE_JOB="$CIRCLE_JOB"
168169
${PARALLEL_FLAGS}
169170
cd workspace
170171
EOL
@@ -258,6 +259,11 @@ jobs:
258259
executor: <<parameters.executor>>
259260
steps:
260261
- checkout
262+
- run:
263+
name: _HACK_ Install CUDA compatible cmath
264+
no_output_timeout: 1m
265+
command: |
266+
powershell .circleci/scripts/vs_install_cmath.ps1
261267
- run:
262268
name: Install Cuda
263269
no_output_timeout: 30m

.circleci/verbatim-sources/workflows/workflows-scheduled-ci.yml

Lines changed: 0 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -66,32 +66,6 @@
6666
vc_product: Community
6767
vc_version: ""
6868
vc_year: "2019"
69-
- docker_build_job:
70-
name: "docker-pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4"
71-
image_name: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4"
72-
- docker_build_job:
73-
name: "docker-pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7"
74-
image_name: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7"
75-
- pytorch_linux_build:
76-
name: pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build
77-
requires:
78-
- "docker-pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7"
79-
build_environment: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-build"
80-
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7"
81-
- pytorch_linux_test:
82-
name: pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test
83-
requires:
84-
- pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build
85-
build_environment: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-test"
86-
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7"
87-
use_cuda_docker_runtime: "1"
88-
resource_class: gpu.medium
89-
- pytorch_linux_build:
90-
name: pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc5_4_build
91-
requires:
92-
- "docker-pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4"
93-
build_environment: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4-build"
94-
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4"
9569

9670
# The following allows these jobs to run on ci-all and release branches
9771
debuggable-scheduled-ci:
@@ -189,54 +163,3 @@
189163
only:
190164
- /ci-all\/.*/
191165
- /release\/.*/
192-
- docker_build_job:
193-
name: "docker-pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4"
194-
image_name: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4"
195-
filters:
196-
branches:
197-
only:
198-
- /ci-all\/.*/
199-
- /release\/.*/
200-
- docker_build_job:
201-
name: "docker-pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7"
202-
image_name: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7"
203-
filters:
204-
branches:
205-
only:
206-
- /ci-all\/.*/
207-
- /release\/.*/
208-
- pytorch_linux_build:
209-
name: pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build
210-
requires:
211-
- "docker-pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7"
212-
build_environment: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-build"
213-
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7"
214-
filters:
215-
branches:
216-
only:
217-
- /ci-all\/.*/
218-
- /release\/.*/
219-
- pytorch_linux_test:
220-
name: pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test
221-
requires:
222-
- pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build
223-
build_environment: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-test"
224-
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7"
225-
use_cuda_docker_runtime: "1"
226-
resource_class: gpu.medium
227-
filters:
228-
branches:
229-
only:
230-
- /ci-all\/.*/
231-
- /release\/.*/
232-
- pytorch_linux_build:
233-
name: pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc5_4_build
234-
requires:
235-
- "docker-pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4"
236-
build_environment: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4-build"
237-
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4"
238-
filters:
239-
branches:
240-
only:
241-
- /ci-all\/.*/
242-
- /release\/.*/

.gitmodules

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@
5454
ignore = dirty
5555
path = third_party/zstd
5656
url = https://github.com/facebook/zstd.git
57-
[submodule "third-party/cpuinfo"]
57+
[submodule "third_party/cpuinfo"]
5858
ignore = dirty
5959
path = third_party/cpuinfo
6060
url = https://github.com/pytorch/cpuinfo.git
@@ -131,5 +131,5 @@
131131
path = third_party/tensorpipe
132132
url = https://github.com/pytorch/tensorpipe.git
133133
[submodule "third_party/kineto"]
134-
path = third_party/kineto
135-
url = https://github.com/pytorch/kineto
134+
path = third_party/kineto
135+
url = https://github.com/pytorch/kineto

.jenkins/pytorch/build-asan.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ CC="clang" CXX="clang++" LDSHARED="clang --shared" \
3737
USE_ASAN=1 USE_CUDA=0 USE_MKLDNN=0 \
3838
python setup.py install
3939

40-
4140
# Test building via the sdist source tarball
4241
python setup.py sdist
4342
mkdir -p /tmp/tmp

0 commit comments

Comments
 (0)