Skip to content

Commit 4be684d

Browse files
author
Andrew Gu
committed
Update on "[FSDP2] Del'd unsharded param at end of backward"
cc XilunWu H-Huang kwen2501 wanchaol fegin fduwjj wz337 wconstab d4l3k c-p-i-o [ghstack-poisoned]
2 parents eb6f9c2 + 37700c2 commit 4be684d

File tree

355 files changed

+10085
-12270
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

355 files changed

+10085
-12270
lines changed

.ci/docker/build.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,12 @@ case "$image" in
355355
CONDA_CMAKE=yes
356356
VISION=yes
357357
;;
358+
pytorch-linux-jammy-py3-clang18-asan)
359+
ANACONDA_PYTHON_VERSION=3.10
360+
CLANG_VERSION=18
361+
CONDA_CMAKE=yes
362+
VISION=yes
363+
;;
358364
pytorch-linux-jammy-py3.9-gcc11)
359365
ANACONDA_PYTHON_VERSION=3.9
360366
GCC_VERSION=11

.ci/docker/common/install_clang.sh

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,17 @@ if [ -n "$CLANG_VERSION" ]; then
1313
elif [[ $UBUNTU_VERSION == 22.04 ]]; then
1414
# work around ubuntu apt-get conflicts
1515
sudo apt-get -y -f install
16+
wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
17+
if [[ $CLANG_VERSION == 18 ]]; then
18+
apt-add-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-18 main"
19+
fi
1620
fi
1721

1822
sudo apt-get update
19-
apt-get install -y --no-install-recommends clang-"$CLANG_VERSION"
20-
apt-get install -y --no-install-recommends llvm-"$CLANG_VERSION"
23+
apt-get install -y --no-install-recommends clang-"$CLANG_VERSION" llvm-"$CLANG_VERSION"
24+
if [[ $CLANG_VERSION == 18 ]]; then
25+
apt-get install -y --no-install-recommends libomp-18-dev
26+
fi
2127

2228
# Install dev version of LLVM.
2329
if [ -n "$LLVMDEV" ]; then

.ci/pytorch/common_utils.sh

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,9 +191,22 @@ function install_torchrec_and_fbgemm() {
191191
pip_uninstall torchrec-nightly
192192
pip_uninstall fbgemm-gpu-nightly
193193
pip_install setuptools-git-versioning scikit-build pyre-extensions
194+
195+
# TODO (huydhn): I still have no clue on why sccache doesn't work with only fbgemm_gpu here, but it
196+
# seems to be an sccache-related issue
197+
if [[ "$IS_A100_RUNNER" == "1" ]]; then
198+
unset CMAKE_CUDA_COMPILER_LAUNCHER
199+
sudo mv /opt/cache/bin /opt/cache/bin-backup
200+
fi
201+
194202
# See https://github.com/pytorch/pytorch/issues/106971
195203
CUDA_PATH=/usr/local/cuda-12.1 pip_install --no-use-pep517 --user "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#egg=fbgemm-gpu&subdirectory=fbgemm_gpu"
196204
pip_install --no-use-pep517 --user "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}"
205+
206+
if [[ "$IS_A100_RUNNER" == "1" ]]; then
207+
export CMAKE_CUDA_COMPILER_LAUNCHER=/opt/cache/bin/sccache
208+
sudo mv /opt/cache/bin-backup /opt/cache/bin
209+
fi
197210
}
198211

199212
function clone_pytorch_xla() {

.ci/pytorch/test.sh

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -665,15 +665,6 @@ test_inductor_torchbench_smoketest_perf() {
665665
# The threshold value needs to be actively maintained to make this check useful
666666
python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv" -t 1.4
667667

668-
TORCHINDUCTOR_ABI_COMPATIBLE=1 python benchmarks/dynamo/torchbench.py --device cuda --performance --bfloat16 --inference \
669-
--export-aot-inductor --only nanogpt --output "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv"
670-
# The threshold value needs to be actively maintained to make this check useful
671-
# The perf number of nanogpt seems not very stable, e.g.
672-
# https://github.com/pytorch/pytorch/actions/runs/7158691360/job/19491437314,
673-
# and thus we lower its threshold to reduce flakiness. If this continues to be a problem,
674-
# we switch to use some other model.
675-
python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv" -t 4.9
676-
677668
# Check memory compression ratio for a few models
678669
for test in hf_Albert timm_vision_transformer; do
679670
python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --amp --training \
@@ -1469,7 +1460,7 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
14691460
# https://github.com/opencv/opencv-python/issues/885
14701461
pip_install opencv-python==4.8.0.74
14711462
if [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then
1472-
checkout_install_torchbench hf_Bert hf_Albert nanogpt timm_vision_transformer
1463+
checkout_install_torchbench hf_Bert hf_Albert timm_vision_transformer
14731464
PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf
14741465
elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_smoketest_perf* ]]; then
14751466
checkout_install_torchbench timm_vision_transformer phlippe_densenet basic_gnn_edgecnn \

.circleci/scripts/binary_linux_test.sh

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,11 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
2727
source activate testenv >/dev/null
2828
elif [[ "$PACKAGE_TYPE" != libtorch ]]; then
2929
python_path="/opt/python/cp\$python_nodot-cp\${python_nodot}"
30-
# Prior to Python 3.8 paths were suffixed with an 'm'
31-
if [[ -d "\${python_path}/bin" ]]; then
32-
export PATH="\${python_path}/bin:\$PATH"
33-
elif [[ -d "\${python_path}m/bin" ]]; then
34-
export PATH="\${python_path}m/bin:\$PATH"
30+
if [[ "\$python_nodot" = *t ]]; then
31+
python_digits="\$(echo $DESIRED_PYTHON | tr -cd [:digit:])"
32+
python_path="/opt/python/cp\$python_digits-cp\${python_digits}t"
3533
fi
34+
export PATH="\${python_path}/bin:\$PATH"
3635
fi
3736
3837
EXTRA_CONDA_FLAGS=""

.clang-format

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,9 @@ ContinuationIndentWidth: 4
4444
Cpp11BracedListStyle: true
4545
DerivePointerAlignment: false
4646
DisableFormat: false
47-
ForEachMacros: [ FOR_EACH_RANGE, FOR_EACH, ]
47+
ForEachMacros:
48+
- FOR_EACH_RANGE
49+
- FOR_EACH
4850
IncludeCategories:
4951
- Regex: '^<.*\.h(pp)?>'
5052
Priority: 1
@@ -58,6 +60,24 @@ IndentWrappedFunctionNames: false
5860
KeepEmptyLinesAtTheStartOfBlocks: false
5961
MacroBlockBegin: ''
6062
MacroBlockEnd: ''
63+
Macros:
64+
- >-
65+
PyObject_HEAD_INIT(type)={
66+
/* this is not exactly match with PyObject_HEAD_INIT in Python source code
67+
* but it is enough for clang-format */
68+
{ 0xFFFFFFFF },
69+
(type)
70+
},
71+
- >-
72+
PyVarObject_HEAD_INIT(type, size)={
73+
{
74+
/* manually expand PyObject_HEAD_INIT(type) above
75+
* because clang-format do not support recursive expansion */
76+
{ 0xFFFFFFFF },
77+
(type)
78+
},
79+
(size)
80+
},
6181
MaxEmptyLinesToKeep: 1
6282
NamespaceIndentation: None
6383
PenaltyBreakBeforeFirstCallParameter: 1
@@ -79,7 +99,11 @@ SpacesInContainerLiterals: true
7999
SpacesInCStyleCastParentheses: false
80100
SpacesInParentheses: false
81101
SpacesInSquareBrackets: false
82-
Standard: Cpp11
102+
Standard: c++17
103+
StatementMacros:
104+
- PyObject_HEAD
105+
- PyObject_VAR_HEAD
106+
- PyException_HEAD
83107
TabWidth: 8
84108
UseTab: Never
85109
---

.github/actions/checkout-pytorch/action.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,14 @@ inputs:
1818
runs:
1919
using: composite
2020
steps:
21+
- name: Check if in a container runner
22+
shell: bash
23+
id: check_container_runner
24+
run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"
25+
2126
- name: Clean workspace
2227
shell: bash
28+
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
2329
env:
2430
NO_SUDO: ${{ inputs.no-sudo }}
2531
run: |

.github/actions/linux-test/action.yml

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -85,23 +85,33 @@ runs:
8585
with:
8686
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
8787

88-
- name: Check if in a ARC runner
88+
- name: Check if in a container runner
8989
shell: bash
90-
id: check_arc_runner
91-
run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)" >> "$GITHUB_OUTPUT"
90+
id: check_container_runner
91+
run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"
9292

9393
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
9494
id: install-nvidia-driver
9595
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
96-
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
96+
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
97+
98+
- name: Setup GPU_FLAG for docker run
99+
id: setup-gpu-flag
100+
run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
101+
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
102+
103+
- name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
104+
id: setup-sscache-port-flag
105+
run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
106+
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
97107

98108
- name: Lock NVIDIA A100 40GB Frequency
99109
shell: bash
100110
run: |
101111
sudo nvidia-smi -pm 1
102112
sudo nvidia-smi -ac 1215,1410
103113
nvidia-smi
104-
if: contains(matrix.runner, 'a100')
114+
if: ${{ contains(matrix.runner, 'a100') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
105115

106116
- name: Start monitoring script
107117
id: monitor-script
@@ -172,6 +182,7 @@ runs:
172182
NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
173183
TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
174184
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
185+
SCCACHE_REGION: us-east-1
175186
SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
176187
SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
177188
DOCKER_IMAGE: ${{ inputs.docker-image }}
@@ -181,6 +192,9 @@ runs:
181192
PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
182193
DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
183194
HUGGING_FACE_HUB_TOKEN: ${{ inputs.HUGGING_FACE_HUB_TOKEN }}
195+
SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
196+
IS_A100_RUNNER: ${{ contains(matrix.runner, 'a100') && '1' || '0' }}
197+
184198
shell: bash
185199
run: |
186200
set -x
@@ -199,6 +213,7 @@ runs:
199213
# shellcheck disable=SC2086,SC2090
200214
container_name=$(docker run \
201215
${GPU_FLAG:-} \
216+
${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
202217
-e BUILD_ENVIRONMENT \
203218
-e PR_NUMBER \
204219
-e GITHUB_ACTIONS \
@@ -227,14 +242,17 @@ runs:
227242
-e PR_LABELS \
228243
-e MAX_JOBS="$(nproc --ignore=2)" \
229244
-e SCCACHE_BUCKET \
245+
-e SCCACHE_REGION \
230246
-e SCCACHE_S3_KEY_PREFIX \
231247
-e XLA_CUDA \
232248
-e XLA_CLANG_CACHE_S3_BUCKET_NAME \
233249
-e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
234250
-e PYTORCH_TEST_RERUN_DISABLED_TESTS \
235251
-e SKIP_SCCACHE_INITIALIZATION=1 \
236252
-e HUGGING_FACE_HUB_TOKEN \
253+
-e SCRIBE_GRAPHQL_ACCESS_TOKEN \
237254
-e DASHBOARD_TAG \
255+
-e IS_A100_RUNNER \
238256
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
239257
--security-opt seccomp=unconfined \
240258
--cap-add=SYS_PTRACE \
@@ -305,7 +323,7 @@ runs:
305323

306324
- name: Teardown Linux
307325
uses: pytorch/test-infra/.github/actions/teardown-linux@main
308-
if: always()
326+
if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false'
309327

310328
# NB: We are currently having an intermittent GPU-related issue on G5 runners with
311329
# A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does

.github/actions/setup-linux/action.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,14 @@ runs:
2828
echo "instance-type: $(get_ec2_metadata instance-type)"
2929
echo "system info $(uname -a)"
3030
31-
- name: Check if in a ARC runner
31+
- name: Check if in a container runner
3232
shell: bash
33-
id: check_arc_runner
34-
run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)" >> $GITHUB_OUTPUT
33+
id: check_container_runner
34+
run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"
3535

3636
- name: Start docker if docker deamon is not running
3737
shell: bash
38-
if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
38+
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
3939
run: |
4040
if systemctl is-active --quiet docker; then
4141
echo "Docker daemon is running...";
@@ -73,7 +73,7 @@ runs:
7373
env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
7474
7575
- name: Kill any existing containers, clean up images
76-
if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
76+
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
7777
shell: bash
7878
run: |
7979
# ignore expansion of "docker ps -q" since it could be empty
@@ -116,7 +116,7 @@ runs:
116116
- name: Check that the docker daemon is running
117117
shell: bash
118118
continue-on-error: true
119-
if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'true' }}
119+
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
120120
run: |
121121
set +x
122122

.github/ci_commit_pins/audio.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
ba696ea3dfec4cbe693bf06a84c75dc196077f5b
1+
3f0569939c4369bec943fc27d1c9d8dfbc828c26

0 commit comments

Comments
 (0)