Skip to content

Commit 2dfe6d8

Browse files
author
Yifu Wang
committed
Update on "[SymmetricMemoryOps] implement one_shot_all_reduce"
[ghstack-poisoned]
2 parents 96d0257 + 6d4a429 commit 2dfe6d8

30 files changed

+305
-249
lines changed

.ci/docker/requirements-ci.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -139,9 +139,9 @@ opt-einsum==3.3
139139
#Pinned versions: 3.3
140140
#test that import: test_linalg.py
141141

142-
optree==0.12.1
142+
optree==0.13.0
143143
#Description: A library for tree manipulation
144-
#Pinned versions: 0.12.1
144+
#Pinned versions: 0.13.0
145145
#test that import: test_vmap.py, test_aotdispatch.py, test_dynamic_shapes.py,
146146
#test_pytree.py, test_ops.py, test_control_flow.py, test_modules.py,
147147
#common_utils.py, test_eager_transforms.py, test_python_dispatch.py,
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# iOS simulator requirements
22
coremltools==5.0b5
33
protobuf==3.20.2
4-
optree==0.12.1
4+
optree==0.13.0

.github/requirements/pip-requirements-macOS.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ pytest-cpp==2.3.0
2727
rockset==1.0.3
2828
z3-solver==4.12.2.0
2929
tensorboard==2.13.0
30-
optree==0.12.1
30+
optree==0.13.0
3131
# NB: test_hparams_* from test_tensorboard is failing with protobuf 5.26.0 in
3232
# which the stringify metadata is wrong when escaping double quote
3333
protobuf==3.20.2

.github/templates/linux_binary_build_workflow.yml.j2

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ jobs:
6868
needs: get-label-type
6969
with:!{{ upload.binary_env_as_input(config) }}
7070
{%- if "aarch64" in build_environment %}
71+
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
7172
runs_on: linux.arm64.m7g.4xlarge.ephemeral
7273
ALPINE_IMAGE: "arm64v8/alpine"
7374
{%- elif "s390x" in build_environment %}
@@ -102,6 +103,7 @@ jobs:
102103
build_name: !{{ config["build_name"] }}
103104
build_environment: !{{ build_environment }}
104105
{%- if "aarch64" in build_environment %}
106+
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
105107
runs_on: linux.arm64.2xlarge
106108
ALPINE_IMAGE: "arm64v8/alpine"
107109
{%- elif "s390x" in build_environment %}

.github/workflows/_win-test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ jobs:
189189
run: |
190190
pushd "${PYTORCH_FINAL_PACKAGE_DIR}"
191191
# shellcheck disable=SC2046,SC2102
192-
python3 -mpip install $(echo *.whl)[opt-einsum,optree] optree==0.12.1
192+
python3 -mpip install $(echo *.whl)[opt-einsum,optree] optree==0.13.0
193193
popd
194194
195195
.ci/pytorch/win-test.sh

.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml

Lines changed: 12 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.lintrunner.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ init_command = [
153153
'junitparser==2.1.1',
154154
'rich==10.9.0',
155155
'pyyaml==6.0.1',
156-
'optree==0.12.1',
156+
'optree==0.13.0',
157157
]
158158

159159
[[linter]]

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,4 @@ fsspec
2121
lintrunner
2222
ninja
2323
packaging
24-
optree>=0.12.0 ; python_version <= "3.12"
24+
optree>=0.13.0

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1203,7 +1203,7 @@ def main():
12031203
install_requires += extra_install_requires
12041204

12051205
extras_require = {
1206-
"optree": ["optree>=0.12.0"],
1206+
"optree": ["optree>=0.13.0"],
12071207
"opt-einsum": ["opt-einsum>=3.3"],
12081208
}
12091209

test/distributed/test_c10d_nccl.py

Lines changed: 34 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2554,6 +2554,24 @@ def _test_nccl_errors_blocking(self, func):
25542554
del process_group
25552555
func()
25562556

2557+
def _test_barrier_error(self):
2558+
store = c10d.FileStore(self.file_name, self.world_size)
2559+
process_group = c10d.ProcessGroupNCCL(
2560+
store,
2561+
self.rank,
2562+
self.world_size,
2563+
timeout=timedelta(seconds=10),
2564+
)
2565+
process_group.barrier().wait()
2566+
if self.rank == 0:
2567+
with self.assertRaisesRegex(dist.DistBackendError, ""):
2568+
# It seems the error message would be different depending on
2569+
# whether the test is run on CI machine and devGPU. Skipping
2570+
# the error message check to make both sides happy.
2571+
process_group.barrier().wait(
2572+
timeout=timedelta(seconds=self.op_timeout_sec)
2573+
)
2574+
25572575
@with_nccl_blocking_wait
25582576
@requires_nccl()
25592577
@requires_nccl_version((2, 4, 0), "Need NCCL 2.4+ for error checking")
@@ -2602,22 +2620,23 @@ def test_nccl_errors_blocking_sigterm(self):
26022620
@requires_nccl_version((2, 4, 0), "Need NCCL 2.4+ for error checking")
26032621
@skip_if_lt_x_gpu(3)
26042622
def test_nccl_blocking_wait_with_barrier(self):
2605-
store = c10d.FileStore(self.file_name, self.world_size)
2606-
process_group = c10d.ProcessGroupNCCL(
2607-
store,
2608-
self.rank,
2609-
self.world_size,
2610-
timeout=timedelta(seconds=10),
2623+
self._test_barrier_error()
2624+
2625+
@requires_nccl()
2626+
@requires_nccl_version((2, 4, 0), "Need NCCL 2.4+ for error checking")
2627+
@skip_if_lt_x_gpu(3)
2628+
def test_nccl_non_blocking_wait_with_barrier(self):
2629+
# test the barrier behavior in the non blocking wait setting
2630+
prev_nccl_async_error_handling = os.environ.get(
2631+
"TORCH_NCCL_ASYNC_ERROR_HANDLING", None
26112632
)
2612-
process_group.barrier().wait()
2613-
if self.rank == 0:
2614-
with self.assertRaisesRegex(dist.DistBackendError, ""):
2615-
# It seems the error message would be different depending on
2616-
# whether the test is run on CI machine and devGPU. Skipping
2617-
# the error message check to make both sides happy.
2618-
process_group.barrier().wait(
2619-
timeout=timedelta(seconds=self.op_timeout_sec)
2620-
)
2633+
# avoid watchdog thread interference
2634+
os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "0"
2635+
self._test_barrier_error()
2636+
if prev_nccl_async_error_handling is not None:
2637+
os.environ[
2638+
"TORCH_NCCL_ASYNC_ERROR_HANDLING"
2639+
] = prev_nccl_async_error_handling
26212640

26222641
def _run_invalid_nccl_blocking_wait_env(self, val):
26232642
os.environ["TORCH_NCCL_BLOCKING_WAIT"] = val

0 commit comments

Comments
 (0)