Skip to content

Commit f21f40e

Browse files
author
Jiewen Tan
committed
Update on "[c10d] Make barrier as a custom op"
Summary: This patch makes barrier as a custom op such that it's dispatcher passable. It's one part of the effort to route comm ops to the dispatcher such that tracing mechanisms that relies on the dispatcher can trace them, e.g., LazyTensor and AOTAutograd. Test Plan: python test/distributed/test_c10d_nccl.py -k test_nccl_barrier ...and other existing distributed tests. [ghstack-poisoned]
2 parents c7d7aa1 + 5b80fc4 commit f21f40e

File tree

243 files changed

+10753
-5348
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

243 files changed

+10753
-5348
lines changed

.bazelrc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,10 @@ build --copt=-isystem --copt=bazel-out/k8-fastbuild-cpu-only/bin
3131
# rules_cuda configuration
3232
build:cpu-only --@rules_cuda//cuda:enable_cuda=False
3333

34+
# Definition of --config=shell
35+
# interactive shell immediately before execution
36+
build:shell --run_under="//tools/bazel_tools:shellwrap"
37+
3438
# Disable all warnings for external repositories. We don't care about
3539
# their warnings.
3640
build --per_file_copt=^external/@-w

.circleci/scripts/binary_windows_build.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"
66

77
export CUDA_VERSION="${DESIRED_CUDA/cu/}"
88
export USE_SCCACHE=1
9-
export SCCACHE_BUCKET=ossci-compiler-cache-windows
9+
export SCCACHE_BUCKET=ossci-compiler-cache
1010
export SCCACHE_IGNORE_SERVER_IO_ERROR=1
1111
export VC_YEAR=2019
1212

.github/actions/build-android/action.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ runs:
3737
shell: bash
3838
env:
3939
BRANCH: ${{ inputs.branch }}
40-
JOB_BASE_NAME: ${{ inputs.build-environment }}-build-and-test
4140
BUILD_ENVIRONMENT: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-${{ inputs.arch-for-build-env }}-build"
4241
AWS_DEFAULT_REGION: us-east-1
4342
PR_NUMBER: ${{ github.event.pull_request.number }}
@@ -51,7 +50,6 @@ runs:
5150
export container_name
5251
container_name=$(docker run \
5352
-e BUILD_ENVIRONMENT \
54-
-e JOB_BASE_NAME \
5553
-e MAX_JOBS="$(nproc --ignore=2)" \
5654
-e AWS_DEFAULT_REGION \
5755
-e PR_NUMBER \
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2d43de8576e47a1d7f287011f14a55e93b568428
1+
1f16b92a728d527a6394a9687809ccc7888b4f48

.github/ci_commit_pins/xla.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
02346af955653a179b896eef5475e569ab8d4229
1+
1b0d4feca391303bcfe2846bc198b5e89f8f72d4

.github/merge_rules.json

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,12 @@
33
"name": "ONNX exporter",
44
"patterns": [
55
".jenkins/caffe2/*",
6-
"scripts/onnx/**",
6+
"aten/src/ATen/core/interned_strings.h",
77
"docs/source/onnx.rst",
8-
"test/onnx/**",
8+
"docs/source/scripts/onnx/**",
9+
"scripts/onnx/**",
910
"test/jit/test_export_modes.py",
10-
"aten/src/ATen/core/interned_strings.h",
11+
"test/onnx/**",
1112
"tools/onnx/**",
1213
"torch/_C/__init__.pyi.in",
1314
"torch/csrc/jit/passes/onnx.*",
@@ -51,14 +52,28 @@
5152
},
5253
{
5354
"name": "CI Pinned Hashes",
54-
"patterns": [".github/ci_commit_pins/**"],
55+
"patterns": [
56+
".github/ci_commit_pins/vision.txt",
57+
".github/ci_commit_pins/torchdynamo.txt"
58+
],
5559
"approved_by": ["pytorchbot", "ezyang", "pytorch/pytorch-dev-infra"],
5660
"mandatory_checks_name": [
5761
"Facebook CLA Check",
5862
"Lint",
5963
"pull"
6064
]
6165
},
66+
{
67+
"name": "XLA hash pin update",
68+
"patterns": [".github/ci_commit_pins/xla.txt"],
69+
"approved_by": ["pytorchbot", "ezyang", "pytorch/pytorch-dev-infra"],
70+
"mandatory_checks_name": [
71+
"Facebook CLA Check",
72+
"Lint",
73+
"pull / linux-bionic-py3_7-clang8-xla / build",
74+
"pull / linux-bionic-py3_7-clang8-xla / test (xla, 1, 1, linux.2xlarge)"
75+
]
76+
},
6277
{
6378
"name": "Documentation",
6479
"patterns": ["docs/**", "torch/*docs.py"],

.github/scripts/trymerge.py

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,7 @@
302302
"""
303303

304304
RE_GHSTACK_HEAD_REF = re.compile(r"^(gh/[^/]+/[0-9]+/)head$")
305-
RE_GHSTACK_SOURCE_ID = re.compile(r'^ghstack-source-id: (.+)\n?', re.MULTILINE)
305+
RE_GHSTACK_DESC = re.compile(r'Stack.*:\r?\n(\* [^\r\n]+\r?\n)+', re.MULTILINE)
306306
RE_PULL_REQUEST_RESOLVED = re.compile(
307307
r'Pull Request resolved: '
308308
r'https://github.com/(?P<owner>[^/]+)/(?P<repo>[^/]+)/pull/(?P<number>[0-9]+)',
@@ -564,8 +564,11 @@ def add_conclusions(edges: List[Dict[str, Dict[str, Any]]]) -> None:
564564
checkruns = node["checkRuns"]
565565
if workflow_run is not None:
566566
conclusions[workflow_run["workflow"]["name"]] = (node["conclusion"], node["url"])
567+
has_failing_check = False
567568
while checkruns is not None:
568569
for checkrun_node in checkruns["nodes"]:
570+
if checkrun_node["conclusion"] == 'FAILURE':
571+
has_failing_check = True
569572
conclusions[checkrun_node["name"]] = (checkrun_node["conclusion"], checkrun_node["detailsUrl"])
570573
if bool(checkruns["pageInfo"]["hasNextPage"]):
571574
rc = gh_graphql(GH_GET_PR_NEXT_CHECK_RUNS,
@@ -578,6 +581,9 @@ def add_conclusions(edges: List[Dict[str, Dict[str, Any]]]) -> None:
578581
checkruns = last_commit["checkSuites"]["nodes"][-1]["checkRuns"]
579582
else:
580583
checkruns = None
584+
# Github doesn't set conclusion to failure if a job is still pending
585+
if workflow_run is not None and has_failing_check:
586+
conclusions[workflow_run["workflow"]["name"]] = ("FAILURE", node["url"])
581587

582588
add_conclusions(checksuites["edges"])
583589
while bool(checksuites["pageInfo"]["hasNextPage"]):
@@ -702,33 +708,38 @@ def merge_ghstack_into(self, repo: GitRepo, force: bool, comment_id: Optional[in
702708
if self.org != m.group('owner') or self.project != m.group('repo'):
703709
raise RuntimeError(f"PR {m.group('number')} resolved to wrong owner/repo pair")
704710
pr_num = int(m.group('number'))
711+
commit_msg = self.gen_commit_message(filter_ghstack=True)
705712
if pr_num != self.pr_num:
706713
pr = GitHubPR(self.org, self.project, pr_num)
707714
if pr.is_closed():
708715
print(f"Skipping {idx+1} of {len(rev_list)} PR (#{pr_num}) as its already been merged")
709716
continue
710-
approved_by = pr.get_approved_by()
717+
commit_msg = pr.gen_commit_message(filter_ghstack=True)
711718
# Raises exception if matching rule is not found
712719
find_matching_merge_rule(pr, repo, force=force, skip_internal_checks=can_skip_internal_checks(self, comment_id))
713720

714-
# Adding the url here makes it clickable within the Github UI
715-
approved_by_urls = ', '.join(prefix_with_github_url(login) for login in approved_by)
716721
repo.cherry_pick(rev)
717-
msg = re.sub(RE_GHSTACK_SOURCE_ID, "", msg)
718-
msg += f"\nApproved by: {approved_by_urls}\n"
719-
repo.amend_commit_message(msg)
722+
repo.amend_commit_message(commit_msg)
723+
724+
def gen_commit_message(self, filter_ghstack: bool = False) -> str:
725+
""" Fetches title and body from PR description
726+
adds reviewed by, pull request resolved and optionally
727+
filters out ghstack info """
728+
# Adding the url here makes it clickable within the Github UI
729+
approved_by_urls = ', '.join(prefix_with_github_url(login) for login in self.get_approved_by())
730+
msg = self.get_title() + f" (#{self.pr_num})\n\n"
731+
msg += self.get_body() if not filter_ghstack else re.sub(RE_GHSTACK_DESC, "", self.get_body())
732+
msg += f"\nPull Request resolved: {self.get_pr_url()}\n"
733+
msg += f"Approved by: {approved_by_urls}\n"
734+
return msg
720735

721736
def merge_into(self, repo: GitRepo, *, force: bool = False, dry_run: bool = False, comment_id: Optional[int] = None) -> None:
722737
# Raises exception if matching rule is not found
723738
find_matching_merge_rule(self, repo, force=force, skip_internal_checks=can_skip_internal_checks(self, comment_id))
724739
if repo.current_branch() != self.default_branch():
725740
repo.checkout(self.default_branch())
726741
if not self.is_ghstack_pr():
727-
# Adding the url here makes it clickable within the Github UI
728-
approved_by_urls = ', '.join(prefix_with_github_url(login) for login in self.get_approved_by())
729-
msg = self.get_title() + f" (#{self.pr_num})\n\n" + self.get_body()
730-
msg += f"\nPull Request resolved: {self.get_pr_url()}\n"
731-
msg += f"Approved by: {approved_by_urls}\n"
742+
msg = self.gen_commit_message()
732743
pr_branch_name = f"__pull-request-{self.pr_num}__init__"
733744
repo.fetch(f"pull/{self.pr_num}/head", pr_branch_name)
734745
repo._run_git("merge", "--squash", pr_branch_name)

.github/templates/common.yml.j2

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,6 @@ on:
9191
AWS_DEFAULT_REGION: us-east-1
9292
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
9393
BRANCH: ${{ steps.parse-ref.outputs.branch }}
94-
JOB_BASE_NAME: !{{ build_environment }}-test
9594
PR_NUMBER: ${{ github.event.pull_request.number }}
9695
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
9796
TAG: ${{ steps.parse-ref.outputs.tag }}

.github/workflows/_android-build-test.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,6 @@ jobs:
5757
- name: Build
5858
env:
5959
BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
60-
JOB_BASE_NAME: ${{ inputs.build-environment }}-build-and-test
6160
TORCH_CUDA_ARCH_LIST: 5.2
6261
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
6362
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
@@ -79,7 +78,6 @@ jobs:
7978
git submodule sync && git submodule update -q --init --recursive --depth 1 --jobs 0
8079
export id
8180
id=$(docker run -e BUILD_ENVIRONMENT \
82-
-e JOB_BASE_NAME \
8381
-e MAX_JOBS="$(nproc --ignore=2)" \
8482
-e SCCACHE_BUCKET \
8583
-e SKIP_SCCACHE_INITIALIZATION=1 \

.github/workflows/_bazel-build-test.yml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,6 @@ jobs:
6868
env:
6969
BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
7070
BRANCH: ${{ steps.parse-ref.outputs.branch }}
71-
JOB_BASE_NAME: ${{ inputs.build-environment }}-build-and-test
7271
# TODO duplicated
7372
AWS_DEFAULT_REGION: us-east-1
7473
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
@@ -80,7 +79,6 @@ jobs:
8079
# detached container should get cleaned up by teardown_ec2_linux
8180
container_name=$(docker run \
8281
-e BUILD_ENVIRONMENT \
83-
-e JOB_BASE_NAME \
8482
-e MAX_JOBS="$(nproc --ignore=2)" \
8583
-e SCCACHE_BUCKET \
8684
-e SKIP_SCCACHE_INITIALIZATION=1 \
@@ -104,7 +102,6 @@ jobs:
104102
# Time out the test phase after 3.5 hours
105103
timeout-minutes: 210
106104
env:
107-
JOB_BASE_NAME: ${{ inputs.build-environment }}-build-and-test
108105
BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
109106
PR_NUMBER: ${{ github.event.pull_request.number }}
110107
BRANCH: ${{ steps.parse-ref.outputs.branch }}
@@ -128,7 +125,6 @@ jobs:
128125
-e GIT_DEFAULT_BRANCH="$GIT_DEFAULT_BRANCH" \
129126
-e SHARD_NUMBER \
130127
-e NUM_TEST_SHARDS \
131-
-e JOB_BASE_NAME \
132128
-e MAX_JOBS="$(nproc --ignore=2)" \
133129
-e SCCACHE_BUCKET \
134130
-e PYTORCH_RETRY_TEST_CASES \
@@ -163,7 +159,6 @@ jobs:
163159
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
164160
BRANCH: ${{ steps.parse-ref.outputs.branch }}
165161
BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
166-
JOB_BASE_NAME: ${{ inputs.build-environment }}-test
167162
PR_NUMBER: ${{ github.event.pull_request.number }}
168163
PYTORCH_RETRY_TEST_CASES: 1
169164
PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1

0 commit comments

Comments
 (0)