Skip to content

Commit f39860d

Browse files
author
Jiewen Tan
committed
Update on "[c10d] Make send/recv as custom ops"
Summary: This patch makes send/recv as custom ops such that it's dispatcher passable. It's one part of the effort to route comm ops to the dispatcher such that tracing mechanisms that relies on the dispatcher can trace them, e.g., LazyTensor and AOTAutograd. Test Plan: python test/distributed/test_c10d_nccl.py -k test_send_recv ...and other existing distributed tests. [ghstack-poisoned]
2 parents a3c6faa + 6d388e8 commit f39860d

File tree

68 files changed

+5029
-3155
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

68 files changed

+5029
-3155
lines changed

.github/workflows/periodic.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,15 @@ concurrency:
1414

1515
jobs:
1616
linux-xenial-cuda10_2-py3-gcc7-slow-gradcheck-build:
17+
if: false # https://github.com/pytorch/pytorch/issues/80314
1718
name: linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck
1819
uses: ./.github/workflows/_linux-build.yml
1920
with:
2021
build-environment: linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck
2122
docker-image-name: pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7
2223

2324
linux-xenial-cuda10_2-py3-gcc7-slow-gradcheck-test:
25+
if: false # https://github.com/pytorch/pytorch/issues/80314
2426
name: linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck
2527
uses: ./.github/workflows/_linux-test.yml
2628
needs: linux-xenial-cuda10_2-py3-gcc7-slow-gradcheck-build

.github/workflows/trunk.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ jobs:
6262
{ config: "default", shard: 1, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" },
6363
{ config: "default", shard: 2, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" },
6464
{ config: "slow", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
65-
{ config: "nogpu_NO_AVX", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
65+
{ config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
6666
{ config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
6767
{ config: "jit_legacy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
6868
{ config: "distributed", shard: 1, num_shards: 2, runner: "linux.8xlarge.nvidia.gpu" },

.jenkins/pytorch/test.sh

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -149,13 +149,9 @@ if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
149149
(cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_aten_asan(3)")
150150
fi
151151

152-
if [[ $TEST_CONFIG == 'nogpu_NO_AVX' ]]; then
152+
if [[ $TEST_CONFIG == 'nogpu_NO_AVX2' ]]; then
153153
export ATEN_CPU_CAPABILITY=default
154-
elif [[ $TEST_CONFIG == 'nogpu_NO_AVX2' ]]; then
155-
export ATEN_CPU_CAPABILITY=default
156-
157-
# TODO: this condition is never (we have no NO_AVX512 config), need to fix this.
158-
elif [[ $TEST_CONFIG == 'nogpu_NO_AVX512' ]]; then
154+
elif [[ $TEST_CONFIG == 'nogpu_AVX512' ]]; then
159155
export ATEN_CPU_CAPABILITY=avx2
160156
fi
161157

aten/src/ATen/NestedTensorImpl.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,9 @@ struct TORCH_API NestedTensorImpl : public c10::TensorImpl {
6565
// with real implementations
6666
int64_t numel_custom() const override;
6767
bool is_contiguous_custom(MemoryFormat) const override;
68+
int64_t size_custom(int64_t d) const override {
69+
return this->size(d);
70+
}
6871
IntArrayRef sizes_custom() const override;
6972
c10::SymIntArrayRef sym_sizes_custom() const override;
7073
c10::SymIntArrayRef sym_sizes() const override;

aten/src/ATen/SparseCsrTensorUtils.h

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,38 @@ inline std::string plainIndicesName(Layout layout) {
181181
[&] { return "row_indices"; });
182182
}
183183

184+
inline std::string compressedDimName(Layout layout) {
185+
switch (layout) {
186+
case kSparseCsr:
187+
return "row";
188+
case kSparseCsc:
189+
return "column";
190+
case kSparseBsr:
191+
return "row block";
192+
case kSparseBsc:
193+
return "column block";
194+
default:
195+
TORCH_CHECK(false, "Not a sparse compressed layout:", layout);
196+
return "";
197+
}
198+
}
199+
200+
inline std::string plainDimName(Layout layout) {
201+
switch (layout) {
202+
case kSparseCsr:
203+
return "column";
204+
case kSparseCsc:
205+
return "row";
206+
case kSparseBsr:
207+
return "column block";
208+
case kSparseBsc:
209+
return "row block";
210+
default:
211+
TORCH_CHECK(false, "Not a sparse compressed layout:", layout);
212+
return "";
213+
}
214+
}
215+
184216
inline int rowDimension(Layout layout, IntArrayRef size) {
185217
return size.size() - (isCompressedRow(layout) ? 2 : 1);
186218
}

aten/src/ATen/core/TensorBase.h

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -165,10 +165,7 @@ class TORCH_API TensorBase {
165165
}
166166

167167
int64_t size(int64_t dim) const {
168-
const auto sizes = this->sizes();
169-
const auto ndim = static_cast<int64_t>(sizes.size());
170-
// false is passed to maybe_wrap_dim so behavior is identical to array access (but with wrapping)
171-
return sizes[c10::maybe_wrap_dim(dim, ndim, /*wrap_scalar=*/false)];
168+
return impl_->size(dim);
172169
}
173170

174171
int64_t stride(int64_t dim) const {

aten/src/ATen/core/jit_type_base.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@ struct TORCH_API Type {
247247
// nvcc; see comment in destroy() below.
248248
struct SharedPtrWrapper {
249249
SharedPtrWrapper(std::shared_ptr<T> &&x)
250-
: repr_(x) {}
250+
: repr_(std::move(x)) {}
251251
std::shared_ptr<T> repr_;
252252
};
253253
union Repr {

aten/src/ATen/native/DispatchStub.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,9 @@ static CPUCapability compute_cpu_capability() {
3939

4040
#if !defined(__powerpc__) && !defined(__s390x__)
4141
if (cpuinfo_initialize()) {
42-
#ifdef HAVE_AVX512_CPU_DEFINITION
42+
// AVX512 can be slower then AVX2, so lets keep it as opt-in
43+
// see https://github.com/pytorch/pytorch/issues/80252
44+
#if defined(HAVE_AVX512_CPU_DEFINITION) && false
4345
// GCC supports some AVX512 intrinsics such as _mm512_set_epi16 only in
4446
// versions 9 & beyond. So, we want to ensure that only releases built with
4547
// supported compilers on supported hardware return CPU Capability AVX512,

aten/src/ATen/native/EmbeddingBag.cpp

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include <ATen/NativeFunctions.h>
44
#include <ATen/Parallel.h>
55
#include <ATen/TensorUtils.h>
6+
#include <ATen/TensorSubclassLikeUtils.h>
67

78
#include <ATen/native/CPUBlas.h>
89

@@ -1271,11 +1272,18 @@ Tensor _embedding_bag_backward(const Tensor &grad, const Tensor &indices_,
12711272

12721273
Tensor offset2bag_;
12731274
if (indices.numel() != 0 && offset2bag.numel() == 0) {
1274-
offset2bag_ = at::zeros(
1275-
{indices.size(0) + 1}, offsets.options()); // offset2bag = [0 0 0 0 0]
1275+
offset2bag_ = offsets.new_zeros(
1276+
{indices.size(0) + 1}, offsets.options()); // offset2bag = [0 0 0 0 0]
12761277

12771278
make_offset2bag(offsets, offset2bag_);
1278-
offset2bag_.resize_({indices.size(0)});
1279+
// For Composite Compliance, if `offset2bag_` is CCT
1280+
// then we can't call `resize_`. Instead we call `narrow`
1281+
// to slice the tensor.
1282+
if (isTensorSubclassLike(offset2bag_)) {
1283+
offset2bag_ = offset2bag_.narrow(0, 0, indices.size(0));
1284+
} else {
1285+
offset2bag_.resize_({indices.size(0)});
1286+
}
12791287
} else {
12801288
auto offset2bag_arg = TensorArg(offset2bag, "offset2bag", 1);
12811289
checkScalarTypes("embedding_bag", offset2bag_arg, {kLong, kInt});

aten/src/ATen/native/nested/NestedTensorMath.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -560,6 +560,22 @@ Tensor NestedTensor_elementwise_Tensor(
560560
const Tensor& other,
561561
const std::string& op_name,
562562
Func f) {
563+
// self is a scalar
564+
if (!self.is_nested() && self.dim() == 0 && self.numel() == 1) {
565+
auto other_impl = get_nested_tensor_impl(other);
566+
return wrap_buffer(
567+
f(self, other_impl->get_buffer()),
568+
other_impl->get_nested_size_tensor().clone()
569+
);
570+
}
571+
// other is a scalar
572+
if (!other.is_nested() && other.dim() == 0 && other.numel() == 1) {
573+
auto self_impl = get_nested_tensor_impl(self);
574+
return wrap_buffer(
575+
f(self_impl->get_buffer(), other),
576+
self_impl->get_nested_size_tensor().clone()
577+
);
578+
}
563579
NestedTensorImpl* self_impl = nullptr;
564580
NestedTensorImpl* other_impl = nullptr;
565581
std::tie(self_impl, other_impl) =
@@ -598,6 +614,18 @@ Tensor& NestedTensor_elementwise__Tensor(
598614
const Tensor& other,
599615
const std::string& op_name,
600616
Func f) {
617+
// self is a scalar
618+
if (!self.is_nested() && self.dim() == 0 && self.numel() == 1) {
619+
auto other_impl = get_nested_tensor_impl(other);
620+
f(self, other_impl->get_buffer());
621+
return self;
622+
}
623+
// other is a scalar
624+
if (!other.is_nested() && other.dim() == 0 && other.numel() == 1) {
625+
auto self_impl = get_nested_tensor_impl(self);
626+
f(self_impl->get_buffer(), other);
627+
return self;
628+
}
601629
NestedTensorImpl* self_impl = nullptr;
602630
NestedTensorImpl* other_impl = nullptr;
603631
std::tie(self_impl, other_impl) =

0 commit comments

Comments
 (0)