pytorch
diff --git a/‎.github/workflows/periodic.yml‎
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/periodic.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/trunk.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.jenkins/pytorch/test.sh‎
Lines changed: 2 additions & 6 deletions b/‎.jenkins/pytorch/test.sh‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎aten/src/ATen/NestedTensorImpl.h‎
Lines changed: 3 additions & 0 deletions b/‎aten/src/ATen/NestedTensorImpl.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎aten/src/ATen/SparseCsrTensorUtils.h‎
Lines changed: 32 additions & 0 deletions b/‎aten/src/ATen/SparseCsrTensorUtils.h‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎aten/src/ATen/core/TensorBase.h‎
Lines changed: 1 addition & 4 deletions b/‎aten/src/ATen/core/TensorBase.h‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎aten/src/ATen/core/jit_type_base.h‎
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/core/jit_type_base.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/DispatchStub.cpp‎
Lines changed: 3 additions & 1 deletion b/‎aten/src/ATen/native/DispatchStub.cpp‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎aten/src/ATen/native/EmbeddingBag.cpp‎
Lines changed: 11 additions & 3 deletions b/‎aten/src/ATen/native/EmbeddingBag.cpp‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎aten/src/ATen/native/nested/NestedTensorMath.cpp‎
Lines changed: 28 additions & 0 deletions b/‎aten/src/ATen/native/nested/NestedTensorMath.cpp‎
Lines changed: 28 additions & 0 deletions
@@ -14,13 +14,15 @@ concurrency:
 
 jobs:
   linux-xenial-cuda10_2-py3-gcc7-slow-gradcheck-build:
+    if: false # https://github.com/pytorch/pytorch/issues/80314
     name: linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck
     uses: ./.github/workflows/_linux-build.yml
     with:
       build-environment: linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck
       docker-image-name: pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7
 
   linux-xenial-cuda10_2-py3-gcc7-slow-gradcheck-test:
+    if: false # https://github.com/pytorch/pytorch/issues/80314
     name: linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck
     uses: ./.github/workflows/_linux-test.yml
     needs: linux-xenial-cuda10_2-py3-gcc7-slow-gradcheck-build
 
@@ -62,7 +62,7 @@ jobs:
           { config: "default", shard: 1, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" },
           { config: "default", shard: 2, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" },
           { config: "slow", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "nogpu_NO_AVX", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
           { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
           { config: "jit_legacy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
           { config: "distributed", shard: 1, num_shards: 2, runner: "linux.8xlarge.nvidia.gpu" },
 
@@ -149,13 +149,9 @@ if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
     (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_aten_asan(3)")
 fi
 
-if [[ $TEST_CONFIG == 'nogpu_NO_AVX' ]]; then
+if [[ $TEST_CONFIG == 'nogpu_NO_AVX2' ]]; then
   export ATEN_CPU_CAPABILITY=default
-elif [[ $TEST_CONFIG == 'nogpu_NO_AVX2' ]]; then
-  export ATEN_CPU_CAPABILITY=default
-
-# TODO: this condition is never (we have no NO_AVX512 config), need to fix this.
-elif [[ $TEST_CONFIG == 'nogpu_NO_AVX512' ]]; then
+elif [[ $TEST_CONFIG == 'nogpu_AVX512' ]]; then
   export ATEN_CPU_CAPABILITY=avx2
 fi
 
 
@@ -65,6 +65,9 @@ struct TORCH_API NestedTensorImpl : public c10::TensorImpl {
   // with real implementations
   int64_t numel_custom() const override;
   bool is_contiguous_custom(MemoryFormat) const override;
+  int64_t size_custom(int64_t d) const override {
+    return this->size(d);
+  }
   IntArrayRef sizes_custom() const override;
   c10::SymIntArrayRef sym_sizes_custom() const override;
   c10::SymIntArrayRef sym_sizes() const override;
 
@@ -181,6 +181,38 @@ inline std::string plainIndicesName(Layout layout) {
       [&] { return "row_indices"; });
 }
 
+inline std::string compressedDimName(Layout layout) {
+  switch (layout) {
+    case kSparseCsr:
+      return "row";
+    case kSparseCsc:
+      return "column";
+    case kSparseBsr:
+      return "row block";
+    case kSparseBsc:
+      return "column block";
+    default:
+      TORCH_CHECK(false, "Not a sparse compressed layout:", layout);
+      return "";
+  }
+}
+
+inline std::string plainDimName(Layout layout) {
+  switch (layout) {
+    case kSparseCsr:
+      return "column";
+    case kSparseCsc:
+      return "row";
+    case kSparseBsr:
+      return "column block";
+    case kSparseBsc:
+      return "row block";
+    default:
+      TORCH_CHECK(false, "Not a sparse compressed layout:", layout);
+      return "";
+  }
+}
+
 inline int rowDimension(Layout layout, IntArrayRef size) {
   return size.size() - (isCompressedRow(layout) ? 2 : 1);
 }
 
@@ -165,10 +165,7 @@ class TORCH_API TensorBase {
   }
 
   int64_t size(int64_t dim) const {
-    const auto sizes = this->sizes();
-    const auto ndim = static_cast<int64_t>(sizes.size());
-    // false is passed to maybe_wrap_dim so behavior is identical to array access (but with wrapping)
-    return sizes[c10::maybe_wrap_dim(dim, ndim, /*wrap_scalar=*/false)];
+    return impl_->size(dim);
   }
 
   int64_t stride(int64_t dim) const {
 
@@ -247,7 +247,7 @@ struct TORCH_API Type {
     // nvcc; see comment in destroy() below.
     struct SharedPtrWrapper {
       SharedPtrWrapper(std::shared_ptr<T> &&x)
-          : repr_(x) {}
+          : repr_(std::move(x)) {}
       std::shared_ptr<T> repr_;
     };
     union Repr {
 
@@ -39,7 +39,9 @@ static CPUCapability compute_cpu_capability() {
 
 #if !defined(__powerpc__) && !defined(__s390x__)
   if (cpuinfo_initialize()) {
-#ifdef HAVE_AVX512_CPU_DEFINITION
+    // AVX512 can be slower then AVX2, so lets keep it as opt-in
+    // see https://github.com/pytorch/pytorch/issues/80252
+#if defined(HAVE_AVX512_CPU_DEFINITION) && false
     // GCC supports some AVX512 intrinsics such as _mm512_set_epi16 only in
     // versions 9 & beyond. So, we want to ensure that only releases built with
     // supported compilers on supported hardware return CPU Capability AVX512,
 
@@ -3,6 +3,7 @@
 #include <ATen/NativeFunctions.h>
 #include <ATen/Parallel.h>
 #include <ATen/TensorUtils.h>
+#include <ATen/TensorSubclassLikeUtils.h>
 
 #include <ATen/native/CPUBlas.h>
 
@@ -1271,11 +1272,18 @@ Tensor _embedding_bag_backward(const Tensor &grad, const Tensor &indices_,
 
   Tensor offset2bag_;
   if (indices.numel() != 0 && offset2bag.numel() == 0) {
-    offset2bag_ = at::zeros(
-       {indices.size(0) + 1}, offsets.options()); // offset2bag = [0 0 0 0 0]
+    offset2bag_ = offsets.new_zeros(
+      {indices.size(0) + 1}, offsets.options()); // offset2bag = [0 0 0 0 0]
 
     make_offset2bag(offsets, offset2bag_);
-    offset2bag_.resize_({indices.size(0)});
+    // For Composite Compliance, if `offset2bag_` is CCT
+    // then we can't call `resize_`. Instead we call `narrow`
+    // to slice the tensor.
+    if (isTensorSubclassLike(offset2bag_)) {
+      offset2bag_ = offset2bag_.narrow(0, 0, indices.size(0));
+    } else {
+      offset2bag_.resize_({indices.size(0)});
+    }
   } else {
     auto offset2bag_arg = TensorArg(offset2bag, "offset2bag", 1);
     checkScalarTypes("embedding_bag", offset2bag_arg, {kLong, kInt});
 
@@ -560,6 +560,22 @@ Tensor NestedTensor_elementwise_Tensor(
     const Tensor& other,
     const std::string& op_name,
     Func f) {
+  // self is a scalar
+  if (!self.is_nested() && self.dim() == 0 && self.numel() == 1) {
+    auto other_impl = get_nested_tensor_impl(other);
+    return wrap_buffer(
+      f(self, other_impl->get_buffer()),
+      other_impl->get_nested_size_tensor().clone()
+    );
+  }
+  // other is a scalar
+  if (!other.is_nested() && other.dim() == 0 && other.numel() == 1) {
+    auto self_impl = get_nested_tensor_impl(self);
+    return wrap_buffer(
+      f(self_impl->get_buffer(), other),
+      self_impl->get_nested_size_tensor().clone()
+    );
+  }
   NestedTensorImpl* self_impl = nullptr;
   NestedTensorImpl* other_impl = nullptr;
   std::tie(self_impl, other_impl) =
@@ -598,6 +614,18 @@ Tensor& NestedTensor_elementwise__Tensor(
     const Tensor& other,
     const std::string& op_name,
     Func f) {
+  // self is a scalar
+  if (!self.is_nested() && self.dim() == 0 && self.numel() == 1) {
+    auto other_impl = get_nested_tensor_impl(other);
+    f(self, other_impl->get_buffer());
+    return self;
+  }
+  // other is a scalar
+  if (!other.is_nested() && other.dim() == 0 && other.numel() == 1) {
+    auto self_impl = get_nested_tensor_impl(self);
+    f(self_impl->get_buffer(), other);
+    return self;
+  }
   NestedTensorImpl* self_impl = nullptr;
   NestedTensorImpl* other_impl = nullptr;
   std::tie(self_impl, other_impl) =
Original file line number	Diff line number	Diff line change
`@@ -165,10 +165,7 @@ class TORCH_API TensorBase {`
`165`	`165`	`}`
`166`	`166`
`167`	`167`	`int64_t size(int64_t dim) const {`
`168`		`- const auto sizes = this->sizes();`
`169`		`- const auto ndim = static_cast<int64_t>(sizes.size());`
`170`		`- // false is passed to maybe_wrap_dim so behavior is identical to array access (but with wrapping)`
`171`		`- return sizes[c10::maybe_wrap_dim(dim, ndim, /wrap_scalar=/false)];`
	`168`	`+ return impl_->size(dim);`
`172`	`169`	`}`
`173`	`170`
`174`	`171`	`int64_t stride(int64_t dim) const {`