pytorch
diff --git a/‎.circleci/config.yml‎
Lines changed: 4 additions & 1 deletion b/‎.circleci/config.yml‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎.circleci/verbatim-sources/job-specs/job-specs-custom.yml‎
Lines changed: 4 additions & 1 deletion b/‎.circleci/verbatim-sources/job-specs/job-specs-custom.yml‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎aten/src/ATen/TensorIterator.cpp‎
Lines changed: 40 additions & 11 deletions b/‎aten/src/ATen/TensorIterator.cpp‎
Lines changed: 40 additions & 11 deletions
diff --git a/‎aten/src/ATen/TensorIterator.h‎
Lines changed: 7 additions & 3 deletions b/‎aten/src/ATen/TensorIterator.h‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎aten/src/ATen/native/BinaryOps.cpp‎
Lines changed: 0 additions & 10 deletions b/‎aten/src/ATen/native/BinaryOps.cpp‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎aten/src/ATen/native/Bucketization.cpp‎
Lines changed: 18 additions & 6 deletions b/‎aten/src/ATen/native/Bucketization.cpp‎
Lines changed: 18 additions & 6 deletions
diff --git a/‎aten/src/ATen/native/Convolution.cpp‎
Lines changed: 50 additions & 0 deletions b/‎aten/src/ATen/native/Convolution.cpp‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/cuda/Bucketization.cu‎
Lines changed: 2 additions & 2 deletions b/‎aten/src/ATen/native/cuda/Bucketization.cu‎
Lines changed: 2 additions & 2 deletions
@@ -43,7 +43,8 @@
           set -ex
           export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}:build-${DOCKER_TAG}-${CIRCLE_SHA1}
           echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE}
-          tag=${CIRCLE_TAG:1:5}
+          # turn v1.12.0rc3 into 1.12.0
+          tag=$(echo $CIRCLE_TAG | sed -e 's/v*\([0-9.]*\).*/\1/')
           target=${tag:-master}
           echo "building for ${target}"
           time docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null
@@ -88,6 +89,8 @@
           set -ex
           export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}:build-${DOCKER_TAG}-${CIRCLE_SHA1}
           echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE}
+          # turn v1.12.0rc3 into 1.12.0
+          tag=$(echo $CIRCLE_TAG | sed -e 's/v*\([0-9.]*\).*/\1/')
           tag=${CIRCLE_TAG:1:5}
           target=${tag:-master}
           echo "building for ${target}"
 
@@ -128,7 +128,19 @@ TensorIteratorConfig& TensorIteratorConfig::add_borrowed_input(const TensorBase&
 
 TensorIteratorConfig& TensorIteratorConfig::declare_static_dtype_and_device(ScalarType dtype, Device device) {
   TORCH_CHECK(!check_all_same_dtype_, "check_all_same_dtype(false) must be called before declare_static_dtype(...)");
-  static_dtype_and_device_ = c10::make_optional(std::make_pair(dtype, device));
+  static_dtype_ = dtype;
+  static_device_ = device;
+  return *this;
+}
+
+TensorIteratorConfig& TensorIteratorConfig::declare_static_dtype(ScalarType dtype) {
+  TORCH_CHECK(!check_all_same_dtype_, "check_all_same_dtype(false) must be called before declare_static_dtype(...)");
+  static_dtype_ = dtype;
+  return *this;
+}
+
+TensorIteratorConfig& TensorIteratorConfig::declare_static_device(Device device) {
+  static_device_ = device;
   return *this;
 }
 
@@ -327,12 +339,20 @@ void TensorIteratorBase::compute_types(const TensorIteratorConfig& config) {
     //   the device it should be allocated on.
     if (!op.is_type_defined()) {
       TORCH_INTERNAL_ASSERT(op.is_output, "Found type undefined input tensor!");
-      if (config.static_dtype_and_device_.has_value()) {
-        op.target_dtype = config.static_dtype_and_device_->first;
-        op.device = config.static_dtype_and_device_->second;
+
+      if (config.static_dtype_.has_value()) {
+        op.target_dtype = config.static_dtype_.value();
       } else {
-        TORCH_INTERNAL_ASSERT(config.check_all_same_device_);
         has_undefined_outputs = true;
+      }
+
+      if (config.static_device_.has_value()) {
+        op.device = config.static_device_.value();
+      } else {
+        TORCH_INTERNAL_ASSERT(config.check_all_same_device_);
+      }
+
+      if (has_undefined_outputs || !op.device.has_value()) {
         continue;
       }
     }
@@ -418,12 +438,21 @@ void TensorIteratorBase::compute_types(const TensorIteratorConfig& config) {
   //   - checks that all tensors are on the same device, if requested
   //   - checks that the common dtype can safely cast to each output, if requested
   //   - creates temporaries for CPU operations, if needed and requested
+  common_device_ = common_device;
   int max_cpu_scalars_on_non_cpu = config.allow_cpu_scalars_ ? 1 : 0;
   int current_cpu_scalars_on_non_cpu = 0;
   for (auto& op : operands_) {
-    if (!op.is_type_defined()) {
+    bool is_type_defined = op.is_type_defined();
+    bool is_device_defined = op.is_device_defined();
+
+    if (!is_type_defined) {
       op.target_dtype = common_dtype_;
+    }
+    if (!is_device_defined) {
       op.device = common_device;
+    }
+
+    if (!is_type_defined && !is_device_defined) {
       continue;
     }
 
@@ -441,10 +470,10 @@ void TensorIteratorBase::compute_types(const TensorIteratorConfig& config) {
         TORCH_CHECK(current_cpu_scalars_on_non_cpu < max_cpu_scalars_on_non_cpu,
                     "Trying to pass too many CPU scalars to non-CPU kernel!");
         ++current_cpu_scalars_on_non_cpu;
-      } else if (op.device != common_device) {
+      } else if (op.device.value() != common_device) {
         TORCH_CHECK(false,
                     "Expected all tensors to be on the same device, but "
-                    "found at least two devices, ", common_device, " and ", op.device, "!");
+                    "found at least two devices, ", common_device, " and ", op.device.value(), "!");
       }
     }
 
@@ -490,7 +519,6 @@ void TensorIteratorBase::compute_types(const TensorIteratorConfig& config) {
         op.target_dtype = common_dtype_;
       }
     }
-    common_device_ = common_device;
   }
 }
 
@@ -864,7 +892,7 @@ void TensorIteratorBase::build_comparison_op(
   // want the output to be bool. Otherwise (e.g. 'torch.eq(a, b, out=c)') we
   // don't coerce the output.
   if (!out.defined()) {
-    config.declare_static_dtype_and_device(kBool, a.device());
+    config.declare_static_dtype(kBool);
   }
 
   // Note [special-case bool outputs]
@@ -943,7 +971,8 @@ void TensorIteratorBase::build_unary_force_boolean_op(const TensorBase& out, con
   build(TensorIteratorConfig()
       .set_check_mem_overlap(true)
       .check_all_same_dtype(false)
-      .declare_static_dtype_and_device(at::kBool, a.device())
+      .declare_static_dtype(at::kBool)
+      .declare_static_device(a.device())
       .add_owned_output(out)
       .add_owned_input(a));
 }
 
@@ -122,13 +122,14 @@ struct TORCH_API OperandInfo {
   /// but during type promotion target_dtype value can become different from tensor's dtype
   /// also, during type promotion target_dtype and device can be set for an undefined tensor so that tensor can be properly
   /// constructed later.
-  Device device = kCPU;
+  c10::optional<Device> device = c10::nullopt;
   ScalarType target_dtype = ScalarType::Undefined;
   // Caches dtype of the tensor, because scalar_type is an expensive operation
   // If dtype of the tensor is changed (e.g. as a result of type promotion or in allocate_outputs), this
   //value should be changed too.
   ScalarType current_dtype = ScalarType::Undefined;
 
+  bool is_device_defined() const { return device.has_value(); }
   bool is_type_defined() const { return target_dtype != ScalarType::Undefined; }
   TensorOptions options() const {
     return TensorOptions(target_dtype).device(device);
@@ -256,7 +257,7 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
     return common_dtype_;
   }
   ScalarType input_dtype(int arg=0) const { return operands_[num_outputs_ + arg].current_dtype; }
-  Device device(int arg=0) const { return operands_[arg].device; }
+  Device device(int arg=0) const { return operands_[arg].device.value(); }
   DeviceType device_type(int arg=0) const { return device(arg).type(); }
   int64_t element_size(int arg) const { return elementSize(dtype(arg)); }
   bool is_scalar(int arg) const;
@@ -725,6 +726,8 @@ class TORCH_API TensorIteratorConfig final {
 
   // Bypass output dtype/device computation and fix the dtype/device as specified here.
   TensorIteratorConfig& declare_static_dtype_and_device(ScalarType dtype, Device device);
+  TensorIteratorConfig& declare_static_dtype(ScalarType dtype);
+  TensorIteratorConfig& declare_static_device(Device device);
   TensorIteratorConfig& declare_static_shape(IntArrayRef shape);
   TensorIteratorConfig& declare_static_shape(IntArrayRef shape, IntArrayRef squash_dims);
 
@@ -742,7 +745,8 @@ class TORCH_API TensorIteratorConfig final {
   int num_inputs_ = 0;
 
   c10::optional<DimVector> static_shape_ = c10::nullopt;
-  c10::optional<std::pair<ScalarType, Device>> static_dtype_and_device_ = c10::nullopt;
+  c10::optional<ScalarType> static_dtype_ = c10::nullopt;
+  c10::optional<Device> static_device_ = c10::nullopt;
   bool check_mem_overlap_ = true;
   bool allow_cpu_scalars_ = false;
   bool is_reduction_ = false;
 
@@ -204,13 +204,6 @@ void comparison_op_check(const Tensor& self, const Tensor& other, const Tensor&
       native::check_convert(self.item(), other.scalar_type());
     }
   }
-  // In-place operation To avoid overflow during type promotion we will check that
-  // both dtypes of self and other are same
-  if (result.is_same(self)) {
-    TORCH_CHECK(self.dtype() == other.dtype(),
-                "Expected object of scalar type ", self.dtype(), " but got scalar type ",
-                other.dtype(), " for argument 'other'");
-  }
 }
 
 #define CREATE_COMPARISON_SCALAR_TENSOR_META_FUNC(func)                     \
@@ -915,9 +908,6 @@ Tensor comparison_op(const Tensor& self, const Tensor& other, OutImpl& out_impl)
 // To avoid overflow during type promotion we will check that both dtypes of self and other are same
 template <typename OutImpl>
 Tensor& comparison_op_(Tensor& self, const Tensor& other, OutImpl& out_impl) {
-  TORCH_CHECK(self.dtype() == other.dtype(),
-              "Expected object of scalar type ", self.dtype(), " but got scalar type ",
-              other.dtype(), " for argument 'other'");
   return out_impl(self, self, other);
 }
 
 
@@ -76,14 +76,26 @@ void searchsorted_cpu_contiguous(Tensor& result, const Tensor& input, const Tens
 
 void dispatch(Tensor& result, const Tensor& input, const Tensor& boundaries, bool out_int32, bool right) {
   if (!out_int32) {
-    AT_DISPATCH_ALL_TYPES_AND(ScalarType::BFloat16, input.scalar_type(), "searchsorted_out_cpu", [&] {
-      searchsorted_cpu_contiguous<scalar_t, int64_t>(result, input, boundaries, right);
-    });
+    AT_DISPATCH_ALL_TYPES_AND2(
+        ScalarType::Half,
+        ScalarType::BFloat16,
+        input.scalar_type(),
+        "searchsorted_out_cpu",
+        [&] {
+          searchsorted_cpu_contiguous<scalar_t, int64_t>(
+              result, input, boundaries, right);
+        });
   }
   else {
-    AT_DISPATCH_ALL_TYPES_AND(ScalarType::BFloat16, input.scalar_type(), "searchsorted_out_cpu", [&] {
-      searchsorted_cpu_contiguous<scalar_t, int>(result, input, boundaries, right);
-    });
+    AT_DISPATCH_ALL_TYPES_AND2(
+        ScalarType::Half,
+        ScalarType::BFloat16,
+        input.scalar_type(),
+        "searchsorted_out_cpu",
+        [&] {
+          searchsorted_cpu_contiguous<scalar_t, int>(
+              result, input, boundaries, right);
+        });
   }
 }
 
 
@@ -432,6 +432,42 @@ bool check_cudnn_depthwise_workload(const at::Tensor& input, int stride) {
   }
   return false;
 }
+
+// simplified version for cudnn 8.2 and above
+bool check_cudnn_depthwise_workload_with_filter(const at::Tensor& input, int stride, const at::Tensor& weight) {
+  // 1D conv
+  if(input.size(2) == 1 && stride == 1){
+    return true;
+  }
+
+  // 2d conv
+  // only square filters
+  if (weight.size(2) != weight.size(3)) return false;
+  int filter = weight.size(3);
+  // only 1/3/5 filter
+  if (filter != 1 && filter != 3 && filter != 5) return false;
+  // we don't enforce square input but only check width to reduce heuristic space
+  if (input.size(3) < 7) return false; // min width 7
+  int w = input.size(3);
+  // only 1/2 stride, use cudnn for all stride 1
+  if (stride == 1) return true;
+  if (stride != 2) return false;
+
+  int ch = input.size(1);
+  int bs = input.size(0);
+  // special case since bs1 show good perf in lots of cases
+  if (bs == 1) {
+    if (filter == 1 && w <= 28) return true;
+    if (filter == 3 || filter == 5) return true;
+  } else {
+    if (filter == 1 && bs <= 16 && ch >= 128 && w <= 7) return true;
+    if (filter == 3 || filter == 5) {
+      if ((ch >= 512) || (ch >= 256 && w >= 28)) return true;
+    }
+  }
+  return false;
+}
+
 // Use cudnn for FP16 depthwise convolutions
 auto ConvParams::use_cudnn_depthwise(
         const at::Tensor& input, const at::Tensor& weight) const -> bool {
@@ -440,6 +476,20 @@ auto ConvParams::use_cudnn_depthwise(
   }
   if (detail::getCUDAHooks().supportsDepthwiseConvolutionWithCuDNN()) {
     long cudnn_version = detail::getCUDAHooks().versionCuDNN();
+    if (cudnn_version >= 8200) {
+      bool kernel_cond =  (use_cudnn(input, weight) &&
+                           input.scalar_type() == kHalf && // only for FP16
+                           weight.scalar_type() == kHalf &&
+                           is_depthwise(input, weight) &&
+                           input.ndimension() == 4 &&   // TODO: 5-D contiguous depthwise is not supported yet, need benchmarks
+                           !is_dilated() && // no dilation supported
+                           (stride[0] == stride[1] || input.size(2) == 1) && // square or 1d
+                           input.size(1) >= 32); // min 32 channels supported)
+      if (kernel_cond) {
+        return check_cudnn_depthwise_workload_with_filter(input, stride[1], weight);
+      }
+    }
+    // keep (7600 <= cudnn < 8200) code unchanged
     bool kernel_cond =  (cudnn_version >= 7600 &&
                          use_cudnn(input, weight) &&
                          input.scalar_type() == kHalf && // only for FP16
 
@@ -92,12 +92,12 @@ void searchsorted_cuda_contiguous(Tensor& result, const Tensor& input, const Ten
 
 void dispatch(Tensor& result, const Tensor& input, const Tensor& boundaries, bool out_int32, bool right) {
   if (!out_int32) {
-    AT_DISPATCH_ALL_TYPES(input.scalar_type(), "searchsorted_out_cuda", [&] {
+    AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, input.scalar_type(), "searchsorted_out_cuda", [&] {
       searchsorted_cuda_contiguous<scalar_t, int64_t>(result, input, boundaries, right);
     });
   }
   else {
-    AT_DISPATCH_ALL_TYPES(input.scalar_type(), "searchsorted_out_cuda", [&] {
+    AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, input.scalar_type(), "searchsorted_out_cuda", [&] {
       searchsorted_cuda_contiguous<scalar_t, int>(result, input, boundaries, right);
     });
   }
Original file line number	Diff line number	Diff line change
`@@ -204,13 +204,6 @@ void comparison_op_check(const Tensor& self, const Tensor& other, const Tensor&`
`204`	`204`	`native::check_convert(self.item(), other.scalar_type());`
`205`	`205`	`}`
`206`	`206`	`}`
`207`		`- // In-place operation To avoid overflow during type promotion we will check that`
`208`		`- // both dtypes of self and other are same`
`209`		`- if (result.is_same(self)) {`
`210`		`- TORCH_CHECK(self.dtype() == other.dtype(),`
`211`		`- "Expected object of scalar type ", self.dtype(), " but got scalar type ",`
`212`		`- other.dtype(), " for argument 'other'");`
`213`		`- }`
`214`	`207`	`}`
`215`	`208`
`216`	`209`	`#define CREATE_COMPARISON_SCALAR_TENSOR_META_FUNC(func) \`
`@@ -915,9 +908,6 @@ Tensor comparison_op(const Tensor& self, const Tensor& other, OutImpl& out_impl)`
`915`	`908`	`// To avoid overflow during type promotion we will check that both dtypes of self and other are same`
`916`	`909`	`template <typename OutImpl>`
`917`	`910`	`Tensor& comparison_op_(Tensor& self, const Tensor& other, OutImpl& out_impl) {`
`918`		`- TORCH_CHECK(self.dtype() == other.dtype(),`
`919`		`- "Expected object of scalar type ", self.dtype(), " but got scalar type ",`
`920`		`- other.dtype(), " for argument 'other'");`
`921`	`911`	`return out_impl(self, self, other);`
`922`	`912`	`}`
`923`	`913`
Original file line number	Diff line number	Diff line change
`@@ -92,12 +92,12 @@ void searchsorted_cuda_contiguous(Tensor& result, const Tensor& input, const Ten`
`92`	`92`
`93`	`93`	`void dispatch(Tensor& result, const Tensor& input, const Tensor& boundaries, bool out_int32, bool right) {`
`94`	`94`	`if (!out_int32) {`
`95`		`- AT_DISPATCH_ALL_TYPES(input.scalar_type(), "searchsorted_out_cuda", [&] {`
	`95`	`+ AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, input.scalar_type(), "searchsorted_out_cuda", [&] {`
`96`	`96`	`searchsorted_cuda_contiguous<scalar_t, int64_t>(result, input, boundaries, right);`
`97`	`97`	`});`
`98`	`98`	`}`
`99`	`99`	`else {`
`100`		`- AT_DISPATCH_ALL_TYPES(input.scalar_type(), "searchsorted_out_cuda", [&] {`
	`100`	`+ AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, input.scalar_type(), "searchsorted_out_cuda", [&] {`
`101`	`101`	`searchsorted_cuda_contiguous<scalar_t, int>(result, input, boundaries, right);`
`102`	`102`	`});`
`103`	`103`	`}`