pytorch
diff --git a/‎aten/src/ATen/Dispatch.h‎
Lines changed: 61 additions & 9 deletions b/‎aten/src/ATen/Dispatch.h‎
Lines changed: 61 additions & 9 deletions
diff --git a/‎aten/src/ATen/core/TensorBase.h‎
Lines changed: 0 additions & 6 deletions b/‎aten/src/ATen/core/TensorBase.h‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎aten/src/ATen/native/DistributionTemplates.h‎
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/DistributionTemplates.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/Loss.cpp‎
Lines changed: 24 additions & 5 deletions b/‎aten/src/ATen/native/Loss.cpp‎
Lines changed: 24 additions & 5 deletions
diff --git a/‎aten/src/ATen/native/quantized/affine_quantizer.cpp‎
Lines changed: 6 additions & 6 deletions b/‎aten/src/ATen/native/quantized/affine_quantizer.cpp‎
Lines changed: 6 additions & 6 deletions
@@ -80,9 +80,13 @@ inline constexpr bool should_include_kernel_dtype(
 #define C10_UNUSED_DISPATCH_CUDA_WORKAROUND C10_UNUSED
 #endif // defined(__CUDACC__) && CUDA_VERSION <= 10100
 
+#if defined __cpp_if_constexpr
 #define AT_QINT_PRIVATE_CASE_TYPE(                                           \
-    enum_type, type, underlying_enum, underlying_type, ...)                  \
+    NAME, enum_type, type, underlying_enum, underlying_type, ...)            \
   case enum_type: {                                                          \
+    if constexpr (!at::should_include_kernel_dtype(NAME, enum_type)) {       \
+      AT_ERROR("dtype '", toString(enum_type), "' not selected for kernel tag ", #NAME); \
+    }                                                                        \
     using scalar_t = type;                                                   \
     using underlying_t C10_UNUSED_DISPATCH_CUDA_WORKAROUND =                 \
         scalar_t::underlying;                                                \
@@ -93,10 +97,57 @@ inline constexpr bool should_include_kernel_dtype(
     /* TODO: Use [[maybe-unused]] when C++17 becomes the standard */         \
     return __VA_ARGS__();                                                    \
   }
+#else
+#define AT_QINT_PRIVATE_CASE_TYPE(                                               \
+    NAME, enum_type, type, underlying_enum, underlying_type, ...)                \
+  case enum_type: {                                                              \
+    at::guts::if_constexpr<(!at::should_include_kernel_dtype(NAME, enum_type))>( \
+      [] {                                                                       \
+        AT_ERROR("dtype '" #enum_type "' not selected for kernel tag " #NAME);   \
+      }                                                                          \
+    );                                                                           \
+    using scalar_t = type;                                                       \
+    using underlying_t C10_UNUSED_DISPATCH_CUDA_WORKAROUND =                     \
+        scalar_t::underlying;                                                    \
+    const auto& SCALAR_TYPE C10_UNUSED_DISPATCH_CUDA_WORKAROUND = enum_type;     \
+    const auto& UNDERLYING_TYPE C10_UNUSED_DISPATCH_CUDA_WORKAROUND =            \
+        toUnderlying(enum_type);                                                 \
+    (void)SCALAR_TYPE;  /* Suppress unused-var compiler warning */               \
+    /* TODO: Use [[maybe-unused]] when C++17 becomes the standard */             \
+    return __VA_ARGS__();                                                        \
+  }
+#endif
 
+#if defined __cpp_if_constexpr
+#define AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE(                                       \
+    NAME, enum_type, type, underlying_type, bitwidth, qmin, qmax, ...)            \
+  case enum_type: {                                                               \
+      if constexpr (!at::should_include_kernel_dtype(NAME, enum_type)) {          \
+      AT_ERROR("dtype '", toString(enum_type), "' not selected for kernel tag ", #NAME); \
+    }                                                                             \
+    using scalar_t = type;                                                        \
+    using underlying_t C10_UNUSED_DISPATCH_CUDA_WORKAROUND =                      \
+        scalar_t::underlying;                                                     \
+    const auto& SCALAR_TYPE C10_UNUSED_DISPATCH_CUDA_WORKAROUND = enum_type;      \
+    const auto& UNDERLYING_TYPE C10_UNUSED_DISPATCH_CUDA_WORKAROUND =             \
+        toUnderlying(enum_type);                                                  \
+    int bit_width = bitwidth;                                                     \
+    int64_t quant_min = qmin;                                                     \
+    int64_t quant_max = qmax;                                                     \
+    (void)bit_width; /* Suppress unused variable warning */                       \
+    (void)quant_min; /* Suppress unused variable warning */                       \
+    (void)quant_max; /* Suppress unused variable warning */                       \
+    return __VA_ARGS__();                                                         \
+  }
+#else
 #define AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE(                                       \
-    enum_type, type, underlying_type, bitwidth, qmin, qmax, ...)                  \
+    NAME, enum_type, type, underlying_type, bitwidth, qmin, qmax, ...)            \
   case enum_type: {                                                               \
+      at::guts::if_constexpr<(!at::should_include_kernel_dtype(NAME, enum_type))>( \
+      [] {                                                                        \
+        AT_ERROR("dtype '" #enum_type "' not selected for kernel tag " #NAME);    \
+      }                                                                           \
+    );                                                                            \
     using scalar_t = type;                                                        \
     using underlying_t C10_UNUSED_DISPATCH_CUDA_WORKAROUND =                      \
         scalar_t::underlying;                                                     \
@@ -111,6 +162,7 @@ inline constexpr bool should_include_kernel_dtype(
     (void)quant_max; /* Suppress unused variable warning */                       \
     return __VA_ARGS__();                                                         \
   }
+#endif
 
 namespace detail {
 
@@ -449,11 +501,11 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
     RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st);                                \
     switch (_st) {                                                          \
       AT_QINT_PRIVATE_CASE_TYPE(                                            \
-          at::kQInt8, at::qint8, at::kChar, int8_t, __VA_ARGS__)            \
+          NAME, at::kQInt8, at::qint8, at::kChar, int8_t, __VA_ARGS__)      \
       AT_QINT_PRIVATE_CASE_TYPE(                                            \
-          at::kQUInt8, at::quint8, at::kByte, uint8_t, __VA_ARGS__)         \
+          NAME, at::kQUInt8, at::quint8, at::kByte, uint8_t, __VA_ARGS__)   \
       AT_QINT_PRIVATE_CASE_TYPE(                                            \
-          at::kQInt32, at::qint32, at::kInt, int, __VA_ARGS__)              \
+          NAME, at::kQInt32, at::qint32, at::kInt, int, __VA_ARGS__)        \
       default:                                                              \
         AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");     \
     }                                                                       \
@@ -467,13 +519,13 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
     RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st);                                                   \
     switch (_st) {                                                                             \
       AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE(                                                      \
-          at::kQInt8, at::qint8, int8_t, CHAR_BIT, SCHAR_MIN, SCHAR_MAX, __VA_ARGS__)          \
+          NAME, at::kQInt8, at::qint8, int8_t, CHAR_BIT, SCHAR_MIN, SCHAR_MAX, __VA_ARGS__)    \
       AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE(                                                      \
-          at::kQUInt8, at::quint8, uint8_t, CHAR_BIT, 0, UCHAR_MAX, __VA_ARGS__)               \
+          NAME, at::kQUInt8, at::quint8, uint8_t, CHAR_BIT, 0, UCHAR_MAX, __VA_ARGS__)         \
       AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE(                                                      \
-          at::kQInt32, at::qint32, int, CHAR_BIT * sizeof(int), INT_MIN, INT_MAX, __VA_ARGS__) \
+          NAME, at::kQInt32, at::qint32, int, CHAR_BIT * sizeof(int), INT_MIN, INT_MAX, __VA_ARGS__) \
       AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE(                                                      \
-          at::kQUInt4x2, at::quint4x2, uint8_t, 4, 0, 15, __VA_ARGS__)                         \
+          NAME, at::kQUInt4x2, at::quint4x2, uint8_t, 4, 0, 15, __VA_ARGS__)                   \
       default:                                                                                 \
         AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");                        \
     }                                                                                          \
 
@@ -755,12 +755,6 @@ class TORCH_API TensorBase {
   TensorBase __dispatch_contiguous(c10::MemoryFormat) const;
 };
 
-// For "multiple ... operators specified" warnings, closing brace of class
-// declaration must be included between pragma push & pop
-#ifdef _MSC_VER
-#pragma warning( pop )
-#endif
-
 inline int64_t get_device(const TensorBase& self) {
   return self.get_device();
 }
 
@@ -24,7 +24,7 @@ namespace templates {
 //
 // If random's uint64_t arithmetics produces 65503 as a random value after casting to torch::half it becomes 65504
 // and violates the requirement that random value must be less than `to`. To resolve this issue `update_from` and `update_to`
-// moves `from` to the left and `to` to the right to the next closest value that won't go outside [from, to) after casting to
+// moves `from` to the right and `to` to the left to the next closest value that won't go outside [from, to) after casting to
 // the target dtype. For `to` = 65504 it moves left for (1 << (log2(to) - 11 + 1)) = 32 and becomes 65472, which is previous
 // available number for torch::half dtype.
 template<typename scalar_t>
 
@@ -30,13 +30,32 @@ DEFINE_DISPATCH(mse_stub);
 DEFINE_DISPATCH(mse_backward_stub);
 
 Tensor cosine_embedding_loss(const Tensor& input1, const Tensor& input2, const Tensor& target, double margin, int64_t reduction) {
+  auto targ_dim = target.dim();
   TORCH_CHECK(
-      target.dim() == 1,
-      "1D target tensor expected, multi-target not supported");
+      targ_dim == 1 || targ_dim == 0,
+      "0D or 1D target tensor expected, multi-target not supported");
+
+  if (targ_dim == 1) {
+    TORCH_CHECK(
+        input1.dim() == 2,
+        "1D target tensor expects 2D input tensors, but found inputs with sizes ",
+        input1.sizes(),
+        " and ",
+        input2.sizes(),
+        ".");
+  } else {
+    TORCH_CHECK(
+        input1.dim() == 1,
+        "0D target tensor expects 1D input tensors, but found inputs with sizes ",
+        input1.sizes(),
+        " and ",
+        input2.sizes(),
+        ".");
+  }
 
-  auto prod_sum = (input1 * input2).sum(1);
-  auto mag_square1 = (input1 * input1).sum(1) + EPSILON;
-  auto mag_square2 = (input2 * input2).sum(1) + EPSILON;
+  auto prod_sum = (input1 * input2).sum(targ_dim);
+  auto mag_square1 = (input1 * input1).sum(targ_dim) + EPSILON;
+  auto mag_square2 = (input2 * input2).sum(targ_dim) + EPSILON;
   auto denom = (mag_square1 * mag_square2).sqrt_();
   auto cos = prod_sum / denom;
 
 
@@ -107,7 +107,7 @@ Tensor& quantize_tensor_per_tensor_affine(
     Tensor& qtensor,
     double scale,
     int64_t zero_point) {
-  static const std::string fn_name = "quantize_tensor_per_tensor_affine";
+  static constexpr auto fn_name = "quantize_tensor_per_tensor_affine";
 
   checkRoundingMode(fn_name);
   checkFloatTensor(fn_name, rtensor);
@@ -138,7 +138,7 @@ Tensor& quantize_tensor_per_channel_affine(
     Tensor scales,
     Tensor zero_points,
     int64_t axis) {
-  static const std::string fn_name = "quantize_tensor_per_channel_affine";
+  static constexpr auto fn_name = "quantize_tensor_per_channel_affine";
 
   checkRoundingMode(fn_name);
   checkFloatTensor(fn_name, rtensor);
@@ -178,7 +178,7 @@ Tensor& quantize_tensor_per_channel_float_qparams(
     Tensor scales,
     Tensor zero_points,
     int64_t axis) {
-  static const std::string fn_name =
+  static constexpr auto fn_name =
       "quantize_tensor_per_channel_float_qparams";
 
   checkRoundingMode(fn_name);
@@ -216,7 +216,7 @@ Tensor& dequantize_tensor_per_tensor_affine(
     Tensor& rtensor,
     double scale,
     int64_t zero_point) {
-  static const std::string fn_name = "dequantize_tensor_per_tensor_affine";
+  static constexpr auto fn_name = "dequantize_tensor_per_tensor_affine";
   checkFloatTensor(fn_name, rtensor);
   checkSameDevice(fn_name, rtensor, qtensor);
   checkSameSize(fn_name, qtensor, rtensor);
@@ -243,7 +243,7 @@ Tensor& dequantize_tensor_per_channel_affine(
     Tensor scales,
     Tensor zero_points,
     int64_t axis) {
-  static const std::string fn_name = "dequantize_tensor_per_channel_affine";
+  static constexpr auto fn_name = "dequantize_tensor_per_channel_affine";
 
   checkFloatTensor(fn_name, rtensor);
   checkSameDevice(fn_name, rtensor, qtensor);
@@ -282,7 +282,7 @@ Tensor& dequantize_tensor_per_channel_float_qparams(
     Tensor scales,
     Tensor zero_points,
     int64_t axis) {
-  static const std::string fn_name = "dequantize_tensor_per_channel_affine";
+  static constexpr auto fn_name = "dequantize_tensor_per_channel_affine";
 
   checkFloatTensor(fn_name, rtensor);
   checkSameDevice(fn_name, rtensor, qtensor);