pytorch
diff --git a/‎.lintrunner.toml‎
Lines changed: 14 additions & 0 deletions b/‎.lintrunner.toml‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎aten/src/ATen/Context.cpp‎
Lines changed: 4 additions & 0 deletions b/‎aten/src/ATen/Context.cpp‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎aten/src/ATen/Context.h‎
Lines changed: 1 addition & 0 deletions b/‎aten/src/ATen/Context.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎aten/src/ATen/autocast_mode.h‎
Lines changed: 14 additions & 4 deletions b/‎aten/src/ATen/autocast_mode.h‎
Lines changed: 14 additions & 4 deletions
diff --git a/‎aten/src/ATen/mps/MPSDevice.h‎
Lines changed: 1 addition & 0 deletions b/‎aten/src/ATen/mps/MPSDevice.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎aten/src/ATen/mps/MPSDevice.mm‎
Lines changed: 3 additions & 0 deletions b/‎aten/src/ATen/mps/MPSDevice.mm‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/FractionalMaxPool2d.cpp‎
Lines changed: 7 additions & 4 deletions b/‎aten/src/ATen/native/FractionalMaxPool2d.cpp‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎aten/src/ATen/native/cuda/EmbeddingBag.cu‎
Lines changed: 4 additions & 3 deletions b/‎aten/src/ATen/native/cuda/EmbeddingBag.cu‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎aten/src/ATen/native/cuda/RowwiseScaledMM.cu‎
Lines changed: 18 additions & 4 deletions b/‎aten/src/ATen/native/cuda/RowwiseScaledMM.cu‎
Lines changed: 18 additions & 4 deletions
@@ -1733,3 +1733,17 @@ include_patterns = [
    'torch/**/not-exist.py'
 ]
 is_formatter = false
+
+# `import_linter` reports on importing disallowed third party libraries.
+[[linter]]
+code = 'IMPORT_LINTER'
+command = [
+    'python3',
+    'tools/linter/adapters/import_linter.py',
+    '--',
+    '@{{PATHSFILE}}'
+]
+include_patterns = [
+   'torch/_dynamo/**',
+]
+is_formatter = false
@@ -1095,6 +1095,9 @@ if(NOT MSVC)
     append_cxx_flag_if_supported("-Wno-error=redundant-move" CMAKE_CXX_FLAGS)
   endif()
 else()
+  # Define export functions for AOTI.
+  add_compile_definitions(EXPORT_AOTI_FUNCTIONS)
+
   # skip unwanted includes from windows.h
   add_compile_definitions(WIN32_LEAN_AND_MEAN)
   # Windows SDK broke compatibility since version 25131, but introduced this
 
@@ -543,6 +543,10 @@ void Context::setDisplayVmapFallbackWarnings(bool enabled) {
   display_vmap_fallback_warnings_ = enabled;
 }
 
+bool Context::isDefaultMobileCPUAllocatorSet() {
+  return prev_allocator_ptr_ != nullptr;
+}
+
 void Context::setDefaultMobileCPUAllocator() {
   TORCH_CHECK(prev_allocator_ptr_ == nullptr,
       "Already within the scope of another non-default cpu allocator."
 
@@ -347,6 +347,7 @@ class TORCH_API Context {
   void setDisplayVmapFallbackWarnings(bool enabled);
   bool areVmapFallbackWarningsEnabled() const;
 
+  bool isDefaultMobileCPUAllocatorSet();
   void setDefaultMobileCPUAllocator();
   void unsetDefaultMobileCPUAllocator();
   bool allowFP16ReductionCPU() const;
 
@@ -124,6 +124,16 @@ TORCH_API inline void set_autocast_gpu_dtype(at::ScalarType dtype) {
 // deprecated other backend specific autocast APIs
 AT_FORALL_DEPRECATED_AUTOCAST_BAKCNEDS(DECLARE_DEPRECATED_AUTOCAST_APIS)
 
+const std::array<at::DeviceType, 8> _AUTOCAST_SUPPORTED_DTYPES{
+    at::kCPU,
+    at::kCUDA,
+    at::kXPU,
+    at::kIPU,
+    at::kHPU,
+    at::kXLA,
+    at::kPrivateUse1,
+    at::kMPS};
+
 namespace {
 inline bool is_autocast_eligible(
     const Tensor& tensor,
@@ -179,10 +189,10 @@ inline DispatchKey get_autocast_dispatch_key_from_device_type(
 }
 
 inline bool is_autocast_available(c10::DeviceType device_type) {
-  if (device_type == at::kCPU || device_type == at::kCUDA ||
-      device_type == at::kXPU || device_type == at::kIPU ||
-      device_type == at::kHPU || device_type == at::kXLA ||
-      device_type == at::kPrivateUse1 || device_type == at::kMPS) {
+  if (std::find(
+          _AUTOCAST_SUPPORTED_DTYPES.begin(),
+          _AUTOCAST_SUPPORTED_DTYPES.end(),
+          device_type) != _AUTOCAST_SUPPORTED_DTYPES.end()) {
     return true;
   } else {
     return false;
 
@@ -24,6 +24,7 @@ enum class MacOSVersion : uint32_t {
   MACOS_VER_14_4_PLUS,
   MACOS_VER_15_0_PLUS,
   MACOS_VER_15_1_PLUS,
+  MACOS_VER_15_2_PLUS,
 };
 
 //-----------------------------------------------------------------
 
@@ -73,6 +73,7 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
   static bool _macos_14_4_plus = is_os_version_at_least(14, 4);
   static bool _macos_15_0_plus = is_os_version_at_least(15, 0);
   static bool _macos_15_1_plus = is_os_version_at_least(15, 1);
+  static bool _macos_15_2_plus = is_os_version_at_least(15, 2);
 
   switch (version) {
     case MacOSVersion::MACOS_VER_13_1_PLUS:
@@ -89,6 +90,8 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
       return _macos_15_0_plus;
     case MacOSVersion::MACOS_VER_15_1_PLUS:
       return _macos_15_1_plus;
+    case MacOSVersion::MACOS_VER_15_2_PLUS:
+      return _macos_15_2_plus;
     default:
       return false;
   }
 
@@ -109,10 +109,13 @@ TORCH_META_FUNC(fractional_max_pool2d_backward)(
   /* get contiguous gradOutput */
   auto gradOutput = gradOutput_.contiguous();
 
-  TORCH_CHECK(outputW == gradOutput.size(widthDim),
-    "fractional_max_pool2d_backward(): gradOutput width unexpected");
-  TORCH_CHECK(outputH == gradOutput.size(heightDim),
-    "fractional_max_pool2d_backward(): gradOutput height unexpected");
+  auto expectedOutputShape = IntArrayRef(input.sizes().data(), ndims - 2).vec();
+  expectedOutputShape.push_back(outputH);
+  expectedOutputShape.push_back(outputW);
+  TORCH_CHECK(gradOutput.sizes().equals(expectedOutputShape),
+    "fractional_max_pool2d_backward(): gradOutput sizes unexpected");
+  TORCH_CHECK(indices.sizes().equals(expectedOutputShape),
+    "fractional_max_pool2d_backward(): indices sizes unexpected");
 
   /* resize */
   if (ndims == 3) {
 
@@ -136,9 +136,10 @@ __global__ void EmbeddingBag_updateOutputKernel_sum_mean(
       accscalar_t weightFeatSum = 0;
       int64_t bag_size_ = 0;
       for (int64_t emb = begin; emb < end; emb++) {
-        bool pad = (input[emb] == padding_idx);
-        CUDA_KERNEL_ASSERT(input[emb] < numRows);
-        const int64_t weightRow = input[emb] * weight_stride0;
+        index_t input_idx = input[emb];
+        bool pad = (input_idx == padding_idx);
+        CUDA_KERNEL_ASSERT(0 <= input_idx && input_idx < numRows);
+        const int64_t weightRow = input_idx * weight_stride0;
         scalar_t weightValue = weightFeat[weightRow];
         weightValue = pad ? static_cast<scalar_t>(0) : weightValue;
         if (per_sample_weights) {
 
@@ -43,6 +43,7 @@ static CUresult CUDAAPI nvrtc_cuTensorMapEncodeTiled(
 }
 
 
+#include <cutlass/version.h>
 #include <cutlass/core_io.h>
 #include <cutlass/cutlass.h>
 #include <cutlass/gemm/device/gemm.h>
@@ -174,7 +175,11 @@ void f8f8bf16_rowwise_impl(
 
   // Implement rowwise scaling epilogue.
   constexpr int ColBroadcastStages = 0;
+  #if CUTLASS_VERSION == 351
+  constexpr int RowBroadcastStages = 0;
+  #else
   constexpr int RowBroadcastStages = PingPong::value ? 2 : 1;
+  #endif
 
   using XScale = cutlass::epilogue::fusion::
       Sm90ColBroadcast<ColBroadcastStages, TileShape, DtypeScale>;
@@ -191,15 +196,24 @@ void f8f8bf16_rowwise_impl(
 
   using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
 
+  #if CUTLASS_VERSION == 351
+  using AccumScale = cutlass::epilogue::fusion::Sm90EVT<
+              Multiply,
+              WScale,
+              cutlass::epilogue::fusion::Sm90EVT<Multiply, XScale, Accum>>;
+  #else
+  using AccumScale = cutlass::epilogue::fusion::Sm90EVT<
+              Multiply,
+              XScale,
+              cutlass::epilogue::fusion::Sm90EVT<Multiply, WScale, Accum>>;
+  #endif
+
   using EpilogueEVT = cutlass::epilogue::fusion::Sm90EVT<
       Cast,
       cutlass::epilogue::fusion::Sm90EVT<
           Add,
           Bias,
-          cutlass::epilogue::fusion::Sm90EVT<
-              Multiply,
-              XScale,
-              cutlass::epilogue::fusion::Sm90EVT<Multiply, WScale, Accum>>>>;
+          AccumScale>>;
 
   using CollectiveEpilogue =
       typename cutlass::epilogue::collective::CollectiveBuilder<