pytorch
diff --git a/‎aten/src/ATen/TensorIterator.cpp‎
Lines changed: 2 additions & 0 deletions b/‎aten/src/ATen/TensorIterator.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎aten/src/ATen/cpu/vec256/vec256_bfloat16.h‎
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/cpu/vec256/vec256_bfloat16.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/SpectralOps.cpp‎
Lines changed: 0 additions & 169 deletions b/‎aten/src/ATen/native/SpectralOps.cpp‎
Lines changed: 0 additions & 169 deletions
diff --git a/‎aten/src/ATen/native/TensorProperties.cpp‎
Lines changed: 2 additions & 7 deletions b/‎aten/src/ATen/native/TensorProperties.cpp‎
Lines changed: 2 additions & 7 deletions
diff --git a/‎aten/src/ATen/native/cpu/UnaryOpsKernel.cpp‎
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/cpu/UnaryOpsKernel.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/cuda/SpectralOps.cu‎
Lines changed: 0 additions & 107 deletions b/‎aten/src/ATen/native/cuda/SpectralOps.cu‎
Lines changed: 0 additions & 107 deletions
@@ -485,10 +485,12 @@ void TensorIteratorBase::allocate_or_resize_outputs() {
         // At the moment, quantized kernels mostly handle output Tensor
         // construction manually, this path is an edge case. So, only support
         // the single input case for now.
+        /*
         TORCH_INTERNAL_ASSERT(
             operands_.size() == num_outputs_ + 1,
             "Advanced indexing of quantized Tensors with multiple inputs is not "
             "supported yet.");
+            */
         // get the first input and copy its quantization parameters
         const auto& first_input = operands_[num_outputs_];
         TORCH_INTERNAL_ASSERT_DEBUG_ONLY(first_input.tensor.is_quantized());
 
@@ -25,7 +25,7 @@ static inline void cvtbf16_fp32(const __m256i& a, __m256& o1, __m256& o2) {
 static inline __m256i cvtfp32_bf16(const __m256& a, const __m256& b) {
   __m256i lo = _mm256_castps_si256(a);
   __m256i hi = _mm256_castps_si256(b);
-  __m256i nan = _mm256_set1_epi32(0x7fc0);
+  __m256i nan = _mm256_set1_epi32(0xffff);
   __m256i mask_lo = _mm256_castps_si256(_mm256_cmp_ps(a, a, _CMP_ORD_Q));
   __m256i mask_hi = _mm256_castps_si256(_mm256_cmp_ps(b, b, _CMP_ORD_Q));
   __m256i ones = _mm256_set1_epi32(0x1);
 
@@ -19,12 +19,6 @@
 
 namespace at { namespace native {
 
-// Common code for all FFT functions
-static inline Tensor _fft(
-    const Tensor &self, int64_t signal_ndim, bool complex_input,
-    const bool complex_output, bool inverse, IntArrayRef signal_sizes,
-    fft_norm_mode normalization, bool onesided);
-
 namespace {
 
 // Promote inputs to FFT functions
@@ -416,139 +410,6 @@ Tensor fft_ifftshift(const Tensor& x, c10::optional<IntArrayRef> dim_opt) {
 }
 
 
-// This is a pass-through wrapper function that does the size check and
-// inferences. The actual forward implementation function is called
-// at::_fft_with_size which dispatches to _fft_cufft (CUDA) or _fft_mkl (CPU).
-static inline Tensor _fft(const Tensor &self, const int64_t signal_ndim,
-           const bool complex_input, const bool complex_output,
-           const bool inverse, IntArrayRef signal_sizes,
-           const fft_norm_mode normalization, const bool onesided) {
-
-  TORCH_CHECK(signal_ndim >= 1 && signal_ndim <= 3,
-           "Expected signal_ndim to be 1, 2, or 3, but got signal_ndim=",
-           signal_ndim);
-  TORCH_CHECK(at::isFloatingType(self.scalar_type()),
-           "Expected an input tensor of floating types, but got input=",
-           self.toString(), self.sizes());
-
-  auto signal_tensor_ndim = signal_ndim + static_cast<int64_t>(complex_input);  // add complex dim
-  if (self.dim() < signal_tensor_ndim) {
-    std::ostringstream ss;
-    ss << "Given signal_ndim=" << signal_ndim << ", expected an input tensor "
-       << "of at least " << signal_tensor_ndim << "D";
-    if (complex_input) {
-      ss << " (complex input adds an extra dimension)";
-    }
-    ss << ", but got input=" << self.toString() << self.sizes();
-    AT_ERROR(ss.str());
-  }
-
-  auto self_shape = self.sizes();
-  auto batch_ndim = self.dim() - signal_tensor_ndim;
-
-  Tensor input = self;
-  // flatten the batch dims
-  if (batch_ndim == 0) {
-    // slightly faster path for non-batch mode
-    input = input.unsqueeze(0);
-  } else if (batch_ndim > 1) {
-    std::vector<int64_t> flatten_input_shape(signal_tensor_ndim + 1);
-    std::copy(self_shape.begin() + batch_ndim, self_shape.end(), flatten_input_shape.begin() + 1);
-    flatten_input_shape[0] = -1;
-    input = input.reshape(flatten_input_shape);
-
-  }
-
-  // now we assume that input is batched as [ B x signal_dims... ]
-
-  if (complex_input) {
-    TORCH_CHECK(input.size(signal_ndim + 1) == 2,
-             "Expected an input tensor with a last dimension of size 2 "
-             "representing real + imaginary components, but got input ",
-             self.toString(), self.sizes());
-  }
-
-  // build signal_sizes and output_size
-  TORCH_CHECK(signal_sizes.size() == 0 || static_cast<int64_t>(signal_sizes.size()) == signal_ndim,
-           "Expected signal_sizes to be empty (default) or of signal_ndim=",
-           signal_ndim, "D, but got signal_sizes=", signal_sizes);
-  std::vector<int64_t> output_sizes(signal_ndim + 1 + static_cast<int64_t>(complex_output));
-  output_sizes[0] = input.size(0);  // batch size
-  std::vector<int64_t> checked_signal_sizes(signal_ndim);
-  for (int64_t i = 0; i < signal_ndim; i++) {
-    int64_t input_size = input.size(i + 1);
-    if (i == signal_ndim - 1 && onesided && complex_input && !complex_output) {
-      // If last dim and complex-to-real onesided, input is only half of
-      // signal, and we need to infer basing on signal_sizes, if given
-      // See native/SpectralOpsUtils.h for detailed description.
-      int64_t inferred_size;
-      if (signal_sizes.size() > 0) {
-        inferred_size = infer_ft_complex_to_real_onesided_size(input_size, signal_sizes[i]);
-      } else {
-        inferred_size = infer_ft_complex_to_real_onesided_size(input_size);
-      }
-      checked_signal_sizes[i] = inferred_size;
-      output_sizes[i + 1] = inferred_size;
-    } else {
-      if (i == signal_ndim - 1 && onesided && !complex_input && complex_output) {
-        // if last dim and real-to-complex onesided, output should be only
-        // half of the signal, and we need to infer using input_size
-        output_sizes[i + 1] = infer_ft_real_to_complex_onesided_size(input_size);
-      } else {
-        output_sizes[i + 1] = input_size;
-      }
-      checked_signal_sizes[i] = input_size;
-      TORCH_CHECK(signal_sizes.size() == 0 || signal_sizes[i] == checked_signal_sizes[i],
-               "Expected given signal_sizes=", signal_sizes," to have same "
-               "shape with input at signal dimension ", i, ", but got "
-               "signal_sizes=", signal_sizes, " and input=", self.toString(),
-               self.sizes());
-    }
-  }
-  if (complex_output) {
-    output_sizes[signal_ndim + 1] = 2;
-  }
-
-  Tensor output = at::_fft_with_size(input, signal_ndim, complex_input,
-                                     complex_output, inverse,
-                                     checked_signal_sizes,
-                                     static_cast<int64_t>(normalization),
-                                     onesided,
-                                     output_sizes);
-
-  // unflatten the batch dims
-  if (batch_ndim == 0) {
-    // slightly faster path for non-batch mode
-    output = output.squeeze(0);
-  } else if (batch_ndim > 1) {
-    auto output_ndim = self.dim() + static_cast<int64_t>(complex_output) - static_cast<int64_t>(complex_input);
-    std::vector<int64_t> unflatten_output_shape(output_ndim);
-    std::copy(self_shape.begin(), self_shape.begin() + batch_ndim, unflatten_output_shape.begin());
-    std::copy(output_sizes.begin() + 1, output_sizes.end(), unflatten_output_shape.begin() + batch_ndim);
-    output = output.reshape(unflatten_output_shape);
-  }
-  return output;
-}
-
-// Wrapper to preserve the historic signature of _fft_with_size
-// NOTE: This is only used for torchscript backwards compatibility and the new
-// signature with normalization modes should be used in all other cases
-Tensor _fft_with_size(const Tensor& input, int64_t signal_ndim,
-                      bool complex_input, bool complex_output,
-                      bool inverse, IntArrayRef checked_signal_sizes,
-                      bool normalized, bool onesided,
-                      IntArrayRef output_sizes) {
-  fft_norm_mode norm;
-  if (normalized) {
-    norm = fft_norm_mode::by_root_n;
-  } else {
-    norm = inverse ? fft_norm_mode::by_n : fft_norm_mode::none;
-  }
-  return at::_fft_with_size(
-      input, signal_ndim, complex_input, complex_output, inverse,
-      checked_signal_sizes, static_cast<int64_t>(norm), onesided, output_sizes);
-}
-
 // We call the following methods via CUDA hooks because they are really only
 // valid when CUDA is available. See native/cuda/CuFFTPlanCache.h for more details.
 int64_t _cufft_get_plan_cache_max_size(int64_t device_index) {
@@ -567,36 +428,6 @@ void _cufft_clear_plan_cache(int64_t device_index) {
   detail::getCUDAHooks().cuFFTClearPlanCache(device_index);
 }
 
-static Tensor fft(const Tensor& self, const int64_t signal_ndim, const bool normalized) {
-  return _fft(self, signal_ndim, /* complex_input */ true,
-              /* complex_output */ true, /* inverse */ false, {},
-              normalized ? fft_norm_mode::by_root_n : fft_norm_mode::none,
-              /* onesided */ false);
-}
-
-static Tensor ifft(const Tensor& self, const int64_t signal_ndim, const bool normalized) {
-  return _fft(self, signal_ndim, /* complex_input */ true,
-              /* complex_output */ true, /* inverse */ true, {},
-              normalized ? fft_norm_mode::by_root_n : fft_norm_mode::by_n,
-              /* onesided */ false);
-}
-
-static Tensor rfft(const Tensor& self, const int64_t signal_ndim, const bool normalized,
-            const bool onesided) {
-  return _fft(self, signal_ndim, /* complex_input */ false,
-              /* complex_output */ true, /* inverse */ false, {},
-              normalized ? fft_norm_mode::by_root_n : fft_norm_mode::none,
-              onesided);
-}
-
-static Tensor irfft(const Tensor& self, const int64_t signal_ndim, const bool normalized,
-             const bool onesided,  IntArrayRef signal_sizes) {
-  return _fft(self, signal_ndim, /* complex_input */ true,
-              /* complex_output */ false, /* inverse */ true, signal_sizes,
-              normalized ? fft_norm_mode::by_root_n : fft_norm_mode::by_n,
-              onesided);
-}
-
 template <typename Stream, typename T>
 static Stream& write_opt(Stream& SS, const optional<T>& value) {
   if (value) {
 
@@ -1,6 +1,5 @@
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
-#include <ATen/WrapDimUtils.h>
 #include <ATen/detail/CUDAHooksInterface.h>
 #include <ATen/NamedTensorUtils.h>
 #include <torch/library.h>
@@ -14,15 +13,11 @@ bool is_same_size(const Tensor& self, const Tensor& other) {
 }
 
 int64_t size(const Tensor& self, int64_t dim) {
-  // false is passed to maybe_wrap_dim so behavior is identical to array access (but with wrapping)
-  dim = maybe_wrap_dim(dim, self.dim(), false);
-  return self.sizes()[dim];
+  return self.size(dim);
 }
 
 int64_t stride(const Tensor& self, int64_t dim) {
-  // false is passed to maybe_wrap_dim so behavior is identical to array access (but with wrapping)
-  dim = maybe_wrap_dim(dim, self.dim(), false);
-  return self.strides()[dim];
+  return self.stride(dim);
 }
 
 int64_t size(const Tensor& self, Dimname dim) {
 
@@ -277,7 +277,7 @@ static void sign_kernel(TensorIterator& iter){
           [=](scalar_t a) -> scalar_t { return (0 < a) - (a < 0); },
           [=](Vec256<scalar_t> self_vec){
 
-              // Comparision operators returns bitmask.
+              // Comparison operators returns bitmask.
               auto left = Vec256<scalar_t>::blendv(zero_vec, one_vec, zero_vec < self_vec);
               auto right = Vec256<scalar_t>::blendv(zero_vec, one_vec, self_vec < zero_vec);
 
 
@@ -589,112 +589,5 @@ Tensor _fft_c2c_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization
   return output;
 }
 
-// cuFFT
-// Currently not utilizing multi GPUs so this can be potentially sped up.
-Tensor _fft_cufft(const Tensor& self, int64_t signal_ndim,
-                  bool complex_input, bool complex_output, bool inverse,
-                  IntArrayRef checked_signal_sizes, int64_t normalization, bool onesided,
-                  IntArrayRef output_sizes) {
-
-  CuFFTParamsLRUCache& plan_cache = cufft_get_plan_cache(self.device().index());
-
-  Tensor input = self;
-  const auto fft_type = GetCuFFTTransformType(complex_input, complex_output);
-
-  if (complex_input) {
-    TORCH_CHECK(input.size(-1) == 2, "Expected a complex (size 2) last dimension");
-  }
-
-
-  // Slice when twosided complex-to-real. This is not always needed because we
-  // calculate the inembed. But it will benefit us in certain cases where we
-  // clone the input tensor.
-  //
-  // See NOTE [ cuFFT Embedded Strides ].
-  // See NOTE [ Fourier Transform Conjugate Symmetry ] in native/SpectralOpsUtils.h.
-  if (fft_type == CuFFTTransformType::C2R && !onesided) {
-    auto onesided_size = infer_ft_real_to_complex_onesided_size(checked_signal_sizes[signal_ndim - 1]);
-    input = input.narrow(signal_ndim, 0, onesided_size);
-  }
-
-  // cuFFT requires input and output data pointers to complex type aligned.
-  // Our newly allocated output tensor is always 512 bytes aligned so it is fine
-  // (see kRoundSmall and kRoundLarge in THCCachingAllocator.cpp), but we do
-  // need to check input tensor to make sure that it is not unaligned, e.g.,
-  // from a slicing.
-  bool must_clone = false;
-  auto complex_size_bytes = 2 * input.element_size();
-  if (reinterpret_cast<std::uintptr_t>(input.data_ptr()) % complex_size_bytes != 0) {
-    must_clone = true;
-  }
-
-  if (complex_input) {
-    auto strides = input.strides();
-    // Real/imag dimension must be like complex type.
-    must_clone |= strides.back() != 1;
-    // Strides of other dimensions needs to be aligned when viewed as complex
-    // type, i.e., multiples of 2.
-    must_clone |= std::any_of(strides.begin(), strides.end() - 1,
-                              [&](int64_t stride) { return stride % 2 != 0; });
-
-    // Complex to real FFTs may overwrite the input buffer (gh-34551)
-    must_clone |= !complex_output;
-  }
-
-  if (must_clone) {
-    input = input.clone(MemoryFormat::Contiguous);
-  }
-
-  // Now that we have done error check and data_ptr checks, we delegate all
-  // further cuFFT parameter computation and plan creation to the helper class
-  // CuFFTConfig in CuFFTPlanCache.h.
-
-  // If plan caching is enabled, we check the cache. Note that this accesses
-  // plan_cache.max_size() and thus makes this function less functional.
-  // However, integrating additional arguments into the "public" level c++ APIs,
-  // e.g., irfft, is difficult as we have a long call sequence looking like
-  //   irfft --> _fft --> _fft_with_size --dispatching-to-> _fft_cufft
-
-  DimVector in_strides(signal_ndim + 1);
-  auto input_strides = input.strides();
-  for (int64_t i = signal_ndim; i >= 0; --i) {
-    in_strides[i] = complex_input ? input_strides[i] / 2 : input_strides[i];
-  }
-
-  DimVector out_strides(signal_ndim + 1);
-  out_strides[signal_ndim] = 1;
-  if (fft_type == CuFFTTransformType::R2C && onesided) {
-    out_strides[signal_ndim - 1] = checked_signal_sizes[signal_ndim - 1] / 2 + 1;
-  } else {
-    out_strides[signal_ndim - 1] = checked_signal_sizes[signal_ndim - 1];
-  }
-  for (int64_t i = signal_ndim - 2; i >= 0; --i) {
-    out_strides[i] = out_strides[i + 1] * checked_signal_sizes[i];
-  }
-
-  DimVector full_sizes(signal_ndim + 1);
-  full_sizes[0] = self.size(0);
-  std::copy(checked_signal_sizes.begin(), checked_signal_sizes.end(), full_sizes.begin() + 1);
-  CuFFTParams Params(in_strides, out_strides, full_sizes, fft_type,
-                     c10::toValueType(input.scalar_type()));
-
-  // This read is not locked for perf reason. Shouldn't matter too much because
-  // we check again after acquiring the lock.
-  if (plan_cache.max_size() > 0) {
-    std::lock_guard<std::mutex> guard(plan_cache.mutex);
-    if (plan_cache.max_size() > 0) {  // check again after acquiring the lock
-      const CuFFTConfig &config = plan_cache.lookup(Params);
-      return _run_cufft(config, input, signal_ndim, complex_input,
-                        complex_output, inverse, checked_signal_sizes,
-                        static_cast<fft_norm_mode>(normalization),
-                        onesided, output_sizes, must_clone);
-    }
-  }
-  CuFFTConfig config(Params);
-  return _run_cufft(config, input, signal_ndim, complex_input,
-                    complex_output, inverse, checked_signal_sizes,
-                    static_cast<fft_norm_mode>(normalization),
-                    onesided, output_sizes, must_clone);
-}
 
 }} // at::native