pytorch
diff --git a/‎.circleci/cimodel/data/simple/ios_definitions.py‎
Lines changed: 2 additions & 2 deletions b/‎.circleci/cimodel/data/simple/ios_definitions.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.circleci/config.yml‎
Lines changed: 0 additions & 13 deletions b/‎.circleci/config.yml‎
Lines changed: 0 additions & 13 deletions
diff --git a/‎.circleci/scripts/binary_linux_build.sh‎
Lines changed: 1 addition & 1 deletion b/‎.circleci/scripts/binary_linux_build.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CMakeLists.txt‎
Lines changed: 18 additions & 2 deletions b/‎CMakeLists.txt‎
Lines changed: 18 additions & 2 deletions
diff --git a/‎android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp‎
Lines changed: 1 addition & 1 deletion b/‎android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/BatchingRegistrations.cpp‎
Lines changed: 35 additions & 1 deletion b/‎aten/src/ATen/BatchingRegistrations.cpp‎
Lines changed: 35 additions & 1 deletion
diff --git a/‎aten/src/ATen/VmapTransforms.h‎
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/VmapTransforms.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/core/aten_interned_strings.h‎
Lines changed: 1 addition & 0 deletions b/‎aten/src/ATen/core/aten_interned_strings.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎aten/src/ATen/core/dispatch/OperatorEntry.cpp‎
Lines changed: 20 additions & 9 deletions b/‎aten/src/ATen/core/dispatch/OperatorEntry.cpp‎
Lines changed: 20 additions & 9 deletions
diff --git a/‎aten/src/ATen/core/dispatch/OperatorEntry.h‎
Lines changed: 3 additions & 1 deletion b/‎aten/src/ATen/core/dispatch/OperatorEntry.h‎
Lines changed: 3 additions & 1 deletion
@@ -62,8 +62,8 @@ def gen_tree(self):
 
 WORKFLOW_DATA = [
     IOSJob(IOS_VERSION, ArchVariant("x86_64"), is_org_member_context=False),
-    IOSJob(IOS_VERSION, ArchVariant("arm64")),
-    IOSJob(IOS_VERSION, ArchVariant("arm64", True), extra_props={"op_list": "mobilenetv2.yaml"}),
+    # IOSJob(IOS_VERSION, ArchVariant("arm64")),
+    # IOSJob(IOS_VERSION, ArchVariant("arm64", True), extra_props={"op_list": "mobilenetv2.yaml"}),
 ]
 
 
 
@@ -7003,19 +7003,6 @@ workflows:
           ios_arch: x86_64
           ios_platform: SIMULATOR
           name: pytorch_ios_11_2_1_x86_64_build
-      - pytorch_ios_build:
-          build_environment: pytorch-ios-11.2.1-arm64_build
-          context: org-member
-          ios_arch: arm64
-          ios_platform: OS
-          name: pytorch_ios_11_2_1_arm64_build
-      - pytorch_ios_build:
-          build_environment: pytorch-ios-11.2.1-arm64_custom_build
-          context: org-member
-          ios_arch: arm64
-          ios_platform: OS
-          name: pytorch_ios_11_2_1_arm64_custom_build
-          op_list: mobilenetv2.yaml
       - pytorch_linux_build:
           build_environment: pytorch-linux-xenial-py3-clang5-mobile-build
           build_only: "1"
 
@@ -21,4 +21,4 @@ else
 fi
 
 # Build the package
-SKIP_ALL_TESTS=1 stdbuf -i0 -o0 -e0 "/builder/$build_script"
+SKIP_ALL_TESTS=1 "/builder/$build_script"
@@ -144,7 +144,7 @@ option(BUILD_MOBILE_AUTOGRAD "Build autograd function in mobile build (in develo
 cmake_dependent_option(
     INSTALL_TEST "Install test binaries if BUILD_TEST is on" ON
     "BUILD_TEST" OFF)
-option(CODE_COVERAGE "Compile C/C++ with code coverage flags" OFF)
+option(USE_CPP_CODE_COVERAGE "Compile C/C++ with code coverage flags" OFF)
 option(COLORIZE_OUTPUT "Colorize output during compilation" ON)
 option(USE_ASAN "Use Address Sanitizer" OFF)
 option(USE_TSAN "Use Thread Sanitizer" OFF)
@@ -610,6 +610,22 @@ if(USE_ASAN)
     string(APPEND CMAKE_LINKER_FLAGS_DEBUG " -fsanitize=address")
 endif()
 
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+  include(CheckCSourceCompiles)
+  check_c_source_compiles("#include <arm_neon.h>
+int main() {
+  float32x4x2_t v;
+  v.val[0] = vcombine_f32 (vcreate_f32 (__AARCH64_UINT64_C (0)), vcreate_f32 (__AARCH64_UINT64_C (0)));
+  v.val[1] = vcombine_f32 (vcreate_f32 (__AARCH64_UINT64_C (0)), vcreate_f32 (__AARCH64_UINT64_C (0)));
+  vst1q_f32_x2(a, v);
+  return 0;
+}" HAS_VST1)
+
+  if(NOT HAS_VST1)
+    string(APPEND CMAKE_CXX_FLAGS " -DMISSING_ARM_VST1")
+  endif()
+endif()
+
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
   include(CheckCSourceCompiles)
   check_c_source_compiles("#include <arm_neon.h>
@@ -626,7 +642,7 @@ endif()
 
 
 # Add code coverage flags to supported compilers
-if(CODE_COVERAGE)
+if(USE_CPP_CODE_COVERAGE)
   if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
     string(APPEND CMAKE_C_FLAGS  " --coverage -fprofile-abs-path")
     string(APPEND CMAKE_CXX_FLAGS  " --coverage -fprofile-abs-path")
 
@@ -92,7 +92,7 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
     if (auto method = module_.find_method(methodName)) {
       auto output = [&]() {
         LiteJITCallGuard guard;
-        return module_.run_method(methodName, inputs);
+        return module_.get_method(methodName)(inputs);
       }();
       return JIValue::newJIValueFromAtIValue(output);
     }
 
@@ -19,7 +19,7 @@ namespace at {
 // NOTE: [When should I add a batching rule?]
 // When you are adding a new operator, you'll need to add a batching rule so
 // that vmap can work efficiently with said operator. If you do not, we'll attempt
-// to generate a slow fallback for the batching rule (this is not yet implemented).
+// to generate a slow fallback for the batching rule.
 
 // NOTE: [How to write batching rules?]
 // The signature of a batching rule should look like exactly like the C++ signature
@@ -223,13 +223,33 @@ Tensor select_batching_rule(const Tensor& self, int64_t dim, int64_t index) {
   return self_physical.newLogicalFromPhysical(result);
 }
 
+static int64_t getGradInputPhysicalDim(int64_t dim, IntArrayRef input_sizes, int64_t num_batch_dims) {
+  return maybe_wrap_dim(dim, input_sizes.size()) + num_batch_dims;
+}
+
+Tensor select_backward_batching_rule(const Tensor& grad, IntArrayRef input_sizes, int64_t dim, int64_t index) {
+  auto grad_physical = MultiBatchVmapTransform::logicalToPhysical(grad);
+  auto grad_input = at::zeros(grad_physical.getPhysicalShape(input_sizes), grad.options());
+  auto physical_dim = getGradInputPhysicalDim(dim, input_sizes, grad_physical.numBatchDims());
+  grad_input.select(physical_dim, index).copy_(grad_physical.tensor());
+  return grad_physical.newLogicalFromPhysical(grad_input);
+}
+
 Tensor slice_batching_rule(const Tensor& self, int64_t dim, int64_t start, int64_t end, int64_t step) {
   auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self);
   auto dim_physical = self_physical.getPhysicalDim(dim);
   auto result = self_physical.tensor().slice(dim_physical, start, end, step);
   return self_physical.newLogicalFromPhysical(result);
 }
 
+Tensor slice_backward_batching_rule(const Tensor& grad, IntArrayRef input_sizes, int64_t dim, int64_t start, int64_t end, int64_t step) {
+  auto grad_physical = MultiBatchVmapTransform::logicalToPhysical(grad);
+  auto grad_input = at::zeros(grad_physical.getPhysicalShape(input_sizes), grad.options());
+  auto physical_dim = getGradInputPhysicalDim(dim, input_sizes, grad_physical.numBatchDims());
+  grad_input.slice(physical_dim, start, end, step).copy_(grad_physical.tensor());
+  return grad_physical.newLogicalFromPhysical(grad_input);
+}
+
 Tensor diagonal_batching_rule(const Tensor& self, int64_t offset, int64_t dim1, int64_t dim2) {
   auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self);
   auto dim1_physical = self_physical.getPhysicalDim(dim1);
@@ -238,6 +258,15 @@ Tensor diagonal_batching_rule(const Tensor& self, int64_t offset, int64_t dim1,
   return self_physical.newLogicalFromPhysical(result);
 }
 
+Tensor diagonal_backward_batching_rule(const Tensor& grad, IntArrayRef input_sizes, int64_t offset, int64_t dim1, int64_t dim2) {
+  auto grad_physical = MultiBatchVmapTransform::logicalToPhysical(grad);
+  auto grad_input = at::zeros(grad_physical.getPhysicalShape(input_sizes), grad.options());
+  auto dim1_physical = getGradInputPhysicalDim(dim1, input_sizes, grad_physical.numBatchDims());
+  auto dim2_physical = getGradInputPhysicalDim(dim2, input_sizes, grad_physical.numBatchDims());
+  grad_input.diagonal(offset, dim1_physical, dim2_physical).copy_(grad_physical.tensor());
+  return grad_physical.newLogicalFromPhysical(grad_input);
+}
+
 Tensor movedim_batching_rule(const Tensor& self, IntArrayRef source, IntArrayRef destination) {
   auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self);
   auto source_physical = self_physical.getPhysicalDims(source);
@@ -614,6 +643,11 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
   // cat/stack
   m.impl("cat", cat_batching_rule);
   m.impl("stack", stack_batching_rule);
+
+  // backward operators
+  m.impl("select_backward", select_backward_batching_rule);
+  m.impl("slice_backward", slice_backward_batching_rule);
+  m.impl("diagonal_backward", diagonal_backward_batching_rule);
 }
 
 } // namespace at
@@ -72,7 +72,7 @@ struct TORCH_API MultiBatchVmapTransform {
 //
 // Given inputs of size (B, 2) and (2,), BroadcastingVmapTransform returns
 // VmapPhysicalViews wrapping tensors of size (B, 2) and (1, 2). We don't
-// actually *need* to return a tensor of size (B, 2) for the second tensor
+// actually *need* to return a tensor of size (1, 2) for the second tensor
 // because the broadcasting operation takes care of that for us, but we do
 // it anyways to keep things simple.
 struct TORCH_API BroadcastingVmapTransform {
 
@@ -72,6 +72,7 @@ _(aten, _embedding_bag_sparse_backward) \
 _(aten, _erf) \
 _(aten, _erfc) \
 _(aten, _exp) \
+_(aten, _exp2) \
 _(aten, _expm1) \
 _(aten, _fft_with_size) \
 _(aten, _fill) \
 
@@ -161,7 +161,8 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab
   //  (1) Use kernel if it's directly registered to this key
   //  (2) Handle runtime keys that have kernels available from alias keys
   //    (2.1) Use kernel from DispatchKey::Autograd if available
-  //    (2.2) Use catchAllKernel_(as if it was populated to DispatchKey::Autograd) if available
+  //    (2.2) For autograd backend keys, we use kernel from alias Math key (catchAll will be moved to Math)
+  //          if there's no direct registration to the backend key.
   //          Tensor factory functions used to have no registration to Autograd key but only to catchAll.
   //          In the past we directly call into backends(filled with catchAll) after BackendSelect.
   //          Now that we first call Autograd backend keys after BackendSelect, we should fill those
@@ -170,7 +171,7 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab
   //  (4) Use catchAll kernel if available
   // TODO: currently Autograd is the only alias key, we'll update alias key precedence after we add new
   //      alias keys AutogradDispatchCPUOrCUDA and Math.
-  // TODO: we can remove (2.2) and (4) after TypeDefault registrations are moved from catchAll to Math
+  // TODO: we can fix (2.2) and remove (4) after TypeDefault registrations are moved from catchAll to Math
   //       so that Math can populate to Autograd backend keys before fallback kernels.
 
   // 1. Operator registration
@@ -188,11 +189,12 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab
       TORCH_INTERNAL_ASSERT(kern_autograd->second.front().kernel.isValid());
       return {kern_autograd->second.front(), "autograd kernel"};
 
-    // 2.2. For autograd backend keys, we do this before step 4 to make it higher precedence than
-    //      the fallthrough kernel we registered to Autograd backend keys as fallback.
-    } else if (!catchAllKernel_.empty()) {
+    // 2.2. For autograd backend keys, we use kernel from alias Math key (catchAll will be moved to Math)
+    //      if there's no direct registration to the backend key.
+    } else if (kernels_.find(getBackendKeyFromAutograd(dispatch_key)) == kernels_.end()
+            && !catchAllKernel_.empty()) {
       TORCH_INTERNAL_ASSERT(catchAllKernel_.front().kernel.isValid());
-      return {catchAllKernel_.front(), "autograd catch all"};
+      return {catchAllKernel_.front(), "catch all"};
     }
   }
 
@@ -211,11 +213,20 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab
   }
 }
 
+void OperatorEntry::updateDispatchTableEntry_(const c10::Dispatcher& dispatcher, DispatchKey dispatch_key) {
+  auto dispatch_ix = static_cast<uint8_t>(dispatch_key);
+  dispatchTable_[dispatch_ix] = computeDispatchTableEntry(dispatcher, dispatch_key);
+  dispatchKeyExtractor_.setOperatorHasFallthroughForKey(dispatch_key, dispatchTable_[dispatch_ix].isFallthrough());
+}
+
 void OperatorEntry::updateDispatchTable_(const c10::Dispatcher& dispatcher, DispatchKey dispatch_key) {
   for (auto k : c10::getRuntimeDispatchKeys(dispatch_key)) {
-    auto dispatch_ix = static_cast<uint8_t>(k);
-    dispatchTable_[dispatch_ix] = computeDispatchTableEntry(dispatcher, k);
-    dispatchKeyExtractor_.setOperatorHasFallthroughForKey(k, dispatchTable_[dispatch_ix].isFallthrough());
+    updateDispatchTableEntry_(dispatcher, k);
+  }
+  // Registering to backend key might affect computed entry at its Autograd backend key due to 2.2.
+  DispatchKey autograd_key = getAutogradKeyFromBackend(dispatch_key);
+  if (autograd_key != DispatchKey::AutogradOther) {
+    updateDispatchTableEntry_(dispatcher, autograd_key);
   }
 }
 
 
@@ -244,7 +244,9 @@ class CAFFE2_API OperatorEntry final {
     const c10::Dispatcher& dispatcher, DispatchKey dispatch_key
   ) const;
   // This function re-establishes the invariant that dispatchTable
-  // contains the front element from the kernels list for a given dispatch key.
+  // contains the front element from the kernels list for a given runtime dispatch key.
+  void updateDispatchTableEntry_(const c10::Dispatcher& dispatcher, DispatchKey dispatch_key);
+  // Like above, but also handles alias dispatch keys.
   void updateDispatchTable_(const c10::Dispatcher& dispatcher, DispatchKey dispatch_key);
   // Like above, but for ALL entries in the dispatch table.
   void updateDispatchTableFull_(const c10::Dispatcher& dispatcher);
Original file line number	Diff line number	Diff line change
`@@ -62,8 +62,8 @@ def gen_tree(self):`
`62`	`62`
`63`	`63`	`WORKFLOW_DATA = [`
`64`	`64`	`IOSJob(IOS_VERSION, ArchVariant("x86_64"), is_org_member_context=False),`
`65`		`- IOSJob(IOS_VERSION, ArchVariant("arm64")),`
`66`		`- IOSJob(IOS_VERSION, ArchVariant("arm64", True), extra_props={"op_list": "mobilenetv2.yaml"}),`
	`65`	`+ # IOSJob(IOS_VERSION, ArchVariant("arm64")),`
	`66`	`+ # IOSJob(IOS_VERSION, ArchVariant("arm64", True), extra_props={"op_list": "mobilenetv2.yaml"}),`
`67`	`67`	`]`
`68`	`68`
`69`	`69`
Original file line number	Diff line number	Diff line change
`@@ -92,7 +92,7 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {`
`92`	`92`	`if (auto method = module_.find_method(methodName)) {`
`93`	`93`	`auto output = [&]() {`
`94`	`94`	`LiteJITCallGuard guard;`
`95`		`- return module_.run_method(methodName, inputs);`
	`95`	`+ return module_.get_method(methodName)(inputs);`
`96`	`96`	`}();`
`97`	`97`	`return JIValue::newJIValueFromAtIValue(output);`
`98`	`98`	`}`