Skip to content

Commit 4ea2a11

Browse files
committed
Update on "move rebuild buckets from end of first iteration to beginning of second iteration"
Part of relanding PR #41954, this refactoring is to move rebuild_buckets call from end of first iteration to beginning of second iteration Differential Revision: [D23583017](https://our.internmc.facebook.com/intern/diff/D23583017/) **NOTE FOR REVIEWERS**: This PR has internal Facebook specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D23583017/)! [ghstack-poisoned]
2 parents 6796a1e + c68a99b commit 4ea2a11

File tree

225 files changed

+5712
-2236
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

225 files changed

+5712
-2236
lines changed

.circleci/cimodel/data/simple/ios_definitions.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,8 @@ def gen_tree(self):
6262

6363
WORKFLOW_DATA = [
6464
IOSJob(IOS_VERSION, ArchVariant("x86_64"), is_org_member_context=False),
65-
IOSJob(IOS_VERSION, ArchVariant("arm64")),
66-
IOSJob(IOS_VERSION, ArchVariant("arm64", True), extra_props={"op_list": "mobilenetv2.yaml"}),
65+
# IOSJob(IOS_VERSION, ArchVariant("arm64")),
66+
# IOSJob(IOS_VERSION, ArchVariant("arm64", True), extra_props={"op_list": "mobilenetv2.yaml"}),
6767
]
6868

6969

.circleci/config.yml

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7003,19 +7003,6 @@ workflows:
70037003
ios_arch: x86_64
70047004
ios_platform: SIMULATOR
70057005
name: pytorch_ios_11_2_1_x86_64_build
7006-
- pytorch_ios_build:
7007-
build_environment: pytorch-ios-11.2.1-arm64_build
7008-
context: org-member
7009-
ios_arch: arm64
7010-
ios_platform: OS
7011-
name: pytorch_ios_11_2_1_arm64_build
7012-
- pytorch_ios_build:
7013-
build_environment: pytorch-ios-11.2.1-arm64_custom_build
7014-
context: org-member
7015-
ios_arch: arm64
7016-
ios_platform: OS
7017-
name: pytorch_ios_11_2_1_arm64_custom_build
7018-
op_list: mobilenetv2.yaml
70197006
- pytorch_linux_build:
70207007
build_environment: pytorch-linux-xenial-py3-clang5-mobile-build
70217008
build_only: "1"

.circleci/scripts/binary_linux_build.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,4 @@ else
2121
fi
2222

2323
# Build the package
24-
SKIP_ALL_TESTS=1 stdbuf -i0 -o0 -e0 "/builder/$build_script"
24+
SKIP_ALL_TESTS=1 "/builder/$build_script"

CMakeLists.txt

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ option(BUILD_MOBILE_AUTOGRAD "Build autograd function in mobile build (in develo
144144
cmake_dependent_option(
145145
INSTALL_TEST "Install test binaries if BUILD_TEST is on" ON
146146
"BUILD_TEST" OFF)
147-
option(CODE_COVERAGE "Compile C/C++ with code coverage flags" OFF)
147+
option(USE_CPP_CODE_COVERAGE "Compile C/C++ with code coverage flags" OFF)
148148
option(COLORIZE_OUTPUT "Colorize output during compilation" ON)
149149
option(USE_ASAN "Use Address Sanitizer" OFF)
150150
option(USE_TSAN "Use Thread Sanitizer" OFF)
@@ -610,6 +610,22 @@ if(USE_ASAN)
610610
string(APPEND CMAKE_LINKER_FLAGS_DEBUG " -fsanitize=address")
611611
endif()
612612

613+
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
614+
include(CheckCSourceCompiles)
615+
check_c_source_compiles("#include <arm_neon.h>
616+
int main() {
617+
float32x4x2_t v;
618+
v.val[0] = vcombine_f32 (vcreate_f32 (__AARCH64_UINT64_C (0)), vcreate_f32 (__AARCH64_UINT64_C (0)));
619+
v.val[1] = vcombine_f32 (vcreate_f32 (__AARCH64_UINT64_C (0)), vcreate_f32 (__AARCH64_UINT64_C (0)));
620+
vst1q_f32_x2(a, v);
621+
return 0;
622+
}" HAS_VST1)
623+
624+
if(NOT HAS_VST1)
625+
string(APPEND CMAKE_CXX_FLAGS " -DMISSING_ARM_VST1")
626+
endif()
627+
endif()
628+
613629
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
614630
include(CheckCSourceCompiles)
615631
check_c_source_compiles("#include <arm_neon.h>
@@ -626,7 +642,7 @@ endif()
626642

627643

628644
# Add code coverage flags to supported compilers
629-
if(CODE_COVERAGE)
645+
if(USE_CPP_CODE_COVERAGE)
630646
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
631647
string(APPEND CMAKE_C_FLAGS " --coverage -fprofile-abs-path")
632648
string(APPEND CMAKE_CXX_FLAGS " --coverage -fprofile-abs-path")

android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
9292
if (auto method = module_.find_method(methodName)) {
9393
auto output = [&]() {
9494
LiteJITCallGuard guard;
95-
return module_.run_method(methodName, inputs);
95+
return module_.get_method(methodName)(inputs);
9696
}();
9797
return JIValue::newJIValueFromAtIValue(output);
9898
}

aten/src/ATen/BatchingRegistrations.cpp

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ namespace at {
1919
// NOTE: [When should I add a batching rule?]
2020
// When you are adding a new operator, you'll need to add a batching rule so
2121
// that vmap can work efficiently with said operator. If you do not, we'll attempt
22-
// to generate a slow fallback for the batching rule (this is not yet implemented).
22+
// to generate a slow fallback for the batching rule.
2323

2424
// NOTE: [How to write batching rules?]
2525
// The signature of a batching rule should look like exactly like the C++ signature
@@ -223,13 +223,33 @@ Tensor select_batching_rule(const Tensor& self, int64_t dim, int64_t index) {
223223
return self_physical.newLogicalFromPhysical(result);
224224
}
225225

226+
static int64_t getGradInputPhysicalDim(int64_t dim, IntArrayRef input_sizes, int64_t num_batch_dims) {
227+
return maybe_wrap_dim(dim, input_sizes.size()) + num_batch_dims;
228+
}
229+
230+
Tensor select_backward_batching_rule(const Tensor& grad, IntArrayRef input_sizes, int64_t dim, int64_t index) {
231+
auto grad_physical = MultiBatchVmapTransform::logicalToPhysical(grad);
232+
auto grad_input = at::zeros(grad_physical.getPhysicalShape(input_sizes), grad.options());
233+
auto physical_dim = getGradInputPhysicalDim(dim, input_sizes, grad_physical.numBatchDims());
234+
grad_input.select(physical_dim, index).copy_(grad_physical.tensor());
235+
return grad_physical.newLogicalFromPhysical(grad_input);
236+
}
237+
226238
Tensor slice_batching_rule(const Tensor& self, int64_t dim, int64_t start, int64_t end, int64_t step) {
227239
auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self);
228240
auto dim_physical = self_physical.getPhysicalDim(dim);
229241
auto result = self_physical.tensor().slice(dim_physical, start, end, step);
230242
return self_physical.newLogicalFromPhysical(result);
231243
}
232244

245+
Tensor slice_backward_batching_rule(const Tensor& grad, IntArrayRef input_sizes, int64_t dim, int64_t start, int64_t end, int64_t step) {
246+
auto grad_physical = MultiBatchVmapTransform::logicalToPhysical(grad);
247+
auto grad_input = at::zeros(grad_physical.getPhysicalShape(input_sizes), grad.options());
248+
auto physical_dim = getGradInputPhysicalDim(dim, input_sizes, grad_physical.numBatchDims());
249+
grad_input.slice(physical_dim, start, end, step).copy_(grad_physical.tensor());
250+
return grad_physical.newLogicalFromPhysical(grad_input);
251+
}
252+
233253
Tensor diagonal_batching_rule(const Tensor& self, int64_t offset, int64_t dim1, int64_t dim2) {
234254
auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self);
235255
auto dim1_physical = self_physical.getPhysicalDim(dim1);
@@ -238,6 +258,15 @@ Tensor diagonal_batching_rule(const Tensor& self, int64_t offset, int64_t dim1,
238258
return self_physical.newLogicalFromPhysical(result);
239259
}
240260

261+
Tensor diagonal_backward_batching_rule(const Tensor& grad, IntArrayRef input_sizes, int64_t offset, int64_t dim1, int64_t dim2) {
262+
auto grad_physical = MultiBatchVmapTransform::logicalToPhysical(grad);
263+
auto grad_input = at::zeros(grad_physical.getPhysicalShape(input_sizes), grad.options());
264+
auto dim1_physical = getGradInputPhysicalDim(dim1, input_sizes, grad_physical.numBatchDims());
265+
auto dim2_physical = getGradInputPhysicalDim(dim2, input_sizes, grad_physical.numBatchDims());
266+
grad_input.diagonal(offset, dim1_physical, dim2_physical).copy_(grad_physical.tensor());
267+
return grad_physical.newLogicalFromPhysical(grad_input);
268+
}
269+
241270
Tensor movedim_batching_rule(const Tensor& self, IntArrayRef source, IntArrayRef destination) {
242271
auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self);
243272
auto source_physical = self_physical.getPhysicalDims(source);
@@ -614,6 +643,11 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
614643
// cat/stack
615644
m.impl("cat", cat_batching_rule);
616645
m.impl("stack", stack_batching_rule);
646+
647+
// backward operators
648+
m.impl("select_backward", select_backward_batching_rule);
649+
m.impl("slice_backward", slice_backward_batching_rule);
650+
m.impl("diagonal_backward", diagonal_backward_batching_rule);
617651
}
618652

619653
} // namespace at

aten/src/ATen/VmapTransforms.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ struct TORCH_API MultiBatchVmapTransform {
7272
//
7373
// Given inputs of size (B, 2) and (2,), BroadcastingVmapTransform returns
7474
// VmapPhysicalViews wrapping tensors of size (B, 2) and (1, 2). We don't
75-
// actually *need* to return a tensor of size (B, 2) for the second tensor
75+
// actually *need* to return a tensor of size (1, 2) for the second tensor
7676
// because the broadcasting operation takes care of that for us, but we do
7777
// it anyways to keep things simple.
7878
struct TORCH_API BroadcastingVmapTransform {

aten/src/ATen/core/aten_interned_strings.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ _(aten, _embedding_bag_sparse_backward) \
7272
_(aten, _erf) \
7373
_(aten, _erfc) \
7474
_(aten, _exp) \
75+
_(aten, _exp2) \
7576
_(aten, _expm1) \
7677
_(aten, _fft_with_size) \
7778
_(aten, _fill) \

aten/src/ATen/core/dispatch/OperatorEntry.cpp

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,8 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab
161161
// (1) Use kernel if it's directly registered to this key
162162
// (2) Handle runtime keys that have kernels available from alias keys
163163
// (2.1) Use kernel from DispatchKey::Autograd if available
164-
// (2.2) Use catchAllKernel_(as if it was populated to DispatchKey::Autograd) if available
164+
// (2.2) For autograd backend keys, we use kernel from alias Math key (catchAll will be moved to Math)
165+
// if there's no direct registration to the backend key.
165166
// Tensor factory functions used to have no registration to Autograd key but only to catchAll.
166167
// In the past we directly call into backends(filled with catchAll) after BackendSelect.
167168
// Now that we first call Autograd backend keys after BackendSelect, we should fill those
@@ -170,7 +171,7 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab
170171
// (4) Use catchAll kernel if available
171172
// TODO: currently Autograd is the only alias key, we'll update alias key precedence after we add new
172173
// alias keys AutogradDispatchCPUOrCUDA and Math.
173-
// TODO: we can remove (2.2) and (4) after TypeDefault registrations are moved from catchAll to Math
174+
// TODO: we can fix (2.2) and remove (4) after TypeDefault registrations are moved from catchAll to Math
174175
// so that Math can populate to Autograd backend keys before fallback kernels.
175176

176177
// 1. Operator registration
@@ -188,11 +189,12 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab
188189
TORCH_INTERNAL_ASSERT(kern_autograd->second.front().kernel.isValid());
189190
return {kern_autograd->second.front(), "autograd kernel"};
190191

191-
// 2.2. For autograd backend keys, we do this before step 4 to make it higher precedence than
192-
// the fallthrough kernel we registered to Autograd backend keys as fallback.
193-
} else if (!catchAllKernel_.empty()) {
192+
// 2.2. For autograd backend keys, we use kernel from alias Math key (catchAll will be moved to Math)
193+
// if there's no direct registration to the backend key.
194+
} else if (kernels_.find(getBackendKeyFromAutograd(dispatch_key)) == kernels_.end()
195+
&& !catchAllKernel_.empty()) {
194196
TORCH_INTERNAL_ASSERT(catchAllKernel_.front().kernel.isValid());
195-
return {catchAllKernel_.front(), "autograd catch all"};
197+
return {catchAllKernel_.front(), "catch all"};
196198
}
197199
}
198200

@@ -211,11 +213,20 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab
211213
}
212214
}
213215

216+
void OperatorEntry::updateDispatchTableEntry_(const c10::Dispatcher& dispatcher, DispatchKey dispatch_key) {
217+
auto dispatch_ix = static_cast<uint8_t>(dispatch_key);
218+
dispatchTable_[dispatch_ix] = computeDispatchTableEntry(dispatcher, dispatch_key);
219+
dispatchKeyExtractor_.setOperatorHasFallthroughForKey(dispatch_key, dispatchTable_[dispatch_ix].isFallthrough());
220+
}
221+
214222
void OperatorEntry::updateDispatchTable_(const c10::Dispatcher& dispatcher, DispatchKey dispatch_key) {
215223
for (auto k : c10::getRuntimeDispatchKeys(dispatch_key)) {
216-
auto dispatch_ix = static_cast<uint8_t>(k);
217-
dispatchTable_[dispatch_ix] = computeDispatchTableEntry(dispatcher, k);
218-
dispatchKeyExtractor_.setOperatorHasFallthroughForKey(k, dispatchTable_[dispatch_ix].isFallthrough());
224+
updateDispatchTableEntry_(dispatcher, k);
225+
}
226+
// Registering to backend key might affect computed entry at its Autograd backend key due to 2.2.
227+
DispatchKey autograd_key = getAutogradKeyFromBackend(dispatch_key);
228+
if (autograd_key != DispatchKey::AutogradOther) {
229+
updateDispatchTableEntry_(dispatcher, autograd_key);
219230
}
220231
}
221232

aten/src/ATen/core/dispatch/OperatorEntry.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,9 @@ class CAFFE2_API OperatorEntry final {
244244
const c10::Dispatcher& dispatcher, DispatchKey dispatch_key
245245
) const;
246246
// This function re-establishes the invariant that dispatchTable
247-
// contains the front element from the kernels list for a given dispatch key.
247+
// contains the front element from the kernels list for a given runtime dispatch key.
248+
void updateDispatchTableEntry_(const c10::Dispatcher& dispatcher, DispatchKey dispatch_key);
249+
// Like above, but also handles alias dispatch keys.
248250
void updateDispatchTable_(const c10::Dispatcher& dispatcher, DispatchKey dispatch_key);
249251
// Like above, but for ALL entries in the dispatch table.
250252
void updateDispatchTableFull_(const c10::Dispatcher& dispatcher);

0 commit comments

Comments
 (0)