Skip to content

Commit d67d51a

Browse files
committed
Update on "Tensorify compute on Python scalars"
Signed-off-by: Bob Ren <bobrenfb.com> Comandeered from #130228 as I'm helping ezyang w/ shipping dynamic float arguments in PT2. This starts with supporting torch.ops.aten.mul. I'll stack on top support for other operators in subsequent PRs to keep this scoped to the mechanics of the fx pass. cc jgong5 mingfeima XiaobingSuper sanchitintel ashokei jingxu10 voznesenskym penguinwu EikanWang Guobing-Chen zhuhaozhe blzheng wenzhe-nrv jiayisunx ipiszy yf225 chenyang78 kadeng muchulee8 ColinPeppler amjames desertfire chauhang rec [ghstack-poisoned]
2 parents e886d4f + d1ca59c commit d67d51a

File tree

183 files changed

+10635
-4527
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

183 files changed

+10635
-4527
lines changed

.ci/docker/common/install_onnx.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ pip_install coloredlogs packaging
3232

3333
pip_install onnxruntime==1.18.1
3434
pip_install onnx==1.16.2
35-
pip_install onnxscript==0.1.0.dev20240831 --no-deps
35+
pip_install onnxscript==0.1.0.dev20241008 --no-deps
3636
# required by onnxscript
3737
pip_install ml_dtypes
3838

.github/workflows/pull.yml

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -185,10 +185,10 @@ jobs:
185185
docker-image-name: pytorch-linux-focal-py3.9-clang10
186186
test-matrix: |
187187
{ include: [
188-
{ config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
189-
{ config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
190-
{ config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
191-
{ config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
188+
{ config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
189+
{ config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
190+
{ config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
191+
{ config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
192192
{ config: "crossref", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
193193
{ config: "crossref", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
194194
{ config: "dynamo", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
@@ -217,10 +217,10 @@ jobs:
217217
docker-image-name: pytorch-linux-focal-py3.11-clang10
218218
test-matrix: |
219219
{ include: [
220-
{ config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
221-
{ config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
222-
{ config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
223-
{ config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
220+
{ config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
221+
{ config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
222+
{ config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
223+
{ config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
224224
{ config: "crossref", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
225225
{ config: "crossref", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
226226
{ config: "dynamo", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
@@ -251,10 +251,10 @@ jobs:
251251
docker-image-name: pytorch-linux-focal-py3.12-clang10
252252
test-matrix: |
253253
{ include: [
254-
{ config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
255-
{ config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
256-
{ config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
257-
{ config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
254+
{ config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
255+
{ config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
256+
{ config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
257+
{ config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
258258
{ config: "dynamo", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
259259
{ config: "dynamo", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
260260
{ config: "dynamo", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
@@ -588,9 +588,9 @@ jobs:
588588
docker-image-name: pytorch-linux-focal-py3.12-clang10
589589
test-matrix: |
590590
{ include: [
591-
{ config: "default", shard: 1, num_shards: 3, runner: "linux.4xlarge" },
592-
{ config: "default", shard: 2, num_shards: 3, runner: "linux.4xlarge" },
593-
{ config: "default", shard: 3, num_shards: 3, runner: "linux.4xlarge" },
591+
{ config: "default", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
592+
{ config: "default", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
593+
{ config: "default", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
594594
{ config: "dynamo", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
595595
{ config: "dynamo", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
596596
{ config: "dynamo", shard: 3, num_shards: 3, runner: "linux.2xlarge" },

aten/src/ATen/SparseCsrTensorUtils.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -144,8 +144,8 @@ class CheckSparseTensorInvariants {
144144
bool old_state;
145145

146146
public:
147-
CheckSparseTensorInvariants(bool state) {
148-
old_state = at::globalContext().checkSparseTensorInvariants();
147+
CheckSparseTensorInvariants(bool state)
148+
: old_state(at::globalContext().checkSparseTensorInvariants()) {
149149
at::globalContext().setCheckSparseTensorInvariants(state);
150150
}
151151

aten/src/ATen/ThreadLocalState.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ class TORCH_API ThreadLocalState {
8282
!defined(BUILD_LITE_INTERPRETER)
8383
// TLS for autocast dtypes
8484
std::array<at::ScalarType, at::COMPILE_TIME_MAX_DEVICE_TYPES>
85-
autocast_dtypes_;
85+
autocast_dtypes_{};
8686
#endif
8787

8888
friend class ThreadLocalStateGuard;

aten/src/ATen/cuda/CUDAGraph.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/, cudaStreamCaptureMode capt
125125
// due to the capture status being updated _after_ a capture had already started.
126126
c10::cuda::CUDACachingAllocator::beginAllocateToPool(capture_dev_, mempool_id_, [this](cudaStream_t stream) {
127127
cudaStreamCaptureStatus status;
128-
CaptureId_t stream_capture_id;
128+
CaptureId_t stream_capture_id = 0;
129129
AT_CUDA_CHECK(cudaStreamGetCaptureInfo(stream, &status, &stream_capture_id));
130130
return status == cudaStreamCaptureStatus::cudaStreamCaptureStatusActive && stream_capture_id == capture_id_;
131131
});

aten/src/ATen/cuda/cub.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,7 @@ inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
234234
scan_op,
235235
num_items,
236236
at::cuda::getCurrentCUDAStream());
237-
C10_CUDA_KERNEL_LAUNCH_CHECK();
237+
C10_HIP_KERNEL_LAUNCH_CHECK();
238238
#else
239239
// non synchronizing cub call
240240
// even though cub is supposed to support tensors with int_max elements, in reality it doesn't,
@@ -302,7 +302,7 @@ inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
302302
init_value,
303303
num_items,
304304
at::cuda::getCurrentCUDAStream());
305-
C10_CUDA_KERNEL_LAUNCH_CHECK();
305+
C10_HIP_KERNEL_LAUNCH_CHECK();
306306
#else
307307
// non synchronizing cub call
308308
// even though cub is supposed to support tensors with int_max elements, in reality it doesn't,

aten/src/ATen/functorch/BatchRulesConvolution.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -362,6 +362,7 @@ static std::tuple<Tensor,Tensor,Tensor> convolution_backward_plumbing(
362362
const Tensor& grad_output_, const Tensor& input_, const Tensor& weight_,
363363
const c10::OptionalArrayRef<SymInt> bias_sizes_opt,
364364
c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed,
365+
// NOLINTNEXTLINE(performance-unnecessary-value-param)
365366
c10::SymIntArrayRef output_padding, c10::SymInt groups, std::array<bool, 3> output_mask) {
366367
const auto maybe_layer = maybeCurrentDynamicLayer();
367368
vmap_check_escaped(maybe_layer, "convolution_backward_plumbing");

aten/src/ATen/functorch/BatchRulesIndexing.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
#include <ATen/core/dispatch/Dispatcher.h>
99
#include <ATen/functorch/BatchRulesHelper.h>
1010

11-
namespace at { namespace functorch {
11+
namespace at::functorch {
1212

1313
#define OP_DECOMPOSE(op) m.impl(#op, static_cast<decltype(&ATEN_FN(op))>(native::op));
1414
#define OP_DECOMPOSE2(op, overload) m.impl(#op"."#overload, static_cast<decltype(&ATEN_FN2(op, overload))>(native::op));
@@ -20,4 +20,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
2020
OP_DECOMPOSE(_unsafe_masked_index_put_accumulate);
2121
}
2222

23-
}}
23+
}

aten/src/ATen/functorch/BatchRulesModules.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,7 @@ static Tensor one_hot_decomposition_hack(const Tensor &self, int64_t num_classes
226226
if (num_classes <= 0) {
227227
AT_ERROR("Can not infer total number of classes from empty tensor.");
228228
} else {
229-
shape.push_back(num_classes);
229+
shape.emplace_back(num_classes);
230230
return at::empty_symint(shape, self.options());
231231
}
232232
}
@@ -246,7 +246,7 @@ static Tensor one_hot_decomposition_hack(const Tensor &self, int64_t num_classes
246246
// TORCH_CHECK(num_classes > self.max().item().toLong(), "Class values must be smaller than num_classes.");
247247
// }
248248

249-
shape.push_back(num_classes);
249+
shape.emplace_back(num_classes);
250250
Tensor ret = at::zeros_symint(shape, self.options());
251251
return ret.scatter(-1, self.unsqueeze(-1), 1);
252252
}

aten/src/ATen/functorch/BatchRulesRandomness.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ static std::tuple<Tensor,Tensor> native_dropout_batching_rule(const Tensor& tens
213213
return std::make_tuple(output, mask);
214214
}
215215

216-
static Tensor multinomial_batching_rule(const Tensor& self, const int64_t num_samples, const bool replacement, const std::optional<Generator> generator) {
216+
static Tensor multinomial_batching_rule(const Tensor& self, const int64_t num_samples, const bool replacement, std::optional<Generator> generator) {
217217
c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
218218
auto maybe_layer = maybeCurrentDynamicLayer();
219219
const auto cur_level = maybe_layer->layerId();
@@ -237,7 +237,7 @@ static Tensor multinomial_batching_rule(const Tensor& self, const int64_t num_sa
237237
if (is_2D_case) {
238238
self_value = reshape_dim_into(0, 0, self_value);
239239
}
240-
auto out = multinomial(self_value, num_samples, replacement, generator);
240+
auto out = multinomial(self_value, num_samples, replacement, std::move(generator));
241241
if (is_2D_case) {
242242
out = reshape_dim_outof_symint(0, maybe_layer->batchSize(), out);
243243
}
@@ -249,7 +249,7 @@ static Tensor multinomial_batching_rule(const Tensor& self, const int64_t num_sa
249249
// Must be same randomness with unbatched input
250250
// 1D case: S -> multinomial(S) -> S
251251
// 2D case: MS -> multinomial(MS) -> MS
252-
return multinomial(self_value, num_samples, replacement, generator);
252+
return multinomial(self_value, num_samples, replacement, std::move(generator));
253253
}
254254

255255
template <typename A, A a, typename C>

0 commit comments

Comments
 (0)