pytorch
diff --git a/‎aten/src/ATen/TensorIterator.cpp‎
Lines changed: 128 additions & 23 deletions b/‎aten/src/ATen/TensorIterator.cpp‎
Lines changed: 128 additions & 23 deletions
diff --git a/‎aten/src/ATen/TensorIterator.h‎
Lines changed: 8 additions & 0 deletions b/‎aten/src/ATen/TensorIterator.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎aten/src/ATen/TensorMeta.cpp‎
Lines changed: 0 additions & 16 deletions b/‎aten/src/ATen/TensorMeta.cpp‎
Lines changed: 0 additions & 16 deletions
diff --git a/‎aten/src/ATen/TensorMeta.h‎
Lines changed: 41 additions & 15 deletions b/‎aten/src/ATen/TensorMeta.h‎
Lines changed: 41 additions & 15 deletions
@@ -402,14 +402,14 @@ void TensorIteratorBase::compute_types(const TensorIteratorConfig& config) {
     // TODO: reuse temporaries when possible (e.g. for inplace operations)
     if (common_device == kCPU) {
       // Casts to outputs by creating temporaries of the correct dtype (if needed)
-      if (config.cast_common_dtype_to_outputs_ && op.is_output && op.current_dtype != common_dtype_) {
+      // NB: we skip this on is_meta_, because the temporary allocation here is
+      // unnecessary if we aren't going to actually do the compute
+      if (config.cast_common_dtype_to_outputs_ && op.is_output && op.current_dtype != common_dtype_ && !is_meta_) {
         TORCH_INTERNAL_ASSERT(op.tensor.defined());
+        // Marker [Output original_tensor is set]
         op.original_tensor = op.tensor;
         // NB: do NOT use set_output here, as the temporary is NOT a true output;
         // op.tensor is the true output and it was pre-provided for us.
-        // TODO: When we extend this to work with meta tensors, we'll need to
-        // skip this temporary allocation in that case (because it's
-        // unnecessary)
         // TODO: The logic for cast_outputs will need to be handled by the
         // structured kernels implementation.  What probably should happen
         // is that we pass in the inferred dtype into the out kernel, and
@@ -488,10 +488,10 @@ void TensorIteratorBase::allocate_or_resize_outputs() {
         set_output(i, tensor_shape, tensor_stride, op.options(), names_);
       }
       op.current_dtype = op.target_dtype;
-    } else if (op.tensor.defined() && !names_.empty()) {
-      // Even if we don't resize, we may still propagate names, esp
-      // if we were doing an inplace operation
-      namedinference::propagate_names(op.tensor, names_);
+    } else if (op.tensor.defined()) {
+      // Even if we don't resize, we still need to tell set_output about
+      // the output, so that we properly set guard and propagate names
+      set_output(i, op.tensor.sizes(), {}, op.tensor.options(), names_);
     }
   }
 }
@@ -765,6 +765,8 @@ void TensorIteratorBase::cast_outputs() {
   for (auto& op : operands_) {
     if (op.is_output && op.original_tensor.defined() &&
         op.original_tensor.scalar_type() != op.current_dtype) {
+      // TODO: Now that set_output resizes both the original_tensor
+      // and tensor, this condition should no longer ever be true
       if (op.original_tensor.sizes() != op.tensor.sizes()){
         op.original_tensor.resize_as_(op.tensor).as_strided_(op.tensor.sizes(), op.tensor.strides());
       }
@@ -808,18 +810,22 @@ void TensorIteratorBase::select_all_keeping_dim(int start_dim, IntArrayRef indic
   }
 }
 
-TensorIterator TensorIterator::binary_op(Tensor& out, const Tensor& a,
-    const Tensor& b) {
-  return TensorIteratorConfig()
-     .set_check_mem_overlap(true)
-     .add_output(out)
-     .add_input(a)
-     .add_input(b)
-     .allow_cpu_scalars(true)
-     .promote_inputs_to_common_dtype(true)
-     .cast_common_dtype_to_outputs(true)
-     .enforce_safe_casting_to_output(true)
-     .build();
+void TensorIteratorBase::build_binary_op(const Tensor& out, const Tensor& a, const Tensor& b) {
+  build(TensorIteratorConfig()
+    .set_check_mem_overlap(true)
+    .add_output(out)
+    .add_input(a)
+    .add_input(b)
+    .allow_cpu_scalars(true)
+    .promote_inputs_to_common_dtype(true)
+    .cast_common_dtype_to_outputs(true)
+    .enforce_safe_casting_to_output(true));
+}
+
+TensorIterator TensorIterator::binary_op(Tensor& out, const Tensor& a, const Tensor& b) {
+  TensorIterator iter;
+  iter.build_binary_op(out, a, b);
+  return iter;
 }
 
 // Helper to construct a binary op that promotes integer inputs to float.
@@ -940,6 +946,13 @@ TensorIterator TensorIterator::reduce_op(Tensor& out1, Tensor& out2, const Tenso
 
 void TensorIteratorBase::populate_operands(TensorIteratorConfig& config) {
   for (auto& tensor: config.tensors_) {
+    // If *any* of the arguments is a meta tensor, the overall
+    // computation is a meta computation (don't do any work,
+    // just compute output information).  This aligns with
+    // our multiple dispatch semantics.
+    if (tensor.is_meta()) {
+      is_meta_ = true;
+    }
     operands_.emplace_back(std::move(tensor));
   }
   num_outputs_ = config.num_outputs_;
@@ -988,6 +1001,10 @@ void TensorIteratorBase::compute_mem_overlaps(const TensorIteratorConfig& config
   if (!config.check_mem_overlap_) {
     return;
   }
+  if (is_meta_) {
+    // We don't have pointer addresses, cannot check for overlap!
+    return;
+  }
   for (int i = 0; i < num_outputs_; i++) {
     const auto& output = operands_[i].tensor;
     if (!output.defined()) continue;
@@ -1265,9 +1282,11 @@ void TensorIteratorBase::build(TensorIteratorConfig& config) {
     // allocate the output tensor if it's not provided
     allocate_or_resize_outputs();
     // coalesce adjacent dimensions when possible
-    coalesce_dimensions();
+    if (!is_meta_) coalesce_dimensions();
   }
 
+  if (is_meta_) return;
+
   for (auto& op : operands_) {
     TORCH_INTERNAL_ASSERT(op.tensor.defined());
     op.data = op.tensor.data_ptr();
@@ -1281,14 +1300,92 @@ void TensorIteratorBase::build(TensorIteratorConfig& config) {
   view_offsets_ = DimVector(ndim_offsets, 0);
 }
 
+// This is the structured kernels implementation of set_output.  It is
+// NEVER actually called directly; instead, a subclass of TensorIteratorBase
+// will override set_output to actually do the operation, and then call
+// set_output on the TensorIteratorBase to setup TI's metadata.
+// The precondition for this function is that maybe_get_output() now
+// unconditionally returns a real Tensor (prior to output setting,
+// this function may return an undefined tensor.)
+void TensorIteratorBase::set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) {
+  auto& op = operands_[output_idx];
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(output_idx < num_outputs_);
+  const auto& t = maybe_get_output(output_idx);
+  TORCH_INTERNAL_ASSERT(t.defined());
+  if (!op.tensor.defined()) {
+    op.tensor = t;
+    op.current_dtype = op.target_dtype;
+  } else if (op.will_resize) {
+    if (op.original_tensor.defined()) {
+      // OK, so this is pretty weird.  To understand how we can end up in
+      // this situation, first look at Marker [Output original_tensor is set].
+      // That is the sole site where original_tensor may be set on an
+      // output operand.  Essentially, when we are given an explicit output
+      // tensor whose dtype doesn't match the computed common dtype from
+      // the input operands, we do a switcheroo: we replace the (incorrectly
+      // typed) output tensor with a correctly typed, *temporary* tensor,
+      // and remember the original tensor in original_tensor (which will
+      // then get written back to when we cast_outputs).
+      //
+      // Now, what if the given output tensor also happened to be zero
+      // size (meaning that we will_resize it)?  Well, at the call site
+      // above, we don't necessarily(*) know what the correct shape should
+      // be, so we give the temporary tensor the same shape as the original.
+      // At the time of set_output is when we DO know what the correct size
+      // is, and the subclass's implementation of set_output in structured class
+      // responsible for resizing original_tensor.  But we still have this
+      // incorrectly sized temporary output which the structured subclass
+      // knows nothing about, so we are obligated to also resize it here.
+      //
+      // This is a slight memory pessimization, because previously
+      // original_tensor only got resized at the end of the computation, rather
+      // than at the beginning (as happens here).  However, the peak memory
+      // usage is the same, since you need to materialize both original tensor
+      // and temporary tensor to do the copy.
+      //
+      // (*) Actually, technically, we probably do know what the shape
+      // should be, since we do shape computation before dtype computation.
+      // So hypothetically we could figure out what the correct shape is
+      // at that point in time and directly allocate the temporary at
+      // the right size.
+      //
+      // But a better solution is to delay allocation of temporaries until
+      // after TensorIterator builder, waiting until we actually want
+      // to do the computation.  That would also remove the necessity
+      // for the is_meta_ test.
+      TORCH_INTERNAL_ASSERT(op.original_tensor.is_same(t));
+      TORCH_INTERNAL_ASSERT(!op.tensor.is_same(t));
+      at::native::resize_output(op.tensor, sizes);
+      if (!strides.empty()) {
+        TORCH_INTERNAL_ASSERT(!options.memory_format_opt().has_value());
+        op.tensor.as_strided_(sizes, strides);
+      } else if (options.memory_format_opt().has_value()) {
+        op.tensor.unsafeGetTensorImpl()->empty_tensor_restride(*options.memory_format_opt());
+      }
+    }
+  }
+}
+
+// This is the "traditional" implementation of set_output.  On TensorIterator
+// instances, it is invoked directly from various call sites in this file.  No
+// funny business.
 void TensorIterator::set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) {
+  // NB: intentionally no superclass call
   auto& op = operands_[output_idx];
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(output_idx < num_outputs_);
   if (!op.tensor.defined()) {
       if (strides.empty()) {
-          op.tensor = at::empty(sizes, options);
+          if (is_meta_) {
+            op.tensor = at::empty_meta(sizes, options);
+          } else {
+            op.tensor = at::empty(sizes, options);
+          }
       } else {
-          op.tensor = at::empty_strided(sizes, strides, options);
+          if (is_meta_) {
+            TORCH_INTERNAL_ASSERT(0, "meta strided not yet implemented");
+          } else {
+            op.tensor = at::empty_strided(sizes, strides, options);
+          }
       }
       op.current_dtype = op.target_dtype;
   } else if (op.will_resize) {
@@ -1306,6 +1403,14 @@ void TensorIterator::set_output(int64_t output_idx, IntArrayRef sizes, IntArrayR
   }
 }
 
+// Not actually used by anything (TensorIterator subclass calls
+// its own implementation of set_output which knows exactly where
+// all the outputs are), but we have to provide all pure virtual methods
+// for MetaBase
+const Tensor& TensorIterator::maybe_get_output(int64_t output_idx) {
+  return operands_[output_idx].tensor;
+}
+
 SplitUntil32Bit TensorIteratorBase::with_32bit_indexing() const {
   return SplitUntil32Bit(*this);
 }
 
@@ -297,6 +297,10 @@ struct CAFFE2_API TensorIteratorBase : public impl::MetaBase {
     return true;
   }
 
+  void set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) override;
+
+  void build_binary_op(const Tensor& out, const Tensor& a, const Tensor& b);
+
 protected:
   // Mutable reference as it moves tensors out of TensorIteratorConfig
   void populate_operands(TensorIteratorConfig&);
@@ -399,6 +403,9 @@ struct CAFFE2_API TensorIteratorBase : public impl::MetaBase {
 
   // From TensorIteratorConfig
   bool is_reduction_ = false;
+
+  /// Set by populate_operands(), says if we're handling meta tensors
+  bool is_meta_ = false;
 };
 
 struct CAFFE2_API TensorIterator final : public TensorIteratorBase {
@@ -415,6 +422,7 @@ struct CAFFE2_API TensorIterator final : public TensorIteratorBase {
   static TensorIterator reduce_op(Tensor& out, const Tensor& a);
   static TensorIterator reduce_op(Tensor& out1, Tensor& out2, const Tensor& a);
 
+  const Tensor& maybe_get_output(int64_t output_idx) override;
   void set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) override;
 };
 
 
@@ -1,21 +1,5 @@
 #include <ATen/TensorMeta.h>
-#include <ATen/ATen.h>
 
 namespace at {
 
-Tensor meta_tensor_from_meta(const TensorMeta& meta) {
-  // TODO: eliminate indirection
-  return at::empty_meta(meta.sizes, meta.options);
-}
-
-Tensor tensor_from_meta(const TensorMeta& meta) {
-  // TODO: eliminate indirection
-  return at::empty(meta.sizes, meta.options);
-}
-
-// Analogous to self.new_empty(sizes)
-TensorMeta new_meta(const Tensor& self, IntArrayRef sizes) {
-  return TensorMeta(sizes, self.options());
-}
-
 } // namespace at
@@ -10,28 +10,54 @@ class Tensor;
 
 namespace impl {
 
-struct MetaBase {
+// Use this to define the prototype for a meta function.  There are two
+// versions; one that takes one argument (just the operator name), or FUNC2
+// variant that takes two arguments (operator name and overload name).
+//
+// Example usage:
+//
+//    TORCH_META_FUNC2(add, Tensor) (
+//      const Tensor& self, const Tensor& other
+//    ) {
+//      ... compute sizes and options ...
+//      set_output(sizes, options);
+//    }
+//
+#define TORCH_META_FUNC(name) void name::meta
+#define TORCH_META_FUNC2(name, overload) void name##_##overload::meta
+
+// Use this to define the prototype for an implementation.  This takes only
+// one argument, which is the name of the dispatch key entry you're
+// implementing.
+//
+// Example usage:
+//
+//    TORCH_IMPL_FUNC(add_cpu) (
+//      Tensor& result, const Tensor& self, const Tensor& other
+//    ) {
+//      ... do the actual implementation ...
+//    }
+//
+#define TORCH_IMPL_FUNC(name) void structured_##name::impl
+
+// Base class for all structured kernel classes.  The set_output virtual
+// method is varied depending whether or not the operator is
+// functional/out/inplace, and could also be specialized for CPU/CUDA/etc
+// (although presently it isn't).
+//
+// A notable subclass of this interface is TensorIteratorBase.
+struct CAFFE2_API MetaBase {
   virtual void set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) = 0;
+  virtual const Tensor& maybe_get_output(int64_t output_idx) = 0;
   void set_output(IntArrayRef sizes, TensorOptions options) {
     set_output(0, sizes, {}, options, {});
   }
+  // Returns a reference to an undefined tensor if there is no presupplied
+  // output
+  const Tensor& maybe_get_output() { return maybe_get_output(0); }
   virtual ~MetaBase() {}
 };
 
 } // namespace impl
 
-struct TensorMeta {
-  DimVector sizes;
-  // TODO: DimVector strides;
-  TensorOptions options;
-
-  TensorMeta(IntArrayRef _sizes, TensorOptions _options)
-    : sizes(_sizes), options(_options) {}
-};
-
-CAFFE2_API Tensor meta_tensor_from_meta(const TensorMeta& meta);
-CAFFE2_API Tensor tensor_from_meta(const TensorMeta& meta);
-// Analogous to self.new_empty(sizes)
-CAFFE2_API TensorMeta new_meta(const Tensor& self, IntArrayRef sizes);
-
 } // namespace at