Update on "Fix NJT linear_backward() memory usage"

jbschlosser · jbschlosser · commit ddfa80bd1233 · 2024-11-20T18:38:13.000-05:00
Fixes #141112 The formula we're using for `linear_backward()` is inefficient for higher dim input sizes, even if the input is trivially higher dim (e.g. via use of `unsqueeze()`). This PR updates the formula to match the more efficient version employed by NST. Specifically, note the leading dim collapse for `grad_output`'s values before we compute the various matmuls. https://github.com/pytorch/pytorch/blob/d5ee1d1b581da8399d604bd661ea5fe454b485d6/aten/src/ATen/native/nested/NestedTensorBackward.cpp#L37-L70 Testing for correctness is done via existing gradcheck tests (e.g. `test_backward_nn_functional_linear`). I added a memory usage test but I think it's likely there's a better way to do this. [ghstack-poisoned]
diff --git a/torch/nested/_internal/ops.py b/torch/nested/_internal/ops.py
@@ -544,16 +544,22 @@ def linear_backward_default(func, *args, **kwargs):
 
     ds, dw, db = None, None, None
     check_ragged_dim_same(func, inp, "self", grad_output, "grad_output")
-    reshaped_grad = grad_output._values.reshape(-1, weight.size(0))
     if output_mask[0]:
         ds = NestedTensor(
-            torch.matmul(reshaped_grad, weight).view_as(inp._values),
-            **extract_kwargs(grad_output),
+            torch.matmul(grad_output._values, weight), **extract_kwargs(grad_output)
         )
     if output_mask[1]:
-        dw = torch.matmul(reshaped_grad.t(), inp._values.reshape(-1, weight.size(1)))
+        # NB: Fold dims of values for input and grad_output to treat them as 2D. This
+        # trick avoids materializing large intermediates and immediately reducing over
+        # them via sum(). This is equivalent to computing:
+        #     torch.matmul(grad_output._values.transpose(-2, -1), inp._values)
+        # and then summing over the leading dimensions to get a 2D weight grad.
+        grad_2d = grad_output._values.reshape(-1, weight.size(0))
+        input_2d = inp._values.reshape(-1, weight.size(1))
+        dw = torch.matmul(grad_2d.t(), input_2d)
     if output_mask[2]:
-        db = reshaped_grad.sum(0)
+        # NB: autograd engine will sum over all but the last dim to get a 1D bias grad.
+        db = grad_output._values
     return (ds, dw, db)