Update on "Support generic stream/event on XPU backend"

guangyey · guangyey · commit 5b51a4f15f45 · 2024-05-09T00:48:10.000Z
# Motivation According to [#123611](#123611), we support generic stream/event on XPU backend. # Additional Context new method/attribute on `torch.Event` - torch.Event.event_id - torch.Event.elapsed_time - torch.Event.synchronize new method on `c10::Event` - c10.Event.event_id - c10.Event.elapsed_time - c10.Event.synchronize cc jgong5 mingfeima XiaobingSuper sanchitintel ashokei jingxu10 gujinghui EikanWang fengyuan14 [ghstack-poisoned]
diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-06ad737628abc3a1e617571dc03cbdd5b36ea96a
+d23a6e1664d20707c11781299611436e1f0c104f
diff --git a/aten/src/ATen/cpu/Utils.cpp b/aten/src/ATen/cpu/Utils.cpp
@@ -5,22 +5,6 @@
 
 namespace at::cpu {
 
-bool is_cpu_support_avx2() {
-#if !defined(__s390x__) && !defined(__powerpc__)
-  return cpuinfo_initialize() && cpuinfo_has_x86_avx2();
-#else
-  return false;
-#endif
-}
-
-bool is_cpu_support_avx512() {
-#if !defined(__s390x__) && !defined(__powerpc__)
-  return cpuinfo_initialize() && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512vl() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq();
-#else
-  return false;
-#endif
-}
-
 bool is_cpu_support_vnni() {
 #if !defined(__s390x__) && !defined(__powerpc__)
   return cpuinfo_initialize() && cpuinfo_has_x86_avx512vnni();
diff --git a/aten/src/ATen/cpu/Utils.h b/aten/src/ATen/cpu/Utils.h
@@ -4,9 +4,6 @@
 
 namespace at::cpu {
 
-TORCH_API bool is_cpu_support_avx2();
-TORCH_API bool is_cpu_support_avx512();
-
 // Detect if CPU support Vector Neural Network Instruction.
 TORCH_API bool is_cpu_support_vnni();
 
diff --git a/c10/xpu/impl/XPUGuardImpl.h b/c10/xpu/impl/XPUGuardImpl.h
@@ -158,8 +158,7 @@ struct XPUGuardImpl final : public c10::impl::DeviceGuardImplInterface {
     const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
     if (C10_UNLIKELY(interp)) {
       (*interp)->trace_gpu_event_synchronization(
-          c10::kXPU,
-          reinterpret_cast<uintptr_t>(xpu_event));
+          c10::kXPU, reinterpret_cast<uintptr_t>(xpu_event));
     }
     xpu_event->wait_and_throw();
   }
diff --git a/test/distributed/_tensor/test_dtensor_compile.py b/test/distributed/_tensor/test_dtensor_compile.py
@@ -60,7 +60,7 @@ def forward(self, input):
 
 
 def extract_graph(fx_g, _, graph_cell):
-    graph_cell[0] = fx_g
+    graph_cell[0] = fx_g.code
     return fx_g
 
 
@@ -481,6 +481,32 @@ def fn(x_dt):
         res = opt_fn(x_dt)
         self.assertEqual(ref, res)
 
+    def test_graph_input_is_async(self):
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        def fn(x):
+            return x.sin().sin()
+
+        opt_fn = torch.compile(fn, backend=aot_eager_graph, fullgraph=True)
+
+        x = torch.randn(4, 4, requires_grad=True)
+        x_dt = DTensor.from_local(x, mesh, [Shard(0)], run_check=False)
+        x2 = x_dt.redistribute(mesh, [Replicate()], async_op=True)
+        x2 = x2.to_local()
+        out = opt_fn(x2)
+        # The important part: we get a wait_tensor() in the graph.
+        # At runtime, the input to the graph is an AsyncCollectiveTensor,
+        # and inside the graph we need to issue a wait() to synchronize.
+        self.assertExpectedInline(
+            str(fw_graph_cell[0]).strip(),
+            """\
+def forward(self, primals_1):
+    wait_tensor = torch.ops._c10d_functional.wait_tensor.default(primals_1)
+    sin = torch.ops.aten.sin.default(wait_tensor)
+    sin_1 = torch.ops.aten.sin.default(sin);  sin = None
+    return [sin_1, primals_1, wait_tensor]""",
+        )
+
     @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
     def test_dtensor_partial_placement_graph_output(self):
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
diff --git a/test/inductor/test_debug_trace.py b/test/inductor/test_debug_trace.py
@@ -9,6 +9,8 @@
 
 import torch
 from torch._inductor import config, test_operators
+from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.utils._triton import has_triton
 
 try:
     try:
@@ -168,6 +170,29 @@ def body(self, ops):
         # intentionally only cleanup on success so debugging test is easier
         shutil.rmtree(filename)
 
+    @unittest.skipIf(not TEST_CUDA or not has_triton(), "requires cuda")
+    def test_debug_multi_tempalte(self):
+        class ToyModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.l = torch.nn.Linear(100, 100)
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, x):
+                return self.relu(self.l(x))
+
+        # no failure
+
+        from torch._inductor.utils import fresh_inductor_cache
+
+        with self.assertLogs(
+            logging.getLogger("torch._inductor.debug"), level=logging.WARNING
+        ), fresh_inductor_cache():
+            m = ToyModel().to(device="cuda:0")
+            m = torch.compile(m, mode="max-autotune")
+            input_tensor = torch.randn(100).to(device="cuda:0")
+            m(input_tensor)
+
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
@@ -6245,11 +6245,6 @@ def fn(x):
 
         self.common(fn, [torch.randn(64, 64)])
 
-    def test_new_cpp_build_logical(self):
-        from torch._inductor.codecache import validate_new_cpp_commands
-
-        validate_new_cpp_commands()
-
     def test_as_strided(self):
         def fn(x):
             return (
diff --git a/test/run_test.py b/test/run_test.py
@@ -229,9 +229,7 @@ def __contains__(self, item):
     "nn/test_pooling",
     "nn/test_convolution",  # Doesn't respect set_per_process_memory_fraction, results in OOM for other tests in slow gradcheck
     "distributions/test_distributions",
-    "functorch/test_vmap",  # OOM
     "test_fx",  # gets SIGKILL
-    "test_dataloader",  # frequently hangs for ROCm
     "functorch/test_memory_efficient_fusion",  # Cause CUDA OOM on ROCm
     "test_utils",  # OOM
     "test_sort_and_select",  # OOM
diff --git a/test/test_linalg.py b/test/test_linalg.py
@@ -18,7 +18,7 @@
      TEST_WITH_ROCM, IS_FBCODE, IS_REMOTE_GPU, iter_indices,
      make_fullrank_matrices_with_distinct_singular_values,
      freeze_rng_state, IS_ARM64, IS_SANDCASTLE, TEST_OPT_EINSUM, parametrize, skipIfTorchDynamo,
-     setBlasBackendsToDefaultFinally, setLinalgBackendsToDefaultFinally)
+     setBlasBackendsToDefaultFinally, setLinalgBackendsToDefaultFinally, serialTest)
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, dtypes, has_cusolver, has_hipsolver,
      onlyCPU, skipCUDAIf, skipCUDAIfNoMagma, skipCPUIfNoLapack, precisionOverride,
@@ -2485,6 +2485,7 @@ def run_subtest(actual_rank, matrix_size, batches, device, svd_lowrank, **option
     @precisionOverride({torch.float: 1e-4, torch.cfloat: 2e-4})
     @setLinalgBackendsToDefaultFinally
     @dtypes(*floating_and_complex_types())
+    @serialTest()
     def test_svd(self, device, dtype):
         # tests linalg.svd, svd, linalg.svdvals
         make_arg = partial(make_tensor, dtype=dtype, device=device)
diff --git a/torch/_C/_cpu.pyi b/torch/_C/_cpu.pyi
@@ -2,6 +2,4 @@ from torch.types import _bool
 
 # Defined in torch/csrc/cpu/Module.cpp
 
-def _is_cpu_support_avx2() -> _bool: ...
-def _is_cpu_support_avx512() -> _bool: ...
 def _is_cpu_support_vnni() -> _bool: ...
diff --git a/torch/_dynamo/trace_rules.py b/torch/_dynamo/trace_rules.py
@@ -406,8 +406,6 @@
         "torch._C._construct_CUDA_Tensor_From_Storage_And_Metadata",
         "torch._C._construct_storage_from_data_pointer",
         "torch._C._conv_determine_backend_memory_format",
-        "torch._C._cpu._is_cpu_support_avx2",
-        "torch._C._cpu._is_cpu_support_avx512",
         "torch._C._cpu._is_cpu_support_vnni",
         "torch._C._crash_if_aten_asan",
         "torch._C._crash_if_csrc_asan",
@@ -2422,8 +2420,6 @@
         "torch.chain_matmul",
         "torch.compile",
         "torch.compiled_with_cxx11_abi",
-        "torch.cpu._is_cpu_support_avx2",
-        "torch.cpu._is_cpu_support_avx512",
         "torch.cpu._is_cpu_support_vnni",
         "torch.cpu.current_device",
         "torch.cpu.current_stream",
diff --git a/torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py b/torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py
@@ -251,7 +251,7 @@ def aot_dispatch_autograd_graph(
         torch.Tensor, lambda t: t.detach(), updated_joint_inputs
     )
     maybe_subclass_meta = subclass_tracing_info.maybe_subclass_meta
-    aot_graphs_log.debug(
+    aot_graphs_log.info(
         "aot_config id: %s, fw_metadata=%s,subclass_metadata=%s",
         str(aot_config.aot_id),
         str(fw_metadata),
diff --git a/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py b/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py
@@ -40,8 +40,8 @@
 
 from .runtime_wrappers import (
     aot_dispatch_subclass_wrapper,
-    create_runtime_wrapper,
     functionalized_rng_runtime_epilogue,
+    RuntimeWrapper,
 )
 from .schemas import (
     AOTConfig,
@@ -174,7 +174,7 @@ def aot_dispatch_base(
                 fw_module, fwd_output_strides
             )
 
-    # However, create_runtime_wrapper does not expect the rng offsets in the
+    # However, RuntimeWrapper does not expect the rng offsets in the
     # output. So, we have to create another wrapper and take out the offset. As
     # a result, we have to account for not boxed_call compilers as well.
     if not hasattr(compiled_fw, "_boxed_call"):
@@ -212,13 +212,14 @@ def rng_functionalization_wrapper(args: List[Any]):
     if not hasattr(compiled_fw_func, "_boxed_call"):
         compiled_fw_func = make_boxed_func(compiled_fw_func)
 
-    compiled_fn = create_runtime_wrapper(
-        compiled_fw_func,
-        runtime_metadata=fw_metadata,
+    compiled_fn = RuntimeWrapper(
         indices_of_inps_to_detach=[],
         trace_joint=False,
-        keep_input_mutations=aot_config.keep_inference_input_mutations,
         disable_amp=disable_amp,
+    ).post_compile(
+        compiled_fw_func,
+        aot_config,
+        fw_metadata=fw_metadata,
     )
 
     return compiled_fn
@@ -1041,13 +1042,14 @@ def backward(ctx, *args):
                 return (*[None] * num_tokens, *outs_wrapped)
             return (*[None] * num_tokens, *out)
 
-    compiled_function = create_runtime_wrapper(
-        CompiledFunction.apply,
-        runtime_metadata=fw_metadata,
+    compiled_function = RuntimeWrapper(
         indices_of_inps_to_detach=_indices_of_inps_to_detach,
         trace_joint=True,
-        keep_input_mutations=aot_config.keep_inference_input_mutations,
         disable_amp=disable_amp,
+    ).post_compile(
+        CompiledFunction.apply,
+        aot_config,
+        fw_metadata=fw_metadata,
     )
 
     if not config.debug_assert:
diff --git a/torch/_functorch/_aot_autograd/runtime_wrappers.py b/torch/_functorch/_aot_autograd/runtime_wrappers.py
@@ -8,6 +8,7 @@
 
 import collections
 import pprint
+from dataclasses import dataclass
 from functools import wraps
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
@@ -52,6 +53,79 @@
 zip = strict_zip
 
 
+class CompilerWrapper:
+    """
+    A wrapper around the inputs and outputs to the compiler_fn. We separate these into two parts:
+
+    1. The prologue, which edits the input to the compiler_fn(flat_fn, flat_args, etc)
+    2. The epilogue, which edits the outputs of the compiler_fn (compiled_fn, real arguments)
+
+    Each wrapper below should be implemented as a CompilerWrapper, so that we can facilitate
+    caching on the compiled output, and re-wrapping the output via epilogues.
+    Extra metadata that is needed to compute pre or post compile can be passed in via attributes.
+    """
+
+    def pre_compile(
+        self,
+        flat_fn,
+        flat_args: List[Tensor],
+        aot_config: AOTConfig,
+        *,
+        fw_metadata: ViewAndMutationMeta,
+    ):
+        """
+        Process the inputs to the compiler_fn. You can pass in extra metadata via kwargs.
+        Args:
+        flat_fn: The function to compile
+        flat_args: Metadata from example inputs of the function to compile
+        aot_config: AOTConfig passed in at compile time
+        fw_metadata: ViewAndMutationMeta generated from flat_fn and flat_args
+        """
+        return flat_fn, flat_args, aot_config, fw_metadata
+
+    def post_compile(self, compiled_fn, aot_config, *, fw_metadata):
+        """
+        Given an output of the compiler, wrap it with information received from prologue.
+        Args:
+        compiled_fn: Callable after calling compiler_fn
+        aot_config: AOTConfig after calling prologue
+        fw_metadata: ViewAndMutationMeta after calling prologue
+        Example:
+
+        def wrapped_compiled_fn(args):
+            # do something with args, aot_config, fw_metadata
+            return compiled_fn(args)
+
+        return wrapped_compiled_fn
+        """
+        return compiled_fn
+
+    def create(
+        self,
+        flat_fn,
+        flat_args: List[Tensor],
+        aot_config: AOTConfig,
+        *,
+        fw_metadata: ViewAndMutationMeta,
+        compiler_fn,
+        **kwargs,
+    ):
+        (
+            wrapped_flat_fn,
+            new_flat_args,
+            new_aot_config,
+            new_fw_metadata,
+        ) = self.pre_compile(
+            flat_fn, flat_args, aot_config, fw_metadata=fw_metadata, **kwargs
+        )
+        compiled_fn = compiler_fn(
+            wrapped_flat_fn, new_flat_args, new_aot_config, fw_metadata=new_fw_metadata
+        )
+        return self.post_compile(
+            compiled_fn, new_aot_config, fw_metadata=new_fw_metadata, **kwargs
+        )
+
+
 # The wrapper created by this function handles all of the runtime aliasing and mutation "epilogue" logic
 # that needs to run after the compiled function.
 #
@@ -60,7 +134,30 @@
 # This is because there are some minor differences in how we treat these cases at runtime:
 # - resize_() is currently handled in the inference case, but not fully handled in the autograd case.
 # - the autograd cases inserts TensorAlias wrapper objects for outputs that alias inputs
-def create_runtime_wrapper(
+@dataclass
+class RuntimeWrapper(CompilerWrapper):
+    indices_of_inps_to_detach: List[int]
+    trace_joint: bool
+    disable_amp: bool
+
+    def post_compile(
+        self,
+        compiled_fn,
+        aot_config: AOTConfig,
+        *,
+        fw_metadata: ViewAndMutationMeta,
+    ):
+        return _create_runtime_wrapper(
+            compiled_fn,
+            runtime_metadata=fw_metadata,
+            indices_of_inps_to_detach=self.indices_of_inps_to_detach,
+            trace_joint=self.trace_joint,
+            keep_input_mutations=aot_config.keep_inference_input_mutations,
+            disable_amp=self.disable_amp,
+        )
+
+
+def _create_runtime_wrapper(
     compiled_fn,
     *,
     runtime_metadata: ViewAndMutationMeta,
diff --git a/torch/_functorch/aot_autograd.py b/torch/_functorch/aot_autograd.py
@@ -70,9 +70,7 @@
     aot_dispatch_subclass_wrapper,
     aot_wrapper_dedupe,
     aot_wrapper_synthetic_base,
-    create_runtime_wrapper,
     functionalized_rng_runtime_epilogue,
-    merge_view_inputs,
 )
 from ._aot_autograd.schemas import (  # noqa: F401
     AOTConfig,
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
diff --git a/torch/_inductor/codegen/cuda/cutlass_utils.py b/torch/_inductor/codegen/cuda/cutlass_utils.py
diff --git a/torch/_inductor/cpp_builder.py b/torch/_inductor/cpp_builder.py
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
diff --git a/torch/cpu/__init__.py b/torch/cpu/__init__.py
diff --git a/torch/csrc/cpu/Module.cpp b/torch/csrc/cpu/Module.cpp
diff --git a/torch/distributed/_functional_collectives.py b/torch/distributed/_functional_collectives.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-06ad737628abc3a1e617571dc03cbdd5b36ea96a`
	`1`	`+d23a6e1664d20707c11781299611436e1f0c104f`
Original file line number	Diff line number	Diff line change
`@@ -158,8 +158,7 @@ struct XPUGuardImpl final : public c10::impl::DeviceGuardImplInterface {`
`158`	`158`	`const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();`
`159`	`159`	`if (C10_UNLIKELY(interp)) {`
`160`	`160`	`(*interp)->trace_gpu_event_synchronization(`
`161`		`- c10::kXPU,`
`162`		`- reinterpret_cast<uintptr_t>(xpu_event));`
	`161`	`+ c10::kXPU, reinterpret_cast<uintptr_t>(xpu_event));`
`163`	`162`	`}`
`164`	`163`	`xpu_event->wait_and_throw();`
`165`	`164`	`}`