[Inductor] Add DeviceAssert op to enable device-side assertion in torch.compile (#160677)

karthickai · pytorchmergebot · commit 378edb047f83 · 2025-08-27T14:49:20.000Z
This PR introduces a device_assert op to trigger device-side assertions within torch.compile. This implementation is based on the suggestion in [this comment](#147282 (comment)). Changes Included - Implemented device_assert op and overrides has_side_effect to return True to avoid removal by dead code elimination. - Commented out the assert_async_msg_decomp and functional_assert_async_msg_decomp decompositions to disable the default assert decomposition inside Inductor. - Added lowering for torch.ops.aten._assert_async.msg to convert assert calls into the ops_handler. - Implemented the codegen method for the device_assert op. This supports generating C++ and Triton code. - Added test cases to verify both "should throw" and "should not throw" scenarios. Fixes #147282 Pull Request resolved: #160677 Approved by: https://github.com/mlazos
diff --git a/test/inductor/test_device_assert.py b/test/inductor/test_device_assert.py
@@ -0,0 +1,204 @@
+# Owner(s): ["module: inductor"]
+import os
+import subprocess
+import sys
+
+import torch
+import torch._inductor.config
+from torch._inductor import metrics
+from torch._inductor.compiler_bisector import BisectionResult, CompilerBisector
+from torch._inductor.test_case import run_tests, TestCase
+from torch.testing._internal.common_utils import skipIfRocm
+from torch.testing._internal.triton_utils import requires_cuda_and_triton
+
+
+class TestTorchDeviceAssertTrigger(TestCase):
+    def _run_assert_should_throw(self, device):
+        def func():
+            a = torch.tensor([1.0, -2.0], device=device)
+            result = torch.all(a > 0)
+            assert result, "should throw"
+
+        def test_fn():
+            torch._dynamo.reset()
+            f_c = torch.compile(func)
+
+            try:
+                f_c()
+                return False
+            except Exception:
+                return True
+
+        bisect_result = CompilerBisector.do_bisect(test_fn)
+        # do_bisect return None if all system is passed else return BisectionResult
+        self.assertNotIsInstance(bisect_result, BisectionResult)
+
+    def _run_assert_should_not_throw(self, device):
+        def func():
+            a = torch.tensor([1.0, 2.0], device=device)
+            result = torch.all(a > 0)
+            assert result, "should throw"
+
+        def test_fn():
+            torch._dynamo.reset()
+            f_c = torch.compile(func)
+
+            try:
+                f_c()
+                return True
+            except Exception:
+                return False
+
+        bisect_result = CompilerBisector.do_bisect(test_fn)
+        self.assertNotIsInstance(bisect_result, BisectionResult)
+
+    def _run_assert_inline_expression_should_throw(self, device):
+        def func():
+            a = torch.tensor([1.0, -2.0], device=device)
+            assert torch.all(a > 0), "should throw"
+
+        def test_fn():
+            torch._dynamo.reset()
+            f_c = torch.compile(func)
+
+            try:
+                f_c()
+                return False
+            except Exception:
+                return True
+
+        bisect_result = CompilerBisector.do_bisect(test_fn)
+        self.assertNotIsInstance(bisect_result, BisectionResult)
+
+    def _run_assert_inline_expression_should_not_throw(self, device):
+        def func():
+            a = torch.tensor([1.0, 2.0], device=device)
+            assert torch.all(a > 0), "should throw"
+
+        def test_fn():
+            torch._dynamo.reset()
+            f_c = torch.compile(func)
+
+            try:
+                f_c()
+                return True
+            except Exception:
+                return False
+
+        bisect_result = CompilerBisector.do_bisect(test_fn)
+        self.assertNotIsInstance(bisect_result, BisectionResult)
+
+    @torch._inductor.config.patch(force_disable_caches=True)
+    def test_assert_should_throw(self):
+        device = "cpu"
+        self._run_assert_should_throw(device)
+        self._run_assert_inline_expression_should_throw(device)
+
+    @torch._inductor.config.patch(force_disable_caches=True)
+    def test_assert_should_not_throw(self):
+        device = "cpu"
+        self._run_assert_should_not_throw(device)
+        self._run_assert_inline_expression_should_not_throw(device)
+
+    @torch._inductor.config.patch(force_disable_caches=True, cpp_wrapper=True)
+    def test_assert_should_throw_cpp_wrapper(self):
+        device = "cpu"
+        self._run_assert_should_throw(device)
+        self._run_assert_inline_expression_should_throw(device)
+
+    @torch._inductor.config.patch(force_disable_caches=True, cpp_wrapper=True)
+    def test_assert_should_not_throw_cpp_wrapper(self):
+        device = "cpu"
+        self._run_assert_should_not_throw(device)
+        self._run_assert_inline_expression_should_not_throw(device)
+
+    @requires_cuda_and_triton
+    @skipIfRocm
+    @torch._inductor.config.patch(force_disable_caches=True)
+    def test_assert_fusion(self):
+        torch._logging.set_logs(inductor_metrics=True)
+
+        def func():
+            a = torch.tensor([1.0, 2.0], device="cuda")
+            result = torch.all(a > 0)
+            assert result, "should throw"
+
+        torch._dynamo.reset()
+        f_c = torch.compile(func, backend="inductor")
+        metrics.reset()
+        self.assertEqual(metrics.generated_kernel_count, 0)
+        f_c()
+        self.assertEqual(metrics.generated_kernel_count, 1)
+        torch._logging.set_logs()
+
+    @requires_cuda_and_triton
+    @skipIfRocm
+    @torch._inductor.config.patch(force_disable_caches=True)
+    def test_run_assert_triton(self):
+        should_throw = """
+import torch
+import torch._dynamo
+
+def func_should_throw():
+    a = torch.tensor([1.0, -2.0], device='cuda')
+    result = torch.all(a > 0)
+    assert result, "should throw"
+
+def test_fn():
+    torch._dynamo.reset()
+    f_c = torch.compile(func_should_throw, backend="inductor")
+
+    try:
+        f_c()
+        torch.cuda.synchronize()
+        return False
+    except Exception as e:
+        return True
+
+result = test_fn()
+print(f"Test result: {result}")
+"""
+
+        should_not_throw = """
+import torch
+import torch._dynamo
+
+def func_should_not_throw():
+    a = torch.tensor([1.0, 2.0], device='cuda')
+    result = torch.all(a > 0)
+    assert result, "should throw"
+
+def test_fn():
+    torch._dynamo.reset()
+    f_c = torch.compile(func_should_not_throw, backend="inductor")
+
+    try:
+        f_c()
+        torch.cuda.synchronize()
+        return True
+    except Exception as e:
+        return False
+
+result = test_fn()
+print(f"Test result: {result}")
+"""
+        for script in [should_not_throw, should_throw]:
+            p = subprocess.run(
+                [sys.executable, "-c", script],
+                cwd=os.path.dirname(os.path.realpath(__file__)),
+                capture_output=True,
+                text=True,
+            )
+
+            output = p.stdout + "\n" + p.stderr
+
+            self.assertIn("Test result: True", output)
+
+            if p.returncode != 0:
+                self.fail(
+                    f"Subprocess failed with return code {p.returncode}. Output: {output}"
+                )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
@@ -1119,6 +1119,10 @@ def sign(x):
         code.writeline("()")
         return code
 
+    @staticmethod
+    def device_assert_async(cond, msg):
+        return f'({cond} ? 0 : (throw std::runtime_error("{msg}"), 0))'
+
 
 CppOverrides._initialize_pointwise_overrides("cpp")
 
diff --git a/torch/_inductor/codegen/halide.py b/torch/_inductor/codegen/halide.py
@@ -566,6 +566,10 @@ def masked(mask, body, other):
     def frexp(x):
         raise NotImplementedError("frexp")
 
+    @staticmethod
+    def device_assert_async(cond, msg):
+        raise NotImplementedError("device_assert_async")
+
 
 HalideOverrides._initialize_pointwise_overrides("halide")
 
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
@@ -1592,6 +1592,10 @@ def frexp(x):
         V.kernel.cse.put(cache_key, (mantissa, exponent))
         return (mantissa, exponent)
 
+    @staticmethod
+    def device_assert_async(cond, msg):
+        return f"tl.device_assert({cond}, {repr(msg)})"
+
 
 class HelperFunctions:
     """An ordered set of helper functions."""
diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
@@ -158,19 +158,6 @@ def _embedding_dense_backward(
     )
 
 
-# TODO: for now, inductor doesn't handle asserts
-# because the condition is symbol -> tensor in the graph.
-@register_decomposition([aten._assert_async.msg])
-def assert_async_msg_decomp(tensor: torch.Tensor, msg: str) -> None:
-    return
-
-
-# Following `assert_async_msg_decomp` and implement as non-op.
-@register_decomposition([aten._functional_assert_async.msg])
-def functional_assert_async_msg_decomp(tensor: torch.Tensor, msg: str) -> None:
-    return
-
-
 @register_decomposition([aten.sym_constrain_range_for_size.default])
 def sym_constrain_range_for_size(
     symbol: torch.SymInt,
diff --git a/torch/_inductor/dtype_propagation.py b/torch/_inductor/dtype_propagation.py
@@ -373,6 +373,10 @@ def placeholder(self, index: int) -> torch.dtype:
             f"{type(self).__name__}: ops.placeholder should not appear here"
         )
 
+    @staticmethod
+    def device_assert_async(cond, msg: str) -> torch.dtype:
+        return torch.bool
+
 
 if TYPE_CHECKING:
 
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
@@ -1094,7 +1094,10 @@ def constant_to_device(self, device: torch.device) -> IRNode:
         loader = self.make_loader()
         loader = patch.object(ConstantBuffer, "override_device", device)(loader)
         return Pointwise(
-            device=device, dtype=self.dtype, inner_fn=loader, ranges=self.ranges
+            device=device,
+            dtype=self.dtype,
+            inner_fn=loader,
+            ranges=self.ranges,
         )
 
 
@@ -4423,6 +4426,17 @@ class ComputedBuffer(OperationBuffer):
     """
 
     data: Loops
+    _force_realize: ClassVar[bool] = False
+
+    @staticmethod
+    @contextlib.contextmanager
+    def force_realize() -> Iterator[None]:
+        old_value = ComputedBuffer._force_realize
+        try:
+            ComputedBuffer._force_realize = True
+            yield
+        finally:
+            ComputedBuffer._force_realize = old_value
 
     def get_computed_buffer_name(self) -> Optional[str]:
         """
@@ -4497,6 +4511,7 @@ def make_loader(self) -> Callable[[Sequence[Expr]], OpsValue]:
             not self.get_reduction_type()
             and self.name not in V.graph.mutated_buffers
             and self.num_reads() == 0
+            and not self._force_realize
         ):
             # inline this op rather than generating ops.load()
             return self.data.make_loader()
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
@@ -1329,6 +1329,39 @@ def inner_fn(idx):
     )
 
 
+def _assert_async(cond, msg):
+    cond.realize()
+    cond = to_dtype(cond, torch.bool)
+
+    def inner_fn(index):
+        if hasattr(cond.data, "data") and hasattr(cond.data.data, "force_realize"):
+            with cond.data.data.force_realize():
+                cond_loader = cond.make_loader()
+                return ops.device_assert_async(cond_loader(index), msg)
+        else:
+            cond_loader = cond.make_loader()
+            return ops.device_assert_async(cond_loader(index), msg)
+
+    assertion_op = Pointwise.create(
+        device=cond.get_device(),
+        dtype=cond.get_dtype(),
+        inner_fn=inner_fn,
+        ranges=list(cond.get_size()),
+    )
+    assertion_op.realize()
+    return assertion_op
+
+
+@register_lowering(aten._assert_async.msg)
+def lower_assert_async(cond, msg):
+    return _assert_async(cond, msg)
+
+
+@register_lowering(aten._functional_assert_async.msg)
+def lower_assert_functional_async(cond, msg):
+    return _assert_async(cond, msg)
+
+
 @register_lowering(
     quantized_decomposed.dequantize_per_channel, type_promotion_kind=None
 )
diff --git a/torch/_inductor/ops_handler.py b/torch/_inductor/ops_handler.py
@@ -706,6 +706,9 @@ def placeholder(self, index: int) -> T:
         """This is a fake op used in analysis but not codegen"""
         raise NotImplementedError
 
+    def device_assert_async(self, cond: T, msg: str) -> T:
+        raise NotImplementedError
+
 
 _ignore_op_re = re.compile(r"_.*|paren").fullmatch
 
@@ -788,6 +791,9 @@ def {target}(self, {", ".join(args)}):
             if target in OP_NAMES:
                 setattr(cls, target, impl)
 
+    def device_assert_async(self, cond, msg):
+        return None
+
 
 DefaultHandler._init_cls()
 
@@ -933,6 +939,9 @@ def sort(dtypes, values, stable, descending):
     def indirect_indexing(index_var, size, check=True, wrap_neg=True) -> sympy.Symbol:
         return sympy_index_symbol(str(index_var))
 
+    def device_assert_async(self, cond, msg):
+        return None
+
 
 class KernelFormatterHandler(DefaultHandler):
     def __init__(self, parent_handler: OpsHandler[Any]):
@@ -999,6 +1008,9 @@ def getvalue(self, result):
         self._output.writeline(f"return {result}")
         return self._output.getvalue()
 
+    def device_assert_async(self, cond, msg: str):
+        return f"ops.device_assert_async({cond}, {msg})"
+
 
 class WrapperHandler(DefaultHandler):
     def __init__(self, inner: OpsHandler[Any]):
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
diff --git a/torch/_inductor/shape_propagation.py b/torch/_inductor/shape_propagation.py

Original file line number	Diff line number	Diff line change
`@@ -373,6 +373,10 @@ def placeholder(self, index: int) -> torch.dtype:`
`373`	`373`	`f"{type(self).__name__}: ops.placeholder should not appear here"`
`374`	`374`	`)`
`375`	`375`
	`376`	`+ @staticmethod`
	`377`	`+ def device_assert_async(cond, msg: str) -> torch.dtype:`
	`378`	`+ return torch.bool`
	`379`	`+`
`376`	`380`
`377`	`381`	`if TYPE_CHECKING:`
`378`	`382`