[AOTI] Fix a special case compile time data type codegen for sym int variables (#138106)

YUNQIUGUO · facebook-github-bot · commit 226778724a5a · 2024-10-18T11:01:20.000-07:00
Summary:

This change unblocks the CFR AOTI lowering runtime error.

TL;DR:

In this model, one triton kernel expects a scalar input dtype as i64, but getting an i32. The reason is "auto"  can infer a smaller data type if the variable get passed in e.g. is i32. thus cause CUDA IMA.
 Original problematic kernel: `triton_poi_fused_add_ge_logical_and_logical_or_lt_46_grid_100`. and third input `auto var_402 = u0`.

This diff explicitly specifies it to i64 for all symbolic arguments in compile time for i64 triton kernel inputs, instead of use `auto var_x = {arg}` in cpp wrapper code.

Test Plan:
Verified in FLB locally:

```
PYTORCH_NO_CUDA_MEMORY_CACHING=1 AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=3 TORCH_LOGS="output_code" TORCHINDUCTOR_MAX_AUTOTUNE=1 TORCH_SHOW_CPP_STACKTRACES=1 CUDA_LAUNCH_BLOCKING=1 ~/fbsource/buck-out/v2/gen/fbcode/98e643f8bb44fe9d/hpc/new/models/feed/benchmark/__feed_lower_benchmark__/feed_lower_benchmark.par --skip-eager --skip-flop-estimation --lower-backend="AOT_INDUCTOR" --sync-mode=0 --precision bf16 --output-precision bf16  --lower-presets="ifr_cint;disable_new_lowering_weights;disable_dper_passes:passes=fuse_parallel_linear_no_weight_change" --remove-unexpected-type-cast=False --load="manifold://ads_storage_fblearner/tree/user/facebook/fblearner/predictor/924293663/0/gpu_lowering/input.merge"```

Differential Revision: D64490039
diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py
@@ -14,6 +14,7 @@
 import torch._inductor
 import torch._inductor.config
 import torch.nn as nn
+from torch._dynamo import config as dynamo_config
 from torch._dynamo.testing import rand_strided, same
 from torch._dynamo.utils import counters
 from torch._inductor import config
@@ -3608,6 +3609,54 @@ def forward(self, x):
         example_inputs = (torch.randn(8, device=self.device),)
         self.check_model(Model(), example_inputs)
 
+    @dynamo_config.patch({"capture_scalar_outputs": True})
+    def test_sym_i64_input_codegen(self):
+        if self.device != "cuda":
+            raise unittest.SkipTest("requires CUDA")
+
+        from torch.testing._internal.triton_utils import add_kernel
+
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(self, x):
+                x_symint = x.item()
+                a = torch.ones(x_symint, device="cuda")
+                b = torch.ones(x_symint, device="cuda")
+                out = torch.zeros_like(a)
+                # unbacked symint in grid
+                add_kernel[(1, 1, x_symint)](a, b, out, x_symint, 32)
+                return out
+
+        example_inputs = (
+            torch.randint(high=1024, size=(1,), device=self.device, dtype=torch.int32),
+        )
+        # This simple unit test case model generates two triton kernels:
+        # 1. triton_poi_fused_ones_1:
+        # triton_meta={'signature': {'out_ptr0': '*fp32', 'xnumel': 'i64'}
+        # 2. add_kernel:
+        # triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'out_ptr': '*fp32', 'n_elements': 'i64'}
+        # input u0 was defined as int32_t initially, verify for every kernel var args downstream,
+        # it gets explicitly declared using its data types in the cpp wrapper codegen code.
+        expected_scalar_args = [
+            "int64_t var_1 = u0;",
+            "int64_t var_3 = u0;",
+            "int64_t var_5 = u0;",
+            "int64_t var_9 = u0;",
+        ]
+        # check the new behavior of codegen is expected
+        result, code = run_and_get_cpp_code(
+            AOTIRunnerUtil.compile, Model(), example_inputs
+        )
+        for scalar_line in expected_scalar_args:
+            FileCheck().check_count(
+                scalar_line,
+                1,
+            ).run(code)
+
+        self.check_model(Model(), example_inputs)
+
 
 common_utils.instantiate_parametrized_tests(AOTInductorTestsTemplate)
 
diff --git a/torch/_inductor/codegen/cpp_wrapper_gpu.py b/torch/_inductor/codegen/cpp_wrapper_gpu.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 import functools
 import os
-from itertools import chain, count
+from itertools import chain, count, zip_longest
 from typing import Any, Callable, List, Optional, Tuple, TYPE_CHECKING, Union
 
 import sympy
@@ -286,9 +286,17 @@ def generate_load_kernel_once(
         self.writeline("}")
         return kernel_var_name
 
-    def generate_args_decl(self, call_args, arg_types):
+    def generate_args_decl(self, call_args, arg_types, arg_signatures):
         new_args = []
-        for arg, arg_type in zip(call_args, arg_types):
+
+        # Add more cases for other types as needed
+        signature2dtype = {
+            "i32": "int32_t",
+            "i64": "int64_t",
+            "fp32": "float",
+        }
+
+        def process_args(arg, arg_type, arg_signature=None):
             var_name = f"var_{next(self.arg_var_id)}"
             if isinstance(arg_type, torch_dtype):
                 if arg.endswith(".item()"):
@@ -312,10 +320,26 @@ def generate_args_decl(self, call_args, arg_types):
                 self.writeline(f"int {var_name} = {self.expr_printer(arg)};")
             elif arg_type in (sympy.Float, float):
                 self.writeline(f"float {var_name} = {self.expr_printer(arg)};")
+            # For symbolic call arguments, examine the arg signatures from triton meta
+            # to explicitly cast to the right type
+            # Reason: `auto` can infer unexpected type against kernel input signature.
+            elif (
+                isinstance(arg_type, type(SymbolicCallArg))
+                and arg_signature is not None
+                and arg_signature in signature2dtype.keys()
+            ):
+                self.writeline(
+                    f"{signature2dtype[arg_signature]} {var_name} = {self.expr_printer(arg)};"
+                )
             else:
                 self.writeline(f"auto {var_name} = {self.expr_printer(arg)};")
             new_args.append(f"&{var_name}")
 
+        for arg, arg_type, arg_signature in zip_longest(
+            call_args, arg_types, arg_signatures
+        ):
+            process_args(arg, arg_type, arg_signature)
+
         return ", ".join(new_args)
 
     def generate_default_grid(
@@ -392,18 +416,26 @@ def generate_kernel_call(
             # args with value 1 are added into equal_to_1 and constants
             # in triton_meta (in the Python codegen) which makes them
             # inlined in the PTX and compiled CUBIN
+            arg_signatures = []
             if (
                 triton_meta is not None
-                and "configs" in triton_meta
-                and triton_meta["configs"]
+                and triton_meta.get("configs")
+                and triton_meta.get("signature")
             ):
                 equal_to_1 = triton_meta["configs"][0].equal_to_1
                 call_args = [
                     arg for i, arg in enumerate(call_args) if i not in equal_to_1
                 ]
                 arg_types = [t for i, t in enumerate(arg_types) if i not in equal_to_1]
+                # extract the arg signatures from triton_meta
+                arg_signatures = triton_meta["signature"].values()
+                arg_signatures = [
+                    v for i, v in enumerate(arg_signatures) if i not in equal_to_1
+                ]
 
-            call_args_str = self.generate_args_decl(call_args, arg_types)
+            call_args_str = self.generate_args_decl(
+                call_args, arg_types, arg_signatures
+            )
             kernel_args_var = f"kernel_args_var_{next(self.kernel_callsite_id)}"
             self.writeline(f"void* {kernel_args_var}[] = {{{call_args_str}}};")