Fix too big to optimize in test, actually use O0 when aot_inductor.compile_wrapper_with_O0 is set (#148714)

yushangdi · facebook-github-bot · commit b20b51c14269 · 2025-03-12T16:57:50.000-07:00
Summary:

1. Check against the "0" char instead

2. We got the following error when using anything other than O0 flag: `error: Function ZN5torch12aot_inductorL22__check_inputs_outputsEPP16AtenTensorOpaqueS3 is too big to optimize [-Werror,-Wignored-optimization-argument]` So we use O0 flag in wrapper code when `aot_inductor.compile_wrapper_opt_level` is set to `O0`.

Test Plan:
```
 buck run  'fbcode//mode/opt' fbcode//deeplearning/aot_inductor/cpu/test:ads_second_stage_dsnn_models_aoti_lowering_test -- -r AdsSecondStageDSNNModelsAOTILoweringTest
```

Reviewed By: desertfire

Differential Revision: D70670957
diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py
@@ -139,24 +139,6 @@ def forward(self, x, y):
                 model, example_inputs, "AOTInductorModelRunMinimalArrayrefInterface(", 1
             )
 
-    def test_compile_wrapper_with_O0(self):
-        class Model(torch.nn.Module):
-            def __init__(self) -> None:
-                super().__init__()
-                self.linear = torch.nn.Linear(10, 10)
-
-            def forward(self, x, y):
-                return x + self.linear(y)
-
-        example_inputs = (
-            torch.randn(10, 10, device=self.device),
-            torch.randn(10, 10, device=self.device),
-        )
-        model = Model()
-        with config.patch("aot_inductor.compile_wrapper_with_O0", True):
-            self.check_model(model, example_inputs)
-            self.code_check_count(model, example_inputs, "__attribute__((", 2)
-
     def test_small_constant(self):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu.py b/torch/_inductor/codegen/cpp_wrapper_cpu.py
@@ -407,7 +407,7 @@ def gen_check(handle_kind, idx, name, tensor):
             """
             bool _check_aoti_runtime_check_inputs_env() {
                 const static char* env_var_value = getenv("AOTI_RUNTIME_CHECK_INPUTS");
-                const static bool result = env_var_value != nullptr && env_var_value[0] != 0;
+                const static bool result = env_var_value != nullptr && env_var_value[0] != '0';
                 return result;
             }
 
@@ -461,17 +461,7 @@ def write_wrapper_decl(self):
                         """
                     )
 
-                run_impl_proto = ""
-                if config.aot_inductor.compile_wrapper_with_O0:
-                    run_impl_proto += """
-                    #ifdef __clang__
-                    __attribute__((optnone))
-                    #else
-                    __attribute__((optimize("O0")))
-                    #endif
-                    """
-
-                run_impl_proto += """
+                run_impl_proto = """
                     void AOTInductorModel::run_impl(
                         AtenTensorHandle*
                             input_handles, // array of input AtenTensorHandle; handles
diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py b/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py
@@ -189,17 +189,7 @@ def write_wrapper_decl(self):
                         """
                     )
 
-                run_impl_proto = ""
-                if config.aot_inductor.compile_wrapper_with_O0:
-                    run_impl_proto += """
-                    #ifdef __clang__
-                    __attribute__((optnone))
-                    #else
-                    __attribute__((optimize("O0")))
-                    #endif
-                    """
-
-                run_impl_proto += """
+                run_impl_proto = """
                     void AOTInductorModel::run_impl(
                         AtenTensorHandle*
                             input_handles, // array of input AtenTensorHandle; handles
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
@@ -1168,9 +1168,9 @@ class aot_inductor:
     debug_compile = os.environ.get("AOT_INDUCTOR_DEBUG_COMPILE", "0") == "1"
 
     # Annotate generated main wrapper function, i.e. AOTInductorModel::run_impl,
-    # to skip cpp compiler optimizations for faster compilation.
-    compile_wrapper_with_O0 = (
-        os.environ.get("AOT_INDUCTOR_COMPILE_WRAPPER_WITH_O0", "0") == "1"
+    # to use which cpp compiler optimization level, default to O1
+    compile_wrapper_opt_level = os.environ.get(
+        "AOT_INDUCTOR_COMPILE_WRAPPER_OPT_LEVEL", "O1"
     )
 
     # option for debug printing/saving for intermediate tensor values for aot inductor
diff --git a/torch/_inductor/cpp_builder.py b/torch/_inductor/cpp_builder.py
@@ -542,10 +542,11 @@ def _get_optimization_cflags(
     if _IS_WINDOWS:
         return ["O1" if min_optimize else "O2"]
     else:
+        wrapper_opt_level = config.aot_inductor.compile_wrapper_opt_level
         cflags = (
             ["O0", "g"]
             if config.aot_inductor.debug_compile
-            else ["O1" if min_optimize else "O3", "DNDEBUG"]
+            else [wrapper_opt_level if min_optimize else "O3", "DNDEBUG"]
         )
         cflags += _get_ffast_math_flags()
         cflags.append("fno-finite-math-only")