pytorch
diff --git a/‎test/distributed/test_inductor_collectives.py‎
Lines changed: 2 additions & 2 deletions b/‎test/distributed/test_inductor_collectives.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎test/dynamo/test_export.py‎
Lines changed: 1 addition & 1 deletion b/‎test/dynamo/test_export.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/dynamo/test_logging.py‎
Lines changed: 1 addition & 1 deletion b/‎test/dynamo/test_logging.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/dynamo/test_misc.py‎
Lines changed: 2 additions & 2 deletions b/‎test/dynamo/test_misc.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎test/export/test_export.py‎
Lines changed: 2 additions & 2 deletions b/‎test/export/test_export.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎test/inductor/test_cuda_repro.py‎
Lines changed: 5 additions & 5 deletions b/‎test/inductor/test_cuda_repro.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎test/inductor/test_indexing.py‎
Lines changed: 37 additions & 2 deletions b/‎test/inductor/test_indexing.py‎
Lines changed: 37 additions & 2 deletions
diff --git a/‎test/inductor/test_memory_planning.py‎
Lines changed: 3 additions & 5 deletions b/‎test/inductor/test_memory_planning.py‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎test/inductor/test_padding.py‎
Lines changed: 1 addition & 1 deletion b/‎test/inductor/test_padding.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/inductor/test_torchinductor.py‎
Lines changed: 3 additions & 3 deletions b/‎test/inductor/test_torchinductor.py‎
Lines changed: 3 additions & 3 deletions
@@ -580,8 +580,8 @@ def example(inp, *, tag, ranks, group_size):
                 .check_regex(
                     "torch.ops._c10d_functional.all_to_all_single.default\\("
                     "arg\\d+_\\d+, "
-                    "\\[\\(s\\d+ // \\d\\), \\(s\\d+ // \\d\\)\\], "
-                    "\\[\\(s\\d+ // \\d\\), \\(s\\d+ // \\d\\)\\]"
+                    "\\[s\\d+ // \\d, s\\d+ // \\d\\], "
+                    "\\[s\\d+ // \\d, s\\d+ // \\d\\]"
                 )
                 .run(code)
             )
 
@@ -3570,7 +3570,7 @@ def forward(self, pred, x):
             "cast_symbool_to_symint_guardless(L['pred']) == 1",
         ]
         false_guard_code = [
-            "Ne(cast_symbool_to_symint_guardless(L['pred']), 1)",
+            "cast_symbool_to_symint_guardless(L['pred']) != 1",
         ]
         test_symbool_guards(
             f,
 
@@ -668,7 +668,7 @@ def f(x, y, z):
             """\
 +- LAMBDA_GUARD: L['x'].size()[0] == 2*L['z'].size()[0]  # return x + torch.cat([y, z])  # #:# in # #:# in #
 +- LAMBDA_GUARD: L['y'].size()[0] == L['z'].size()[0]  # duck sizing added this equality because these variables had the same size 3 (to avoid this specialization, set torch.fx.experimental._config.use_duck_shape = False)
-+- LAMBDA_GUARD: Eq(Mod(2*L['z'].size()[0], 3), 0)  # if x.size(0) % 3 == 0:  # #:# in # #:# in #
++- LAMBDA_GUARD: ((2*L['z'].size()[0]) % 3) == 0  # if x.size(0) % 3 == 0:  # #:# in # #:# in #
 +- LAMBDA_GUARD: 2 <= L['z'].size()[0]  # return x + torch.cat([y, z])  # #:# in # (user code shown is first use of this value--the guard itself is not due user code but due to 0/1 specialization in the framework; to avoid specialization try torch._dynamo.mark_unbacked(tensor, dim))""",  # noqa: B950
         )
 
 
@@ -10457,7 +10457,7 @@ def test_shape_env_equal_evaluate_expr_divisible(self):
 ShapeEnv not equal: field values don't match:
 
 ==> axioms: values don't match.
-  >  Left: {0 < Mod(s0, 3): False, 0 <= Mod(s0, 3): True, Eq(0, Mod(s0, 3)): True, Eq(Mod(s0, 3), 0): True, Mod(s0, 3) < 0: False, Mod(s0, 3) <= 0: True, Ne(0, Mod(s0, 3)): False, Ne(Mod(s0, 3), 0): False}
+  >  Left: {(Mod(s0, 3)) < 0: False, (Mod(s0, 3)) <= 0: True, 0 < (Mod(s0, 3)): False, 0 <= (Mod(s0, 3)): True, Eq(0, Mod(s0, 3)): True, Eq(Mod(s0, 3), 0): True, Ne(0, Mod(s0, 3)): False, Ne(Mod(s0, 3), 0): False}
   > Right: {}
 ==> divisible: values don't match.
   >  Left: {Mod(s0, 3)}
@@ -10576,7 +10576,7 @@ def test_shape_env_equal_runtime_assert(self):
 ShapeEnv not equal: field values don't match:
 
 ==> axioms: values don't match.
-  >  Left: {0 < PythonMod(u0, 3): False, 0 <= PythonMod(u0, 3): True, Eq(0, PythonMod(u0, 3)): True, Eq(PythonMod(u0, 3), 0): True, Ne(0, PythonMod(u0, 3)): False, Ne(PythonMod(u0, 3), 0): False, PythonMod(u0, 3) < 0: False, PythonMod(u0, 3) <= 0: True}
+  >  Left: {(PythonMod(u0, 3)) < 0: False, (PythonMod(u0, 3)) <= 0: True, 0 < (PythonMod(u0, 3)): False, 0 <= (PythonMod(u0, 3)): True, Eq(0, PythonMod(u0, 3)): True, Eq(PythonMod(u0, 3), 0): True, Ne(0, PythonMod(u0, 3)): False, Ne(PythonMod(u0, 3), 0): False}
   > Right: {}
 ==> deferred_runtime_asserts: values don't match.
   >  Left: {u0: [Eq(PythonMod(u0, 3), 0)]}
 
@@ -3259,9 +3259,9 @@ def forward(self, x, fixes):
             (torch.tensor(20),),
             fixes=[
                 # Could not guard on data-dependent expression Eq((u0//2), 0)
-                "torch._check(((i//2)) != 0)",
+                "torch._check((i // 2) != 0)",
                 # Could not guard on data-dependent expression Eq((u0//2), 1)
-                "torch._check(((i//2)) != 1)",
+                "torch._check((i // 2) != 1)",
             ],
         )
 
 
@@ -1426,12 +1426,12 @@ def triton_poi_fused_add_reflection_pad2d_0(in_ptr0, in_ptr1, out_ptr0, xnumel,
     xoffset = tl.program_id(0) * XBLOCK
     xindex = xoffset + tl.arange(0, XBLOCK)[:]
     xmask = xindex < xnumel
-    x0 = xindex % 20
-    x1 = (xindex // 20) % 20
-    x2 = (xindex // 400)
+    x0 = (xindex % 20)
+    x1 = ((xindex // 20) % 20)
+    x2 = xindex // 400
     x3 = xindex
-    tmp0 = tl.load(in_ptr0 + (99 + ((-1)*(tl_math.abs((-9) + (tl_math.abs((-5) + x0))))) + ((-10)*(tl_math.abs((-9) + (tl_math.abs((-5) + x1))))) + (100*x2)), xmask, eviction_policy='evict_last')
-    tmp1 = tl.load(in_ptr1 + (99 + ((-1)*(tl_math.abs((-9) + (tl_math.abs((-5) + x0))))) + ((-10)*(tl_math.abs((-9) + (tl_math.abs((-5) + x1))))) + (100*x2)), xmask, eviction_policy='evict_last')
+    tmp0 = tl.load(in_ptr0 + (99 + ((-1)*tl_math.abs((-9) + tl_math.abs((-5) + x0))) + ((-10)*tl_math.abs((-9) + tl_math.abs((-5) + x1))) + 100*x2), xmask, eviction_policy='evict_last')
+    tmp1 = tl.load(in_ptr1 + (99 + ((-1)*tl_math.abs((-9) + tl_math.abs((-5) + x0))) + ((-10)*tl_math.abs((-9) + tl_math.abs((-5) + x1))) + 100*x2), xmask, eviction_policy='evict_last')
     tmp2 = tmp0 + tmp1
     tl.store(out_ptr0 + (x3), tmp2, xmask)""",  # noqa: B950
         )
 
@@ -20,7 +20,9 @@
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_GPU
 from torch.utils._sympy.functions import (
     FloorDiv,
+    Mod,
     ModularIndexing,
+    PythonMod,
     RoundDecimal,
     RoundToInt,
 )
@@ -236,7 +238,7 @@ def f(x):
         triton_code = run_and_get_triton_code(f, x)
         # Make sure the 2 load uses simpified indexing rather than something like
         # tl.load(in_ptr0 + ((5504*x1) + (x0 // 2)),
-        self.assertEqual(2, triton_code.count("tl.load(in_ptr0 + ((x2 // 2)),"))
+        self.assertEqual(2, triton_code.count("tl.load(in_ptr0 + (x2 // 2),"))
         if DO_PERF_TEST:
             ms = benchmarker.benchmark_gpu(lambda: f(x))
             print(f"{ms=:.03f}")
@@ -313,6 +315,39 @@ def test_print_round(self):
         self.assertExpectedInline(cexpr(expr), """std::lrint((1.0/2.0)*x)""")
         self.assertExpectedInline(texpr(expr), """libdevice.llrint((1/2)*x)""")
 
+    def test_print_mod(self):
+        x = sympy.Symbol("x", integer=True)
+        expr = Mod(x - 1, 2)
+        self.assertExpectedInline(pexpr(expr), """((-1) + x) % 2""")
+        self.assertExpectedInline(cexpr(expr), """((-1L) + x) % 2L""")
+        self.assertExpectedInline(texpr(expr), """((-1) + x) % 2""")
+
+        expr = (x - 10) % x
+        self.assertExpectedInline(pexpr(expr), """(-10) % x""")
+        self.assertExpectedInline(cexpr(expr), """(-10L) % x""")
+        self.assertExpectedInline(texpr(expr), """(-10) % x""")
+
+    def test_print_mod_index(self):
+        x = sympy.Symbol("x", integer=True)
+        ks = sympy.Symbol("ks", integer=True)
+        expr = ModularIndexing(x - 10, ks, ks)
+        self.assertExpectedInline(pexpr(expr), """((((-10) + x) // ks) % ks)""")
+        self.assertExpectedInline(
+            cexpr(expr),
+            """(static_cast<int64_t>(c10::div_floor_integer("""
+            """static_cast<int64_t>((-10L) + x), static_cast<int64_t>(ks))) % static_cast<int64_t>(ks))""",
+        )
+        self.assertExpectedInline(texpr(expr), """((((-10) + x) // ks) % ks)""")
+
+    def test_print_python_mod(self):
+        x = sympy.Symbol("x", integer=True)
+        expr = PythonMod(x - 10, x)
+        self.assertExpectedInline(pexpr(expr), """((-10) + x) % x""")
+        self.assertExpectedInline(cexpr(expr), """((-10L) + x) % x""")
+        self.assertExpectedInline(
+            texpr(expr), """triton_helpers.remainder_integer((-10) + x, x)"""
+        )
+
     @parametrize("ndigits", [-1, 0, 1])
     def test_print_round_decimal(self, ndigits):
         expr = RoundDecimal(sympy.Symbol("x", integer=True) / 2, ndigits)
@@ -330,7 +365,7 @@ def test_print_floor_div(self):
         s1 = sympy.Symbol("s1", integer=True)
         s2 = sympy.Symbol("s2", integer=True)
         expr = FloorDiv(s1, s2)
-        self.assertEqual(pexpr(expr), "(s1 // s2)")
+        self.assertEqual(pexpr(expr), "s1 // s2")
         self.assertEqual(
             cexpr(expr),
             "c10::div_floor_integer(static_cast<int64_t>(s1), static_cast<int64_t>(s2))",
 
@@ -58,13 +58,11 @@ def test_python_wrapper(self):
         result, code = run_and_get_cpp_code(compiled, *args)
 
         FileCheck().check(
-            "pool1 = empty_strided_"
-            + GPU_TYPE
-            + "(((4*s0*s1) + (align(4*(s0*s0))), ), (1, )"
+            "pool1 = empty_strided_" + GPU_TYPE + "((4*s0*s1 + align(4*s0*s0), ), (1, )"
         ).check_next(
             "buf0 = alloc_from_pool(pool1, 0, torch.float32, (s0, s0), (s0, 1))"
         ).check(
-            "buf1 = alloc_from_pool(pool1, align(4*(s0*s0)),"
+            "buf1 = alloc_from_pool(pool1, align(4*s0*s0),"
         ).run(
             code
         )
@@ -103,7 +101,7 @@ def test_aoti(self):
         )
 
         FileCheck().check(
-            "int64_t int_array_2[] = {24L + (align(12L*s0)), };"
+            "int64_t int_array_2[] = {24L + align(12L*s0), };"
         ).check_next("int64_t int_array_3[] = {1L, };").check_next(
             "AtenTensorHandle pool1_handle;"
         ).check_next(
 
@@ -487,7 +487,7 @@ def test_LinearAndSoftmax_codegen(self, bias=True):
 
         # make sure the load for softmax is aligned
         self.assertTrue(
-            "tl.load(in_ptr0 + (r1 + (30528*x0))" in forward_wrapper,
+            "tl.load(in_ptr0 + (r1 + 30528*x0)" in forward_wrapper,
             f"forward_wrapper: {forward_wrapper}",
         )
 
 
@@ -12505,8 +12505,8 @@ def f(a, b):
                 self.assertExpectedInline(
                     "\n".join(lines),
                     """\
-        tmp0 = tl.load(in_ptr0 + (x1 + (512*x0) + (262144*r2)), rmask, eviction_policy='evict_last', other=0.0)
-        tmp1 = tl.load(in_ptr1 + (x3 + (262144*r2)), rmask, eviction_policy='evict_first', other=0.0)""",
+        tmp0 = tl.load(in_ptr0 + (x1 + 512*x0 + 262144*r2), rmask, eviction_policy='evict_last', other=0.0)
+        tmp1 = tl.load(in_ptr1 + (x3 + 262144*r2), rmask, eviction_policy='evict_first', other=0.0)""",
                 )
 
         @config.patch("triton.use_block_ptr", True)
@@ -12538,7 +12538,7 @@ def f(a, b):
                 self.assertExpectedInline(
                     "\n".join(lines),
                     """\
-        tmp0 = tl.reshape(tl.broadcast_to(tl.load(block_ptr0, boundary_check=[2], padding_option='zero', eviction_policy='evict_last')[:, None, :, :], [((511 + XBLOCK) // 512), ((1) * ((1) <= (((511 + XBLOCK) // 512))) + (((511 + XBLOCK) // 512)) * ((((511 + XBLOCK) // 512)) < (1))), ((512) * ((512) <= (XBLOCK)) + (XBLOCK) * ((XBLOCK) < (512))), RBLOCK]), [XBLOCK, RBLOCK])
+        tmp0 = tl.reshape(tl.broadcast_to(tl.load(block_ptr0, boundary_check=[2], padding_option='zero', eviction_policy='evict_last')[:, None, :, :], [(511 + XBLOCK) // 512, ((1) * ((1) <= ((511 + XBLOCK) // 512)) + ((511 + XBLOCK) // 512) * (((511 + XBLOCK) // 512) < (1))), ((512) * ((512) <= (XBLOCK)) + (XBLOCK) * ((XBLOCK) < (512))), RBLOCK]), [XBLOCK, RBLOCK])
         tmp1 = tl.load(block_ptr1, boundary_check=[1], padding_option='zero', eviction_policy='evict_first')""",  # noqa: B950 line too long
                 )
Original file line number	Diff line number	Diff line change
`@@ -580,8 +580,8 @@ def example(inp, *, tag, ranks, group_size):`
`580`	`580`	`.check_regex(`
`581`	`581`	`"torch.ops._c10d_functional.all_to_all_single.default\\("`
`582`	`582`	`"arg\\d+_\\d+, "`
`583`		`- "\\[\\(s\\d+ // \\d\\), \\(s\\d+ // \\d\\)\\], "`
`584`		`- "\\[\\(s\\d+ // \\d\\), \\(s\\d+ // \\d\\)\\]"`
	`583`	`+ "\\[s\\d+ // \\d, s\\d+ // \\d\\], "`
	`584`	`+ "\\[s\\d+ // \\d, s\\d+ // \\d\\]"`
`585`	`585`	`)`
`586`	`586`	`.run(code)`
`587`	`587`	`)`
Original file line number	Diff line number	Diff line change
`@@ -3570,7 +3570,7 @@ def forward(self, pred, x):`
`3570`	`3570`	`"cast_symbool_to_symint_guardless(L['pred']) == 1",`
`3571`	`3571`	`]`
`3572`	`3572`	`false_guard_code = [`
`3573`		`- "Ne(cast_symbool_to_symint_guardless(L['pred']), 1)",`
	`3573`	`+ "cast_symbool_to_symint_guardless(L['pred']) != 1",`
`3574`	`3574`	`]`
`3575`	`3575`	`test_symbool_guards(`
`3576`	`3576`	`f,`
Original file line number	Diff line number	Diff line change
`@@ -668,7 +668,7 @@ def f(x, y, z):`
`668`	`668`	`"""\`
`669`	`669`	`+- LAMBDA_GUARD: L['x'].size()[0] == 2*L['z'].size()[0] # return x + torch.cat([y, z]) # #:# in # #:# in #`
`670`	`670`	`+- LAMBDA_GUARD: L['y'].size()[0] == L['z'].size()[0] # duck sizing added this equality because these variables had the same size 3 (to avoid this specialization, set torch.fx.experimental._config.use_duck_shape = False)`
`671`		`-+- LAMBDA_GUARD: Eq(Mod(2*L['z'].size()[0], 3), 0) # if x.size(0) % 3 == 0: # #:# in # #:# in #`
	`671`	`++- LAMBDA_GUARD: ((2*L['z'].size()[0]) % 3) == 0 # if x.size(0) % 3 == 0: # #:# in # #:# in #`
`672`	`672`	`+- LAMBDA_GUARD: 2 <= L['z'].size()[0] # return x + torch.cat([y, z]) # #:# in # (user code shown is first use of this value--the guard itself is not due user code but due to 0/1 specialization in the framework; to avoid specialization try torch._dynamo.mark_unbacked(tensor, dim))""", # noqa: B950`
`673`	`673`	`)`
`674`	`674`
Original file line number	Diff line number	Diff line change
`@@ -487,7 +487,7 @@ def test_LinearAndSoftmax_codegen(self, bias=True):`
`487`	`487`
`488`	`488`	`# make sure the load for softmax is aligned`
`489`	`489`	`self.assertTrue(`
`490`		`- "tl.load(in_ptr0 + (r1 + (30528*x0))" in forward_wrapper,`
	`490`	`+ "tl.load(in_ptr0 + (r1 + 30528*x0)" in forward_wrapper,`
`491`	`491`	`f"forward_wrapper: {forward_wrapper}",`
`492`	`492`	`)`
`493`	`493`