[inductor][cpp] support nested kernel with indirect indexing #129223

jgong5 · 2024-06-21T08:56:16Z

Stack from ghstack (oldest at bottom):

-> [inductor][cpp] support nested kernel with indirect indexing #129223

This PR makes sure the current kernel is used for generating CSE variables when nested kernel codegen is involved, e.g., nested CppKernel is used to generate epilogue of CppTemplateKernel. Without the fix, the epilogue with indirect indexing would fail to run.

pytest -k test_linear_with_embedding_bias_False_cpu test_cpu_select_algorithm.py

Epilogue code Before:

                {
                    #pragma GCC ivdep
                    for(long x0=static_cast<long>(0L); x0<static_cast<long>(m_end + ((-1L)*m_start)); x0+=static_cast<long>(1L))
                    {
                        for(long x1=static_cast<long>(0L); x1<static_cast<long>(16L*(c10::div_floor_integer(N0, 16L))); x1+=static_cast<long>(16L))
                        {
                            auto tmp0 = in_ptr2[static_cast<long>(m_start + x0)];
                            auto tmp11 = at::vec::Vectorized<float>::loadu(local_acc_buf + static_cast<long>(x1 + (N0*x0)), 16);
                            auto tmp1 = 64L;
                            auto tmp2 = c10::convert<int64_t>(tmp1);
                            auto tmp3 = decltype(tmp0)(tmp0 + tmp2);
                            auto tmp4 = tmp0 ? tmp3 : tmp0;
                            auto tmp5 = decltype(tmp4)(tmp4 + tmp2);
                            auto tmp6 = tmp1 ? tmp5 : tmp4;
                            auto tmp7 = tmp6;
                            auto tmp8 = c10::convert<int64_t>(tmp7);
                            TORCH_CHECK((0 <= tmp8) & (tmp8 < 64L), "index out of bounds: 0 <= tmp8 < 64L");
                            auto tmp10 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<long>(n_start + x1 + (384L*tmp6)), 16);
                            auto tmp12 = (tmp11);
                            auto tmp13 = tmp10 + tmp12;
                            tmp13.store(Y + static_cast<long>(n_start + x1 + (384L*m_start) + (384L*x0)));
                        }
                        #pragma omp simd simdlen(8) 
                        for(long x1=static_cast<long>(16L*(c10::div_floor_integer(N0, 16L))); x1<static_cast<long>(N0); x1+=static_cast<long>(1L))
                        {
                            auto tmp0 = in_ptr2[static_cast<long>(m_start + x0)];
                            auto tmp11 = local_acc_buf[static_cast<long>(x1 + (N0*x0))];
                            auto tmp1 = 64L;
                            auto tmp2 = c10::convert<int64_t>(tmp1);
                            auto tmp3 = decltype(tmp0)(tmp0 + tmp2);
                            auto tmp4 = tmp0 ? tmp3 : tmp0;
                            auto tmp5 = decltype(tmp4)(tmp4 + tmp2);
                            auto tmp6 = tmp1 ? tmp5 : tmp4;
                            auto tmp7 = tmp6;
                            auto tmp8 = c10::convert<int64_t>(tmp7);
                            TORCH_CHECK((0 <= tmp8) & (tmp8 < 64L), "index out of bounds: 0 <= tmp8 < 64L");
                            TORCH_CHECK((0 <= tmp8) & (tmp8 < 64L), "index out of bounds: 0 <= tmp8 < 64L");
                            auto tmp10 = in_ptr3[static_cast<long>(n_start + x1 + (384L*tmp6))];
                            auto tmp12 = c10::convert<float>(tmp11);
                            auto tmp13 = decltype(tmp10)(tmp10 + tmp12);
                            Y[static_cast<long>(n_start + x1 + (384L*m_start) + (384L*x0))] = tmp13;
                        }
                    }
                }

Epilogue code After:

                {
                    #pragma GCC ivdep
                    for(long x0=static_cast<long>(0L); x0<static_cast<long>(m_end + ((-1L)*m_start)); x0+=static_cast<long>(1L))
                    {
                        for(long x1=static_cast<long>(0L); x1<static_cast<long>(16L*(c10::div_floor_integer(N0, 16L))); x1+=static_cast<long>(16L))
                        {
                            auto tmp0 = in_ptr2[static_cast<long>(m_start + x0)];
                            auto tmp13 = at::vec::Vectorized<float>::loadu(local_acc_buf + static_cast<long>(x1 + (N0*x0)), 16);
                            auto tmp1 = 64L;
                            auto tmp2 = c10::convert<int64_t>(tmp1);
                            auto tmp3 = decltype(tmp0)(tmp0 + tmp2);
                            auto tmp4 = tmp0 < 0;
                            auto tmp5 = tmp4 ? tmp3 : tmp0;
                            auto tmp6 = decltype(tmp5)(tmp5 + tmp2);
                            auto tmp7 = tmp5 < 0;
                            auto tmp8 = tmp7 ? tmp6 : tmp5;
                            auto tmp9 = tmp8;
                            auto tmp10 = c10::convert<int64_t>(tmp9);
                            TORCH_CHECK((0 <= tmp10) & (tmp10 < 64L), "index out of bounds: 0 <= tmp10 < 64L");
                            auto tmp12 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<long>(n_start + x1 + (384L*tmp8)), 16);
                            auto tmp14 = (tmp13);
                            auto tmp15 = tmp12 + tmp14;
                            tmp15.store(Y + static_cast<long>(n_start + x1 + (384L*m_start) + (384L*x0)));
                        }
                        #pragma omp simd simdlen(8) 
                        for(long x1=static_cast<long>(16L*(c10::div_floor_integer(N0, 16L))); x1<static_cast<long>(N0); x1+=static_cast<long>(1L))
                        {
                            auto tmp0 = in_ptr2[static_cast<long>(m_start + x0)];
                            auto tmp13 = local_acc_buf[static_cast<long>(x1 + (N0*x0))];
                            auto tmp1 = 64L;
                            auto tmp2 = c10::convert<int64_t>(tmp1);
                            auto tmp3 = decltype(tmp0)(tmp0 + tmp2);
                            auto tmp4 = tmp0 < 0;
                            auto tmp5 = tmp4 ? tmp3 : tmp0;
                            auto tmp6 = decltype(tmp5)(tmp5 + tmp2);
                            auto tmp7 = tmp5 < 0;
                            auto tmp8 = tmp7 ? tmp6 : tmp5;
                            auto tmp9 = tmp8;
                            auto tmp10 = c10::convert<int64_t>(tmp9);
                            TORCH_CHECK((0 <= tmp10) & (tmp10 < 64L), "index out of bounds: 0 <= tmp10 < 64L");
                            TORCH_CHECK((0 <= tmp10) & (tmp10 < 64L), "index out of bounds: 0 <= tmp10 < 64L");
                            auto tmp12 = in_ptr3[static_cast<long>(n_start + x1 + (384L*tmp8))];
                            auto tmp14 = c10::convert<float>(tmp13);
                            auto tmp15 = decltype(tmp12)(tmp12 + tmp14);
                            Y[static_cast<long>(n_start + x1 + (384L*m_start) + (384L*x0))] = tmp15;
                        }
                    }
                }

cc @voznesenskym @penguinwu @EikanWang @Guobing-Chen @XiaobingSuper @zhuhaozhe @blzheng @wenzhe-nrv @jiayisunx @peterbell10 @ipiszy @yf225 @chenyang78 @kadeng @muchulee8 @ColinPeppler @amjames @desertfire @chauhang

[ghstack-poisoned]

pytorch-bot · 2024-06-21T08:56:18Z

🔗 Helpful Links

🧪 See artifacts and rendered test results at hud.pytorch.org/pr/129223

📄 Preview Python docs built from this PR
📄 Preview C++ docs built from this PR
❓ Need help or want to give feedback on the CI? Visit the bot commands wiki or our office hours

Note: Links to docs will display an error until the docs builds have been completed.

✅ You can merge normally! (2 Unrelated Failures)

As of commit 4e309e4 with merge base c012013 ():

BROKEN TRUNK - The following job failed but were present on the merge base:

👉 Rebase onto the `viable/strict` branch to avoid these failures

trunk / linux-focal-rocm6.1-py3.8 / test (default, 2, 2, linux.rocm.gpu) (gh) (trunk failure)

UNSTABLE - The following job failed but was likely due to flakiness present on trunk and has been marked as unstable:

inductor / rocm6.1-py3.8-inductor / test (inductor, 1, 1, linux.rocm.gpu.2, unstable) (gh) (#128871)
'test/inductor/test_max_autotune.py::TestTuningProcess::test_tuning_pool_multiple_devices'

This comment was automatically generated by Dr. CI and updates every 15 minutes.

[ghstack-poisoned]

jgong5 · 2024-06-21T08:59:19Z

test/inductor/test_cpu_select_algorithm.py

-        if dtype == torch.half or dtype == torch.bfloat16:
-            atol, rtol = 1e-2, 1e-2
-        with patch.object(select_algorithm, "VERIFY", dict(atol=atol, rtol=rtol)):
+        with verify(dtype) as (atol, rtol):


Simplify the checking with tolerance code here.

jgong5 · 2024-06-21T09:01:22Z

torch/_inductor/codegen/common.py

+                        csevar = V.kernel.cse.generate(
+                            V.kernel.compute, v, bounds=bounds
+                        )


Makes sure we are working on the current kernel when the nested kernel is being generated, e.g., CppKernel generated as the epilogue of CppTemplateKernel.

[ghstack-poisoned]

ghstack-source-id: 162c6ed Pull Request resolved: #129223

leslie-fang-intel

LGTM

leslie-fang-intel · 2024-06-22T02:27:54Z

torch/_inductor/codegen/common.py

                def inner(*args, **kwargs):
                    bounds = CSEProxy._bound_variable(name, *args, **kwargs)

                    value = getattr(parent_handler, name)(*args, **kwargs)  # type: ignore[has-type]


Thanks for the fix. For ops name like lt, getattr(parent_handler, name) will back trace until MockHandler to do the codegen. So, for this case of nested kernel, it back trace to use instance of CSEProxy binding with parent kernel (CppTemplateKernel in this case).

Attached the scalar op list using MockHandler to do the codegen:

pytorch/torch/_inductor/ops_handler.py

Lines 798 to 819 in 5b14943

for name, format_string in {

"add": "{} + {}",

"sub": "{} - {}",

"mul": "{} * {}",

"floordiv": "{} // {}",

"truediv": "{} / {}",

"mod": "{} % {}", # careful, depending on target semantics varies

"pow": "{} ** {}",

"lshift": "{} << {}",

"rshift": "{} >> {}",

"and_": "{} & {}",

"or_": "{} | {}",

"xor": "{} ^ {}",

"eq": "{} == {}",

"ne": "{} != {}",

"lt": "{} < {}",

"gt": "{} > {}",

"le": "{} <= {}",

"ge": "{} >= {}",

"neg": "-{}",

}.items():

setattr(cls, name, make_handler(format_string))

[ghstack-poisoned]

ghstack-source-id: 53b5606 Pull Request resolved: #129223

[ghstack-poisoned]

ghstack-source-id: 84d0157 Pull Request resolved: #129223

jgong5 · 2024-06-25T01:00:46Z

@pytorchbot merge

pytorchmergebot · 2024-06-25T01:03:06Z

Merge started

Your change will be merged once all checks pass (ETA 0-4 Hours).

Learn more about merging in the wiki.

Questions? Feedback? Please reach out to the PyTorch DevX Team

Advanced Debugging

Check the merge workflow status
here

Update

ae71f55

[ghstack-poisoned]

pytorch-bot bot added ciflow/inductor module: inductor labels Jun 21, 2024

jgong5 mentioned this pull request Jun 21, 2024

[inductor][cpp] refactor CppTemplateKernel to inherit CppKernel #129101

Closed

Update

61ff968

[ghstack-poisoned]

pytorchbot added the open source label Jun 21, 2024

jgong5 commented Jun 21, 2024

View reviewed changes

jgong5 requested a review from leslie-fang-intel June 21, 2024 09:02

jgong5 added the topic: not user facing topic category label Jun 21, 2024

Update

003be49

[ghstack-poisoned]

jgong5 pushed a commit that referenced this pull request Jun 21, 2024

[inductor][cpp] support nested kernel with indirect indexing

fc86ced

ghstack-source-id: 162c6ed Pull Request resolved: #129223

leslie-fang-intel approved these changes Jun 22, 2024

View reviewed changes

jgong5 requested review from jansel and peterbell10 June 22, 2024 04:03

Update

872ab34

[ghstack-poisoned]

jgong5 pushed a commit that referenced this pull request Jun 22, 2024

[inductor][cpp] support nested kernel with indirect indexing

18718b6

ghstack-source-id: 53b5606 Pull Request resolved: #129223

jansel approved these changes Jun 24, 2024

View reviewed changes

Update

4e309e4

[ghstack-poisoned]

jgong5 pushed a commit that referenced this pull request Jun 25, 2024

[inductor][cpp] support nested kernel with indirect indexing

33758d3

ghstack-source-id: 84d0157 Pull Request resolved: #129223

pytorch-bot bot added the ciflow/trunk Trigger trunk jobs on your pull request label Jun 25, 2024

pytorchmergebot added the merging label Jun 25, 2024

pytorchmergebot added the Merged label Jun 25, 2024

pytorchmergebot closed this in 533c419 Jun 25, 2024

pytorchmergebot removed the merging label Jun 25, 2024

github-actions bot deleted the gh/jgong5/55/head branch July 26, 2024 01:56

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[inductor][cpp] support nested kernel with indirect indexing #129223

[inductor][cpp] support nested kernel with indirect indexing #129223

Uh oh!

jgong5 commented Jun 21, 2024 •

edited

Loading

Uh oh!

pytorch-bot bot commented Jun 21, 2024 •

edited

Loading

Uh oh!

jgong5 Jun 21, 2024

Uh oh!

jgong5 Jun 21, 2024

Uh oh!

leslie-fang-intel left a comment

Uh oh!

leslie-fang-intel Jun 22, 2024 •

edited

Loading

Uh oh!

jgong5 commented Jun 25, 2024

Uh oh!

pytorchmergebot commented Jun 25, 2024

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

6 participants

	for name, format_string in {
	"add": "{} + {}",
	"sub": "{} - {}",
	"mul": "{} * {}",
	"floordiv": "{} // {}",
	"truediv": "{} / {}",
	"mod": "{} % {}", # careful, depending on target semantics varies
	"pow": "{} ** {}",
	"lshift": "{} << {}",
	"rshift": "{} >> {}",
	"and_": "{} & {}",
	"or_": "{} \| {}",
	"xor": "{} ^ {}",
	"eq": "{} == {}",
	"ne": "{} != {}",
	"lt": "{} < {}",
	"gt": "{} > {}",
	"le": "{} <= {}",
	"ge": "{} >= {}",
	"neg": "-{}",
	}.items():
	setattr(cls, name, make_handler(format_string))

[inductor][cpp] support nested kernel with indirect indexing #129223

[inductor][cpp] support nested kernel with indirect indexing #129223

Uh oh!

Conversation

jgong5 commented Jun 21, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

pytorch-bot bot commented Jun 21, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

🔗 Helpful Links

🧪 See artifacts and rendered test results at hud.pytorch.org/pr/129223

✅ You can merge normally! (2 Unrelated Failures)

Uh oh!

jgong5 Jun 21, 2024

Choose a reason for hiding this comment

Uh oh!

jgong5 Jun 21, 2024

Choose a reason for hiding this comment

Uh oh!

leslie-fang-intel left a comment

Choose a reason for hiding this comment

Uh oh!

leslie-fang-intel Jun 22, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

jgong5 commented Jun 25, 2024

Uh oh!

pytorchmergebot commented Jun 25, 2024

Merge started

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

6 participants

jgong5 commented Jun 21, 2024 •

edited

Loading

pytorch-bot bot commented Jun 21, 2024 •

edited

Loading

leslie-fang-intel Jun 22, 2024 •

edited

Loading