[LoopUnroll] Support parallel reductions for minmax#182473
Conversation
|
@llvm/pr-subscribers-llvm-analysis @llvm/pr-subscribers-llvm-transforms Author: Madhur Amilkanthwar (madhur13490) ChangesThis patch
Planning to take support for vector types in the next patch. Patch is 79.96 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/182473.diff 3 Files Affected:
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index d9422afe5e82a..315f919060bea 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -108,9 +108,9 @@ UnrollVerifyLoopInfo("unroll-verify-loopinfo", cl::Hidden,
#endif
);
-static cl::opt<bool> UnrollAddParallelReductions(
- "unroll-add-parallel-reductions", cl::init(false), cl::Hidden,
- cl::desc("Allow unrolling to add parallel reduction phis."));
+static cl::opt<bool> UnrollParallelReductions(
+ "unroll-parallel-reductions", cl::init(false), cl::Hidden,
+ cl::desc("Allow unrolling to parallelize reductions."));
/// Check if unrolling created a situation where we need to insert phi nodes to
/// preserve LCSSA form.
@@ -670,8 +670,8 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
// to not exit.
DenseMap<PHINode *, RecurrenceDescriptor> Reductions;
bool CanAddAdditionalAccumulators =
- (UnrollAddParallelReductions.getNumOccurrences() > 0
- ? UnrollAddParallelReductions
+ (UnrollParallelReductions.getNumOccurrences() > 0
+ ? UnrollParallelReductions
: ULO.AddAdditionalAccumulators) &&
!CompletelyUnroll && L->getNumBlocks() == 1 &&
(ULO.Runtime ||
@@ -1102,9 +1102,12 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
Builder.setFastMathFlags(Reductions.begin()->second.getFastMathFlags());
RecurKind RK = Reductions.begin()->second.getRecurrenceKind();
for (Instruction *RdxPart : drop_begin(PartialReductions)) {
- RdxResult = Builder.CreateBinOp(
- (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK),
- RdxPart, RdxResult, "bin.rdx");
+ if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))
+ RdxResult = createMinMaxOp(Builder, RK, RdxResult, RdxPart);
+ else
+ RdxResult = Builder.CreateBinOp(
+ (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK),
+ RdxPart, RdxResult, "bin.rdx");
}
NeedToFixLCSSA = true;
for (Instruction *RdxPart : PartialReductions)
@@ -1263,12 +1266,10 @@ llvm::canParallelizeReductionWhenUnrolling(PHINode &Phi, Loop *L,
if (RdxDesc.hasUsesOutsideReductionChain())
return std::nullopt;
RecurKind RK = RdxDesc.getRecurrenceKind();
- // Skip unsupported reductions.
- // TODO: Handle additional reductions, including FP and min-max
- // reductions.
+ // Skip unsupported reductions. Min/max (integer and FP) are supported.
+ // TODO: Handle additional reductions.
if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
- RecurrenceDescriptor::isFindRecurrenceKind(RK) ||
- RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))
+ RecurrenceDescriptor::isFindRecurrenceKind(RK))
return std::nullopt;
if (RdxDesc.hasExactFPMath())
diff --git a/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll b/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll
index e5e969d638acd..5aaabb7b73675 100644
--- a/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll
+++ b/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -p loop-unroll -unroll-add-parallel-reductions -unroll-allow-partial -unroll-max-count=4 -S %s | FileCheck %s
+; RUN: opt -p loop-unroll -unroll-parallel-reductions -unroll-allow-partial -unroll-max-count=4 -S %s | FileCheck %s
define i32 @test_add(ptr %src, i64 %n, i32 %start) {
; CHECK-LABEL: define i32 @test_add(
diff --git a/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll b/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll
index 840cc6c507c3d..f7cab64ef568e 100644
--- a/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll
+++ b/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -p loop-unroll -unroll-add-parallel-reductions -S %s | FileCheck %s
+; RUN: opt -p loop-unroll -unroll-parallel-reductions -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -S %s | FileCheck %s
define i32 @test_add_reduction(ptr %a, i64 %n) {
; CHECK-LABEL: define i32 @test_add_reduction(
-; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 1
@@ -70,7 +70,7 @@ exit:
define i32 @test_add_reduction_constant_op(ptr %a, i64 %n) {
; CHECK-LABEL: define i32 @test_add_reduction_constant_op(
-; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 1
@@ -123,7 +123,7 @@ exit:
define i32 @test_add_reduction_8x_unroll(ptr %a, i64 %n) {
; CHECK-LABEL: define i32 @test_add_reduction_8x_unroll(
-; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 7
@@ -222,7 +222,7 @@ exit:
define <4 x i32> @test_vector_add_reduction(ptr %a, i64 %n) {
; CHECK-LABEL: define <4 x i32> @test_vector_add_reduction(
-; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 1
@@ -287,9 +287,77 @@ exit:
ret <4 x i32> %res
}
+; TODO: Add parallel reduction support for vector reduction phis (e.g. <4 x float>
+; fmin via llvm.minnum.v4f32). Currently they are unrolled with a chained
+; reduction only.
+define <4 x float> @test_vector_fmin_reduction_intrinsic(ptr %a, i64 %n) {
+; CHECK-LABEL: define <4 x float> @test_vector_fmin_reduction_intrinsic(
+; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 1
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1
+; CHECK-NEXT: br i1 [[TMP1]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[ENTRY_NEW:.*]]
+; CHECK: [[ENTRY_NEW]]:
+; CHECK-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[RDX:%.*]] = phi <4 x float> [ splat (float 0x7FF0000000000000), %[[ENTRY_NEW]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[GEP_A]], align 16
+; CHECK-NEXT: [[RDX_NEXT:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[RDX]], <4 x float> [[TMP2]])
+; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[A]], i64 [[IV_NEXT]]
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[GEP_A_1]], align 16
+; CHECK-NEXT: [[RDX_NEXT_1]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[RDX_NEXT]], <4 x float> [[TMP3]])
+; CHECK-NEXT: [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT: [[NITER_NEXT_1]] = add i64 [[NITER]], 2
+; CHECK-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
+; CHECK-NEXT: br i1 [[NITER_NCMP_1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: [[EXIT_UNR_LCSSA]]:
+; CHECK-NEXT: [[RES_PH:%.*]] = phi <4 x float> [ [[RDX_NEXT_1]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV_UNR:%.*]] = phi i64 [ [[IV_NEXT_1]], %[[LOOP]] ]
+; CHECK-NEXT: [[RDX_UNR:%.*]] = phi <4 x float> [ [[RDX_NEXT_1]], %[[LOOP]] ]
+; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; CHECK-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER]], label %[[EXIT:.*]]
+; CHECK: [[LOOP_EPIL_PREHEADER]]:
+; CHECK-NEXT: [[IV_EPIL_INIT:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR]], %[[EXIT_UNR_LCSSA]] ]
+; CHECK-NEXT: [[RDX_EPIL_INIT:%.*]] = phi <4 x float> [ splat (float 0x7FF0000000000000), %[[ENTRY]] ], [ [[RDX_UNR]], %[[EXIT_UNR_LCSSA]] ]
+; CHECK-NEXT: [[LCMP_MOD2:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; CHECK-NEXT: call void @llvm.assume(i1 [[LCMP_MOD2]])
+; CHECK-NEXT: br label %[[LOOP_EPIL:.*]]
+; CHECK: [[LOOP_EPIL]]:
+; CHECK-NEXT: [[GEP_A_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[A]], i64 [[IV_EPIL_INIT]]
+; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[GEP_A_EPIL]], align 16
+; CHECK-NEXT: [[RDX_NEXT_EPIL:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[RDX_EPIL_INIT]], <4 x float> [[TMP4]])
+; CHECK-NEXT: br label %[[EXIT]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[RES:%.*]] = phi <4 x float> [ [[RES_PH]], %[[EXIT_UNR_LCSSA]] ], [ [[RDX_NEXT_EPIL]], %[[LOOP_EPIL]] ]
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %rdx = phi <4 x float> [ <float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000>, %entry ], [ %rdx.next, %loop ]
+ %gep.a = getelementptr inbounds nuw <4 x float>, ptr %a, i64 %iv
+ %1 = load <4 x float>, ptr %gep.a, align 16
+ %rdx.next = call <4 x float> @llvm.minnum.v4f32(<4 x float> %rdx, <4 x float> %1)
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, %n
+ br i1 %ec, label %exit, label %loop, !llvm.loop !0
+
+exit:
+ %res = phi <4 x float> [ %rdx.next, %loop ]
+ ret <4 x float> %res
+}
+
define float @test_fadd_reduction(ptr %a, i64 %n) {
; CHECK-LABEL: define float @test_fadd_reduction(
-; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 1
@@ -313,7 +381,7 @@ define float @test_fadd_reduction(ptr %a, i64 %n) {
; CHECK-NEXT: [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
; CHECK-NEXT: [[NITER_NEXT_1]] = add i64 [[NITER]], 2
; CHECK-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
-; CHECK-NEXT: br i1 [[NITER_NCMP_1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: br i1 [[NITER_NCMP_1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: [[EXIT_UNR_LCSSA]]:
; CHECK-NEXT: [[RES_PH:%.*]] = phi float [ [[RDX_NEXT_1]], %[[LOOP]] ]
; CHECK-NEXT: [[IV_UNR:%.*]] = phi i64 [ [[IV_NEXT_1]], %[[LOOP]] ]
@@ -356,7 +424,7 @@ exit:
define float @test_fadd_no_reassoc(ptr %a, i64 %n) {
; CHECK-LABEL: define float @test_fadd_no_reassoc(
-; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 1
@@ -379,7 +447,7 @@ define float @test_fadd_no_reassoc(ptr %a, i64 %n) {
; CHECK-NEXT: [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
; CHECK-NEXT: [[NITER_NEXT_1]] = add i64 [[NITER]], 2
; CHECK-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
-; CHECK-NEXT: br i1 [[NITER_NCMP_1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT: br i1 [[NITER_NCMP_1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: [[EXIT_UNR_LCSSA]]:
; CHECK-NEXT: [[RES_PH:%.*]] = phi float [ [[RDX_NEXT_1]], %[[LOOP]] ]
; CHECK-NEXT: [[IV_UNR:%.*]] = phi i64 [ [[IV_NEXT_1]], %[[LOOP]] ]
@@ -421,7 +489,7 @@ exit:
define float @test_fadd_other_fastmath(ptr %a, i64 %n) {
; CHECK-LABEL: define float @test_fadd_other_fastmath(
-; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 1
@@ -444,7 +512,7 @@ define float @test_fadd_other_fastmath(ptr %a, i64 %n) {
; CHECK-NEXT: [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
; CHECK-NEXT: [[NITER_NEXT_1]] = add i64 [[NITER]], 2
; CHECK-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
-; CHECK-NEXT: br i1 [[NITER_NCMP_1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: br i1 [[NITER_NCMP_1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: [[EXIT_UNR_LCSSA]]:
; CHECK-NEXT: [[RES_PH:%.*]] = phi float [ [[RDX_NEXT_1]], %[[LOOP]] ]
; CHECK-NEXT: [[IV_UNR:%.*]] = phi i64 [ [[IV_NEXT_1]], %[[LOOP]] ]
@@ -484,6 +552,1061 @@ exit:
ret float %res
}
+define i32 @test_smin_reduction(ptr %a, i64 %n) {
+; CHECK-LABEL: define i32 @test_smin_reduction(
+; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 1
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1
+; CHECK-NEXT: br i1 [[TMP1]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[ENTRY_NEW:.*]]
+; CHECK: [[ENTRY_NEW]]:
+; CHECK-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_1:%.*]] = phi i32 [ 2147483647, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN:%.*]] = phi i32 [ 2147483647, %[[ENTRY_NEW]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[GEP_A]], align 4
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[MIN]], [[TMP2]]
+; CHECK-NEXT: [[RDX_NEXT]] = select i1 [[CMP]], i32 [[MIN]], i32 [[TMP2]]
+; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]]
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[GEP_A_1]], align 4
+; CHECK-NEXT: [[CMP_1:%.*]] = icmp slt i32 [[MIN_1]], [[TMP3]]
+; CHECK-NEXT: [[RDX_NEXT_1]] = select i1 [[CMP_1]], i32 [[MIN_1]], i32 [[TMP3]]
+; CHECK-NEXT: [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT: [[NITER_NEXT_1]] = add i64 [[NITER]], 2
+; CHECK-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
+; CHECK-NEXT: br i1 [[NITER_NCMP_1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK: [[EXIT_UNR_LCSSA]]:
+; CHECK-NEXT: [[RES_PH:%.*]] = phi i32 [ [[RDX_NEXT_1]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV_UNR:%.*]] = phi i64 [ [[IV_NEXT_1]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_UNR:%.*]] = phi i32 [ [[RDX_NEXT_1]], %[[LOOP]] ]
+; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call i32 @llvm.smin.i32(i32 [[RDX_NEXT]], i32 [[RDX_NEXT_1]])
+; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; CHECK-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER]], label %[[EXIT:.*]]
+; CHECK: [[LOOP_EPIL_PREHEADER]]:
+; CHECK-NEXT: [[IV_EPIL_INIT:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR]], %[[EXIT_UNR_LCSSA]] ]
+; CHECK-NEXT: [[MIN_EPIL_INIT:%.*]] = phi i32 [ 2147483647, %[[ENTRY]] ], [ [[RDX_MINMAX]], %[[EXIT_UNR_LCSSA]] ]
+; CHECK-NEXT: [[LCMP_MOD2:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; CHECK-NEXT: call void @llvm.assume(i1 [[LCMP_MOD2]])
+; CHECK-NEXT: br label %[[LOOP_EPIL:.*]]
+; CHECK: [[LOOP_EPIL]]:
+; CHECK-NEXT: [[GEP_A_EPIL:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_EPIL_INIT]]
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[GEP_A_EPIL]], align 4
+; CHECK-NEXT: [[CMP_EPIL:%.*]] = icmp slt i32 [[MIN_EPIL_INIT]], [[TMP4]]
+; CHECK-NEXT: [[RDX_NEXT_EPIL:%.*]] = select i1 [[CMP_EPIL]], i32 [[MIN_EPIL_INIT]], i32 [[TMP4]]
+; CHECK-NEXT: br label %[[EXIT]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[RDX_MINMAX]], %[[EXIT_UNR_LCSSA]] ], [ [[RDX_NEXT_EPIL]], %[[LOOP_EPIL]] ]
+; CHECK-NEXT: ret i32 [[RES]]
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %min = phi i32 [ 2147483647, %entry ], [ %rdx.next, %loop ]
+ %gep.a = getelementptr inbounds i32, ptr %a, i64 %iv
+ %1 = load i32, ptr %gep.a, align 4
+ %cmp = icmp slt i32 %min, %1
+ %rdx.next = select i1 %cmp, i32 %min, i32 %1
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, %n
+ br i1 %ec, label %exit, label %loop, !llvm.loop !0
+
+exit:
+ %res = phi i32 [ %rdx.next, %loop ]
+ ret i32 %res
+}
+
+define i32 @test_smax_reduction(ptr %a, i64 %n) {
+; CHECK-LABEL: define i32 @test_smax_reduction(
+; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 1
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1
+; CHECK-NEXT: br i1 [[TMP1]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[ENTRY_NEW:.*]]
+; CHECK: [[ENTRY_NEW]]:
+; CHECK-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MAX_1:%.*]] = phi i32 [ -2147483648, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MAX:%.*]] = phi i32 [ -2147483648, %[[ENTRY_NEW]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[GEP_A]], align 4
+; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[MAX]], [[TMP2]]
+; CHECK-NEXT: [[RDX_NEXT]] = select i1 [[CMP]], i32 [[MAX]], i32 [[TMP2]]
+; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]]
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[GEP_A_1]], align 4
+; CHECK-NEXT: [[CMP_1:%.*]] = icmp sgt i32 [[MAX_1]], [[TMP3]]
+; CHECK-NEXT: [[RDX_NEXT_1]] = select i1 [[CMP_1]], i32 [[MAX_1]], i32 [[TMP3]]
+; CHECK-NEXT: [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT: [[NITER_NEXT_1]] = add i64 [[NITER]], 2
+; CHECK-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
+; CHECK-NEXT: br i1 [[NITER_NCMP_1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK: [[EXIT_UNR_LCSSA]]:
+; CHECK-NEXT: [[RES_PH:%.*]] = phi i32 [ [[RDX_NEXT_1]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV_UNR:%.*]] = phi i64 [ [[IV_NEXT_1]], %[[LOOP]] ]
+; CHECK-NEXT: [[MAX_UNR:%.*]] = phi i32 [ [[RDX_NEXT_1]], %[[LOOP]] ]
+; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[RDX_NEXT]], i32 [[RDX_NEXT_1]])
+; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; CHECK-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER]], label %[[EXIT:.*]]
+; CHECK: [[LOOP_EPIL_PREHEADER]]:
+; CHECK-NEXT: [[IV_EPIL_INIT:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR]], %[[EXIT_UNR_LCSSA]] ]
+; CHECK-NEXT: [[MAX_EPIL_INIT:%.*]] = phi i32 [ -2147483648, %[[ENTRY]] ], [ [[RDX_MINMAX]], %[[EXIT_UNR_LCSSA]] ]
+; CHECK-NEXT: [[LCMP_MOD2:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; CHECK-NEXT: call void @llvm.assume(i1 [[LCMP_MOD2]])
+; CHECK-NEXT: br label %[[LOOP_EPIL:.*]]
+; CHECK...
[truncated]
|
| @@ -484,6 +552,1061 @@ exit: | |||
| ret float %res | |||
| } | |||
|
|
|||
| define i32 @test_smin_reduction(ptr %a, i64 %n) { | |||
There was a problem hiding this comment.
probably best to move the min/max reductiosn to a separate test file, as this one is growing quite big
| @@ -1,9 +1,9 @@ | |||
| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 | |||
| ; RUN: opt -p loop-unroll -unroll-add-parallel-reductions -S %s | FileCheck %s | |||
| ; RUN: opt -p loop-unroll -unroll-parallel-reductions -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -S %s | FileCheck %s | |||
There was a problem hiding this comment.
Why use -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math? Can we instead have tests with and without the needed fast-math flags, to make sure we handle them correctly?
There was a problem hiding this comment.
Do you mean to use nnan ninf nsz on select and instrinsics?
There was a problem hiding this comment.
Thanks. BTW, in case you didn't notice, tests with intrinsics are still chained. Either I can add TODO to handle them or try harder to fix in this patch. (I don't see a clear way in the code to fix this unless I am missing something already there)
There was a problem hiding this comment.
Ping @fhahn! Are you okay to have FIXME/TODO comments for min/max intrinsic tests in this patch and let me handle them in a separate patch?
There was a problem hiding this comment.
It's better to have the necessary tests in place here, in this patch.
There was a problem hiding this comment.
yes, please update with the split tests
|
Taking a first look at this, catching up on this.... looks like there's one unresolved comment from Florian:
|
Yes, I have this addressed locally; I don't have any objections. I am waiting for @fhahn to comment on the other comment. Based on the response, will update the PR. |
Ok, I think it's better just to upload the latest revision, and I've left a remark above about the test. |
🐧 Linux x64 Test Results
✅ The build succeeded and all tests passed. |
🪟 Windows x64 Test Results
✅ The build succeeded and all tests passed. |
01a956a to
6bcc8a0
Compare
| auto *I = dyn_cast<Instruction>(U); | ||
| if (!I || (!TheLoop->contains(I) && | ||
| (V != BackedgeValue || ++OutOfLoopUses > 1))) | ||
| if (!I || (!TheLoop->contains(I) && V != BackedgeValue)) |
There was a problem hiding this comment.
Relaxing the strict check to allow out-of-loop uses.
There was a problem hiding this comment.
This is unrelated to the patch, right? If so, please split off with separate tests. It will also need LoopVectorize test to make sure this is handled correctly there.
There was a problem hiding this comment.
Unfortunately not. Without this isReductionPHI() returns false as getMinMaxRecurrence() fails to return true.
There was a problem hiding this comment.
Ping @fhahn. Can you please let me know what you think about the patch?
There was a problem hiding this comment.
Right, this is needed because we query after we already (partial) transformed the input so there are extra uses? Can we check earlier?
If not, would be good to split this off + a loop-vectorize test
There was a problem hiding this comment.
I prefer to split. Please have a look at #189906
For more context of this patch, please see llvm#182473. By relaxing, OutOfLoopUses check, we can recognize more case for minmax reduction and thus vectorize.
For more context of this patch, please see llvm#182473. By relaxing, `OutOfLoopUses` check, we can recognize more case for minmax reduction and thus vectorize.
For more context of this patch, please see #182473. By relaxing, `OutOfLoopUses` check, we can recognize more case for minmax reduction and thus vectorize.
For more context of this patch, please see llvm#182473. By relaxing, `OutOfLoopUses` check, we can recognize more case for minmax reduction and thus vectorize.
For more context of this patch, please see llvm#182473. By relaxing, `OutOfLoopUses` check, we can recognize more case for minmax reduction and thus vectorize.
This patch * Supports parallel reductions for min/max operations in LoopUnroller. * Adds relevant test (including intrinsics) and their 8x unroll version. * Renames flag -unroll-add-parallel-reduction to -unroll-parallel-reduction. Planning to take support for vector types in the next patch.
Thanks for resurrecting this PR. It was buried in my list. I rebased and everything looks as expected. |
| static cl::opt<bool> UnrollParallelReductions( | ||
| "unroll-parallel-reductions", cl::init(false), cl::Hidden, | ||
| cl::desc("Allow unrolling to parallelize reductions.")); |
There was a problem hiding this comment.
Renaming/changing option is unrelated to support for min/max, would be good to split off, although renaming the option may break some users.
There was a problem hiding this comment.
Hmm..but we no longer support only add and that is the reason I changed the flag. I am keeping it unchanged but I don't mind if this is appropriate for a new patch.
| // Skip unsupported reductions. Min/max (integer and FP) are supported. | ||
| // TODO: Handle additional reductions. |
There was a problem hiding this comment.
| // Skip unsupported reductions. Min/max (integer and FP) are supported. | |
| // TODO: Handle additional reductions. | |
| // Skip unsupported reductions. | |
| // TODO: Handle any-of and find-last reductions. |
We support all bin-op reductions and min/max (with the patch).
Remaining cases are any-of and findlast, not sure if there is anything that makes them difficult to support though
|
|
||
| ; ------------------------------------------------------------- | ||
| ; floating-point min and max reductions without fast-math flags. | ||
| ; These are currently chained together. |
There was a problem hiding this comment.
I think without fast math flags we cannot parallelize them, better drop the currently to avoid potential confusion
| ; These are currently chained together. | |
| ; These are chained together sequentially. |
| %min = phi float [ 0x7FF0000000000000, %entry ], [ %rdx.next, %loop ] | ||
| %gep.a = getelementptr inbounds float, ptr %a, i64 %iv | ||
| %1 = load float, ptr %gep.a, align 4 | ||
| %rdx.next = call nnan ninf nsz float @llvm.minnum.f32(float %min, float %1) |
There was a problem hiding this comment.
for completeness, could you also add a test with umin/umax/smin intrinsics?
There was a problem hiding this comment.
done. (We may need to revisit this file in future to have better logical groupings of the tests)
| ;. | ||
| ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]} | ||
| ; CHECK: [[META1]] = !{!"llvm.loop.unroll.disable"} | ||
| ; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]} | ||
| ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} | ||
| ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]} | ||
| ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]} | ||
| ; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]]} | ||
| ; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]} | ||
| ; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]]} | ||
| ; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]]} | ||
| ;. |
There was a problem hiding this comment.
can generate checks with --check-globals=none to remove those
| %max = phi i32 [ 0, %entry ], [ %rdx.next, %loop ] | ||
| %gep.a = getelementptr inbounds i32, ptr %a, i64 %iv | ||
| %1 = load i32, ptr %gep.a, align 4 | ||
| %rdx.next = call i32 @llvm.umax.i32(i32 %max, i32 %1) |
There was a problem hiding this comment.
hmm, surprised we do not support this form yet, as I thought the IVDescriptors logic would match it.
…max.ll Co-authored-by: Florian Hahn <[email protected]>
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/223/builds/4214 Here is the relevant piece of the build log for the reference |
This patch
getMinMaxRecurrence) to handle out-of-loop uses.Planning to take support for vector types in the next patch.