[LV] Reduce register usage for scaled reductions#133090
[LV] Reduce register usage for scaled reductions#133090SamTebbs33 merged 19 commits intollvm:mainfrom
Conversation
|
@llvm/pr-subscribers-vectorizers Author: Sam Tebbs (SamTebbs33) ChangesThis PR accounts for scaled reductions in Patch is 56.56 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/133090.diff 5 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c9f314c0ba481..da701ef9ff1a2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5026,10 +5026,23 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs,
// even in the scalar case.
RegUsage[ClassID] += 1;
} else {
+ // The output from scaled phis and scaled reductions actually have
+ // fewer lanes than the VF.
+ auto VF = VFs[J];
+ if (auto *ReductionR = dyn_cast<VPReductionPHIRecipe>(R))
+ VF = VF.divideCoefficientBy(ReductionR->getVFScaleFactor());
+ else if (auto *PartialReductionR =
+ dyn_cast<VPPartialReductionRecipe>(R))
+ VF = VF.divideCoefficientBy(PartialReductionR->getScaleFactor());
+ if (VF != VFs[J])
+ LLVM_DEBUG(dbgs() << "LV(REG): Scaled down VF from " << VFs[J]
+ << " to " << VF << " for ";
+ R->dump(););
+
for (VPValue *DefV : R->definedValues()) {
Type *ScalarTy = TypeInfo.inferScalarType(DefV);
unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy);
- RegUsage[ClassID] += GetRegUsage(ScalarTy, VFs[J]);
+ RegUsage[ClassID] += GetRegUsage(ScalarTy, VF);
}
}
}
@@ -8963,8 +8976,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
return tryToWidenMemory(Instr, Operands, Range);
- if (getScalingForReduction(Instr))
- return tryToCreatePartialReduction(Instr, Operands);
+ if (auto ScaleFactor = getScalingForReduction(Instr))
+ return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value());
if (!shouldWiden(Instr, Range))
return nullptr;
@@ -8988,7 +9001,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
VPRecipeBase *
VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
- ArrayRef<VPValue *> Operands) {
+ ArrayRef<VPValue *> Operands,
+ unsigned ScaleFactor) {
assert(Operands.size() == 2 &&
"Unexpected number of operands for partial reduction");
@@ -9021,7 +9035,7 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
BinOp = Builder.createSelect(Mask, BinOp, Zero, Reduction->getDebugLoc());
}
return new VPPartialReductionRecipe(ReductionOpcode, BinOp, Accumulator,
- Reduction);
+ ScaleFactor, Reduction);
}
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 334cfbad8bd7c..fd0064a34c4c9 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -178,7 +178,8 @@ class VPRecipeBuilder {
/// Create and return a partial reduction recipe for a reduction instruction
/// along with binary operation and reduction phi operands.
VPRecipeBase *tryToCreatePartialReduction(Instruction *Reduction,
- ArrayRef<VPValue *> Operands);
+ ArrayRef<VPValue *> Operands,
+ unsigned ScaleFactor);
/// Set the recipe created for given ingredient.
void setRecipe(Instruction *I, VPRecipeBase *R) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 80b3d2a760293..d84efb1bd6850 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2001,6 +2001,8 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
/// Generate the phi/select nodes.
void execute(VPTransformState &State) override;
+ unsigned getVFScaleFactor() const { return VFScaleFactor; }
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
@@ -2031,17 +2033,19 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
/// scalar value.
class VPPartialReductionRecipe : public VPSingleDefRecipe {
unsigned Opcode;
+ unsigned ScaleFactor;
public:
VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0,
- VPValue *Op1)
+ VPValue *Op1, unsigned ScaleFactor)
: VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1,
- ReductionInst) {}
+ ScaleFactor, ReductionInst) {}
VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1,
+ unsigned ScaleFactor,
Instruction *ReductionInst = nullptr)
: VPSingleDefRecipe(VPDef::VPPartialReductionSC,
ArrayRef<VPValue *>({Op0, Op1}), ReductionInst),
- Opcode(Opcode) {
+ Opcode(Opcode), ScaleFactor(ScaleFactor) {
[[maybe_unused]] auto *AccumulatorRecipe =
getOperand(1)->getDefiningRecipe();
assert((isa<VPReductionPHIRecipe>(AccumulatorRecipe) ||
@@ -2052,7 +2056,7 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe {
VPPartialReductionRecipe *clone() override {
return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1),
- getUnderlyingInstr());
+ ScaleFactor, getUnderlyingInstr());
}
VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
@@ -2067,6 +2071,8 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe {
/// Get the binary op's opcode.
unsigned getOpcode() const { return Opcode; }
+ unsigned getScaleFactor() const { return ScaleFactor; }
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
index de710bfbf8561..ab1cf84dba67d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
@@ -780,10 +780,10 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled(
; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVED-NEXT: entry:
-; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 32
; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 32
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
@@ -792,6 +792,10 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE1:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
@@ -805,45 +809,81 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i8>, ptr [[TMP38]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP42]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
+; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = mul nsw <16 x i32> [[TMP44]], [[TMP40]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE1]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP46]])
; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i8>, ptr [[TMP48]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP52]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
+; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = mul nsw <16 x i32> [[TMP50]], [[TMP53]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP21]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP54]])
; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD19:%.*]] = load <16 x i8>, ptr [[TMP55]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP37]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
+; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = mul nsw <16 x i32> [[TMP56]], [[TMP39]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP26]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP41]])
; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD25:%.*]] = load <16 x i8>, ptr [[TMP43]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP47]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
+; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = mul nsw <16 x i32> [[TMP45]], [[TMP49]]
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP51]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]])
-; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]])
-; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]])
-; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE10]], [[PARTIAL_REDUCE13]]
+; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX30:%.*]] = add <4 x i32> [[PARTIAL_REDUCE]], [[PARTIAL_REDUCE7]]
+; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX30]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX31:%.*]] = add <4 x i32> [[PARTIAL_REDUCE17]], [[PARTIAL_REDUCE16]]
+; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX31]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX32:%.*]] = add <4 x i32> [[PARTIAL_REDUCE11]], [[PARTIAL_REDUCE1]]
+; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX32]])
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVED: scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 75705fdfc23e5..9eaec9353589c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -3177,6 +3177,420 @@ for.exit: ; preds = %for.body
ret i32 %add
}
+define dso_local void @dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, i32 %n) #1 {
+; CHECK-INTERLEAVE1-LABEL: define dso_local void @dotp_high_register_pressure(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[SUM:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: [[CMP100:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP100]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK-INTERLEAVE1: for.body.lr.ph:
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 4
+; CHECK-INTERLEAVE1-NEXT: [[GEP_B_12:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 8
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 12
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX40:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 16
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX58:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 20
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX59:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 24
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX67:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 28
+; CHECK-INTERLEAVE1-NEXT: [[SUM_PROMOTED:%.*]] = load i32, ptr [[SUM]], align 4
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX13_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
+; CHECK-INTERLEAVE1-NEXT: [[GEP_B_12_PROMOTED:%.*]] = load i32, ptr [[GEP_B_12]], align 4
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX31_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX31]], align 4
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX40_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX40]], align 4
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX58_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX58]], align 4
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX58_PROMOTED1:%.*]] = load i32, ptr [[ARRAYIDX5...
[truncated]
|
|
@llvm/pr-subscribers-llvm-transforms Author: Sam Tebbs (SamTebbs33) ChangesThis PR accounts for scaled reductions in Patch is 56.56 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/133090.diff 5 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c9f314c0ba481..da701ef9ff1a2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5026,10 +5026,23 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs,
// even in the scalar case.
RegUsage[ClassID] += 1;
} else {
+ // The output from scaled phis and scaled reductions actually have
+ // fewer lanes than the VF.
+ auto VF = VFs[J];
+ if (auto *ReductionR = dyn_cast<VPReductionPHIRecipe>(R))
+ VF = VF.divideCoefficientBy(ReductionR->getVFScaleFactor());
+ else if (auto *PartialReductionR =
+ dyn_cast<VPPartialReductionRecipe>(R))
+ VF = VF.divideCoefficientBy(PartialReductionR->getScaleFactor());
+ if (VF != VFs[J])
+ LLVM_DEBUG(dbgs() << "LV(REG): Scaled down VF from " << VFs[J]
+ << " to " << VF << " for ";
+ R->dump(););
+
for (VPValue *DefV : R->definedValues()) {
Type *ScalarTy = TypeInfo.inferScalarType(DefV);
unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy);
- RegUsage[ClassID] += GetRegUsage(ScalarTy, VFs[J]);
+ RegUsage[ClassID] += GetRegUsage(ScalarTy, VF);
}
}
}
@@ -8963,8 +8976,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
return tryToWidenMemory(Instr, Operands, Range);
- if (getScalingForReduction(Instr))
- return tryToCreatePartialReduction(Instr, Operands);
+ if (auto ScaleFactor = getScalingForReduction(Instr))
+ return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value());
if (!shouldWiden(Instr, Range))
return nullptr;
@@ -8988,7 +9001,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
VPRecipeBase *
VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
- ArrayRef<VPValue *> Operands) {
+ ArrayRef<VPValue *> Operands,
+ unsigned ScaleFactor) {
assert(Operands.size() == 2 &&
"Unexpected number of operands for partial reduction");
@@ -9021,7 +9035,7 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
BinOp = Builder.createSelect(Mask, BinOp, Zero, Reduction->getDebugLoc());
}
return new VPPartialReductionRecipe(ReductionOpcode, BinOp, Accumulator,
- Reduction);
+ ScaleFactor, Reduction);
}
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 334cfbad8bd7c..fd0064a34c4c9 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -178,7 +178,8 @@ class VPRecipeBuilder {
/// Create and return a partial reduction recipe for a reduction instruction
/// along with binary operation and reduction phi operands.
VPRecipeBase *tryToCreatePartialReduction(Instruction *Reduction,
- ArrayRef<VPValue *> Operands);
+ ArrayRef<VPValue *> Operands,
+ unsigned ScaleFactor);
/// Set the recipe created for given ingredient.
void setRecipe(Instruction *I, VPRecipeBase *R) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 80b3d2a760293..d84efb1bd6850 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2001,6 +2001,8 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
/// Generate the phi/select nodes.
void execute(VPTransformState &State) override;
+ unsigned getVFScaleFactor() const { return VFScaleFactor; }
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
@@ -2031,17 +2033,19 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
/// scalar value.
class VPPartialReductionRecipe : public VPSingleDefRecipe {
unsigned Opcode;
+ unsigned ScaleFactor;
public:
VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0,
- VPValue *Op1)
+ VPValue *Op1, unsigned ScaleFactor)
: VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1,
- ReductionInst) {}
+ ScaleFactor, ReductionInst) {}
VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1,
+ unsigned ScaleFactor,
Instruction *ReductionInst = nullptr)
: VPSingleDefRecipe(VPDef::VPPartialReductionSC,
ArrayRef<VPValue *>({Op0, Op1}), ReductionInst),
- Opcode(Opcode) {
+ Opcode(Opcode), ScaleFactor(ScaleFactor) {
[[maybe_unused]] auto *AccumulatorRecipe =
getOperand(1)->getDefiningRecipe();
assert((isa<VPReductionPHIRecipe>(AccumulatorRecipe) ||
@@ -2052,7 +2056,7 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe {
VPPartialReductionRecipe *clone() override {
return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1),
- getUnderlyingInstr());
+ ScaleFactor, getUnderlyingInstr());
}
VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
@@ -2067,6 +2071,8 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe {
/// Get the binary op's opcode.
unsigned getOpcode() const { return Opcode; }
+ unsigned getScaleFactor() const { return ScaleFactor; }
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
index de710bfbf8561..ab1cf84dba67d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
@@ -780,10 +780,10 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled(
; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVED-NEXT: entry:
-; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 32
; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 32
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
@@ -792,6 +792,10 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE1:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
@@ -805,45 +809,81 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i8>, ptr [[TMP38]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP42]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
+; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = mul nsw <16 x i32> [[TMP44]], [[TMP40]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE1]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP46]])
; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i8>, ptr [[TMP48]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP52]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
+; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = mul nsw <16 x i32> [[TMP50]], [[TMP53]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP21]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP54]])
; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD19:%.*]] = load <16 x i8>, ptr [[TMP55]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP37]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
+; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = mul nsw <16 x i32> [[TMP56]], [[TMP39]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP26]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP41]])
; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD25:%.*]] = load <16 x i8>, ptr [[TMP43]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP47]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
+; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = mul nsw <16 x i32> [[TMP45]], [[TMP49]]
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP51]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]])
-; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]])
-; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]])
-; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE10]], [[PARTIAL_REDUCE13]]
+; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX30:%.*]] = add <4 x i32> [[PARTIAL_REDUCE]], [[PARTIAL_REDUCE7]]
+; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX30]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX31:%.*]] = add <4 x i32> [[PARTIAL_REDUCE17]], [[PARTIAL_REDUCE16]]
+; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX31]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX32:%.*]] = add <4 x i32> [[PARTIAL_REDUCE11]], [[PARTIAL_REDUCE1]]
+; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX32]])
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVED: scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 75705fdfc23e5..9eaec9353589c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -3177,6 +3177,420 @@ for.exit: ; preds = %for.body
ret i32 %add
}
+define dso_local void @dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, i32 %n) #1 {
+; CHECK-INTERLEAVE1-LABEL: define dso_local void @dotp_high_register_pressure(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[SUM:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: [[CMP100:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP100]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK-INTERLEAVE1: for.body.lr.ph:
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 4
+; CHECK-INTERLEAVE1-NEXT: [[GEP_B_12:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 8
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 12
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX40:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 16
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX58:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 20
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX59:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 24
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX67:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 28
+; CHECK-INTERLEAVE1-NEXT: [[SUM_PROMOTED:%.*]] = load i32, ptr [[SUM]], align 4
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX13_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
+; CHECK-INTERLEAVE1-NEXT: [[GEP_B_12_PROMOTED:%.*]] = load i32, ptr [[GEP_B_12]], align 4
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX31_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX31]], align 4
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX40_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX40]], align 4
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX58_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX58]], align 4
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX58_PROMOTED1:%.*]] = load i32, ptr [[ARRAYIDX5...
[truncated]
|
NickGuy-Arm
left a comment
There was a problem hiding this comment.
Looks generally good to me so far, with a few nitpicks.
| /// scalar value. | ||
| class VPPartialReductionRecipe : public VPSingleDefRecipe { | ||
| unsigned Opcode; | ||
| unsigned ScaleFactor; |
There was a problem hiding this comment.
Nit: Could this be VFScaleFactor to match the equivalent in VPReductionPHIRecipe?
| else if (auto *PartialReductionR = | ||
| dyn_cast<VPPartialReductionRecipe>(R)) | ||
| VF = VF.divideCoefficientBy(PartialReductionR->getScaleFactor()); | ||
| if (VF != VFs[J]) |
There was a problem hiding this comment.
Nit: If the condition is only used for debug output then can it be moved to inside the LLVM_DEBUG
There was a problem hiding this comment.
Could you pre-commit this test, so we can see how the output changes before and after the changes in LoopVectorize.cpp
There was a problem hiding this comment.
Good idea, done.
| // The output from scaled phis and scaled reductions actually have | ||
| // fewer lanes than the VF. | ||
| auto VF = VFs[J]; | ||
| if (auto *ReductionR = dyn_cast<VPReductionPHIRecipe>(R)) |
There was a problem hiding this comment.
[Idle thought, feel free to ignore]
I wonder if there's precedent to add a getVFScaleFactor or equivalent to the base recipe class (or one of the other subclasses), and allow any recipe to override it instead of explicitly checking for every type that could scale the VF.
Likely not yet, and almost certainly not in this patch, but maybe something to consider in the future?
There was a problem hiding this comment.
Yeah that's a nice idea. We could add a VPScaledRecipe class. I agree with doing it afterwards.
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
nit: please don't use auto when it's not obvious from the context what the type is. (I'm reviewing this code and can't see from the limited context that Github gives me what the type of VF is)
There was a problem hiding this comment.
Is it possible to add a new RUN line that prints out the debug output from LV and have a CHECK line for the register pressure?
If so, then I wonder if there's a need to add another test, and perhaps the CHECK lines for lower register pressure are sufficient.
b11c763 to
c72adbe
Compare
There was a problem hiding this comment.
I realise it may be less efficient, but perhaps it's better to commonise these into the same block? If for some reason we need to update this logic in future it's easier to fix it only once, i.e.
if (isa<VPReductionPHIRecipe, VPPartialReductionRecipe>(R)) {
auto *ReductionR = dyn_cast<VPReductionPHIRecipe>(R);
auto *PartialReductionR = dyn_cast<VPPartialReductionRecipe>(R);
unsigned ScaleFactor = ReductionR ? ReductionR->getVFScaleFactor() : PartialReductionR->getVFScaleFactor();
VF = VF.divideCoefficientBy(ScaleFactor);
}
If getVFScaleFactor becomes available to a common base class then it should simplify further. What do you think?
There was a problem hiding this comment.
Thanks for that. I've done this but made it a little more efficient by not casting the partial reduction if it was a reduction phi.
There was a problem hiding this comment.
Perhaps good to have comments on both new functions added?
There was a problem hiding this comment.
Do you know why this has changed? It doesn't look like a partial reduction.
There was a problem hiding this comment.
Rebasing on top of Florian's merged PR has fixed this and the exit-branch-cost.ll diff, thanks for having a look.
There was a problem hiding this comment.
I'm a bit surprised these are the only CHECK lines that have changed.
There was a problem hiding this comment.
Still missing a REQUIRES: asserts
There was a problem hiding this comment.
minor nit: Is it worth creating a operator<< for VPDef, so that you can write:
| << VF << " for "; | |
| R->dump(); | |
| << VF << " for " << *R << "\n"; |
?
There was a problem hiding this comment.
It is a good idea, but best done separately, I think.
There was a problem hiding this comment.
I would expect it to be a pretty small and uncontroversial change (it would merely add an extra function) and doesn't require any particular tests. Are you sure you don't want to include it in this PR?
sdesmalen-arm
left a comment
There was a problem hiding this comment.
You'll probably want to rebase on top of #126437 again, as some of the code has changed and will cause a merge conflict.
There was a problem hiding this comment.
I would expect it to be a pretty small and uncontroversial change (it would merely add an extra function) and doesn't require any particular tests. Are you sure you don't want to include it in this PR?
There was a problem hiding this comment.
Maybe create a utility function that returns the scaling factor a Recipe, which returns 1 for any recipe other than the VPPartialReductionRecipe/VPReductionPHIRecipe.
Also, please run clang-format on your code.
| if (isa<VPPartialReductionRecipe, VPReductionPHIRecipe>(R)) { | ||
| auto *ReductionR = dyn_cast<VPReductionPHIRecipe>(R); | ||
| auto *PartialReductionR = | ||
| ReductionR ? nullptr : dyn_cast<VPPartialReductionRecipe>(R); | ||
| unsigned ScaleFactor = ReductionR ? ReductionR->getVFScaleFactor() | ||
| : PartialReductionR->getVFScaleFactor(); | ||
| return ScaleFactor; | ||
| } |
There was a problem hiding this comment.
Am I missing something, or can this just be:
| if (isa<VPPartialReductionRecipe, VPReductionPHIRecipe>(R)) { | |
| auto *ReductionR = dyn_cast<VPReductionPHIRecipe>(R); | |
| auto *PartialReductionR = | |
| ReductionR ? nullptr : dyn_cast<VPPartialReductionRecipe>(R); | |
| unsigned ScaleFactor = ReductionR ? ReductionR->getVFScaleFactor() | |
| : PartialReductionR->getVFScaleFactor(); | |
| return ScaleFactor; | |
| } | |
| if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R)) | |
| return RR->getVFScaleFactor(); | |
| if (auto *RR =dyn_cast<VPPartialReductionRecipe>(R)) | |
| return RR->getVFScaleFactor(); |
?
| unsigned ScaleFactor = getVFScaleFactor(R); | ||
| ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor); | ||
| LLVM_DEBUG(if (VF != VFs[J]) { | ||
| dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF |
There was a problem hiding this comment.
nit: for the tests that check the debug output, can a check for this line be added?
There was a problem hiding this comment.
Unfortunately the VF that has partial reductions is pruned before the register usage is calculated. I can add one as part of #132190 once this is merged, though.
| V.print(OS, SlotTracker); | ||
| return OS; | ||
| } | ||
| raw_ostream &llvm::operator<<(raw_ostream &OS, const VPRecipeBase &R) { |
There was a problem hiding this comment.
Instead of adding this operator, you can instead replace the one above operator<<(raw_ostem &OS, const VPValue &V). If this operator supports VPRecipeBase then that also removes the need for a cast in SLPVectorizer.cpp and the one you added in VPlanTest.cpp.
| /// scalar value. | ||
| class VPPartialReductionRecipe : public VPSingleDefRecipe { | ||
| unsigned Opcode; | ||
| unsigned VFScaleFactor; |
There was a problem hiding this comment.
wuld be good to add a comment here
| return OS; | ||
| } | ||
| raw_ostream &llvm::operator<<(raw_ostream &OS, const VPRecipeBase &R) { | ||
| VPSlotTracker SlotTracker(R.getParent()->getPlan()); |
There was a problem hiding this comment.
Parent may be nullptr if the recipe hasn't been added to a plan yet. Would be good to account for that in the general util
| if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe, | ||
| VPBranchOnMaskRecipe>(R)) | ||
| continue; | ||
|
|
There was a problem hiding this comment.
This is just checkoing the debug output for a single function, right? Might be better to move to the existing llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll if possible
|
|
||
| LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " " | ||
| << *cast<VPInstruction>(Values[0]) << "\n"); | ||
| LLVM_DEBUG(dbgs() << "Create VPInstruction " << cast<VPValue>(*VPI) << " " |
There was a problem hiding this comment.
This seems unfortunate, due to VPI being both a VPValue & VPInstruction?
There was a problem hiding this comment.
Exactly, it was an ambiguous call to operator<<. Sander's latest suggestion to replace the VPValue print function with the VPRecipeBase one makes it unnecessary though.
|
|
||
| if (getScalingForReduction(Instr)) | ||
| return tryToCreatePartialReduction(Instr, Operands); | ||
| if (auto ScaleFactor = getScalingForReduction(Instr)) |
There was a problem hiding this comment.
| if (auto ScaleFactor = getScalingForReduction(Instr)) | |
| if (unsigned ScaleFactor = getScalingForReduction(Instr)) |
sdesmalen-arm
left a comment
There was a problem hiding this comment.
LGTM, thanks @SamTebbs33
| /// Generate the phi/select nodes. | ||
| void execute(VPTransformState &State) override; | ||
|
|
||
| /// Get the factor that the VF of this recipe's output should be scaled by |
There was a problem hiding this comment.
| /// Get the factor that the VF of this recipe's output should be scaled by | |
| /// Get the factor that the VF of this recipe's output should be scaled by. |
| /// Get the binary op's opcode. | ||
| unsigned getOpcode() const { return Opcode; } | ||
|
|
||
| /// Get the factor that the VF of this recipe's output should be scaled by |
There was a problem hiding this comment.
| /// Get the factor that the VF of this recipe's output should be scaled by | |
| /// Get the factor that the VF of this recipe's output should be scaled by. |
This PR accounts for scaled reductions in
calculateRegisterUsageto reflect the fact that the number of lanes in their output is smaller than the VF.Depends on #126437