[LV] Support binary and unary operations with EVL-vectorization#93854
[LV] Support binary and unary operations with EVL-vectorization#93854
Conversation
|
@llvm/pr-subscribers-llvm-transforms Author: Kolya Panchenko (nikolaypanchenko) ChangesThe patch adds Patch is 138.14 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/93854.diff 8 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e75a1de548f7d..f1381b99ed68f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -855,6 +855,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPWidenCastSC:
case VPRecipeBase::VPWidenGEPSC:
case VPRecipeBase::VPWidenSC:
+ case VPRecipeBase::VPWidenEVLSC:
case VPRecipeBase::VPWidenSelectSC:
case VPRecipeBase::VPBlendSC:
case VPRecipeBase::VPPredInstPHISC:
@@ -1039,6 +1040,7 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
static inline bool classof(const VPRecipeBase *R) {
return R->getVPDefID() == VPRecipeBase::VPInstructionSC ||
R->getVPDefID() == VPRecipeBase::VPWidenSC ||
+ R->getVPDefID() == VPRecipeBase::VPWidenEVLSC ||
R->getVPDefID() == VPRecipeBase::VPWidenGEPSC ||
R->getVPDefID() == VPRecipeBase::VPWidenCastSC ||
R->getVPDefID() == VPRecipeBase::VPReplicateSC ||
@@ -1333,13 +1335,18 @@ class VPInstruction : public VPRecipeWithIRFlags {
/// ingredient. This recipe covers most of the traditional vectorization cases
/// where each ingredient transforms into a vectorized version of itself.
class VPWidenRecipe : public VPRecipeWithIRFlags {
+protected:
unsigned Opcode;
+ template <typename IterT>
+ VPWidenRecipe(unsigned VPDefOpcode, Instruction &I,
+ iterator_range<IterT> Operands)
+ : VPRecipeWithIRFlags(VPDefOpcode, Operands, I), Opcode(I.getOpcode()) {}
+
public:
template <typename IterT>
VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands)
- : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I),
- Opcode(I.getOpcode()) {}
+ : VPWidenRecipe(VPDef::VPWidenSC, I, Operands) {}
~VPWidenRecipe() override = default;
@@ -1363,6 +1370,49 @@ class VPWidenRecipe : public VPRecipeWithIRFlags {
#endif
};
+class VPWidenEVLRecipe : public VPWidenRecipe {
+private:
+ using VPRecipeWithIRFlags::transferFlags;
+
+public:
+ template <typename IterT>
+ VPWidenEVLRecipe(Instruction &I, iterator_range<IterT> Operands, VPValue &EVL)
+ : VPWidenRecipe(VPDef::VPWidenEVLSC, I, Operands) {
+ addOperand(&EVL);
+ }
+
+ ~VPWidenEVLRecipe() override = default;
+
+ VPWidenRecipe *clone() override final {
+ SmallVector<VPValue *> Ops(operands());
+ VPValue *EVL = Ops.pop_back_val();
+ auto *R = new VPWidenEVLRecipe(*getUnderlyingInstr(),
+ make_range(Ops.begin(), Ops.end()), *EVL);
+ R->transferFlags(*this);
+ return R;
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPWidenEVLSC);
+
+ VPValue *getEVL() { return getOperand(getNumOperands() - 1); }
+ const VPValue *getEVL() const { return getOperand(getNumOperands() - 1); }
+
+ /// A helper function to create widen EVL recipe from regular widen recipe.
+ static VPWidenEVLRecipe *create(VPWidenRecipe *W, VPValue &EVL);
+
+ /// Produce widened copies of all Ingredients.
+ void execute(VPTransformState &State) override final;
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override final;
+#endif
+};
+
/// VPWidenCastRecipe is a recipe to create vector cast instructions.
class VPWidenCastRecipe : public VPRecipeWithIRFlags {
/// Cast instruction opcode.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 5eb99ffd1e10e..a6fd26b666501 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -23,6 +23,7 @@
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
+#include "llvm/IR/VectorBuilder.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
@@ -71,6 +72,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPWidenLoadSC:
case VPWidenPHISC:
case VPWidenSC:
+ case VPWidenEVLSC:
case VPWidenSelectSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
@@ -110,6 +112,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
case VPWidenIntOrFpInductionSC:
case VPWidenPHISC:
case VPWidenSC:
+ case VPWidenEVLSC:
case VPWidenSelectSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
@@ -157,6 +160,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPWidenPHISC:
case VPWidenPointerInductionSC:
case VPWidenSC:
+ case VPWidenEVLSC:
case VPWidenSelectSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
@@ -993,6 +997,64 @@ void VPWidenRecipe::execute(VPTransformState &State) {
#endif
}
+VPWidenEVLRecipe *VPWidenEVLRecipe::create(VPWidenRecipe *W, VPValue &EVL) {
+ auto *R = new VPWidenEVLRecipe(*W->getUnderlyingInstr(), W->operands(), EVL);
+ R->transferFlags(*W);
+ return R;
+}
+
+void VPWidenEVLRecipe::execute(VPTransformState &State) {
+ assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
+ "explicit vector length.");
+ VPValue *Op0 = getOperand(0);
+
+ // If it's scalar operation, hand translation over to VPWidenRecipe
+ if (!State.get(Op0, 0)->getType()->isVectorTy())
+ return VPWidenRecipe::execute(State);
+
+ VPValue *EVL = getEVL();
+ Value *EVLArg = State.get(EVL, 0, /*NeedsScalar=*/true);
+ unsigned Opcode = getOpcode();
+ Instruction *I = getUnderlyingInstr();
+ IRBuilderBase &BuilderIR = State.Builder;
+ VectorBuilder Builder(BuilderIR);
+ Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue());
+ Value *VPInst = nullptr;
+
+ //===------------------- Binary and Unary Ops ---------------------===//
+ if (Instruction::isBinaryOp(Opcode) || Instruction::isUnaryOp(Opcode)) {
+ // Just widen unops and binops.
+
+ SmallVector<Value *, 4> Ops;
+ for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) {
+ VPValue *VPOp = getOperand(I);
+ Ops.push_back(State.get(VPOp, 0));
+ }
+
+ Builder.setMask(Mask).setEVL(EVLArg);
+ VPInst = Builder.createVectorInstruction(Opcode, Ops[0]->getType(), Ops,
+ "vp.op");
+
+ if (I)
+ if (auto *VecOp = dyn_cast<Instruction>(VPInst))
+ VecOp->copyIRFlags(I);
+ } else {
+ llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute");
+ }
+ State.set(this, VPInst, 0);
+ State.addMetadata(VPInst, I);
+}
+
+bool VPWidenEVLRecipe::onlyFirstLaneUsed(const VPValue *Op) const {
+ assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
+ // EVL in that recipe is always the last operand, thus any use before means
+ // the VPValue should be vectorized.
+ for (unsigned I = 0, E = getNumOperands() - 1; I != E; ++I)
+ if (getOperand(I) == Op)
+ return false;
+ return true;
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
@@ -1002,6 +1064,15 @@ void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
printFlags(O);
printOperands(O, SlotTracker);
}
+
+void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "WIDEN vp ";
+ printAsOperand(O, SlotTracker);
+ O << " = " << Instruction::getOpcodeName(Opcode);
+ printFlags(O);
+ printOperands(O, SlotTracker);
+}
#endif
void VPWidenCastRecipe::execute(VPTransformState &State) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 422579ea8b84f..39ebd44909ea6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -20,6 +20,7 @@
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/TypeSwitch.h"
#include "llvm/Analysis/IVDescriptors.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/Intrinsics.h"
@@ -1219,7 +1220,7 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
/// WideCanonicalIV, backedge-taken-count) pattern.
/// TODO: Introduce explicit recipe for header-mask instead of searching
/// for the header-mask pattern manually.
-static SmallVector<VPValue *> collectAllHeaderMasks(VPlan &Plan) {
+static DenseSet<VPValue *> collectAllHeaderMasks(VPlan &Plan) {
SmallVector<VPValue *> WideCanonicalIVs;
auto *FoundWidenCanonicalIVUser =
find_if(Plan.getCanonicalIV()->users(),
@@ -1245,7 +1246,7 @@ static SmallVector<VPValue *> collectAllHeaderMasks(VPlan &Plan) {
// Walk users of wide canonical IVs and collect to all compares of the form
// (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
- SmallVector<VPValue *> HeaderMasks;
+ DenseSet<VPValue *> HeaderMasks;
VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
for (auto *Wide : WideCanonicalIVs) {
for (VPUser *U : SmallVector<VPUser *>(Wide->users())) {
@@ -1257,7 +1258,7 @@ static SmallVector<VPValue *> collectAllHeaderMasks(VPlan &Plan) {
assert(HeaderMask->getOperand(0) == Wide &&
"WidenCanonicalIV must be the first operand of the compare");
- HeaderMasks.push_back(HeaderMask);
+ HeaderMasks.insert(HeaderMask);
}
}
return HeaderMasks;
@@ -1296,6 +1297,55 @@ void VPlanTransforms::addActiveLaneMask(
HeaderMask->replaceAllUsesWith(LaneMask);
}
+/// Replace recipes with their EVL variants.
+static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
+ DenseSet<VPRecipeBase *> ToRemove;
+
+ ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
+ Plan.getEntry());
+ DenseSet<VPValue *> HeaderMasks = collectAllHeaderMasks(Plan);
+ for (VPBasicBlock *VPBB : reverse(VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))) {
+ // The recipes in the block are processed in reverse order, to catch chains
+ // of dead recipes.
+ for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
+ TypeSwitch<VPRecipeBase *>(&R)
+ .Case<VPWidenLoadRecipe>([&](VPWidenLoadRecipe *L) {
+ VPValue *NewMask =
+ HeaderMasks.contains(L->getMask()) ? nullptr : L->getMask();
+ auto *N = new VPWidenLoadEVLRecipe(L, &EVL, NewMask);
+ N->insertBefore(L);
+ L->replaceAllUsesWith(N);
+ ToRemove.insert(L);
+ })
+ .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
+ VPValue *NewMask =
+ HeaderMasks.contains(S->getMask()) ? nullptr : S->getMask();
+ auto *N = new VPWidenStoreEVLRecipe(S, &EVL, NewMask);
+ N->insertBefore(S);
+ ToRemove.insert(S);
+ })
+ .Case<VPWidenRecipe>([&](VPWidenRecipe *W) {
+ unsigned Opcode = W->getOpcode();
+ if (!Instruction::isBinaryOp(Opcode) &&
+ !Instruction::isUnaryOp(Opcode))
+ return;
+ auto *N = VPWidenEVLRecipe::create(W, EVL);
+ N->insertBefore(W);
+ W->replaceAllUsesWith(N);
+ ToRemove.insert(W);
+ });
+ }
+ }
+
+ for (VPRecipeBase *R : ToRemove)
+ R->eraseFromParent();
+
+ for (VPValue *HeaderMask : HeaderMasks)
+ recursivelyDeleteDeadRecipes(HeaderMask);
+}
+
+
+
/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
/// replaces all uses except the canonical IV increment of
/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
@@ -1356,29 +1406,8 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
NextEVLIV->insertBefore(CanonicalIVIncrement);
EVLPhi->addOperand(NextEVLIV);
- for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
- for (VPUser *U : collectUsersRecursively(HeaderMask)) {
- auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U);
- if (!MemR)
- continue;
- VPValue *OrigMask = MemR->getMask();
- assert(OrigMask && "Unmasked widen memory recipe when folding tail");
- VPValue *NewMask = HeaderMask == OrigMask ? nullptr : OrigMask;
- if (auto *L = dyn_cast<VPWidenLoadRecipe>(MemR)) {
- auto *N = new VPWidenLoadEVLRecipe(L, VPEVL, NewMask);
- N->insertBefore(L);
- L->replaceAllUsesWith(N);
- L->eraseFromParent();
- } else if (auto *S = dyn_cast<VPWidenStoreRecipe>(MemR)) {
- auto *N = new VPWidenStoreEVLRecipe(S, VPEVL, NewMask);
- N->insertBefore(S);
- S->eraseFromParent();
- } else {
- llvm_unreachable("unsupported recipe");
- }
- }
- recursivelyDeleteDeadRecipes(HeaderMask);
- }
+ transformRecipestoEVLRecipes(Plan, *VPEVL);
+
// Replace all uses of VPCanonicalIVPHIRecipe by
// VPEVLBasedIVPHIRecipe except for the canonical IV increment.
CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 8d945f6f2b8ea..a90d42289e5f1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -356,6 +356,7 @@ class VPDef {
VPWidenStoreEVLSC,
VPWidenStoreSC,
VPWidenSC,
+ VPWidenEVLSC,
VPWidenSelectSC,
VPBlendSC,
// START: Phi-like recipes. Need to be kept together.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll
new file mode 100644
index 0000000000000..77689fc8a76ee
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll
@@ -0,0 +1,1799 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=IF-EVL
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=NO-VP
+
+
+define void @test_and(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_and(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL: [[VECTOR_PH]]:
+; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]]
+; IF-EVL: [[VECTOR_BODY]]:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
+; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.vp.and.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr align 1 [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL: [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL: [[SCALAR_PH]]:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT: br label %[[LOOP:.*]]
+; IF-EVL: [[LOOP]]:
+; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; IF-EVL-NEXT: [[TMP:%.*]] = and i8 [[TMP20]], 1
+; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL: [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT: ret void
+;
+; NO-VP-LABEL: define void @test_and(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT: br label %[[LOOP:.*]]
+; NO-VP: [[LOOP]]:
+; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-VP-NEXT: [[TMP:%.*]] = and i8 [[TMP0]], 1
+; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1...
[truncated]
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
fa0d938 to
a42f974
Compare
|
CC: @ppenzin |
| for (unsigned I = 0, E = getNumOperands() - 1; I != E; ++I) | ||
| if (getOperand(I) == Op) | ||
| return false; | ||
| return true; |
There was a problem hiding this comment.
can this be simplified to return getEVL() == Op; ?
There was a problem hiding this comment.
It's technically possible to use EVL as first or second operand in the operation itself:
%evl = ...
%0 = add %evl, %0
i.e. that check makes sure onlyFirstLaneUsed returns false if Op == EVL for the example above
There was a problem hiding this comment.
yes, looks like this corresponds to fhahn's comment about if (!State.get(Op0, 0)->getType()->isVectorTy()), regarding whether operands can be scalar."
There was a problem hiding this comment.
If EVL would be the first/second operand, it would implicitly be splatted. In any case, no such plans can be constructed at the moment I think. Better to verify EVL is only used in supported places?
There was a problem hiding this comment.
I will add verification
| ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( | ||
| Plan.getEntry()); |
There was a problem hiding this comment.
Consider utilizing Plan.getVectorLoopRegion()->getEntryBasicBlock() to replace recipes that are within the loop.
There was a problem hiding this comment.
That's a good point. We really need to traverse entire VPlan, but replace instructions in postexit with init vl, like for a reduction.
That said, I need to add special check that EVL is reachable before replacing.
There was a problem hiding this comment.
You're right; it reminds me that we must explicitly express InitVL in VPlan once we support reduction. This is because EVL from the last two iterations cannot be used in the post-exit for correctness. For this PR, can we temporarily skip the preheader and post-exit?
fhahn
left a comment
There was a problem hiding this comment.
Just to double check, this isn't needed for correctness, right?
that's correct. |
fhahn
left a comment
There was a problem hiding this comment.
Just to double check, this isn't needed for correctness, right?
that's correct.
Great, could you add more detail about the motivation of the change to the PR description, i.e. why it is preferable to use the EVL version instead of regular wide ops?
| for (unsigned I = 0, E = getNumOperands() - 1; I != E; ++I) | ||
| if (getOperand(I) == Op) | ||
| return false; | ||
| return true; |
There was a problem hiding this comment.
If EVL would be the first/second operand, it would implicitly be splatted. In any case, no such plans can be constructed at the moment I think. Better to verify EVL is only used in supported places?
b47b6a3 to
5311f92
Compare
5311f92 to
7ad2abe
Compare
| // Verify that \p EVL is used correctly. The user must be either in EVL-based | ||
| // recipes as a last operand or VPInstruction::Add which is incoming value | ||
| // into EVL's recipe. | ||
| bool verifyEVLRecipe(const VPInstruction &EVL) const; |
There was a problem hiding this comment.
Could this be split off, and submitted first?
There was a problem hiding this comment.
This is independent of the unary/binary support, right? Can be split off + a test that forces VF =1 and IC> 1?
There was a problem hiding this comment.
It does relate to that change. Without it we create EVL recipes for scalar VPlan. For load/store that is not a problem, but it's a problem for WidenEVL since it cannot handle VF=1
There was a problem hiding this comment.
Hm, does it make sense to create VPlans with EVL in general? IIUC it doesn't and this could be done independent of the patch.
widen loads and stores shouldn't be created in a scalar plan, I guess at the moment this just works as there are no recipes that can be in scalar plans and get converted to an EVL version without this patch.
There was a problem hiding this comment.
Hm, does it make sense to create VPlans with EVL in general? IIUC it doesn't and this could be done independent of the patch.
Not sure I follow.
widen loads and stores shouldn't be created in a scalar plan, I guess at the moment this just works as there are no recipes that can be in scalar plans and get converted to an EVL version without this patch.
I double checked the code and looks like I got confused with VPInstruction. So I agree VPWidenRecipe won't be a problem. Will move it to a separate patch
52b9548 to
e0c7e7c
Compare
| static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { | ||
| VPDominatorTree VPDT; | ||
| VPDT.recalculate(Plan); | ||
| DenseSet<VPRecipeBase *> ToRemove; |
There was a problem hiding this comment.
SmallVector should be sufficient?
| for (VPBasicBlock *VPBB : | ||
| reverse(VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))) { | ||
| for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) { | ||
| if (!properlyDominates(EVL.getDefiningRecipe(), &R, VPDT)) |
There was a problem hiding this comment.
Can this be an assert? We just earlier inserted EVL at a place that should dominate all recipes handled below?
There was a problem hiding this comment.
It's related to comment above: there are some cases when preheader or postexit may technically contain evl-based recipes, but they do require EVL, but it should be constructed before.
There was a problem hiding this comment.
will remove it for now as it does assert work for VF=1.
| Plan.getEntry()); | ||
| DenseSet<VPValue *> HeaderMasks = collectAllHeaderMasks(Plan); | ||
| for (VPBasicBlock *VPBB : | ||
| reverse(VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))) { |
There was a problem hiding this comment.
Is an RPOT & reverse needed?
There was a problem hiding this comment.
Yes, otherwise recipes won't be removed correctly.
BTW, the comment I removed was copied with a loop from removeDeadRecipes
There was a problem hiding this comment.
I am not sure, I think the reason removeDeadRecipes does it in that order to remove later dead recipes before their users, but the current patch removes all replaced recipes, then removes the masks. The only operand that isn't used any longer when removing the original recipes is the mask, so I am not sure in what cases the order matters?
There was a problem hiding this comment.
agree. removed
There was a problem hiding this comment.
Hm, does it make sense to create VPlans with EVL in general? IIUC it doesn't and this could be done independent of the patch.
widen loads and stores shouldn't be created in a scalar plan, I guess at the moment this just works as there are no recipes that can be in scalar plans and get converted to an EVL version without this patch.
e0c7e7c to
dd1f74f
Compare
dd1f74f to
5098471
Compare
| ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( | ||
| Plan.getEntry()); | ||
| DenseSet<VPValue *> HeaderMasks = collectAllHeaderMasks(Plan); | ||
| for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) { |
| ; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> | ||
| ; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]> | ||
| ; IF-EVL-NEXT: WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]> | ||
| ; IF-EVL-NEXT: WIDEN-VP ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]> |
There was a problem hiding this comment.
Should this also match the EVL operand?
There was a problem hiding this comment.
agree. Fixed.
5098471 to
7312b20
Compare
|
@fhahn ping |
| VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands) | ||
| : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I), | ||
| Opcode(I.getOpcode()) {} | ||
| : VPWidenRecipe(VPDef::VPWidenSC, I, Operands) {} |
There was a problem hiding this comment.
This should probably have a custom classof is isa<VPWidenRecipe> returns true for both VPWidenRecipe and VPWidenEVLRecipe?
There was a problem hiding this comment.
why ? Was the goal of having dedicated EVL-recipes to prevent treating them as non-EVL ?
Should VPWidenLoad also return true for VPWidenLoadEVL ?
There was a problem hiding this comment.
The main benefit from having a shared base-class is so analyses don't have to handle all recipes when it makes sense.
I think analyses that apply to VPWidenRecipe should also conservatively apply to WPWidenEVLRecipe, as the later only possibly operates on fewer values. If that's not sound, we probably shouldn't inherit from VPWidenRecipe without also implementing the corresponding isa relationship.
VPWidenLoad/VPWidenLoadEVL only share VPWidenMemoryRecipe as common base-class, for which all VPWidenLoad|Store? return true.
There was a problem hiding this comment.
I assume than you really meant to introduce base class for VPWidenRecipe and VPWidenEVLRecipe that will return true for both of them, right ? In this case class hierarchy will be similar to VPWiden[Load|Store][|EVL].
If so, same should go to all future EVL-recipes:
VPSomeRecipeBase
/ \
VPSomeRecipe VPSomeEVLRecipe
VPSomeRecipeBase::classof(...) { return Opcode == VPSomeRecipeSC || Opcode == VPSomeEVLRecipeSC; }
There was a problem hiding this comment.
Ideally VPWidenRecipe could serve as such a base class as mentioned above, unless there’s a compelling reason not to. That way we automatically benefit from all folds and analysis already implemented for VPWidenRecipe
There was a problem hiding this comment.
Not sure I understand added hierarchy then. If the goal is to allow reuse of existing code, then what is different in VPWidenStoreEVL or VPWidenLoadEVL as they won't be treated as VPWidenStore or VPWidenLoad respectively:
isa<VPWidenMemoryRecipe>(EVLStore) => true
isa<VPWidenStoreEVLRecipe>(EVLStore) => true
isa<VPWidenStoreRecipe>(EVLStore) => false
Anyway, I'm ok to extend classof
There was a problem hiding this comment.
Thanks for the update. Adjusting VPWidenStoreEVLRecipe/VPWidenLoadEVLRecipe would probably make sense separately, although the reasoning may be a bit more complicated.
| VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands) | ||
| : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I), | ||
| Opcode(I.getOpcode()) {} | ||
| : VPWidenRecipe(VPDef::VPWidenSC, I, Operands) {} |
There was a problem hiding this comment.
The main benefit from having a shared base-class is so analyses don't have to handle all recipes when it makes sense.
I think analyses that apply to VPWidenRecipe should also conservatively apply to WPWidenEVLRecipe, as the later only possibly operates on fewer values. If that's not sound, we probably shouldn't inherit from VPWidenRecipe without also implementing the corresponding isa relationship.
VPWidenLoad/VPWidenLoadEVL only share VPWidenMemoryRecipe as common base-class, for which all VPWidenLoad|Store? return true.
| /// mask. | ||
| struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { | ||
| VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue &EVL, VPValue *Mask) | ||
| VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue *Mask, VPValue &EVL) |
There was a problem hiding this comment.
are those changes intentional? If so, would be good to split off, but as the mask is the optional last operand, the current order seems to be intentional (Mask could have nullptr as default arg)
80c43db to
661356c
Compare
fhahn
left a comment
There was a problem hiding this comment.
Thanks for the latest updates, a few final comments inline
| ; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 | ||
| ; IF-EVL-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1 | ||
| ; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] | ||
| ; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 |
There was a problem hiding this comment.
Are the changes here caused by the patch? After a quick spot check, it seems only related to CHECK variable renames? If so would be good to remove them from the patch
There was a problem hiding this comment.
most likely is an artifact from some of my previous commits
| VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands) | ||
| : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I), | ||
| Opcode(I.getOpcode()) {} | ||
| : VPWidenRecipe(VPDef::VPWidenSC, I, Operands) {} |
There was a problem hiding this comment.
Thanks for the update. Adjusting VPWidenStoreEVLRecipe/VPWidenLoadEVLRecipe would probably make sense separately, although the reasoning may be a bit more complicated.
c8afdb5 to
6af8c2e
Compare
| bool onlyFirstLaneUsed(const VPValue *Op) const override { | ||
| assert(is_contained(operands(), Op) && | ||
| "Op must be an operand of the recipe"); | ||
| // EVL in that recipe is always the last operand, thus any use before means |
There was a problem hiding this comment.
Would be good to post a follow-up patch to enforce EVL only used as last operand of various recipes in the verifier
There was a problem hiding this comment.
Sure. I will create PR with verification I added and removed previously
71c9936 to
da6c1f2
Compare
fhahn
left a comment
There was a problem hiding this comment.
LGTM, thanks! A few small additional suggestions inline
| bool onlyFirstLaneUsed(const VPValue *Op) const override { | ||
| assert(is_contained(operands(), Op) && | ||
| "Op must be an operand of the recipe"); | ||
| // EVL in that recipe is always the last operand, thus any use before means |
The patch adds `VPWidenEVLRecipe` which represents `VPWidenRecipe` + EVL argument. The new recipe replaces `VPWidenRecipe` in `tryAddExplicitVectorLength` for each binary and unary operations. Follow up patches will extend support for remaining cases, like `FCmp` and `ICmp`
supported by vp intrinsics
VP intrinsics can only accept FMFs at this moment, thus trying to set other flags will lead to ICE
234479d to
1f8726e
Compare
|
@nikolaypanchenko can merge this patch? |
|
I've fixed an unused variable warning from this PR with ce192b8. |
thanks. I'll double check why I missed it in the first place |
The patch adds
VPWidenEVLRecipewhich representsVPWidenRecipe+ EVL argument. The new recipe replacesVPWidenRecipeintryAddExplicitVectorLengthfor each binary and unary operations. Follow up patches will extend support for remaining cases, likeFCmpandICmp