diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 5a3c4a514a5dd..5f560530400ad 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -525,14 +525,13 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { static inline bool classof(const VPRecipeBase *R) { switch (R->getVPDefID()) { + case VPRecipeBase::VPBundleSC: case VPRecipeBase::VPDerivedIVSC: case VPRecipeBase::VPEVLBasedIVPHISC: case VPRecipeBase::VPExpandSCEVSC: case VPRecipeBase::VPInstructionSC: case VPRecipeBase::VPReductionEVLSC: case VPRecipeBase::VPReductionSC: - case VPRecipeBase::VPMulAccumulateReductionSC: - case VPRecipeBase::VPExtendedReductionSC: case VPRecipeBase::VPReplicateSC: case VPRecipeBase::VPScalarIVStepsSC: case VPRecipeBase::VPVectorPointerSC: @@ -852,9 +851,7 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags { R->getVPDefID() == VPRecipeBase::VPReductionEVLSC || R->getVPDefID() == VPRecipeBase::VPReplicateSC || R->getVPDefID() == VPRecipeBase::VPVectorEndPointerSC || - R->getVPDefID() == VPRecipeBase::VPVectorPointerSC || - R->getVPDefID() == VPRecipeBase::VPExtendedReductionSC || - R->getVPDefID() == VPRecipeBase::VPMulAccumulateReductionSC; + R->getVPDefID() == VPRecipeBase::VPVectorPointerSC; } static inline bool classof(const VPUser *U) { @@ -2432,28 +2429,6 @@ class VPReductionRecipe : public VPRecipeWithIRFlags { setUnderlyingValue(I); } - /// For VPExtendedReductionRecipe. - /// Note that the debug location is from the extend. - VPReductionRecipe(const unsigned char SC, const RecurKind RdxKind, - ArrayRef Operands, VPValue *CondOp, - bool IsOrdered, DebugLoc DL) - : VPRecipeWithIRFlags(SC, Operands, DL), RdxKind(RdxKind), - IsOrdered(IsOrdered), IsConditional(CondOp) { - if (CondOp) - addOperand(CondOp); - } - - /// For VPMulAccumulateReductionRecipe. - /// Note that the NUW/NSW flags and the debug location are from the Mul. - VPReductionRecipe(const unsigned char SC, const RecurKind RdxKind, - ArrayRef Operands, VPValue *CondOp, - bool IsOrdered, WrapFlagsTy WrapFlags, DebugLoc DL) - : VPRecipeWithIRFlags(SC, Operands, WrapFlags, DL), RdxKind(RdxKind), - IsOrdered(IsOrdered), IsConditional(CondOp) { - if (CondOp) - addOperand(CondOp); - } - public: VPReductionRecipe(RecurKind RdxKind, FastMathFlags FMFs, Instruction *I, VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, @@ -2479,9 +2454,7 @@ class VPReductionRecipe : public VPRecipeWithIRFlags { static inline bool classof(const VPRecipeBase *R) { return R->getVPDefID() == VPRecipeBase::VPReductionSC || - R->getVPDefID() == VPRecipeBase::VPReductionEVLSC || - R->getVPDefID() == VPRecipeBase::VPExtendedReductionSC || - R->getVPDefID() == VPRecipeBase::VPMulAccumulateReductionSC; + R->getVPDefID() == VPRecipeBase::VPReductionEVLSC; } static inline bool classof(const VPUser *U) { @@ -2620,190 +2593,6 @@ class VPReductionEVLRecipe : public VPReductionRecipe { } }; -/// A recipe to represent inloop extended reduction operations, performing a -/// reduction on a extended vector operand into a scalar value, and adding the -/// result to a chain. This recipe is abstract and needs to be lowered to -/// concrete recipes before codegen. The operands are {ChainOp, VecOp, -/// [Condition]}. -class VPExtendedReductionRecipe : public VPReductionRecipe { - /// Opcode of the extend for VecOp. - Instruction::CastOps ExtOp; - - /// The scalar type after extending. - Type *ResultTy; - - /// For cloning VPExtendedReductionRecipe. - VPExtendedReductionRecipe(VPExtendedReductionRecipe *ExtRed) - : VPReductionRecipe( - VPDef::VPExtendedReductionSC, ExtRed->getRecurrenceKind(), - {ExtRed->getChainOp(), ExtRed->getVecOp()}, ExtRed->getCondOp(), - ExtRed->isOrdered(), ExtRed->getDebugLoc()), - ExtOp(ExtRed->getExtOpcode()), ResultTy(ExtRed->getResultType()) { - transferFlags(*ExtRed); - setUnderlyingValue(ExtRed->getUnderlyingValue()); - } - -public: - VPExtendedReductionRecipe(VPReductionRecipe *R, VPWidenCastRecipe *Ext) - : VPReductionRecipe(VPDef::VPExtendedReductionSC, R->getRecurrenceKind(), - {R->getChainOp(), Ext->getOperand(0)}, R->getCondOp(), - R->isOrdered(), Ext->getDebugLoc()), - ExtOp(Ext->getOpcode()), ResultTy(Ext->getResultType()) { - assert((ExtOp == Instruction::CastOps::ZExt || - ExtOp == Instruction::CastOps::SExt) && - "VPExtendedReductionRecipe only supports zext and sext."); - - transferFlags(*Ext); - setUnderlyingValue(R->getUnderlyingValue()); - } - - ~VPExtendedReductionRecipe() override = default; - - VPExtendedReductionRecipe *clone() override { - return new VPExtendedReductionRecipe(this); - } - - VP_CLASSOF_IMPL(VPDef::VPExtendedReductionSC); - - void execute(VPTransformState &State) override { - llvm_unreachable("VPExtendedReductionRecipe should be transform to " - "VPExtendedRecipe + VPReductionRecipe before execution."); - }; - - /// Return the cost of VPExtendedReductionRecipe. - InstructionCost computeCost(ElementCount VF, - VPCostContext &Ctx) const override; - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const override; -#endif - - /// The scalar type after extending. - Type *getResultType() const { return ResultTy; } - - /// Is the extend ZExt? - bool isZExt() const { return getExtOpcode() == Instruction::ZExt; } - - /// Get the opcode of the extend for VecOp. - Instruction::CastOps getExtOpcode() const { return ExtOp; } -}; - -/// A recipe to represent inloop MulAccumulateReduction operations, multiplying -/// the vector operands (which may be extended), performing a reduction.add on -/// the result, and adding the scalar result to a chain. This recipe is abstract -/// and needs to be lowered to concrete recipes before codegen. The operands are -/// {ChainOp, VecOp1, VecOp2, [Condition]}. -class VPMulAccumulateReductionRecipe : public VPReductionRecipe { - /// Opcode of the extend for VecOp1 and VecOp2. - Instruction::CastOps ExtOp; - - /// Non-neg flag of the extend recipe. - bool IsNonNeg = false; - - /// The scalar type after extending. - Type *ResultTy = nullptr; - - /// For cloning VPMulAccumulateReductionRecipe. - VPMulAccumulateReductionRecipe(VPMulAccumulateReductionRecipe *MulAcc) - : VPReductionRecipe( - VPDef::VPMulAccumulateReductionSC, MulAcc->getRecurrenceKind(), - {MulAcc->getChainOp(), MulAcc->getVecOp0(), MulAcc->getVecOp1()}, - MulAcc->getCondOp(), MulAcc->isOrdered(), - WrapFlagsTy(MulAcc->hasNoUnsignedWrap(), MulAcc->hasNoSignedWrap()), - MulAcc->getDebugLoc()), - ExtOp(MulAcc->getExtOpcode()), IsNonNeg(MulAcc->isNonNeg()), - ResultTy(MulAcc->getResultType()) { - transferFlags(*MulAcc); - setUnderlyingValue(MulAcc->getUnderlyingValue()); - } - -public: - VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul, - VPWidenCastRecipe *Ext0, - VPWidenCastRecipe *Ext1, Type *ResultTy) - : VPReductionRecipe( - VPDef::VPMulAccumulateReductionSC, R->getRecurrenceKind(), - {R->getChainOp(), Ext0->getOperand(0), Ext1->getOperand(0)}, - R->getCondOp(), R->isOrdered(), - WrapFlagsTy(Mul->hasNoUnsignedWrap(), Mul->hasNoSignedWrap()), - R->getDebugLoc()), - ExtOp(Ext0->getOpcode()), ResultTy(ResultTy) { - assert(RecurrenceDescriptor::getOpcode(getRecurrenceKind()) == - Instruction::Add && - "The reduction instruction in MulAccumulateteReductionRecipe must " - "be Add"); - assert((ExtOp == Instruction::CastOps::ZExt || - ExtOp == Instruction::CastOps::SExt) && - "VPMulAccumulateReductionRecipe only supports zext and sext."); - setUnderlyingValue(R->getUnderlyingValue()); - // Only set the non-negative flag if the original recipe contains. - if (Ext0->hasNonNegFlag()) - IsNonNeg = Ext0->isNonNeg(); - } - - VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul, - Type *ResultTy) - : VPReductionRecipe( - VPDef::VPMulAccumulateReductionSC, R->getRecurrenceKind(), - {R->getChainOp(), Mul->getOperand(0), Mul->getOperand(1)}, - R->getCondOp(), R->isOrdered(), - WrapFlagsTy(Mul->hasNoUnsignedWrap(), Mul->hasNoSignedWrap()), - R->getDebugLoc()), - ExtOp(Instruction::CastOps::CastOpsEnd), ResultTy(ResultTy) { - assert(RecurrenceDescriptor::getOpcode(getRecurrenceKind()) == - Instruction::Add && - "The reduction instruction in MulAccumulateReductionRecipe must be " - "Add"); - setUnderlyingValue(R->getUnderlyingValue()); - } - - ~VPMulAccumulateReductionRecipe() override = default; - - VPMulAccumulateReductionRecipe *clone() override { - return new VPMulAccumulateReductionRecipe(this); - } - - VP_CLASSOF_IMPL(VPDef::VPMulAccumulateReductionSC); - - void execute(VPTransformState &State) override { - llvm_unreachable("VPMulAccumulateReductionRecipe should transform to " - "VPWidenCastRecipe + " - "VPWidenRecipe + VPReductionRecipe before execution"); - } - - /// Return the cost of VPMulAccumulateReductionRecipe. - InstructionCost computeCost(ElementCount VF, - VPCostContext &Ctx) const override; - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const override; -#endif - - Type *getResultType() const { return ResultTy; } - - /// The first vector value to be extended and reduced. - VPValue *getVecOp0() const { return getOperand(1); } - - /// The second vector value to be extended and reduced. - VPValue *getVecOp1() const { return getOperand(2); } - - /// Return true if this recipe contains extended operands. - bool isExtended() const { return ExtOp != Instruction::CastOps::CastOpsEnd; } - - /// Return the opcode of the extends for the operands. - Instruction::CastOps getExtOpcode() const { return ExtOp; } - - /// Return if the operands are zero-extended. - bool isZExt() const { return ExtOp == Instruction::CastOps::ZExt; } - - /// Return true if the operand extends have the non-negative flag. - bool isNonNeg() const { return IsNonNeg; } -}; - /// VPReplicateRecipe replicates a given instruction producing multiple scalar /// copies of the original scalar type, one per lane, instead of producing a /// single copy of widened type for all lanes. If the instruction is known to be @@ -2922,6 +2711,127 @@ class VPBranchOnMaskRecipe : public VPRecipeBase { } }; +/// A recipe to combine multiple recipes into a 'bundle' recipe, which should be +/// considered as single entity for cost-modeling and transforms. The recipe +/// needs to be 'unbundled', i.e. replaced by its individual recipes before +/// execute. +class VPBundleRecipe : public VPSingleDefRecipe { + enum class BundleTypes { + ExtendedReduction, + MulAccumulateReduction, + }; + + /// Recipes bundled together in this VPBundleRecipe. + SmallVector BundledOps; + + /// Temporary VPValues used for external operands of the bundle, i.e. operands + /// not defined by recipes in the bundle. + SmallVector TmpValues; + + /// Type of the bundle. + BundleTypes BundleType; + + VPBundleRecipe(BundleTypes BundleType, ArrayRef ToBundle) + : VPSingleDefRecipe(VPDef::VPBundleSC, {}, {}), BundledOps(ToBundle), + BundleType(BundleType) { + // Bundle up the operand recipes. + SmallPtrSet BundledUsers; + for (auto *R : BundledOps) + BundledUsers.insert(R); + + // Recipes in the bundle, except the last one, must only be used inside the + // bundle. If there other external users, clone the recipes for the bundle. + for (const auto &[Idx, R] : enumerate(drop_end(ToBundle))) { + if (all_of(R->users(), [&BundledUsers](VPUser *U) { + return BundledUsers.contains(U); + })) { + if (R->getParent()) + R->removeFromParent(); + continue; + } + // The users external to the bundle. Clone the recipe for use in the + // bundle and update all its in-bundle users. + VPSingleDefRecipe *Copy = R->clone(); + BundledOps[Idx] = Copy; + BundledUsers.insert(Copy); + R->replaceUsesWithIf(Copy, [&BundledUsers](VPUser &U, unsigned) { + return BundledUsers.contains(&U); + }); + } + BundledOps.back()->removeFromParent(); + + // Internalize all external operands to the bundled operations. To do so, + // create new temporary VPValues for all operands not defined by recipe in + // the bundle. The original operands are added as operands of the + // VPBundleRecipe. + for (auto *R : BundledOps) { + for (const auto &[Idx, Op] : enumerate(R->operands())) { + auto *Def = Op->getDefiningRecipe(); + if (Def && BundledUsers.contains(Def)) + continue; + addOperand(Op); + TmpValues.push_back(new VPValue()); + R->setOperand(Idx, TmpValues.back()); + } + } + } + +public: + VPBundleRecipe(VPWidenCastRecipe *Ext, VPReductionRecipe *Red) + : VPBundleRecipe(BundleTypes::ExtendedReduction, {Ext, Red}) {} + VPBundleRecipe(VPWidenRecipe *Mul, VPReductionRecipe *Red) + : VPBundleRecipe(BundleTypes::MulAccumulateReduction, {Mul, Red}) {} + VPBundleRecipe(VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1, + VPWidenRecipe *Mul, VPReductionRecipe *Red) + : VPBundleRecipe(BundleTypes::MulAccumulateReduction, + {Ext0, Ext1, Mul, Red}) {} + VPBundleRecipe(VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1, + VPWidenRecipe *Mul, VPWidenCastRecipe *Ext2, + VPReductionRecipe *Red) + : VPBundleRecipe(BundleTypes::MulAccumulateReduction, + {Ext0, Ext1, Mul, Ext2, Red}) {} + + ~VPBundleRecipe() override { + SmallPtrSet Seen; + for (auto *R : reverse(BundledOps)) + if (Seen.insert(R).second) + delete R; + for (VPValue *T : TmpValues) + delete T; + } + + VP_CLASSOF_IMPL(VPDef::VPBundleSC) + + VPBundleRecipe *clone() override { + assert(!BundledOps.empty() && "empty bundles should be removed"); + return new VPBundleRecipe(BundleType, BundledOps); + } + + /// Return the VPSingleDefRecipe producing the final result of the bundled + /// recipe. + VPSingleDefRecipe *getResultOp() const { return BundledOps.back(); } + + /// Insert the bundled recipes back into the VPlan, directly before the + /// current recipe. Leaves the bundle recipe empty and the recipe must be + /// removed before codegen. + void unbundle(); + + /// Generate the extraction of the appropriate bit from the block mask and the + /// conditional branch. + void execute(VPTransformState &State) override { + llvm_unreachable("recipe must be removed before execute"); + } + + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when /// control converges back from a Branch-on-Mask. The phi nodes are needed in /// order to merge values that are set under such a branch and feed their uses. diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 76da5b0314a8e..1e2961c5beb56 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -267,6 +267,13 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { Type *ResultTy = TypeSwitch(V->getDefiningRecipe()) + .Case([this](const auto *R) { + unsigned RdxOpIdxOffset = + cast(R->getResultOp())->isConditional() ? 2 + : 1; + return inferScalarType( + R->getOperand(R->getNumOperands() - RdxOpIdxOffset)); + }) .Case( @@ -296,8 +303,6 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { // TODO: Use info from interleave group. return V->getUnderlyingValue()->getType(); }) - .Case( - [](const auto *R) { return R->getResultType(); }) .Case([](const VPExpandSCEVRecipe *R) { return R->getSCEV()->getType(); }) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 048286d7a97bc..c6bcb1491ee4f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -73,8 +73,6 @@ bool VPRecipeBase::mayWriteToMemory() const { case VPBlendSC: case VPReductionEVLSC: case VPReductionSC: - case VPExtendedReductionSC: - case VPMulAccumulateReductionSC: case VPVectorPointerSC: case VPWidenCanonicalIVSC: case VPWidenCastSC: @@ -123,8 +121,6 @@ bool VPRecipeBase::mayReadFromMemory() const { case VPBlendSC: case VPReductionEVLSC: case VPReductionSC: - case VPExtendedReductionSC: - case VPMulAccumulateReductionSC: case VPVectorPointerSC: case VPWidenCanonicalIVSC: case VPWidenCastSC: @@ -163,8 +159,6 @@ bool VPRecipeBase::mayHaveSideEffects() const { case VPBlendSC: case VPReductionEVLSC: case VPReductionSC: - case VPExtendedReductionSC: - case VPMulAccumulateReductionSC: case VPScalarIVStepsSC: case VPVectorPointerSC: case VPWidenCanonicalIVSC: @@ -2582,30 +2576,143 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF, Ctx.CostKind); } -InstructionCost -VPExtendedReductionRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { - unsigned Opcode = RecurrenceDescriptor::getOpcode(getRecurrenceKind()); - Type *RedTy = Ctx.Types.inferScalarType(this); - auto *SrcVecTy = - cast(toVectorTy(Ctx.Types.inferScalarType(getVecOp()), VF)); - assert(RedTy->isIntegerTy() && - "ExtendedReduction only support integer type currently."); - return Ctx.TTI.getExtendedReductionCost(Opcode, isZExt(), RedTy, SrcVecTy, - std::nullopt, Ctx.CostKind); +void VPBundleRecipe::unbundle() { + for (auto *Op : BundledOps) + if (!Op->getParent()) + Op->insertBefore(this); + + for (const auto &[Idx, Op] : enumerate(operands())) + TmpValues[Idx]->replaceAllUsesWith(Op); + + replaceAllUsesWith(getResultOp()); + + if (BundleType == BundleTypes::MulAccumulateReduction && + BundledOps.size() == 5) { + // Note that we will drop the extend after mul which transforms + // reduce.add(ext(mul(ext, ext))) to reduce.add(mul(ext, ext)). + // TODO: This transform should be done separately from bundling/unbundling. + auto *Ext0 = cast(BundledOps[0]); + auto *Ext1 = cast(BundledOps[1]); + auto *Ext2 = cast(BundledOps[3]); + auto *Op0 = + new VPWidenCastRecipe(Ext0->getOpcode(), Ext0->getOperand(0), + Ext2->getResultType(), *Ext0, getDebugLoc()); + Op0->insertBefore(Ext0); + + VPSingleDefRecipe *Op1 = Op0; + if (Ext0 != Ext1) { + Op1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0), + Ext2->getResultType(), *Ext1, getDebugLoc()); + Op1->insertBefore(Ext1); + } + auto *Mul = cast(BundledOps[2]); + auto *Red = cast(BundledOps[4]); + Mul->setOperand(0, Op0); + Mul->setOperand(1, Op1); + Red->setOperand(1, Mul); + Ext0->eraseFromParent(); + Ext2->eraseFromParent(); + if (Ext0 != Ext1) + Ext1->eraseFromParent(); + } + BundledOps.clear(); } -InstructionCost -VPMulAccumulateReductionRecipe::computeCost(ElementCount VF, +InstructionCost VPBundleRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { Type *RedTy = Ctx.Types.inferScalarType(this); - auto *SrcVecTy = - cast(toVectorTy(Ctx.Types.inferScalarType(getVecOp0()), VF)); - return Ctx.TTI.getMulAccReductionCost(isZExt(), RedTy, SrcVecTy, - Ctx.CostKind); + auto *SrcVecTy = cast( + toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF)); + assert(RedTy->isIntegerTy() && + "ExtendedReduction only support integer type currently."); + switch (BundleType) { + case BundleTypes::ExtendedReduction: { + unsigned Opcode = RecurrenceDescriptor::getOpcode( + cast(BundledOps[1])->getRecurrenceKind()); + return Ctx.TTI.getExtendedReductionCost( + Opcode, + cast(BundledOps.front())->getOpcode() == + Instruction::ZExt, + RedTy, SrcVecTy, std::nullopt, Ctx.CostKind); + } + case BundleTypes::MulAccumulateReduction: + return Ctx.TTI.getMulAccReductionCost( + BundledOps.size() > 2 + ? cast(BundledOps.front())->getOpcode() == + Instruction::ZExt + : false, + RedTy, SrcVecTy, Ctx.CostKind); + } } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + +void VPBundleRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "BUNDLE "; + printAsOperand(O, SlotTracker); + O << " = "; + auto *Red = cast(BundledOps.back()); + unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()); + + switch (BundleType) { + case BundleTypes::ExtendedReduction: { + getOperand(1)->printAsOperand(O, SlotTracker); + O << " +"; + O << " reduce." << Instruction::getOpcodeName(Opcode) << " ("; + getOperand(0)->printAsOperand(O, SlotTracker); + Red->printFlags(O); + + auto *Ext0 = cast(BundledOps[0]); + O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to " + << *Ext0->getResultType(); + if (Red->isConditional()) { + O << ", "; + Red->getCondOp()->printAsOperand(O, SlotTracker); + } + O << ")"; + break; + } + case BundleTypes::MulAccumulateReduction: { + getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker); + O << " + "; + O << "reduce." + << Instruction::getOpcodeName( + RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind())) + << " ("; + O << "mul"; + auto *Mul = cast(BundledOps.size() == 2 ? BundledOps[0] + : BundledOps[2]); + Mul->printFlags(O); + bool IsExtended = BundledOps.size() > 2; + if (IsExtended) + O << "("; + getOperand(0)->printAsOperand(O, SlotTracker); + if (IsExtended) { + auto *Ext0 = cast( + BundledOps.size() == 5 ? BundledOps[3] : BundledOps[0]); + O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to " + << *Ext0->getResultType() << "), ("; + } else { + O << ", "; + } + getOperand(1)->printAsOperand(O, SlotTracker); + if (IsExtended) { + auto *Ext1 = cast( + BundledOps.size() == 5 ? BundledOps[3] : BundledOps[1]); + O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to " + << *Ext1->getResultType() << ")"; + } + if (Red->isConditional()) { + O << ", "; + Red->getCondOp()->printAsOperand(O, SlotTracker); + } + O << ")"; + break; + } + } +} + void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "REDUCE "; @@ -2648,58 +2755,6 @@ void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent, O << ")"; } -void VPExtendedReductionRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "EXTENDED-REDUCE "; - printAsOperand(O, SlotTracker); - O << " = "; - getChainOp()->printAsOperand(O, SlotTracker); - O << " +"; - O << " reduce." - << Instruction::getOpcodeName( - RecurrenceDescriptor::getOpcode(getRecurrenceKind())) - << " ("; - getVecOp()->printAsOperand(O, SlotTracker); - printFlags(O); - O << Instruction::getOpcodeName(ExtOp) << " to " << *getResultType(); - if (isConditional()) { - O << ", "; - getCondOp()->printAsOperand(O, SlotTracker); - } - O << ")"; -} - -void VPMulAccumulateReductionRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "MULACC-REDUCE "; - printAsOperand(O, SlotTracker); - O << " = "; - getChainOp()->printAsOperand(O, SlotTracker); - O << " + "; - O << "reduce." - << Instruction::getOpcodeName( - RecurrenceDescriptor::getOpcode(getRecurrenceKind())) - << " ("; - O << "mul"; - printFlags(O); - if (isExtended()) - O << "("; - getVecOp0()->printAsOperand(O, SlotTracker); - if (isExtended()) - O << " " << Instruction::getOpcodeName(ExtOp) << " to " << *getResultType() - << "), ("; - else - O << ", "; - getVecOp1()->printAsOperand(O, SlotTracker); - if (isExtended()) - O << " " << Instruction::getOpcodeName(ExtOp) << " to " << *getResultType() - << ")"; - if (isConditional()) { - O << ", "; - getCondOp()->printAsOperand(O, SlotTracker); - } - O << ")"; -} #endif /// A helper function to scalarize a single Instruction in the innermost loop. diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 05a0e15f9a199..0b4cd10f35252 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1779,9 +1779,9 @@ void VPlanTransforms::truncateToMinimalBitwidths( for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( vp_depth_first_deep(Plan.getVectorLoopRegion()))) { for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { - if (!isa( - &R)) + if (!isa(&R)) continue; VPValue *ResultVPV = R.getVPSingleValue(); @@ -2530,83 +2530,6 @@ void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) { R->dissolveToCFGLoop(); } -// Expand VPExtendedReductionRecipe to VPWidenCastRecipe + VPReductionRecipe. -static void expandVPExtendedReduction(VPExtendedReductionRecipe *ExtRed) { - VPWidenCastRecipe *Ext; - // Only ZExt contains non-neg flags. - if (ExtRed->isZExt()) - Ext = new VPWidenCastRecipe(ExtRed->getExtOpcode(), ExtRed->getVecOp(), - ExtRed->getResultType(), *ExtRed, - ExtRed->getDebugLoc()); - else - Ext = new VPWidenCastRecipe(ExtRed->getExtOpcode(), ExtRed->getVecOp(), - ExtRed->getResultType(), {}, - ExtRed->getDebugLoc()); - - auto *Red = new VPReductionRecipe( - ExtRed->getRecurrenceKind(), FastMathFlags(), ExtRed->getChainOp(), Ext, - ExtRed->getCondOp(), ExtRed->isOrdered(), ExtRed->getDebugLoc()); - Ext->insertBefore(ExtRed); - Red->insertBefore(ExtRed); - ExtRed->replaceAllUsesWith(Red); - ExtRed->eraseFromParent(); -} - -// Expand VPMulAccumulateReductionRecipe to VPWidenRecipe (mul) + -// VPReductionRecipe (reduce.add) -// + VPWidenCastRecipe (optional). -static void -expandVPMulAccumulateReduction(VPMulAccumulateReductionRecipe *MulAcc) { - // Generate inner VPWidenCastRecipes if necessary. - // Note that we will drop the extend after mul which transforms - // reduce.add(ext(mul(ext, ext))) to reduce.add(mul(ext, ext)). - VPValue *Op0, *Op1; - if (MulAcc->isExtended()) { - Type *RedTy = MulAcc->getResultType(); - if (MulAcc->isZExt()) - Op0 = new VPWidenCastRecipe( - MulAcc->getExtOpcode(), MulAcc->getVecOp0(), RedTy, - VPIRFlags::NonNegFlagsTy(MulAcc->isNonNeg()), MulAcc->getDebugLoc()); - else - Op0 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp0(), - RedTy, {}, MulAcc->getDebugLoc()); - Op0->getDefiningRecipe()->insertBefore(MulAcc); - // Prevent reduce.add(mul(ext(A), ext(A))) generate duplicate - // VPWidenCastRecipe. - if (MulAcc->getVecOp0() == MulAcc->getVecOp1()) { - Op1 = Op0; - } else { - if (MulAcc->isZExt()) - Op1 = new VPWidenCastRecipe( - MulAcc->getExtOpcode(), MulAcc->getVecOp1(), RedTy, - VPIRFlags::NonNegFlagsTy(MulAcc->isNonNeg()), - MulAcc->getDebugLoc()); - else - Op1 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp1(), - RedTy, {}, MulAcc->getDebugLoc()); - Op1->getDefiningRecipe()->insertBefore(MulAcc); - } - } else { - // No extends in this MulAccRecipe. - Op0 = MulAcc->getVecOp0(); - Op1 = MulAcc->getVecOp1(); - } - - std::array MulOps = {Op0, Op1}; - auto *Mul = new VPWidenRecipe( - Instruction::Mul, ArrayRef(MulOps), MulAcc->hasNoUnsignedWrap(), - MulAcc->hasNoSignedWrap(), MulAcc->getDebugLoc()); - Mul->insertBefore(MulAcc); - - auto *Red = new VPReductionRecipe( - MulAcc->getRecurrenceKind(), FastMathFlags(), MulAcc->getChainOp(), Mul, - MulAcc->getCondOp(), MulAcc->isOrdered(), MulAcc->getDebugLoc()); - Red->insertBefore(MulAcc); - - MulAcc->replaceAllUsesWith(Red); - MulAcc->eraseFromParent(); -} - void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan, Type &CanonicalIVTy) { using namespace llvm::VPlanPatternMatch; @@ -2666,12 +2589,10 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan, ToRemove.push_back(VPI); } for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { - if (auto *ExtRed = dyn_cast(&R)) { - expandVPExtendedReduction(ExtRed); - continue; + if (auto *Bundle = dyn_cast(&R)) { + Bundle->unbundle(); + Bundle->eraseFromParent(); } - if (auto *MulAcc = dyn_cast(&R)) - expandVPMulAccumulateReduction(MulAcc); } } @@ -2771,10 +2692,10 @@ void VPlanTransforms::handleUncountableEarlyExit( } /// This function tries convert extended in-loop reductions to -/// VPExtendedReductionRecipe and clamp the \p Range if it is beneficial and -/// valid. The created recipe must be lowered to concrete +/// VPBundleRecipe and clamp the \p Range if it is beneficial and +/// valid. The created recipe must be unbundled to its constituent /// recipes before execution. -static VPExtendedReductionRecipe * +static VPBundleRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range) { using namespace VPlanPatternMatch; @@ -2808,19 +2729,19 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, cast(VecOp)->getOpcode() == Instruction::CastOps::ZExt, Ctx.Types.inferScalarType(A))) - return new VPExtendedReductionRecipe(Red, cast(VecOp)); + return new VPBundleRecipe(cast(VecOp), Red); return nullptr; } /// This function tries convert extended in-loop reductions to -/// VPMulAccumulateReductionRecipe and clamp the \p Range if it is beneficial -/// and valid. The created VPExtendedReductionRecipe must be lower to concrete -/// recipes before execution. Patterns of MulAccumulateReduction: +/// VPBundleRecipe and clamp the \p Range if it is beneficial +/// and valid. The created VPBundleRecipe must be unbundled to its constituent +/// recipes before execution. Patterns of the VPBundleRecipe: /// reduce.add(mul(...)), /// reduce.add(mul(ext(A), ext(B))), /// reduce.add(ext(mul(ext(A), ext(B)))). -static VPMulAccumulateReductionRecipe * +static VPBundleRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range) { using namespace VPlanPatternMatch; @@ -2876,12 +2797,13 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, match(RecipeB, m_ZExtOrSExt(m_VPValue())) && IsMulAccValidAndClampRange(RecipeA->getOpcode() == Instruction::CastOps::ZExt, - Mul, RecipeA, RecipeB, nullptr)) - return new VPMulAccumulateReductionRecipe(Red, Mul, RecipeA, RecipeB, - RecipeA->getResultType()); + Mul, RecipeA, RecipeB, nullptr)) { + return new VPBundleRecipe(RecipeA, RecipeB, Mul, Red); + } // Match reduce.add(mul). - if (IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr)) - return new VPMulAccumulateReductionRecipe(Red, Mul, RedTy); + if (IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr)) { + return new VPBundleRecipe(Mul, Red); + } } // Match reduce.add(ext(mul(ext(A), ext(B)))). // All extend recipes must have same opcode or A == B @@ -2898,9 +2820,9 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, Ext0->getOpcode() == Ext1->getOpcode() && IsMulAccValidAndClampRange(Ext0->getOpcode() == Instruction::CastOps::ZExt, - Mul, Ext0, Ext1, Ext)) - return new VPMulAccumulateReductionRecipe(Red, Mul, Ext0, Ext1, - Ext->getResultType()); + Mul, Ext0, Ext1, Ext)) { + return new VPBundleRecipe(Ext0, Ext1, Mul, Ext, Red); + } } return nullptr; } @@ -2910,8 +2832,9 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range) { - VPReductionRecipe *AbstractR = nullptr; - + VPBundleRecipe *AbstractR = nullptr; + auto IP = std::next(Red->getIterator()); + auto *VPBB = Red->getParent(); if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range)) AbstractR = MulAcc; else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range)) @@ -2920,7 +2843,7 @@ static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, if (!AbstractR) return; - AbstractR->insertBefore(Red); + AbstractR->insertBefore(*VPBB, IP); Red->replaceAllUsesWith(AbstractR); } @@ -2928,7 +2851,7 @@ void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range) { for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( vp_depth_first_deep(Plan.getVectorLoopRegion()))) { - for (VPRecipeBase &R : *VPBB) { + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { if (auto *Red = dyn_cast(&R)) tryToCreateAbstractReductionRecipe(Red, Ctx, Range); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index a0d3dc9b934cc..7246cb9a75ed8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -32,6 +32,7 @@ namespace llvm { // Forward declarations. class raw_ostream; class Value; +class VPBundleRecipe; class VPDef; struct VPDoubleValueDef; class VPSlotTracker; @@ -49,6 +50,7 @@ class VPValue { friend struct VPDoubleValueDef; friend class VPInterleaveRecipe; friend class VPlan; + friend class VPBundleRecipe; const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). @@ -328,6 +330,7 @@ class VPDef { /// type identification. using VPRecipeTy = enum { VPBranchOnMaskSC, + VPBundleSC, VPDerivedIVSC, VPExpandSCEVSC, VPIRInstructionSC, @@ -335,8 +338,6 @@ class VPDef { VPInterleaveSC, VPReductionEVLSC, VPReductionSC, - VPMulAccumulateReductionSC, - VPExtendedReductionSC, VPPartialReductionSC, VPReplicateSC, VPScalarIVStepsSC, diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll index f179a3ae04d23..e8af144498659 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll @@ -1526,7 +1526,8 @@ define i64 @mla_and_add_together_16_64(ptr nocapture noundef readonly %x, i32 no ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2 ; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64> -; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <8 x i64> [[TMP1]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <8 x i64> [[TMP1]], [[TMP4]] ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP2]]) ; CHECK-NEXT: [[TMP5]] = add i64 [[TMP3]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP10:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32> diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll index 978f1b80d26da..3cd37851ec725 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll @@ -287,12 +287,12 @@ define i64 @print_extended_reduction(ptr nocapture readonly %x, ptr nocapture re ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]> -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, ir<[[RDX_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[ADDR:%.+]]> = vector-pointer ir<%arrayidx> ; CHECK-NEXT: WIDEN ir<[[LOAD:%.+]]> = load vp<[[ADDR]]> -; CHECK-NEXT: EXTENDED-REDUCE ir<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (ir<[[LOAD]]> zext to i64) +; CHECK-NEXT: BUNDLE vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (ir<[[LOAD]]> zext to i64) ; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors @@ -332,7 +332,7 @@ define i64 @print_mulacc(ptr nocapture readonly %x, ptr nocapture readonly %y, i ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]> -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, ir<[[RDX_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<[[ARRAYIDX0:%.+]]> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[ADDR0:%.+]]> = vector-pointer ir<[[ARRAYIDX0]]> @@ -340,7 +340,7 @@ define i64 @print_mulacc(ptr nocapture readonly %x, ptr nocapture readonly %y, i ; CHECK-NEXT: CLONE ir<[[ARRAYIDX1:%.+]]> = getelementptr inbounds ir<%y>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[ADDR1:%.+]]> = vector-pointer ir<[[ARRAYIDX1]]> ; CHECK-NEXT: WIDEN ir<[[LOAD1:%.+]]> = load vp<[[ADDR1]]> -; CHECK-NEXT: MULACC-REDUCE ir<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul nsw ir<[[LOAD0]]>, ir<[[LOAD1]]>) +; CHECK-NEXT: BUNDLE vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul nsw ir<[[LOAD0]]>, ir<[[LOAD1]]>) ; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors @@ -382,7 +382,7 @@ define i64 @print_mulacc_extended(ptr nocapture readonly %x, ptr nocapture reado ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]> -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, ir<[[RDX_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<[[ARRAYIDX0:%.+]]> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[ADDR0:%.+]]> = vector-pointer ir<[[ARRAYIDX0]]> @@ -390,7 +390,7 @@ define i64 @print_mulacc_extended(ptr nocapture readonly %x, ptr nocapture reado ; CHECK-NEXT: CLONE ir<[[ARRAYIDX1:%.+]]> = getelementptr inbounds ir<%y>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[ADDR1:%.+]]> = vector-pointer ir<[[ARRAYIDX1]]> ; CHECK-NEXT: WIDEN ir<[[LOAD1:%.+]]> = load vp<[[ADDR1]]> -; CHECK-NEXT: MULACC-REDUCE ir<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (mul nsw (ir<[[LOAD0]]> sext to i64), (ir<[[LOAD1]]> sext to i64)) +; CHECK-NEXT: BUNDLE vp<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (mul nsw (ir<[[LOAD0]]> sext to i64), (ir<[[LOAD1]]> sext to i64)) ; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors