From 9ab5474e560292d15cb53ed94d248d7e5f54787d Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Mon, 23 Dec 2024 23:33:44 +0000 Subject: [PATCH 001/567] [LV] Rename `ToVectorTy` to `toVectorTy` (NFC) (#120404) This is for consistency with other helpers (and also follows the LLVM naming conventions). --- llvm/include/llvm/IR/VectorTypeUtils.h | 10 ++-- .../Transforms/Vectorize/LoopVectorize.cpp | 58 +++++++++---------- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 34 +++++------ 3 files changed, 51 insertions(+), 51 deletions(-) diff --git a/llvm/include/llvm/IR/VectorTypeUtils.h b/llvm/include/llvm/IR/VectorTypeUtils.h index f30bf9ee9240b..d24c714f99cb2 100644 --- a/llvm/include/llvm/IR/VectorTypeUtils.h +++ b/llvm/include/llvm/IR/VectorTypeUtils.h @@ -16,14 +16,14 @@ namespace llvm { /// A helper function for converting Scalar types to vector types. If /// the incoming type is void, we return void. If the EC represents a /// scalar, we return the scalar type. -inline Type *ToVectorTy(Type *Scalar, ElementCount EC) { +inline Type *toVectorTy(Type *Scalar, ElementCount EC) { if (Scalar->isVoidTy() || Scalar->isMetadataTy() || EC.isScalar()) return Scalar; return VectorType::get(Scalar, EC); } -inline Type *ToVectorTy(Type *Scalar, unsigned VF) { - return ToVectorTy(Scalar, ElementCount::getFixed(VF)); +inline Type *toVectorTy(Type *Scalar, unsigned VF) { + return toVectorTy(Scalar, ElementCount::getFixed(VF)); } /// A helper for converting structs of scalar types to structs of vector types. @@ -41,7 +41,7 @@ Type *toScalarizedStructTy(StructType *StructTy); bool isVectorizedStructTy(StructType *StructTy); /// A helper for converting to vectorized types. For scalar types, this is -/// equivalent to calling `ToVectorTy`. For struct types, this returns a new +/// equivalent to calling `toVectorTy`. For struct types, this returns a new /// struct where each element type has been widened to a vector type. /// Note: /// - If the incoming type is void, we return void @@ -50,7 +50,7 @@ bool isVectorizedStructTy(StructType *StructTy); inline Type *toVectorizedTy(Type *Ty, ElementCount EC) { if (StructType *StructTy = dyn_cast(Ty)) return toVectorizedStructTy(StructTy, EC); - return ToVectorTy(Ty, EC); + return toVectorTy(Ty, EC); } /// A helper for converting vectorized types to scalarized (non-vector) types. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 1f6996cd9c1f4..10b998fff02b7 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1251,8 +1251,8 @@ class LoopVectorizationCostModel { return false; // Get the source and destination types of the truncate. - Type *SrcTy = ToVectorTy(cast(I)->getSrcTy(), VF); - Type *DestTy = ToVectorTy(cast(I)->getDestTy(), VF); + Type *SrcTy = toVectorTy(cast(I)->getSrcTy(), VF); + Type *DestTy = toVectorTy(cast(I)->getDestTy(), VF); // If the truncate is free for the given types, return false. Replacing a // free truncate with an induction variable would add an induction variable @@ -3535,14 +3535,14 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, } InstructionCost SafeDivisorCost = 0; - auto *VecTy = ToVectorTy(I->getType(), VF); + auto *VecTy = toVectorTy(I->getType(), VF); // The cost of the select guard to ensure all lanes are well defined // after we speculate above any internal control flow. - SafeDivisorCost += TTI.getCmpSelInstrCost( - Instruction::Select, VecTy, - ToVectorTy(Type::getInt1Ty(I->getContext()), VF), - CmpInst::BAD_ICMP_PREDICATE, CostKind); + SafeDivisorCost += + TTI.getCmpSelInstrCost(Instruction::Select, VecTy, + toVectorTy(Type::getInt1Ty(I->getContext()), VF), + CmpInst::BAD_ICMP_PREDICATE, CostKind); // Certain instructions can be cheaper to vectorize if they have a constant // second vector operand. One example of this are shifts on x86. @@ -4662,7 +4662,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, } auto WillWiden = [&TTI, VF](Type *ScalarTy) { - Type *VectorTy = ToVectorTy(ScalarTy, VF); + Type *VectorTy = toVectorTy(ScalarTy, VF); unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy); if (!NumLegalParts) return false; @@ -5653,7 +5653,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount( TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { ScalarCost += TTI.getScalarizationOverhead( - cast(ToVectorTy(I->getType(), VF)), + cast(toVectorTy(I->getType(), VF)), APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true, /*Extract*/ false, CostKind); ScalarCost += @@ -5672,7 +5672,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount( Worklist.push_back(J); else if (needsExtract(J, VF)) { ScalarCost += TTI.getScalarizationOverhead( - cast(ToVectorTy(J->getType(), VF)), + cast(toVectorTy(J->getType(), VF)), APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false, /*Extract*/ true, CostKind); } @@ -5783,7 +5783,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, unsigned AS = getLoadStoreAddressSpace(I); Value *Ptr = getLoadStorePointerOperand(I); - Type *PtrTy = ToVectorTy(Ptr->getType(), VF); + Type *PtrTy = toVectorTy(Ptr->getType(), VF); // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` // that it is being called from this specific place. @@ -5834,7 +5834,7 @@ InstructionCost LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, ElementCount VF) { Type *ValTy = getLoadStoreType(I); - auto *VectorTy = cast(ToVectorTy(ValTy, VF)); + auto *VectorTy = cast(toVectorTy(ValTy, VF)); Value *Ptr = getLoadStorePointerOperand(I); unsigned AS = getLoadStoreAddressSpace(I); int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); @@ -5866,7 +5866,7 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, assert(Legal->isUniformMemOp(*I, VF)); Type *ValTy = getLoadStoreType(I); - auto *VectorTy = cast(ToVectorTy(ValTy, VF)); + auto *VectorTy = cast(toVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; @@ -5892,7 +5892,7 @@ InstructionCost LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, ElementCount VF) { Type *ValTy = getLoadStoreType(I); - auto *VectorTy = cast(ToVectorTy(ValTy, VF)); + auto *VectorTy = cast(toVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); const Value *Ptr = getLoadStorePointerOperand(I); @@ -5910,7 +5910,7 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, Instruction *InsertPos = Group->getInsertPos(); Type *ValTy = getLoadStoreType(InsertPos); - auto *VectorTy = cast(ToVectorTy(ValTy, VF)); + auto *VectorTy = cast(toVectorTy(ValTy, VF)); unsigned AS = getLoadStoreAddressSpace(InsertPos); enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; @@ -6155,7 +6155,7 @@ InstructionCost LoopVectorizationCostModel::getScalarizationOverhead( return 0; InstructionCost Cost = 0; - Type *RetTy = ToVectorTy(I->getType(), VF); + Type *RetTy = toVectorTy(I->getType(), VF); if (!RetTy->isVoidTy() && (!isa(I) || !TTI.supportsEfficientVectorElementLoadStore())) Cost += TTI.getScalarizationOverhead( @@ -6421,9 +6421,9 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { bool MaskRequired = Legal->isMaskRequired(CI); // Compute corresponding vector type for return value and arguments. - Type *RetTy = ToVectorTy(ScalarRetTy, VF); + Type *RetTy = toVectorTy(ScalarRetTy, VF); for (Type *ScalarTy : ScalarTys) - Tys.push_back(ToVectorTy(ScalarTy, VF)); + Tys.push_back(toVectorTy(ScalarTy, VF)); // An in-loop reduction using an fmuladd intrinsic is a special case; // we don't want the normal cost for that intrinsic. @@ -6613,7 +6613,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, HasSingleCopyAfterVectorization(I, VF)); VectorTy = RetTy; } else - VectorTy = ToVectorTy(RetTy, VF); + VectorTy = toVectorTy(RetTy, VF); if (VF.isVector() && VectorTy->isVectorTy() && !TTI.getNumberOfParts(VectorTy)) @@ -6673,8 +6673,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, return Switch->getNumCases() * TTI.getCmpSelInstrCost( Instruction::ICmp, - ToVectorTy(Switch->getCondition()->getType(), VF), - ToVectorTy(Type::getInt1Ty(I->getContext()), VF), + toVectorTy(Switch->getCondition()->getType(), VF), + toVectorTy(Type::getInt1Ty(I->getContext()), VF), CmpInst::ICMP_EQ, CostKind); } case Instruction::PHI: { @@ -6719,8 +6719,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, } return (Phi->getNumIncomingValues() - 1) * TTI.getCmpSelInstrCost( - Instruction::Select, ToVectorTy(ResultTy, VF), - ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), + Instruction::Select, toVectorTy(ResultTy, VF), + toVectorTy(Type::getInt1Ty(Phi->getContext()), VF), CmpInst::BAD_ICMP_PREDICATE, CostKind); } @@ -6729,8 +6729,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, if (VF.isVector() && foldTailWithEVL() && Legal->getReductionVars().contains(Phi) && !isInLoopReduction(Phi)) { IntrinsicCostAttributes ICA( - Intrinsic::vp_merge, ToVectorTy(Phi->getType(), VF), - {ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF)}); + Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF), + {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)}); return TTI.getIntrinsicInstrCost(ICA, CostKind); } @@ -6870,7 +6870,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]); } - VectorTy = ToVectorTy(ValTy, VF); + VectorTy = toVectorTy(ValTy, VF); return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, cast(I)->getPredicate(), CostKind, {TTI::OK_AnyValue, TTI::OP_None}, @@ -6888,7 +6888,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, if (Decision == CM_Scalarize) Width = ElementCount::getFixed(1); } - VectorTy = ToVectorTy(getLoadStoreType(I), Width); + VectorTy = toVectorTy(getLoadStoreType(I), Width); return getMemoryInstructionCost(I, VF); } case Instruction::BitCast: @@ -6969,7 +6969,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, SrcScalarTy = IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]); Type *SrcVecTy = - VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; + VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy; if (canTruncateToMinimalBitwidth(I, VF)) { // If the result type is <= the source type, there will be no extend @@ -7498,7 +7498,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, // Pre-compute the cost for I, if it has a reduction pattern cost. for (Instruction *I : ChainOpsAndOperands) { auto ReductionCost = CM.getReductionPatternCost( - I, VF, ToVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput); + I, VF, toVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput); if (!ReductionCost) continue; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index f82711141419c..8be2b894acd40 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1031,11 +1031,11 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, Arguments.push_back(V); } - Type *RetTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF); + Type *RetTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); SmallVector ParamTys; for (unsigned I = 0; I != getNumOperands(); ++I) ParamTys.push_back( - ToVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF)); + toVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF)); // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst. FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags(); @@ -1203,7 +1203,7 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF, SelectInst *SI = cast(getUnderlyingValue()); bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions(); Type *ScalarTy = Ctx.Types.inferScalarType(this); - Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF); + Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; VPValue *Op0, *Op1; @@ -1384,7 +1384,7 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; switch (Opcode) { case Instruction::FNeg: { - Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF); + Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); return Ctx.TTI.getArithmeticInstrCost( Opcode, VectorTy, CostKind, {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, @@ -1422,7 +1422,7 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF, if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue && getOperand(1)->isDefinedOutsideLoopRegions()) RHSInfo.Kind = TargetTransformInfo::OK_UniformValue; - Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF); + Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); Instruction *CtxI = dyn_cast_or_null(getUnderlyingValue()); SmallVector Operands; @@ -1435,13 +1435,13 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF, } case Instruction::Freeze: { // This opcode is unknown. Assume that it is the same as 'mul'. - Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF); + Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); } case Instruction::ICmp: case Instruction::FCmp: { Instruction *CtxI = dyn_cast_or_null(getUnderlyingValue()); - Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); + Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(), CostKind, {TTI::OK_AnyValue, TTI::OP_None}, @@ -1569,8 +1569,8 @@ InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF, } auto *SrcTy = - cast(ToVectorTy(Ctx.Types.inferScalarType(Operand), VF)); - auto *DestTy = cast(ToVectorTy(getResultType(), VF)); + cast(toVectorTy(Ctx.Types.inferScalarType(Operand), VF)); + auto *DestTy = cast(toVectorTy(getResultType(), VF)); // Arm TTI will use the underlying instruction to determine the cost. return Ctx.TTI.getCastInstrCost( Opcode, DestTy, SrcTy, CCH, TTI::TCK_RecipThroughput, @@ -2078,8 +2078,8 @@ InstructionCost VPBlendRecipe::computeCost(ElementCount VF, if (vputils::onlyFirstLaneUsed(this)) return Ctx.TTI.getCFInstrCost(Instruction::PHI, CostKind); - Type *ResultTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF); - Type *CmpTy = ToVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF); + Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); + Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF); return (getNumIncomingValues() - 1) * Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy, CmpInst::BAD_ICMP_PREDICATE, CostKind); @@ -2200,7 +2200,7 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { RecurKind RdxKind = RdxDesc.getRecurrenceKind(); Type *ElementTy = Ctx.Types.inferScalarType(this); - auto *VectorTy = cast(ToVectorTy(ElementTy, VF)); + auto *VectorTy = cast(toVectorTy(ElementTy, VF)); TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; unsigned Opcode = RdxDesc.getOpcode(); @@ -2452,7 +2452,7 @@ void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { - Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF); + Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); const Align Alignment = getLoadStoreAlignment(const_cast(&Ingredient)); unsigned AS = @@ -2599,7 +2599,7 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF, // legacy model, it will always calculate the cost of mask. // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we // don't need to compare to the legacy cost model. - Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF); + Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); const Align Alignment = getLoadStoreAlignment(const_cast(&Ingredient)); unsigned AS = @@ -2720,7 +2720,7 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF, // legacy model, it will always calculate the cost of mask. // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we // don't need to compare to the legacy cost model. - Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF); + Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); const Align Alignment = getLoadStoreAlignment(const_cast(&Ingredient)); unsigned AS = @@ -3088,7 +3088,7 @@ InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF, Type *ValTy = Ctx.Types.inferScalarType( getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx) : getStoredValues()[InsertPosIdx]); - auto *VectorTy = cast(ToVectorTy(ValTy, VF)); + auto *VectorTy = cast(toVectorTy(ValTy, VF)); unsigned AS = getLoadStoreAddressSpace(InsertPos); enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; @@ -3331,7 +3331,7 @@ VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF, SmallVector Mask(VF.getKnownMinValue()); std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1); Type *VectorTy = - ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); + toVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice, cast(VectorTy), Mask, CostKind, From 9d0a5d4620a2aa5dc01b150e5ebe7613238cae1c Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Mon, 23 Dec 2024 16:29:28 -0800 Subject: [PATCH 002/567] [Telemetry] Add missing virtual destructors (#121015) Fixes warnings after #121003. --- llvm/include/llvm/Telemetry/Telemetry.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/llvm/include/llvm/Telemetry/Telemetry.h b/llvm/include/llvm/Telemetry/Telemetry.h index bbf0ac84d667a..344a49df5cbf0 100644 --- a/llvm/include/llvm/Telemetry/Telemetry.h +++ b/llvm/include/llvm/Telemetry/Telemetry.h @@ -30,6 +30,8 @@ namespace telemetry { class Serializer { public: + virtual ~Serializer() = default; + virtual Error init() = 0; virtual void write(StringRef KeyName, bool Value) = 0; virtual void write(StringRef KeyName, StringRef Value) = 0; @@ -62,6 +64,8 @@ class Serializer { /// This struct can be extended as needed to add additional configuration /// points specific to a vendor's implementation. struct Config { + virtual ~Config() = default; + // If true, telemetry will be enabled. const bool EnableTelemetry; Config(bool E) : EnableTelemetry(E) {} @@ -132,6 +136,8 @@ class Destination { /// monitored and transmitting the data elsewhere. class Manager { public: + virtual ~Manager() = default; + // Optional callback for subclasses to perform additional tasks before // dispatching to Destinations. virtual Error preDispatch(TelemetryInfo *Entry) = 0; From 030829a7e53fad0eab9b87b5dd49427e9fb13303 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Mon, 23 Dec 2024 16:53:37 -0800 Subject: [PATCH 003/567] [SLP]Drop samesign flag if the vector node has reduced bitwidth If the operands of the icmp instructions has reduced bitwidth after MinBitwidth analysis, need to drop samesign flag to preserve correctness of the transformation. Fixes #120823 --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 ++ .../SLPVectorizer/X86/buildvector-schedule-for-subvector.ll | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index d22379429d007..b5d68c075b986 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -15483,6 +15483,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { CmpInst::Predicate P0 = cast(VL0)->getPredicate(); Value *V = Builder.CreateCmp(P0, L, R); propagateIRFlags(V, E->Scalars, VL0); + if (auto *ICmp = dyn_cast(V); ICmp && It == MinBWs.end()) + ICmp->setSameSign(/*B=*/false); // Do not cast for cmps. VecTy = cast(V->getType()); V = FinalShuffle(V, E); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll index 3bf13b76a9332..b659c10bb2fbf 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll @@ -6,7 +6,7 @@ define void @test() { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[ADD:%.*]] = add i32 1, 0 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[ADD]], i32 3 -; CHECK-NEXT: [[TMP1:%.*]] = icmp samesign ult <4 x i32> [[TMP0]], zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i32> [[TMP0]], zeroinitializer ; CHECK-NEXT: [[ICMP:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 ; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[ICMP]], i32 0, i32 0 ; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[SELECT]] to i64 From 0d6a584f69f07cfb900cdf6c83a10e872a5861f9 Mon Sep 17 00:00:00 2001 From: Ryotaro Kasuga Date: Tue, 24 Dec 2024 10:02:15 +0900 Subject: [PATCH 004/567] =?UTF-8?q?[MachinePipeliner]=20Add=20an=20abstrac?= =?UTF-8?q?t=20layer=20to=20manipulate=20Data=20Dependenc=E2=80=A6=20(#109?= =?UTF-8?q?918)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …e Graph In MachinePipeliner, a DAG class is used to represent the Data Dependence Graph. Data Dependence Graph generally contains cycles, so it's not appropriate to use DAG classes. In fact, some "hacks" are used to express back-edges in the current implementation. This patch adds a new class to provide a better interface for manipulating dependencies. Our approach is as follows: - To build the graph, we use the ScheduleDAGInstrs class as it is, because it has powerful functions and the current implementation depends heavily on it. - After the graph construction is finished (i.e., during scheduling), we use the new class DataDependenceGraph to manipulate the dependencies. Since we don't change the dependencies during scheduling, the new class only provides functions to read them. Also, this patch is just a refactoring, i.e., scheduling results should not change with or without this patch. --- llvm/include/llvm/CodeGen/MachinePipeliner.h | 159 ++++-- llvm/lib/CodeGen/MachinePipeliner.cpp | 532 +++++++++++-------- 2 files changed, 429 insertions(+), 262 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MachinePipeliner.h b/llvm/include/llvm/CodeGen/MachinePipeliner.h index 0cc862590d0c0..8e47d0cead757 100644 --- a/llvm/include/llvm/CodeGen/MachinePipeliner.h +++ b/llvm/include/llvm/CodeGen/MachinePipeliner.h @@ -40,6 +40,7 @@ #ifndef LLVM_CODEGEN_MACHINEPIPELINER_H #define LLVM_CODEGEN_MACHINEPIPELINER_H +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/CodeGen/DFAPacketizer.h" #include "llvm/CodeGen/MachineDominators.h" @@ -114,10 +115,123 @@ class MachinePipeliner : public MachineFunctionPass { bool useWindowScheduler(bool Changed); }; +/// Represents a dependence between two instruction. +class SwingSchedulerDDGEdge { + SUnit *Dst = nullptr; + SDep Pred; + unsigned Distance = 0; + +public: + /// Creates an edge corresponding to an edge represented by \p PredOrSucc and + /// \p Dep in the original DAG. This pair has no information about the + /// direction of the edge, so we need to pass an additional argument \p + /// IsSucc. + SwingSchedulerDDGEdge(SUnit *PredOrSucc, const SDep &Dep, bool IsSucc) + : Dst(PredOrSucc), Pred(Dep), Distance(0u) { + SUnit *Src = Dep.getSUnit(); + + if (IsSucc) { + std::swap(Src, Dst); + Pred.setSUnit(Src); + } + + // An anti-dependence to PHI means loop-carried dependence. + if (Pred.getKind() == SDep::Anti && Src->getInstr()->isPHI()) { + Distance = 1; + std::swap(Src, Dst); + auto Reg = Pred.getReg(); + Pred = SDep(Src, SDep::Kind::Data, Reg); + } + } + + /// Returns the SUnit from which the edge comes (source node). + SUnit *getSrc() const { return Pred.getSUnit(); } + + /// Returns the SUnit to which the edge points (destination node). + SUnit *getDst() const { return Dst; } + + /// Returns the latency value for the edge. + unsigned getLatency() const { return Pred.getLatency(); } + + /// Sets the latency for the edge. + void setLatency(unsigned Latency) { Pred.setLatency(Latency); } + + /// Returns the distance value for the edge. + unsigned getDistance() const { return Distance; } + + /// Sets the distance value for the edge. + void setDistance(unsigned D) { Distance = D; } + + /// Returns the register associated with the edge. + Register getReg() const { return Pred.getReg(); } + + /// Returns true if the edge represents anti dependence. + bool isAntiDep() const { return Pred.getKind() == SDep::Kind::Anti; } + + /// Returns true if the edge represents output dependence. + bool isOutputDep() const { return Pred.getKind() == SDep::Kind::Output; } + + /// Returns true if the edge represents a dependence that is not data, anti or + /// output dependence. + bool isOrderDep() const { return Pred.getKind() == SDep::Kind::Order; } + + /// Returns true if the edge represents unknown scheduling barrier. + bool isBarrier() const { return Pred.isBarrier(); } + + /// Returns true if the edge represents an artificial dependence. + bool isArtificial() const { return Pred.isArtificial(); } + + /// Tests if this is a Data dependence that is associated with a register. + bool isAssignedRegDep() const { return Pred.isAssignedRegDep(); } + + /// Returns true for DDG nodes that we ignore when computing the cost + /// functions. We ignore the back-edge recurrence in order to avoid unbounded + /// recursion in the calculation of the ASAP, ALAP, etc functions. + bool ignoreDependence(bool IgnoreAnti) const; +}; + +/// Represents dependencies between instructions. This class is a wrapper of +/// `SUnits` and its dependencies to manipulate back-edges in a natural way. +/// Currently it only supports back-edges via PHI, which are expressed as +/// anti-dependencies in the original DAG. +/// FIXME: Support any other loop-carried dependencies +class SwingSchedulerDDG { + using EdgesType = SmallVector; + + struct SwingSchedulerDDGEdges { + EdgesType Preds; + EdgesType Succs; + }; + + void initEdges(SUnit *SU); + + SUnit *EntrySU; + SUnit *ExitSU; + + std::vector EdgesVec; + SwingSchedulerDDGEdges EntrySUEdges; + SwingSchedulerDDGEdges ExitSUEdges; + + void addEdge(const SUnit *SU, const SwingSchedulerDDGEdge &Edge); + + SwingSchedulerDDGEdges &getEdges(const SUnit *SU); + const SwingSchedulerDDGEdges &getEdges(const SUnit *SU) const; + +public: + SwingSchedulerDDG(std::vector &SUnits, SUnit *EntrySU, SUnit *ExitSU); + + const EdgesType &getInEdges(const SUnit *SU) const; + + const EdgesType &getOutEdges(const SUnit *SU) const; +}; + /// This class builds the dependence graph for the instructions in a loop, /// and attempts to schedule the instructions using the SMS algorithm. class SwingSchedulerDAG : public ScheduleDAGInstrs { MachinePipeliner &Pass; + + std::unique_ptr DDG; + /// The minimum initiation interval between iterations for this schedule. unsigned MII = 0; /// The maximum initiation interval between iterations for this schedule. @@ -130,7 +244,7 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs { unsigned II_setByPragma = 0; TargetInstrInfo::PipelinerLoopInfo *LoopPipelinerInfo = nullptr; - /// A toplogical ordering of the SUnits, which is needed for changing + /// A topological ordering of the SUnits, which is needed for changing /// dependences and iterating over the SUnits. ScheduleDAGTopologicalSort Topo; @@ -252,27 +366,7 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs { return ScheduleInfo[Node->NodeNum].ZeroLatencyHeight; } - /// Return true if the dependence is a back-edge in the data dependence graph. - /// Since the DAG doesn't contain cycles, we represent a cycle in the graph - /// using an anti dependence from a Phi to an instruction. - bool isBackedge(SUnit *Source, const SDep &Dep) { - if (Dep.getKind() != SDep::Anti) - return false; - return Source->getInstr()->isPHI() || Dep.getSUnit()->getInstr()->isPHI(); - } - - bool isLoopCarriedDep(SUnit *Source, const SDep &Dep, - bool isSucc = true) const; - - /// The distance function, which indicates that operation V of iteration I - /// depends on operations U of iteration I-distance. - unsigned getDistance(SUnit *U, SUnit *V, const SDep &Dep) { - // Instructions that feed a Phi have a distance of 1. Computing larger - // values for arrays requires data dependence information. - if (V->getInstr()->isPHI() && Dep.getKind() == SDep::Anti) - return 1; - return 0; - } + bool isLoopCarriedDep(const SwingSchedulerDDGEdge &Edge) const; void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule); @@ -294,6 +388,8 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs { static bool classof(const ScheduleDAGInstrs *DAG) { return true; } + const SwingSchedulerDDG *getDDG() const { return DDG.get(); } + private: void addLoopCarriedDependences(AAResults *AA); void updatePhiDependences(); @@ -357,6 +453,7 @@ class NodeSet { // // Hold a map from each SUnit in the circle to the maximum distance from the // source node by only considering the nodes. + const SwingSchedulerDDG *DDG = DAG->getDDG(); DenseMap SUnitToDistance; for (auto *Node : Nodes) SUnitToDistance[Node] = 0; @@ -364,8 +461,8 @@ class NodeSet { for (unsigned I = 1, E = Nodes.size(); I <= E; ++I) { SUnit *U = Nodes[I - 1]; SUnit *V = Nodes[I % Nodes.size()]; - for (const SDep &Succ : U->Succs) { - SUnit *SuccSUnit = Succ.getSUnit(); + for (const SwingSchedulerDDGEdge &Succ : DDG->getOutEdges(U)) { + SUnit *SuccSUnit = Succ.getDst(); if (V != SuccSUnit) continue; if (SUnitToDistance[U] + Succ.getLatency() > SUnitToDistance[V]) { @@ -377,13 +474,13 @@ class NodeSet { SUnit *FirstNode = Nodes[0]; SUnit *LastNode = Nodes[Nodes.size() - 1]; - for (auto &PI : LastNode->Preds) { + for (auto &PI : DDG->getInEdges(LastNode)) { // If we have an order dep that is potentially loop carried then a // back-edge exists between the last node and the first node that isn't // modeled in the DAG. Handle it manually by adding 1 to the distance of // the last node. - if (PI.getSUnit() != FirstNode || PI.getKind() != SDep::Order || - !DAG->isLoopCarriedDep(LastNode, PI, false)) + if (PI.getSrc() != FirstNode || !PI.isOrderDep() || + !DAG->isLoopCarriedDep(PI)) continue; SUnitToDistance[FirstNode] = std::max(SUnitToDistance[FirstNode], SUnitToDistance[LastNode] + 1); @@ -627,11 +724,13 @@ class SMSchedule { /// Return the cycle of the earliest scheduled instruction in the dependence /// chain. - int earliestCycleInChain(const SDep &Dep); + int earliestCycleInChain(const SwingSchedulerDDGEdge &Dep, + const SwingSchedulerDDG *DDG); /// Return the cycle of the latest scheduled instruction in the dependence /// chain. - int latestCycleInChain(const SDep &Dep); + int latestCycleInChain(const SwingSchedulerDDGEdge &Dep, + const SwingSchedulerDDG *DDG); void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart, int II, SwingSchedulerDAG *DAG); @@ -694,7 +793,7 @@ class SMSchedule { MachineOperand &MO) const; bool onlyHasLoopCarriedOutputOrOrderPreds(SUnit *SU, - SwingSchedulerDAG *DAG) const; + const SwingSchedulerDDG *DDG) const; void print(raw_ostream &os) const; void dump() const; }; diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index b7d03a10266b0..acd42aa497c6f 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -567,6 +567,7 @@ void SwingSchedulerDAG::schedule() { Topo.InitDAGTopologicalSorting(); changeDependences(); postProcessDAG(); + DDG = std::make_unique(SUnits, &EntrySU, &ExitSU); LLVM_DEBUG(dump()); NodeSetType NodeSets; @@ -1583,29 +1584,6 @@ unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) { return RecMII; } -/// Swap all the anti dependences in the DAG. That means it is no longer a DAG, -/// but we do this to find the circuits, and then change them back. -static void swapAntiDependences(std::vector &SUnits) { - SmallVector, 8> DepsAdded; - for (SUnit &SU : SUnits) { - for (SDep &Pred : SU.Preds) - if (Pred.getKind() == SDep::Anti) - DepsAdded.push_back(std::make_pair(&SU, Pred)); - } - for (std::pair &P : DepsAdded) { - // Remove this anti dependency and add one in the reverse direction. - SUnit *SU = P.first; - SDep &D = P.second; - SUnit *TargetSU = D.getSUnit(); - unsigned Reg = D.getReg(); - unsigned Lat = D.getLatency(); - SU->removePred(D); - SDep Dep(SU, SDep::Anti, Reg); - Dep.setLatency(Lat); - TargetSU->addPred(Dep); - } -} - /// Create the adjacency structure of the nodes in the graph. void SwingSchedulerDAG::Circuits::createAdjacencyStructure( SwingSchedulerDAG *DAG) { @@ -1614,11 +1592,11 @@ void SwingSchedulerDAG::Circuits::createAdjacencyStructure( for (int i = 0, e = SUnits.size(); i != e; ++i) { Added.reset(); // Add any successor to the adjacency matrix and exclude duplicates. - for (auto &SI : SUnits[i].Succs) { + for (auto &OE : DAG->DDG->getOutEdges(&SUnits[i])) { // Only create a back-edge on the first and last nodes of a dependence // chain. This records any chains and adds them later. - if (SI.getKind() == SDep::Output) { - int N = SI.getSUnit()->NodeNum; + if (OE.isOutputDep()) { + int N = OE.getDst()->NodeNum; int BackEdge = i; auto Dep = OutputDeps.find(BackEdge); if (Dep != OutputDeps.end()) { @@ -1628,11 +1606,19 @@ void SwingSchedulerDAG::Circuits::createAdjacencyStructure( OutputDeps[N] = BackEdge; } // Do not process a boundary node, an artificial node. - // A back-edge is processed only if it goes to a Phi. - if (SI.getSUnit()->isBoundaryNode() || SI.isArtificial() || - (SI.getKind() == SDep::Anti && !SI.getSUnit()->getInstr()->isPHI())) + if (OE.getDst()->isBoundaryNode() || OE.isArtificial()) + continue; + + // This code is retained o preserve previous behavior and prevent + // regression. This condition means that anti-dependnecies within an + // iteration are ignored when searching circuits. Therefore it's natural + // to consider this dependence as well. + // FIXME: Remove this code if it doesn't have significant impact on + // performance. + if (OE.isAntiDep()) continue; - int N = SI.getSUnit()->NodeNum; + + int N = OE.getDst()->NodeNum; if (!Added.test(N)) { AdjK[i].push_back(N); Added.set(N); @@ -1640,12 +1626,13 @@ void SwingSchedulerDAG::Circuits::createAdjacencyStructure( } // A chain edge between a store and a load is treated as a back-edge in the // adjacency matrix. - for (auto &PI : SUnits[i].Preds) { - if (!SUnits[i].getInstr()->mayStore() || - !DAG->isLoopCarriedDep(&SUnits[i], PI, false)) + for (auto &IE : DAG->DDG->getInEdges(&SUnits[i])) { + SUnit *Src = IE.getSrc(); + SUnit *Dst = IE.getDst(); + if (!Dst->getInstr()->mayStore() || !DAG->isLoopCarriedDep(IE)) continue; - if (PI.getKind() == SDep::Order && PI.getSUnit()->getInstr()->mayLoad()) { - int N = PI.getSUnit()->NodeNum; + if (IE.isOrderDep() && Src->getInstr()->mayLoad()) { + int N = Src->NodeNum; if (!Added.test(N)) { AdjK[i].push_back(N); Added.set(N); @@ -1720,10 +1707,6 @@ void SwingSchedulerDAG::Circuits::unblock(int U) { /// Identify all the elementary circuits in the dependence graph using /// Johnson's circuit algorithm. void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) { - // Swap all the anti dependences in the DAG. That means it is no longer a DAG, - // but we do this to find the circuits, and then change them back. - swapAntiDependences(SUnits); - Circuits Cir(SUnits, Topo); // Create the adjacency structure. Cir.createAdjacencyStructure(this); @@ -1731,9 +1714,6 @@ void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) { Cir.reset(); Cir.circuit(I, I, NodeSets, this); } - - // Change the dependences back so that we've created a DAG again. - swapAntiDependences(SUnits); } // Create artificial dependencies between the source of COPY/REG_SEQUENCE that @@ -1816,15 +1796,6 @@ void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) { } } -/// Return true for DAG nodes that we ignore when computing the cost functions. -/// We ignore the back-edge recurrence in order to avoid unbounded recursion -/// in the calculation of the ASAP, ALAP, etc functions. -static bool ignoreDependence(const SDep &D, bool isPred) { - if (D.isArtificial() || D.getSUnit()->isBoundaryNode()) - return true; - return D.getKind() == SDep::Anti && isPred; -} - /// Compute several functions need to order the nodes for scheduling. /// ASAP - Earliest time to schedule a node. /// ALAP - Latest time to schedule a node. @@ -1847,15 +1818,15 @@ void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) { int asap = 0; int zeroLatencyDepth = 0; SUnit *SU = &SUnits[I]; - for (const SDep &P : SU->Preds) { - SUnit *pred = P.getSUnit(); - if (P.getLatency() == 0) + for (const auto &IE : DDG->getInEdges(SU)) { + SUnit *Pred = IE.getSrc(); + if (IE.getLatency() == 0) zeroLatencyDepth = - std::max(zeroLatencyDepth, getZeroLatencyDepth(pred) + 1); - if (ignoreDependence(P, true)) + std::max(zeroLatencyDepth, getZeroLatencyDepth(Pred) + 1); + if (IE.ignoreDependence(true)) continue; - asap = std::max(asap, (int)(getASAP(pred) + P.getLatency() - - getDistance(pred, SU, P) * MII)); + asap = std::max(asap, (int)(getASAP(Pred) + IE.getLatency() - + IE.getDistance() * MII)); } maxASAP = std::max(maxASAP, asap); ScheduleInfo[I].ASAP = asap; @@ -1867,17 +1838,17 @@ void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) { int alap = maxASAP; int zeroLatencyHeight = 0; SUnit *SU = &SUnits[I]; - for (const SDep &S : SU->Succs) { - SUnit *succ = S.getSUnit(); - if (succ->isBoundaryNode()) + for (const auto &OE : DDG->getOutEdges(SU)) { + SUnit *Succ = OE.getDst(); + if (Succ->isBoundaryNode()) continue; - if (S.getLatency() == 0) + if (OE.getLatency() == 0) zeroLatencyHeight = - std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1); - if (ignoreDependence(S, true)) + std::max(zeroLatencyHeight, getZeroLatencyHeight(Succ) + 1); + if (OE.ignoreDependence(true)) continue; - alap = std::min(alap, (int)(getALAP(succ) - S.getLatency() + - getDistance(SU, succ, S) * MII)); + alap = std::min(alap, (int)(getALAP(Succ) - OE.getLatency() + + OE.getDistance() * MII)); } ScheduleInfo[I].ALAP = alap; @@ -1906,26 +1877,33 @@ void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) { /// as the predecessors of the elements of NodeOrder that are not also in /// NodeOrder. static bool pred_L(SetVector &NodeOrder, - SmallSetVector &Preds, + SmallSetVector &Preds, SwingSchedulerDDG *DDG, const NodeSet *S = nullptr) { Preds.clear(); - for (const SUnit *SU : NodeOrder) { - for (const SDep &Pred : SU->Preds) { - if (S && S->count(Pred.getSUnit()) == 0) + + for (SUnit *SU : NodeOrder) { + for (const auto &IE : DDG->getInEdges(SU)) { + SUnit *PredSU = IE.getSrc(); + if (S && S->count(PredSU) == 0) continue; - if (ignoreDependence(Pred, true)) + if (IE.ignoreDependence(true)) continue; - if (NodeOrder.count(Pred.getSUnit()) == 0) - Preds.insert(Pred.getSUnit()); + if (NodeOrder.count(PredSU) == 0) + Preds.insert(PredSU); } - // Back-edges are predecessors with an anti-dependence. - for (const SDep &Succ : SU->Succs) { - if (Succ.getKind() != SDep::Anti) + + // FIXME: The following loop-carried dependencies may also need to be + // considered. + // - Physical register dependencies (true-dependence and WAW). + // - Memory dependencies. + for (const auto &OE : DDG->getOutEdges(SU)) { + SUnit *SuccSU = OE.getDst(); + if (!OE.isAntiDep()) continue; - if (S && S->count(Succ.getSUnit()) == 0) + if (S && S->count(SuccSU) == 0) continue; - if (NodeOrder.count(Succ.getSUnit()) == 0) - Preds.insert(Succ.getSUnit()); + if (NodeOrder.count(SuccSU) == 0) + Preds.insert(SuccSU); } } return !Preds.empty(); @@ -1935,25 +1913,33 @@ static bool pred_L(SetVector &NodeOrder, /// as the successors of the elements of NodeOrder that are not also in /// NodeOrder. static bool succ_L(SetVector &NodeOrder, - SmallSetVector &Succs, + SmallSetVector &Succs, SwingSchedulerDDG *DDG, const NodeSet *S = nullptr) { Succs.clear(); - for (const SUnit *SU : NodeOrder) { - for (const SDep &Succ : SU->Succs) { - if (S && S->count(Succ.getSUnit()) == 0) + + for (SUnit *SU : NodeOrder) { + for (const auto &OE : DDG->getOutEdges(SU)) { + SUnit *SuccSU = OE.getDst(); + if (S && S->count(SuccSU) == 0) continue; - if (ignoreDependence(Succ, false)) + if (OE.ignoreDependence(false)) continue; - if (NodeOrder.count(Succ.getSUnit()) == 0) - Succs.insert(Succ.getSUnit()); + if (NodeOrder.count(SuccSU) == 0) + Succs.insert(SuccSU); } - for (const SDep &Pred : SU->Preds) { - if (Pred.getKind() != SDep::Anti) + + // FIXME: The following loop-carried dependencies may also need to be + // considered. + // - Physical register dependnecies (true-dependnece and WAW). + // - Memory dependencies. + for (const auto &IE : DDG->getInEdges(SU)) { + SUnit *PredSU = IE.getSrc(); + if (!IE.isAntiDep()) continue; - if (S && S->count(Pred.getSUnit()) == 0) + if (S && S->count(PredSU) == 0) continue; - if (NodeOrder.count(Pred.getSUnit()) == 0) - Succs.insert(Pred.getSUnit()); + if (NodeOrder.count(PredSU) == 0) + Succs.insert(PredSU); } } return !Succs.empty(); @@ -1964,7 +1950,8 @@ static bool succ_L(SetVector &NodeOrder, static bool computePath(SUnit *Cur, SetVector &Path, SetVector &DestNodes, SetVector &Exclude, - SmallPtrSet &Visited) { + SmallPtrSet &Visited, + SwingSchedulerDDG *DDG) { if (Cur->isBoundaryNode()) return false; if (Exclude.contains(Cur)) @@ -1974,14 +1961,14 @@ static bool computePath(SUnit *Cur, SetVector &Path, if (!Visited.insert(Cur).second) return Path.contains(Cur); bool FoundPath = false; - for (auto &SI : Cur->Succs) - if (!ignoreDependence(SI, false)) + for (const auto &OE : DDG->getOutEdges(Cur)) + if (!OE.ignoreDependence(false)) FoundPath |= - computePath(SI.getSUnit(), Path, DestNodes, Exclude, Visited); - for (auto &PI : Cur->Preds) - if (PI.getKind() == SDep::Anti) + computePath(OE.getDst(), Path, DestNodes, Exclude, Visited, DDG); + for (const auto &IE : DDG->getInEdges(Cur)) + if (IE.isAntiDep() && IE.getDistance() == 0) FoundPath |= - computePath(PI.getSUnit(), Path, DestNodes, Exclude, Visited); + computePath(IE.getSrc(), Path, DestNodes, Exclude, Visited, DDG); if (FoundPath) Path.insert(Cur); return FoundPath; @@ -2078,14 +2065,14 @@ void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) { for (int i = 0, e = NodeSets.size(); i < e; ++i) { NodeSet &N1 = NodeSets[i]; SmallSetVector S1; - if (N1.empty() || !succ_L(N1, S1)) + if (N1.empty() || !succ_L(N1, S1, DDG.get())) continue; for (int j = i + 1; j < e; ++j) { NodeSet &N2 = NodeSets[j]; if (N1.compareRecMII(N2) != 0) continue; SmallSetVector S2; - if (N2.empty() || !succ_L(N2, S2)) + if (N2.empty() || !succ_L(N2, S2, DDG.get())) continue; if (llvm::set_is_subset(S1, S2) && S1.size() == S2.size()) { N1.setColocate(++Colocate); @@ -2126,22 +2113,22 @@ void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) { for (NodeSet &I : NodeSets) { SmallSetVector N; // Add the nodes from the current node set to the previous node set. - if (succ_L(I, N)) { + if (succ_L(I, N, DDG.get())) { SetVector Path; for (SUnit *NI : N) { Visited.clear(); - computePath(NI, Path, NodesAdded, I, Visited); + computePath(NI, Path, NodesAdded, I, Visited, DDG.get()); } if (!Path.empty()) I.insert(Path.begin(), Path.end()); } // Add the nodes from the previous node set to the current node set. N.clear(); - if (succ_L(NodesAdded, N)) { + if (succ_L(NodesAdded, N, DDG.get())) { SetVector Path; for (SUnit *NI : N) { Visited.clear(); - computePath(NI, Path, I, NodesAdded, Visited); + computePath(NI, Path, I, NodesAdded, Visited, DDG.get()); } if (!Path.empty()) I.insert(Path.begin(), Path.end()); @@ -2153,7 +2140,7 @@ void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) { // in a recurrent set. NodeSet NewSet; SmallSetVector N; - if (succ_L(NodesAdded, N)) + if (succ_L(NodesAdded, N, DDG.get())) for (SUnit *I : N) addConnectedNodes(I, NewSet, NodesAdded); if (!NewSet.empty()) @@ -2162,7 +2149,7 @@ void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) { // Create a new node set with the connected nodes of any predecessor of a node // in a recurrent set. NewSet.clear(); - if (pred_L(NodesAdded, N)) + if (pred_L(NodesAdded, N, DDG.get())) for (SUnit *I : N) addConnectedNodes(I, NewSet, NodesAdded); if (!NewSet.empty()) @@ -2185,15 +2172,15 @@ void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet, SetVector &NodesAdded) { NewSet.insert(SU); NodesAdded.insert(SU); - for (auto &SI : SU->Succs) { - SUnit *Successor = SI.getSUnit(); - if (!SI.isArtificial() && !Successor->isBoundaryNode() && + for (auto &OE : DDG->getOutEdges(SU)) { + SUnit *Successor = OE.getDst(); + if (!OE.isArtificial() && !Successor->isBoundaryNode() && NodesAdded.count(Successor) == 0) addConnectedNodes(Successor, NewSet, NodesAdded); } - for (auto &PI : SU->Preds) { - SUnit *Predecessor = PI.getSUnit(); - if (!PI.isArtificial() && NodesAdded.count(Predecessor) == 0) + for (auto &IE : DDG->getInEdges(SU)) { + SUnit *Predecessor = IE.getSrc(); + if (!IE.isArtificial() && NodesAdded.count(Predecessor) == 0) addConnectedNodes(Predecessor, NewSet, NodesAdded); } } @@ -2259,11 +2246,12 @@ void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) { LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n"); OrderKind Order; SmallSetVector N; - if (pred_L(NodeOrder, N) && llvm::set_is_subset(N, Nodes)) { + if (pred_L(NodeOrder, N, DDG.get()) && llvm::set_is_subset(N, Nodes)) { R.insert(N.begin(), N.end()); Order = BottomUp; LLVM_DEBUG(dbgs() << " Bottom up (preds) "); - } else if (succ_L(NodeOrder, N) && llvm::set_is_subset(N, Nodes)) { + } else if (succ_L(NodeOrder, N, DDG.get()) && + llvm::set_is_subset(N, Nodes)) { R.insert(N.begin(), N.end()); Order = TopDown; LLVM_DEBUG(dbgs() << " Top down (succs) "); @@ -2313,30 +2301,36 @@ void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) { NodeOrder.insert(maxHeight); LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " "); R.remove(maxHeight); - for (const auto &I : maxHeight->Succs) { - if (Nodes.count(I.getSUnit()) == 0) + for (const auto &OE : DDG->getOutEdges(maxHeight)) { + SUnit *SU = OE.getDst(); + if (Nodes.count(SU) == 0) continue; - if (NodeOrder.contains(I.getSUnit())) + if (NodeOrder.contains(SU)) continue; - if (ignoreDependence(I, false)) + if (OE.ignoreDependence(false)) continue; - R.insert(I.getSUnit()); + R.insert(SU); } - // Back-edges are predecessors with an anti-dependence. - for (const auto &I : maxHeight->Preds) { - if (I.getKind() != SDep::Anti) + + // FIXME: The following loop-carried dependencies may also need to be + // considered. + // - Physical register dependnecies (true-dependnece and WAW). + // - Memory dependencies. + for (const auto &IE : DDG->getInEdges(maxHeight)) { + SUnit *SU = IE.getSrc(); + if (!IE.isAntiDep()) continue; - if (Nodes.count(I.getSUnit()) == 0) + if (Nodes.count(SU) == 0) continue; - if (NodeOrder.contains(I.getSUnit())) + if (NodeOrder.contains(SU)) continue; - R.insert(I.getSUnit()); + R.insert(SU); } } Order = BottomUp; LLVM_DEBUG(dbgs() << "\n Switching order to bottom up "); SmallSetVector N; - if (pred_L(NodeOrder, N, &Nodes)) + if (pred_L(NodeOrder, N, DDG.get(), &Nodes)) R.insert(N.begin(), N.end()); } else { // Choose the node with the maximum depth. If more than one, choose @@ -2364,28 +2358,34 @@ void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) { R.insert(Nodes.getNode(0)); break; } - for (const auto &I : maxDepth->Preds) { - if (Nodes.count(I.getSUnit()) == 0) + for (const auto &IE : DDG->getInEdges(maxDepth)) { + SUnit *SU = IE.getSrc(); + if (Nodes.count(SU) == 0) continue; - if (NodeOrder.contains(I.getSUnit())) + if (NodeOrder.contains(SU)) continue; - R.insert(I.getSUnit()); + R.insert(SU); } - // Back-edges are predecessors with an anti-dependence. - for (const auto &I : maxDepth->Succs) { - if (I.getKind() != SDep::Anti) + + // FIXME: The following loop-carried dependencies may also need to be + // considered. + // - Physical register dependnecies (true-dependnece and WAW). + // - Memory dependencies. + for (const auto &OE : DDG->getOutEdges(maxDepth)) { + SUnit *SU = OE.getDst(); + if (!OE.isAntiDep()) continue; - if (Nodes.count(I.getSUnit()) == 0) + if (Nodes.count(SU) == 0) continue; - if (NodeOrder.contains(I.getSUnit())) + if (NodeOrder.contains(SU)) continue; - R.insert(I.getSUnit()); + R.insert(SU); } } Order = TopDown; LLVM_DEBUG(dbgs() << "\n Switching order to top down "); SmallSetVector N; - if (succ_L(NodeOrder, N, &Nodes)) + if (succ_L(NodeOrder, N, DDG.get(), &Nodes)) R.insert(N.begin(), N.end()); } } @@ -2458,7 +2458,7 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) { // loop-carried output/order dependencies. Empirically, there are also // cases where scheduling becomes possible with backward search. if (SU->getInstr()->isPHI() || - Schedule.onlyHasLoopCarriedOutputOrOrderPreds(SU, this)) + Schedule.onlyHasLoopCarriedOutputOrOrderPreds(SU, this->getDDG())) scheduleFound = Schedule.insert(SU, LateStart, EarlyStart, II); else scheduleFound = Schedule.insert(SU, EarlyStart, LateStart, II); @@ -2678,22 +2678,20 @@ MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) { /// Return true for an order or output dependence that is loop carried /// potentially. A dependence is loop carried if the destination defines a value /// that may be used or defined by the source in a subsequent iteration. -bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep, - bool isSucc) const { - if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) || - Dep.isArtificial() || Dep.getSUnit()->isBoundaryNode()) +bool SwingSchedulerDAG::isLoopCarriedDep( + const SwingSchedulerDDGEdge &Edge) const { + if ((!Edge.isOrderDep() && !Edge.isOutputDep()) || Edge.isArtificial() || + Edge.getDst()->isBoundaryNode()) return false; if (!SwpPruneLoopCarried) return true; - if (Dep.getKind() == SDep::Output) + if (Edge.isOutputDep()) return true; - MachineInstr *SI = Source->getInstr(); - MachineInstr *DI = Dep.getSUnit()->getInstr(); - if (!isSucc) - std::swap(SI, DI); + MachineInstr *SI = Edge.getSrc()->getInstr(); + MachineInstr *DI = Edge.getDst()->getInstr(); assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI."); // Assume ordered loads and stores may have a loop carried dependence. @@ -2815,46 +2813,48 @@ bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) { } // Return the cycle of the earliest scheduled instruction in the chain. -int SMSchedule::earliestCycleInChain(const SDep &Dep) { +int SMSchedule::earliestCycleInChain(const SwingSchedulerDDGEdge &Dep, + const SwingSchedulerDDG *DDG) { SmallPtrSet Visited; - SmallVector Worklist; + SmallVector Worklist; Worklist.push_back(Dep); int EarlyCycle = INT_MAX; while (!Worklist.empty()) { - const SDep &Cur = Worklist.pop_back_val(); - SUnit *PrevSU = Cur.getSUnit(); + const SwingSchedulerDDGEdge &Cur = Worklist.pop_back_val(); + SUnit *PrevSU = Cur.getSrc(); if (Visited.count(PrevSU)) continue; std::map::const_iterator it = InstrToCycle.find(PrevSU); if (it == InstrToCycle.end()) continue; EarlyCycle = std::min(EarlyCycle, it->second); - for (const auto &PI : PrevSU->Preds) - if (PI.getKind() == SDep::Order || PI.getKind() == SDep::Output) - Worklist.push_back(PI); + for (const auto &IE : DDG->getInEdges(PrevSU)) + if (IE.isOrderDep() || IE.isOutputDep()) + Worklist.push_back(IE); Visited.insert(PrevSU); } return EarlyCycle; } // Return the cycle of the latest scheduled instruction in the chain. -int SMSchedule::latestCycleInChain(const SDep &Dep) { +int SMSchedule::latestCycleInChain(const SwingSchedulerDDGEdge &Dep, + const SwingSchedulerDDG *DDG) { SmallPtrSet Visited; - SmallVector Worklist; + SmallVector Worklist; Worklist.push_back(Dep); int LateCycle = INT_MIN; while (!Worklist.empty()) { - const SDep &Cur = Worklist.pop_back_val(); - SUnit *SuccSU = Cur.getSUnit(); + const SwingSchedulerDDGEdge &Cur = Worklist.pop_back_val(); + SUnit *SuccSU = Cur.getDst(); if (Visited.count(SuccSU) || SuccSU->isBoundaryNode()) continue; std::map::const_iterator it = InstrToCycle.find(SuccSU); if (it == InstrToCycle.end()) continue; LateCycle = std::max(LateCycle, it->second); - for (const auto &SI : SuccSU->Succs) - if (SI.getKind() == SDep::Order || SI.getKind() == SDep::Output) - Worklist.push_back(SI); + for (const auto &OE : DDG->getOutEdges(SuccSU)) + if (OE.isOrderDep() || OE.isOutputDep()) + Worklist.push_back(OE); Visited.insert(SuccSU); } return LateCycle; @@ -2865,7 +2865,7 @@ int SMSchedule::latestCycleInChain(const SDep &Dep) { /// to a Phi, which contains a reference to another Phi. static SUnit *multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG) { for (auto &P : SU->Preds) - if (DAG->isBackedge(SU, P) && P.getSUnit()->getInstr()->isPHI()) + if (P.getKind() == SDep::Anti && P.getSUnit()->getInstr()->isPHI()) for (auto &S : P.getSUnit()->Succs) if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI()) return P.getSUnit(); @@ -2876,57 +2876,47 @@ static SUnit *multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG) { /// depends on any predecessor or successor nodes scheduled already. void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart, int II, SwingSchedulerDAG *DAG) { + const SwingSchedulerDDG *DDG = DAG->getDDG(); + // Iterate over each instruction that has been scheduled already. The start // slot computation depends on whether the previously scheduled instruction // is a predecessor or successor of the specified instruction. for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) { - - // Iterate over each instruction in the current cycle. for (SUnit *I : getInstructions(cycle)) { - // Because we're processing a DAG for the dependences, we recognize - // the back-edge in recurrences by anti dependences. - for (unsigned i = 0, e = (unsigned)SU->Preds.size(); i != e; ++i) { - const SDep &Dep = SU->Preds[i]; - if (Dep.getSUnit() == I) { - if (!DAG->isBackedge(SU, Dep)) { - int EarlyStart = cycle + Dep.getLatency() - - DAG->getDistance(Dep.getSUnit(), SU, Dep) * II; - *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart); - if (DAG->isLoopCarriedDep(SU, Dep, false)) { - int End = earliestCycleInChain(Dep) + (II - 1); - *MinLateStart = std::min(*MinLateStart, End); - } - } else { - int LateStart = cycle - Dep.getLatency() + - DAG->getDistance(SU, Dep.getSUnit(), Dep) * II; - *MinLateStart = std::min(*MinLateStart, LateStart); + for (const auto &IE : DDG->getInEdges(SU)) { + if (IE.getSrc() == I) { + // FIXME: Add reverse edge to `DDG` instead of calling + // `isLoopCarriedDep` + if (DAG->isLoopCarriedDep(IE)) { + int End = earliestCycleInChain(IE, DDG) + (II - 1); + *MinLateStart = std::min(*MinLateStart, End); } + int EarlyStart = cycle + IE.getLatency() - IE.getDistance() * II; + *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart); } + } + + for (const auto &OE : DDG->getOutEdges(SU)) { + if (OE.getDst() == I) { + // FIXME: Add reverse edge to `DDG` instead of calling + // `isLoopCarriedDep` + if (DAG->isLoopCarriedDep(OE)) { + int Start = latestCycleInChain(OE, DDG) + 1 - II; + *MaxEarlyStart = std::max(*MaxEarlyStart, Start); + } + int LateStart = cycle - OE.getLatency() + OE.getDistance() * II; + *MinLateStart = std::min(*MinLateStart, LateStart); + } + } + + SUnit *BE = multipleIterations(I, DAG); + for (const auto &Dep : SU->Preds) { // For instruction that requires multiple iterations, make sure that // the dependent instruction is not scheduled past the definition. - SUnit *BE = multipleIterations(I, DAG); if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() && !SU->isPred(I)) *MinLateStart = std::min(*MinLateStart, cycle); } - for (unsigned i = 0, e = (unsigned)SU->Succs.size(); i != e; ++i) { - if (SU->Succs[i].getSUnit() == I) { - const SDep &Dep = SU->Succs[i]; - if (!DAG->isBackedge(SU, Dep)) { - int LateStart = cycle - Dep.getLatency() + - DAG->getDistance(SU, Dep.getSUnit(), Dep) * II; - *MinLateStart = std::min(*MinLateStart, LateStart); - if (DAG->isLoopCarriedDep(SU, Dep)) { - int Start = latestCycleInChain(Dep) + 1 - II; - *MaxEarlyStart = std::max(*MaxEarlyStart, Start); - } - } else { - int EarlyStart = cycle + Dep.getLatency() - - DAG->getDistance(Dep.getSUnit(), SU, Dep) * II; - *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart); - } - } - } } } } @@ -2943,6 +2933,7 @@ void SMSchedule::orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU, unsigned MoveDef = 0; unsigned MoveUse = 0; int StageInst1 = stageScheduled(SU); + const SwingSchedulerDDG *DDG = SSD->getDDG(); unsigned Pos = 0; for (std::deque::iterator I = Insts.begin(), E = Insts.end(); I != E; @@ -3000,10 +2991,10 @@ void SMSchedule::orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU, } // Check for order dependences between instructions. Make sure the source // is ordered before the destination. - for (auto &S : SU->Succs) { - if (S.getSUnit() != *I) + for (auto &OE : DDG->getOutEdges(SU)) { + if (OE.getDst() != *I) continue; - if (S.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) { + if (OE.isOrderDep() && stageScheduled(*I) == StageInst1) { OrderBeforeUse = true; if (Pos < MoveUse) MoveUse = Pos; @@ -3011,18 +3002,17 @@ void SMSchedule::orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU, // We did not handle HW dependences in previous for loop, // and we normally set Latency = 0 for Anti/Output deps, // so may have nodes in same cycle with Anti/Output dependent on HW regs. - else if ((S.getKind() == SDep::Anti || S.getKind() == SDep::Output) && + else if ((OE.isAntiDep() || OE.isOutputDep()) && stageScheduled(*I) == StageInst1) { OrderBeforeUse = true; if ((MoveUse == 0) || (Pos < MoveUse)) MoveUse = Pos; } } - for (auto &P : SU->Preds) { - if (P.getSUnit() != *I) + for (auto &IE : DDG->getInEdges(SU)) { + if (IE.getSrc() != *I) continue; - if ((P.getKind() == SDep::Order || P.getKind() == SDep::Anti || - P.getKind() == SDep::Output) && + if ((IE.isAntiDep() || IE.isOutputDep() || IE.isOrderDep()) && stageScheduled(*I) == StageInst1) { OrderAfterDef = true; MoveDef = Pos; @@ -3117,12 +3107,9 @@ bool SMSchedule::isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD, /// Return true if all scheduled predecessors are loop-carried output/order /// dependencies. bool SMSchedule::onlyHasLoopCarriedOutputOrOrderPreds( - SUnit *SU, SwingSchedulerDAG *DAG) const { - for (const SDep &Pred : SU->Preds) - if (InstrToCycle.count(Pred.getSUnit()) && !DAG->isBackedge(SU, Pred)) - return false; - for (const SDep &Succ : SU->Succs) - if (InstrToCycle.count(Succ.getSUnit()) && DAG->isBackedge(SU, Succ)) + SUnit *SU, const SwingSchedulerDDG *DDG) const { + for (const auto &IE : DDG->getInEdges(SU)) + if (InstrToCycle.count(IE.getSrc())) return false; return true; } @@ -3137,18 +3124,21 @@ SmallSet SMSchedule::computeUnpipelineableNodes( if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr())) Worklist.push_back(&SU); + const SwingSchedulerDDG *DDG = SSD->getDDG(); while (!Worklist.empty()) { auto SU = Worklist.pop_back_val(); if (DoNotPipeline.count(SU)) continue; LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n"); DoNotPipeline.insert(SU); - for (auto &Dep : SU->Preds) - Worklist.push_back(Dep.getSUnit()); - if (SU->getInstr()->isPHI()) - for (auto &Dep : SU->Succs) - if (Dep.getKind() == SDep::Anti) - Worklist.push_back(Dep.getSUnit()); + for (const auto &IE : DDG->getInEdges(SU)) + Worklist.push_back(IE.getSrc()); + + // To preserve previous behavior and prevent regression + // FIXME: Remove if this doesn't have significant impact on + for (const auto &OE : DDG->getOutEdges(SU)) + if (OE.getDistance() == 1) + Worklist.push_back(OE.getDst()); } return DoNotPipeline; } @@ -3170,8 +3160,15 @@ bool SMSchedule::normalizeNonPipelinedInstructions( // Put the non-pipelined instruction as early as possible in the schedule int NewCycle = getFirstCycle(); - for (auto &Dep : SU.Preds) - NewCycle = std::max(InstrToCycle[Dep.getSUnit()], NewCycle); + for (const auto &IE : SSD->getDDG()->getInEdges(&SU)) + if (IE.getDistance() == 0) + NewCycle = std::max(InstrToCycle[IE.getSrc()], NewCycle); + + // To preserve previous behavior and prevent regression + // FIXME: Remove if this doesn't have significant impact on performance + for (auto &OE : SSD->getDDG()->getOutEdges(&SU)) + if (OE.getDistance() == 1) + NewCycle = std::max(InstrToCycle[OE.getDst()], NewCycle); int OldCycle = InstrToCycle[&SU]; if (OldCycle != NewCycle) { @@ -3204,14 +3201,16 @@ bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) { int StageDef = stageScheduled(&SU); int CycleDef = InstrToCycle[&SU]; assert(StageDef != -1 && "Instruction should have been scheduled."); - for (auto &SI : SU.Succs) - if (SI.isAssignedRegDep() && !SI.getSUnit()->isBoundaryNode()) - if (Register::isPhysicalRegister(SI.getReg())) { - if (stageScheduled(SI.getSUnit()) != StageDef) + for (auto &OE : SSD->getDDG()->getOutEdges(&SU)) { + SUnit *Dst = OE.getDst(); + if (OE.isAssignedRegDep() && !Dst->isBoundaryNode()) + if (Register::isPhysicalRegister(OE.getReg())) { + if (stageScheduled(Dst) != StageDef) return false; - if (InstrToCycle[SI.getSUnit()] <= CycleDef) + if (InstrToCycle[Dst] <= CycleDef) return false; } + } } return true; } @@ -3223,7 +3222,7 @@ bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) { /// The method below checks whether the property is met. /// If not, debug information is printed and statistics information updated. /// Note that we do not use an assert statement. -/// The reason is that although an invalid node oder may prevent +/// The reason is that although an invalid node order may prevent /// the pipeliner from finding a pipelined schedule for arbitrary II, /// it does not lead to the generation of incorrect code. void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const { @@ -3261,8 +3260,8 @@ void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const { (void)Succ; (void)Pred; - for (SDep &PredEdge : SU->Preds) { - SUnit *PredSU = PredEdge.getSUnit(); + for (const auto &IE : DDG->getInEdges(SU)) { + SUnit *PredSU = IE.getSrc(); unsigned PredIndex = std::get<1>( *llvm::lower_bound(Indices, std::make_pair(PredSU, 0), CompareKey)); if (!PredSU->getInstr()->isPHI() && PredIndex < Index) { @@ -3272,8 +3271,8 @@ void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const { } } - for (SDep &SuccEdge : SU->Succs) { - SUnit *SuccSU = SuccEdge.getSUnit(); + for (const auto &OE : DDG->getOutEdges(SU)) { + SUnit *SuccSU = OE.getDst(); // Do not process a boundary node, it was not included in NodeOrder, // hence not in Indices either, call to std::lower_bound() below will // return Indices.end(). @@ -3750,3 +3749,72 @@ void ResourceManager::init(int II) { NumScheduledMops.clear(); NumScheduledMops.resize(II); } + +bool SwingSchedulerDDGEdge::ignoreDependence(bool IgnoreAnti) const { + if (Pred.isArtificial() || Dst->isBoundaryNode()) + return true; + // Currently, dependence that is an anti-dependences but not a loop-carried is + // also ignored. This behavior is preserved to prevent regression. + // FIXME: Remove if this doesn't have significant impact on performance + return IgnoreAnti && (Pred.getKind() == SDep::Kind::Anti || Distance != 0); +} + +SwingSchedulerDDG::SwingSchedulerDDGEdges & +SwingSchedulerDDG::getEdges(const SUnit *SU) { + if (SU == EntrySU) + return EntrySUEdges; + if (SU == ExitSU) + return ExitSUEdges; + return EdgesVec[SU->NodeNum]; +} + +const SwingSchedulerDDG::SwingSchedulerDDGEdges & +SwingSchedulerDDG::getEdges(const SUnit *SU) const { + if (SU == EntrySU) + return EntrySUEdges; + if (SU == ExitSU) + return ExitSUEdges; + return EdgesVec[SU->NodeNum]; +} + +void SwingSchedulerDDG::addEdge(const SUnit *SU, + const SwingSchedulerDDGEdge &Edge) { + auto &Edges = getEdges(SU); + if (Edge.getSrc() == SU) + Edges.Succs.push_back(Edge); + else + Edges.Preds.push_back(Edge); +} + +void SwingSchedulerDDG::initEdges(SUnit *SU) { + for (const auto &PI : SU->Preds) { + SwingSchedulerDDGEdge Edge(SU, PI, false); + addEdge(SU, Edge); + } + + for (const auto &SI : SU->Succs) { + SwingSchedulerDDGEdge Edge(SU, SI, true); + addEdge(SU, Edge); + } +} + +SwingSchedulerDDG::SwingSchedulerDDG(std::vector &SUnits, SUnit *EntrySU, + SUnit *ExitSU) + : EntrySU(EntrySU), ExitSU(ExitSU) { + EdgesVec.resize(SUnits.size()); + + initEdges(EntrySU); + initEdges(ExitSU); + for (auto &SU : SUnits) + initEdges(&SU); +} + +const SwingSchedulerDDG::EdgesType & +SwingSchedulerDDG::getInEdges(const SUnit *SU) const { + return getEdges(SU).Preds; +} + +const SwingSchedulerDDG::EdgesType & +SwingSchedulerDDG::getOutEdges(const SUnit *SU) const { + return getEdges(SU).Succs; +} From 7d166fa38470a23f3134a3793b9236b2a5c68fcf Mon Sep 17 00:00:00 2001 From: khaki3 <47756807+khaki3@users.noreply.github.com> Date: Mon, 23 Dec 2024 17:14:38 -0800 Subject: [PATCH 005/567] [flang][cuda] Correct the number of blocks when setting the grid to `*` (#121000) We set the `gridX` argument of `_FortranACUFLaunchKernel` to `-1` when `*` is passed to the grid parameter. We store it in one of `dim3` members. However, `dim3` members are unsigned, so positive-value checks we use later, such as `gridDim.x > 0`, are invalid. This PR utilizes the original gird-size arguments to compute the number of blocks. --- flang/runtime/CUDA/kernel.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/flang/runtime/CUDA/kernel.cpp b/flang/runtime/CUDA/kernel.cpp index 88cdf3cf42622..bdc04ccb17672 100644 --- a/flang/runtime/CUDA/kernel.cpp +++ b/flang/runtime/CUDA/kernel.cpp @@ -48,13 +48,13 @@ void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY, maxBlocks = multiProcCount * maxBlocks; } if (maxBlocks > 0) { - if (gridDim.x > 0) { + if (gridX > 0) { maxBlocks = maxBlocks / gridDim.x; } - if (gridDim.y > 0) { + if (gridY > 0) { maxBlocks = maxBlocks / gridDim.y; } - if (gridDim.z > 0) { + if (gridZ > 0) { maxBlocks = maxBlocks / gridDim.z; } if (maxBlocks < 1) { @@ -113,13 +113,13 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX, maxBlocks = multiProcCount * maxBlocks; } if (maxBlocks > 0) { - if (config.gridDim.x > 0) { + if (gridX > 0) { maxBlocks = maxBlocks / config.gridDim.x; } - if (config.gridDim.y > 0) { + if (gridY > 0) { maxBlocks = maxBlocks / config.gridDim.y; } - if (config.gridDim.z > 0) { + if (gridZ > 0) { maxBlocks = maxBlocks / config.gridDim.z; } if (maxBlocks < 1) { From 2c95e60df53ba1a5765b3fad9e8ddaff70f21994 Mon Sep 17 00:00:00 2001 From: NAKAMURA Takumi Date: Tue, 24 Dec 2024 11:34:32 +0900 Subject: [PATCH 006/567] TelemetryTest.cpp: Suppress a warning in #121003 [-Wunused-private-field] --- llvm/unittests/Telemetry/TelemetryTest.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/unittests/Telemetry/TelemetryTest.cpp b/llvm/unittests/Telemetry/TelemetryTest.cpp index f06cee04277fd..6b37e89f16435 100644 --- a/llvm/unittests/Telemetry/TelemetryTest.cpp +++ b/llvm/unittests/Telemetry/TelemetryTest.cpp @@ -193,6 +193,7 @@ class TestManager : public Manager { Error preDispatch(TelemetryInfo *Entry) override { Entry->SessionId = SessionId; + (void)CurrentContext; return Error::success(); } From 7ec139ad4bc09857ab2b93926feef0d110071668 Mon Sep 17 00:00:00 2001 From: NAKAMURA Takumi Date: Tue, 24 Dec 2024 12:25:06 +0900 Subject: [PATCH 007/567] Revert "Reapply "[llvm-jitlink] Use concurrent linking by default." with fixes. (#120958)" Caused random failures. This reverts commit 93d4b1f7a72f366c1ea91b2d65991266053be8d9. (llvmorg-20-init-16299-g93d4b1f7a72f) --- llvm/tools/llvm-jitlink/llvm-jitlink-coff.cpp | 2 - llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp | 2 - .../tools/llvm-jitlink/llvm-jitlink-macho.cpp | 2 - llvm/tools/llvm-jitlink/llvm-jitlink.cpp | 56 +++---------------- 4 files changed, 8 insertions(+), 54 deletions(-) diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink-coff.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink-coff.cpp index 6db78926101fd..5271fdb556590 100644 --- a/llvm/tools/llvm-jitlink/llvm-jitlink-coff.cpp +++ b/llvm/tools/llvm-jitlink/llvm-jitlink-coff.cpp @@ -66,8 +66,6 @@ static Expected getCOFFStubTarget(LinkGraph &G, Block &B) { namespace llvm { Error registerCOFFGraphInfo(Session &S, LinkGraph &G) { - std::lock_guard Lock(S.M); - auto FileName = sys::path::filename(G.getName()); if (S.FileInfos.count(FileName)) { return make_error("When -check is passed, file names must be " diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp index 6aa89413b7230..a8c804a459e3c 100644 --- a/llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp +++ b/llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp @@ -101,8 +101,6 @@ static Error registerSymbol(LinkGraph &G, Symbol &Sym, Session::FileInfo &FI, namespace llvm { Error registerELFGraphInfo(Session &S, LinkGraph &G) { - std::lock_guard Lock(S.M); - auto FileName = sys::path::filename(G.getName()); if (S.FileInfos.count(FileName)) { return make_error("When -check is passed, file names must be " diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink-macho.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink-macho.cpp index 2fc56c9fcc72a..2c60c802293a1 100644 --- a/llvm/tools/llvm-jitlink/llvm-jitlink-macho.cpp +++ b/llvm/tools/llvm-jitlink/llvm-jitlink-macho.cpp @@ -69,8 +69,6 @@ static Expected getMachOStubTarget(LinkGraph &G, Block &B) { namespace llvm { Error registerMachOGraphInfo(Session &S, LinkGraph &G) { - std::lock_guard Lock(S.M); - auto FileName = sys::path::filename(G.getName()); if (S.FileInfos.count(FileName)) { return make_error("When -check is passed, file names must be " diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp index 5b23823317279..96a3e5b2acdf4 100644 --- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp +++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp @@ -91,10 +91,6 @@ static cl::list InputFiles(cl::Positional, cl::OneOrMore, cl::desc("input files"), cl::cat(JITLinkCategory)); -static cl::opt MaterializationThreads( - "num-threads", cl::desc("Number of materialization threads to use"), - cl::init(std::numeric_limits::max()), cl::cat(JITLinkCategory)); - static cl::list LibrarySearchPaths("L", cl::desc("Add dir to the list of library search paths"), @@ -404,7 +400,6 @@ bool lazyLinkingRequested() { } static Error applyHarnessPromotions(Session &S, LinkGraph &G) { - std::lock_guard Lock(S.M); // If this graph is part of the test harness there's nothing to do. if (S.HarnessFiles.empty() || S.HarnessFiles.count(G.getName())) @@ -455,11 +450,7 @@ static Error applyHarnessPromotions(Session &S, LinkGraph &G) { return Error::success(); } -static void dumpSectionContents(raw_ostream &OS, Session &S, LinkGraph &G) { - std::lock_guard Lock(S.M); - - outs() << "Relocated section contents for " << G.getName() << ":\n"; - +static void dumpSectionContents(raw_ostream &OS, LinkGraph &G) { constexpr orc::ExecutorAddrDiff DumpWidth = 16; static_assert(isPowerOf2_64(DumpWidth), "DumpWidth must be a power of two"); @@ -851,7 +842,7 @@ static Expected> launchExecutor() { S.CreateMemoryManager = createSharedMemoryManager; return SimpleRemoteEPC::Create( - std::make_unique(MaterializationThreads), + std::make_unique(std::nullopt), std::move(S), FromExecutor[ReadEnd], ToExecutor[WriteEnd]); #endif } @@ -993,16 +984,10 @@ Expected> Session::Create(Triple TT, auto PageSize = sys::Process::getPageSize(); if (!PageSize) return PageSize.takeError(); - std::unique_ptr Dispatcher; - if (MaterializationThreads == 0) - Dispatcher = std::make_unique(); - else - Dispatcher = std::make_unique( - MaterializationThreads); - EPC = std::make_unique( - std::make_shared(), std::move(Dispatcher), - std::move(TT), *PageSize, createInProcessMemoryManager()); + std::make_shared(), + std::make_unique(), std::move(TT), *PageSize, + createInProcessMemoryManager()); } Error Err = Error::success(); @@ -1236,7 +1221,6 @@ void Session::modifyPassConfig(LinkGraph &G, PassConfiguration &PassConfig) { if (ShowGraphsRegex) PassConfig.PostFixupPasses.push_back([this](LinkGraph &G) -> Error { - std::lock_guard Lock(M); // Print graph if ShowLinkGraphs is specified-but-empty, or if // it contains the given graph. if (ShowGraphsRegex->match(G.getName())) { @@ -1255,8 +1239,9 @@ void Session::modifyPassConfig(LinkGraph &G, PassConfiguration &PassConfig) { [this](LinkGraph &G) { return applyHarnessPromotions(*this, G); }); if (ShowRelocatedSectionContents) - PassConfig.PostFixupPasses.push_back([this](LinkGraph &G) -> Error { - dumpSectionContents(outs(), *this, G); + PassConfig.PostFixupPasses.push_back([](LinkGraph &G) -> Error { + outs() << "Relocated section contents for " << G.getName() << ":\n"; + dumpSectionContents(outs(), G); return Error::success(); }); @@ -1628,31 +1613,6 @@ static Error sanitizeArguments(const Triple &TT, const char *ArgV0) { } } - if (MaterializationThreads == std::numeric_limits::max()) { - if (auto HC = std::thread::hardware_concurrency()) - MaterializationThreads = HC; - else { - errs() << "Warning: std::thread::hardware_concurrency() returned 0, " - "defaulting to -threads=1.\n"; - MaterializationThreads = 1; - } - } - - if (!!OutOfProcessExecutor.getNumOccurrences() || - !!OutOfProcessExecutorConnect.getNumOccurrences()) { - if (NoExec) - return make_error("-noexec cannot be used with " + - OutOfProcessExecutor.ArgStr + " or " + - OutOfProcessExecutorConnect.ArgStr, - inconvertibleErrorCode()); - - if (MaterializationThreads == 0) - return make_error("-threads=0 cannot be used with " + - OutOfProcessExecutor.ArgStr + " or " + - OutOfProcessExecutorConnect.ArgStr, - inconvertibleErrorCode()); - } - // Only one of -oop-executor and -oop-executor-connect can be used. if (!!OutOfProcessExecutor.getNumOccurrences() && !!OutOfProcessExecutorConnect.getNumOccurrences()) From 4cb2a519db10f54815c8a4ccd5accbedc1cdfd07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Mon, 23 Dec 2024 21:27:34 -0800 Subject: [PATCH 008/567] Revert "Reland '[flang] Allow to pass an async id to allocate the descriptor (#118713)' and #118733" (#121029) This still cause issue for device runtime build. --- flang/include/flang/Runtime/CUDA/allocator.h | 8 +++--- flang/include/flang/Runtime/CUDA/common.h | 3 --- flang/include/flang/Runtime/allocatable.h | 6 ++--- .../flang/Runtime/allocator-registry.h | 10 +++---- flang/include/flang/Runtime/descriptor.h | 2 +- flang/lib/Lower/Allocatable.cpp | 11 +++----- .../Optimizer/Builder/Runtime/Allocatable.cpp | 9 +++---- flang/runtime/CUDA/allocatable.cpp | 2 +- flang/runtime/CUDA/allocator.cpp | 15 ++++------- flang/runtime/CUDA/descriptor.cpp | 3 +-- flang/runtime/allocatable.cpp | 10 +++---- flang/runtime/array-constructor.cpp | 8 +++--- flang/runtime/descriptor.cpp | 4 +-- flang/test/HLFIR/elemental-codegen.fir | 6 ++--- flang/test/Lower/OpenACC/acc-declare.f90 | 4 +-- flang/test/Lower/allocatable-polymorphic.f90 | 26 +++++++++---------- flang/test/Lower/allocatable-runtime.f90 | 4 +-- flang/test/Lower/allocate-mold.f90 | 4 +-- flang/test/Lower/polymorphic.f90 | 6 ++--- flang/unittests/Runtime/CUDA/Allocatable.cpp | 3 +-- flang/unittests/Runtime/CUDA/AllocatorCUF.cpp | 21 +-------------- flang/unittests/Runtime/CUDA/Memory.cpp | 3 +-- 22 files changed, 63 insertions(+), 105 deletions(-) diff --git a/flang/include/flang/Runtime/CUDA/allocator.h b/flang/include/flang/Runtime/CUDA/allocator.h index b6f0e7f303176..4fb4c94c5e9b0 100644 --- a/flang/include/flang/Runtime/CUDA/allocator.h +++ b/flang/include/flang/Runtime/CUDA/allocator.h @@ -20,16 +20,16 @@ extern "C" { void RTDECL(CUFRegisterAllocator)(); } -void *CUFAllocPinned(std::size_t, std::int64_t = kCudaNoStream); +void *CUFAllocPinned(std::size_t); void CUFFreePinned(void *); -void *CUFAllocDevice(std::size_t, std::int64_t); +void *CUFAllocDevice(std::size_t); void CUFFreeDevice(void *); -void *CUFAllocManaged(std::size_t, std::int64_t = kCudaNoStream); +void *CUFAllocManaged(std::size_t); void CUFFreeManaged(void *); -void *CUFAllocUnified(std::size_t, std::int64_t = kCudaNoStream); +void *CUFAllocUnified(std::size_t); void CUFFreeUnified(void *); } // namespace Fortran::runtime::cuda diff --git a/flang/include/flang/Runtime/CUDA/common.h b/flang/include/flang/Runtime/CUDA/common.h index 9c95f727ee673..474f8e6578b89 100644 --- a/flang/include/flang/Runtime/CUDA/common.h +++ b/flang/include/flang/Runtime/CUDA/common.h @@ -23,9 +23,6 @@ static constexpr unsigned kHostToDevice = 0; static constexpr unsigned kDeviceToHost = 1; static constexpr unsigned kDeviceToDevice = 2; -/// Value used for asyncId when no specific stream is specified. -static constexpr std::int64_t kCudaNoStream = -1; - #define CUDA_REPORT_IF_ERROR(expr) \ [](cudaError_t err) { \ if (err == cudaSuccess) \ diff --git a/flang/include/flang/Runtime/allocatable.h b/flang/include/flang/Runtime/allocatable.h index 121c31af963aa..58061d9862095 100644 --- a/flang/include/flang/Runtime/allocatable.h +++ b/flang/include/flang/Runtime/allocatable.h @@ -94,9 +94,9 @@ int RTDECL(AllocatableCheckLengthParameter)(Descriptor &, // Successfully allocated memory is initialized if the allocatable has a // derived type, and is always initialized by AllocatableAllocateSource(). // Performs all necessary coarray synchronization and validation actions. -int RTDECL(AllocatableAllocate)(Descriptor &, std::int64_t asyncId = -1, - bool hasStat = false, const Descriptor *errMsg = nullptr, - const char *sourceFile = nullptr, int sourceLine = 0); +int RTDECL(AllocatableAllocate)(Descriptor &, bool hasStat = false, + const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr, + int sourceLine = 0); int RTDECL(AllocatableAllocateSource)(Descriptor &, const Descriptor &source, bool hasStat = false, const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr, int sourceLine = 0); diff --git a/flang/include/flang/Runtime/allocator-registry.h b/flang/include/flang/Runtime/allocator-registry.h index 4c3295edf13d9..29302c5d825bc 100644 --- a/flang/include/flang/Runtime/allocator-registry.h +++ b/flang/include/flang/Runtime/allocator-registry.h @@ -11,7 +11,6 @@ #include "flang/Common/api-attrs.h" #include "flang/Runtime/allocator-registry-consts.h" -#include #include #include @@ -19,7 +18,7 @@ namespace Fortran::runtime { -using AllocFct = void *(*)(std::size_t, std::int64_t); +using AllocFct = void *(*)(std::size_t); using FreeFct = void (*)(void *); typedef struct Allocator_t { @@ -27,11 +26,10 @@ typedef struct Allocator_t { FreeFct free{nullptr}; } Allocator_t; -static RT_API_ATTRS void *MallocWrapper( - std::size_t size, [[maybe_unused]] std::int64_t) { +#ifdef RT_DEVICE_COMPILATION +static RT_API_ATTRS void *MallocWrapper(std::size_t size) { return std::malloc(size); } -#ifdef RT_DEVICE_COMPILATION static RT_API_ATTRS void FreeWrapper(void *p) { return std::free(p); } #endif @@ -41,7 +39,7 @@ struct AllocatorRegistry { : allocators{{&MallocWrapper, &FreeWrapper}} {} #else constexpr AllocatorRegistry() { - allocators[kDefaultAllocator] = {&MallocWrapper, &std::free}; + allocators[kDefaultAllocator] = {&std::malloc, &std::free}; }; #endif RT_API_ATTRS void Register(int, Allocator_t); diff --git a/flang/include/flang/Runtime/descriptor.h b/flang/include/flang/Runtime/descriptor.h index 44e82c6a25687..dd36fba157ca9 100644 --- a/flang/include/flang/Runtime/descriptor.h +++ b/flang/include/flang/Runtime/descriptor.h @@ -369,7 +369,7 @@ class Descriptor { // before calling. It (re)computes the byte strides after // allocation. Does not allocate automatic components or // perform default component initialization. - RT_API_ATTRS int Allocate(std::int64_t asyncId = -1); + RT_API_ATTRS int Allocate(); RT_API_ATTRS void SetByteStrides(); // Deallocates storage; does not call FINAL subroutines or diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp index f1436564aabaa..fb8380ac7e8c5 100644 --- a/flang/lib/Lower/Allocatable.cpp +++ b/flang/lib/Lower/Allocatable.cpp @@ -184,14 +184,9 @@ static mlir::Value genRuntimeAllocate(fir::FirOpBuilder &builder, ? fir::runtime::getRuntimeFunc(loc, builder) : fir::runtime::getRuntimeFunc(loc, builder); - llvm::SmallVector args{box.getAddr()}; - if (!box.isPointer()) - args.push_back( - builder.createIntegerConstant(loc, builder.getI64Type(), -1)); - args.push_back(errorManager.hasStat); - args.push_back(errorManager.errMsgAddr); - args.push_back(errorManager.sourceFile); - args.push_back(errorManager.sourceLine); + llvm::SmallVector args{ + box.getAddr(), errorManager.hasStat, errorManager.errMsgAddr, + errorManager.sourceFile, errorManager.sourceLine}; llvm::SmallVector operands; for (auto [fst, snd] : llvm::zip(args, callee.getFunctionType().getInputs())) operands.emplace_back(builder.createConvert(loc, snd, fst)); diff --git a/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp b/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp index 28452d3b486da..70a88ff18cb1d 100644 --- a/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp +++ b/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp @@ -76,19 +76,16 @@ void fir::runtime::genAllocatableAllocate(fir::FirOpBuilder &builder, mlir::func::FuncOp func{ fir::runtime::getRuntimeFunc(loc, builder)}; mlir::FunctionType fTy{func.getFunctionType()}; - mlir::Value asyncId = - builder.createIntegerConstant(loc, builder.getI64Type(), -1); mlir::Value sourceFile{fir::factory::locationToFilename(builder, loc)}; mlir::Value sourceLine{ - fir::factory::locationToLineNo(builder, loc, fTy.getInput(5))}; + fir::factory::locationToLineNo(builder, loc, fTy.getInput(4))}; if (!hasStat) hasStat = builder.createBool(loc, false); if (!errMsg) { mlir::Type boxNoneTy = fir::BoxType::get(builder.getNoneType()); errMsg = builder.create(loc, boxNoneTy).getResult(); } - llvm::SmallVector args{ - fir::runtime::createArguments(builder, loc, fTy, desc, asyncId, hasStat, - errMsg, sourceFile, sourceLine)}; + llvm::SmallVector args{fir::runtime::createArguments( + builder, loc, fTy, desc, hasStat, errMsg, sourceFile, sourceLine)}; builder.create(loc, func, args); } diff --git a/flang/runtime/CUDA/allocatable.cpp b/flang/runtime/CUDA/allocatable.cpp index 3f6f8f3d6d5de..9be54e8906903 100644 --- a/flang/runtime/CUDA/allocatable.cpp +++ b/flang/runtime/CUDA/allocatable.cpp @@ -52,7 +52,7 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t stream, } // Perform the standard allocation. int stat{RTNAME(AllocatableAllocate)( - desc, stream, hasStat, errMsg, sourceFile, sourceLine)}; + desc, hasStat, errMsg, sourceFile, sourceLine)}; return stat; } diff --git a/flang/runtime/CUDA/allocator.cpp b/flang/runtime/CUDA/allocator.cpp index d848f1811dcf3..85b3daf65a8ba 100644 --- a/flang/runtime/CUDA/allocator.cpp +++ b/flang/runtime/CUDA/allocator.cpp @@ -33,7 +33,7 @@ void RTDEF(CUFRegisterAllocator)() { } } -void *CUFAllocPinned(std::size_t sizeInBytes, std::int64_t) { +void *CUFAllocPinned(std::size_t sizeInBytes) { void *p; CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes)); return p; @@ -41,20 +41,15 @@ void *CUFAllocPinned(std::size_t sizeInBytes, std::int64_t) { void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); } -void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t stream) { +void *CUFAllocDevice(std::size_t sizeInBytes) { void *p; - if (stream >= 0) { - CUDA_REPORT_IF_ERROR( - cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)stream)); - } else { - CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes)); - } + CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes)); return p; } void CUFFreeDevice(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); } -void *CUFAllocManaged(std::size_t sizeInBytes, std::int64_t) { +void *CUFAllocManaged(std::size_t sizeInBytes) { void *p; CUDA_REPORT_IF_ERROR( cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal)); @@ -63,7 +58,7 @@ void *CUFAllocManaged(std::size_t sizeInBytes, std::int64_t) { void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); } -void *CUFAllocUnified(std::size_t sizeInBytes, std::int64_t) { +void *CUFAllocUnified(std::size_t sizeInBytes) { // Call alloc managed for the time being. return CUFAllocManaged(sizeInBytes); } diff --git a/flang/runtime/CUDA/descriptor.cpp b/flang/runtime/CUDA/descriptor.cpp index 816b1458ee52c..391c47e84241d 100644 --- a/flang/runtime/CUDA/descriptor.cpp +++ b/flang/runtime/CUDA/descriptor.cpp @@ -20,8 +20,7 @@ RT_EXT_API_GROUP_BEGIN Descriptor *RTDEF(CUFAllocDescriptor)( std::size_t sizeInBytes, const char *sourceFile, int sourceLine) { - return reinterpret_cast( - CUFAllocManaged(sizeInBytes, kCudaNoStream)); + return reinterpret_cast(CUFAllocManaged(sizeInBytes)); } void RTDEF(CUFFreeDescriptor)( diff --git a/flang/runtime/allocatable.cpp b/flang/runtime/allocatable.cpp index b65cec8d51cf8..5e065f47636a8 100644 --- a/flang/runtime/allocatable.cpp +++ b/flang/runtime/allocatable.cpp @@ -133,17 +133,15 @@ void RTDEF(AllocatableApplyMold)( } } -int RTDEF(AllocatableAllocate)(Descriptor &descriptor, std::int64_t asyncId, - bool hasStat, const Descriptor *errMsg, const char *sourceFile, - int sourceLine) { +int RTDEF(AllocatableAllocate)(Descriptor &descriptor, bool hasStat, + const Descriptor *errMsg, const char *sourceFile, int sourceLine) { Terminator terminator{sourceFile, sourceLine}; if (!descriptor.IsAllocatable()) { return ReturnError(terminator, StatInvalidDescriptor, errMsg, hasStat); } else if (descriptor.IsAllocated()) { return ReturnError(terminator, StatBaseNotNull, errMsg, hasStat); } else { - int stat{ - ReturnError(terminator, descriptor.Allocate(asyncId), errMsg, hasStat)}; + int stat{ReturnError(terminator, descriptor.Allocate(), errMsg, hasStat)}; if (stat == StatOk) { if (const DescriptorAddendum * addendum{descriptor.Addendum()}) { if (const auto *derived{addendum->derivedType()}) { @@ -162,7 +160,7 @@ int RTDEF(AllocatableAllocateSource)(Descriptor &alloc, const Descriptor &source, bool hasStat, const Descriptor *errMsg, const char *sourceFile, int sourceLine) { int stat{RTNAME(AllocatableAllocate)( - alloc, /*asyncId=*/-1, hasStat, errMsg, sourceFile, sourceLine)}; + alloc, hasStat, errMsg, sourceFile, sourceLine)}; if (stat == StatOk) { Terminator terminator{sourceFile, sourceLine}; DoFromSourceAssign(alloc, source, terminator); diff --git a/flang/runtime/array-constructor.cpp b/flang/runtime/array-constructor.cpp index 0d677d7cc63aa..c6953167f5fb2 100644 --- a/flang/runtime/array-constructor.cpp +++ b/flang/runtime/array-constructor.cpp @@ -50,8 +50,8 @@ static RT_API_ATTRS void AllocateOrReallocateVectorIfNeeded( initialAllocationSize(fromElements, to.ElementBytes())}; to.GetDimension(0).SetBounds(1, allocationSize); RTNAME(AllocatableAllocate) - (to, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr, - vector.sourceFile, vector.sourceLine); + (to, /*hasStat=*/false, /*errMsg=*/nullptr, vector.sourceFile, + vector.sourceLine); to.GetDimension(0).SetBounds(1, fromElements); vector.actualAllocationSize = allocationSize; } else { @@ -59,8 +59,8 @@ static RT_API_ATTRS void AllocateOrReallocateVectorIfNeeded( // first value: there should be no reallocation. RUNTIME_CHECK(terminator, previousToElements >= fromElements); RTNAME(AllocatableAllocate) - (to, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr, - vector.sourceFile, vector.sourceLine); + (to, /*hasStat=*/false, /*errMsg=*/nullptr, vector.sourceFile, + vector.sourceLine); vector.actualAllocationSize = previousToElements; } } else { diff --git a/flang/runtime/descriptor.cpp b/flang/runtime/descriptor.cpp index f43c96bed7d00..32f43e89dc7a3 100644 --- a/flang/runtime/descriptor.cpp +++ b/flang/runtime/descriptor.cpp @@ -163,7 +163,7 @@ RT_API_ATTRS static inline int MapAllocIdx(const Descriptor &desc) { #endif } -RT_API_ATTRS int Descriptor::Allocate(std::int64_t asyncId) { +RT_API_ATTRS int Descriptor::Allocate() { std::size_t elementBytes{ElementBytes()}; if (static_cast(elementBytes) < 0) { // F'2023 7.4.4.2 p5: "If the character length parameter value evaluates @@ -175,7 +175,7 @@ RT_API_ATTRS int Descriptor::Allocate(std::int64_t asyncId) { // Zero size allocation is possible in Fortran and the resulting // descriptor must be allocated/associated. Since std::malloc(0) // result is implementation defined, always allocate at least one byte. - void *p{alloc(byteSize ? byteSize : 1, asyncId)}; + void *p{alloc(byteSize ? byteSize : 1)}; if (!p) { return CFI_ERROR_MEM_ALLOCATION; } diff --git a/flang/test/HLFIR/elemental-codegen.fir b/flang/test/HLFIR/elemental-codegen.fir index 3c33bf8fca2d1..0d5f343cb1771 100644 --- a/flang/test/HLFIR/elemental-codegen.fir +++ b/flang/test/HLFIR/elemental-codegen.fir @@ -192,7 +192,7 @@ func.func @test_polymorphic(%arg0: !fir.class> {fir.bindc_ // CHECK: %[[VAL_35:.*]] = fir.absent !fir.box // CHECK: %[[VAL_36:.*]] = fir.convert %[[VAL_4]] : (!fir.ref>>>>) -> !fir.ref> // CHECK: %[[VAL_37:.*]] = fir.convert %[[VAL_31]] : (!fir.ref>) -> !fir.ref -// CHECK: %[[VAL_38:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_36]], %{{.*}}, %[[VAL_34]], %[[VAL_35]], %[[VAL_37]], %[[VAL_33]]) : (!fir.ref>, i64, i1, !fir.box, !fir.ref, i32) -> i32 +// CHECK: %[[VAL_38:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_36]], %[[VAL_34]], %[[VAL_35]], %[[VAL_37]], %[[VAL_33]]) : (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 // CHECK: %[[VAL_39:.*]] = fir.load %[[VAL_13]]#0 : !fir.ref>>>> // CHECK: %[[VAL_40:.*]] = arith.constant 1 : index // CHECK: fir.do_loop %[[VAL_41:.*]] = %[[VAL_40]] to %[[EX1]] step %[[VAL_40]] unordered { @@ -276,7 +276,7 @@ func.func @test_polymorphic_expr(%arg0: !fir.class> {fir.b // CHECK: %[[VAL_36:.*]] = fir.absent !fir.box // CHECK: %[[VAL_37:.*]] = fir.convert %[[VAL_5]] : (!fir.ref>>>>) -> !fir.ref> // CHECK: %[[VAL_38:.*]] = fir.convert %[[VAL_32]] : (!fir.ref>) -> !fir.ref -// CHECK: %[[VAL_39:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_37]], %{{.*}}, %[[VAL_35]], %[[VAL_36]], %[[VAL_38]], %[[VAL_34]]) : (!fir.ref>, i64, i1, !fir.box, !fir.ref, i32) -> i32 +// CHECK: %[[VAL_39:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_37]], %[[VAL_35]], %[[VAL_36]], %[[VAL_38]], %[[VAL_34]]) : (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 // CHECK: %[[VAL_40:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref>>>> // CHECK: %[[VAL_41:.*]] = arith.constant 1 : index // CHECK: fir.do_loop %[[VAL_42:.*]] = %[[VAL_41]] to %[[VAL_3]] step %[[VAL_41]] unordered { @@ -329,7 +329,7 @@ func.func @test_polymorphic_expr(%arg0: !fir.class> {fir.b // CHECK: %[[VAL_85:.*]] = fir.absent !fir.box // CHECK: %[[VAL_86:.*]] = fir.convert %[[VAL_4]] : (!fir.ref>>>>) -> !fir.ref> // CHECK: %[[VAL_87:.*]] = fir.convert %[[VAL_81]] : (!fir.ref>) -> !fir.ref -// CHECK: %[[VAL_88:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_86]], %{{.*}}, %[[VAL_84]], %[[VAL_85]], %[[VAL_87]], %[[VAL_83]]) : (!fir.ref>, i64, i1, !fir.box, !fir.ref, i32) -> i32 +// CHECK: %[[VAL_88:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_86]], %[[VAL_84]], %[[VAL_85]], %[[VAL_87]], %[[VAL_83]]) : (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 // CHECK: %[[VAL_89:.*]] = fir.load %[[VAL_63]]#0 : !fir.ref>>>> // CHECK: %[[VAL_90:.*]] = arith.constant 1 : index // CHECK: fir.do_loop %[[VAL_91:.*]] = %[[VAL_90]] to %[[VAL_3]] step %[[VAL_90]] unordered { diff --git a/flang/test/Lower/OpenACC/acc-declare.f90 b/flang/test/Lower/OpenACC/acc-declare.f90 index 9fe51a8db55e3..0066e712fbdcc 100644 --- a/flang/test/Lower/OpenACC/acc-declare.f90 +++ b/flang/test/Lower/OpenACC/acc-declare.f90 @@ -469,6 +469,6 @@ subroutine init() end module ! CHECK-LABEL: func.func @_QMacc_declare_post_action_statPinit() -! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath {acc.declare_action = #acc.declare_action} : (!fir.ref>, i64, i1, !fir.box, !fir.ref, i32) -> i32 +! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath {acc.declare_action = #acc.declare_action} : (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 ! CHECK: fir.if -! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath {acc.declare_action = #acc.declare_action} : (!fir.ref>, i64, i1, !fir.box, !fir.ref, i32) -> i32 +! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath {acc.declare_action = #acc.declare_action} : (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 diff --git a/flang/test/Lower/allocatable-polymorphic.f90 b/flang/test/Lower/allocatable-polymorphic.f90 index fba0c12fb889c..bbc54754ca1ab 100644 --- a/flang/test/Lower/allocatable-polymorphic.f90 +++ b/flang/test/Lower/allocatable-polymorphic.f90 @@ -267,7 +267,7 @@ subroutine test_allocatable() ! CHECK: %[[C0:.*]] = arith.constant 0 : i32 ! CHECK: fir.call @_FortranAAllocatableInitDerivedForAllocate(%[[P_CAST]], %[[TYPE_DESC_P1_CAST]], %[[RANK]], %[[C0]]) {{.*}}: (!fir.ref>, !fir.ref, i32, i32) -> none ! CHECK: %[[P_CAST:.*]] = fir.convert %[[P_DECL]]#1 : (!fir.ref>>>) -> !fir.ref> -! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[P_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref>, i64, i1, !fir.box, !fir.ref, i32) -> i32 +! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[P_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 ! CHECK: %[[TYPE_DESC_P1:.*]] = fir.type_desc !fir.type<_QMpolyTp1{a:i32,b:i32}> ! CHECK: %[[C1_CAST:.*]] = fir.convert %[[C1_DECL]]#1 : (!fir.ref>>>) -> !fir.ref> @@ -276,7 +276,7 @@ subroutine test_allocatable() ! CHECK: %[[C0:.*]] = arith.constant 0 : i32 ! CHECK: fir.call @_FortranAAllocatableInitDerivedForAllocate(%[[C1_CAST]], %[[TYPE_DESC_P1_CAST]], %[[RANK]], %[[C0]]) {{.*}}: (!fir.ref>, !fir.ref, i32, i32) -> none ! CHECK: %[[C1_CAST:.*]] = fir.convert %[[C1_DECL]]#1 : (!fir.ref>>>) -> !fir.ref> -! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C1_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref>, i64, i1, !fir.box, !fir.ref, i32) -> i32 +! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C1_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 ! CHECK: %[[TYPE_DESC_P2:.*]] = fir.type_desc !fir.type<_QMpolyTp2{p1:!fir.type<_QMpolyTp1{a:i32,b:i32}>,c:i32}> ! CHECK: %[[C2_CAST:.*]] = fir.convert %[[C2_DECL]]#1 : (!fir.ref>>>) -> !fir.ref> @@ -285,7 +285,7 @@ subroutine test_allocatable() ! CHECK: %[[C0:.*]] = arith.constant 0 : i32 ! CHECK: fir.call @_FortranAAllocatableInitDerivedForAllocate(%[[C2_CAST]], %[[TYPE_DESC_P2_CAST]], %[[RANK]], %[[C0]]) {{.*}}: (!fir.ref>, !fir.ref, i32, i32) -> none ! CHECK: %[[C2_CAST:.*]] = fir.convert %[[C2_DECL]]#1 : (!fir.ref>>>) -> !fir.ref> -! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C2_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref>, i64, i1, !fir.box, !fir.ref, i32) -> i32 +! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C2_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 ! CHECK: %[[TYPE_DESC_P1:.*]] = fir.type_desc !fir.type<_QMpolyTp1{a:i32,b:i32}> ! CHECK: %[[C3_CAST:.*]] = fir.convert %[[C3_DECL]]#1 : (!fir.ref>>>>) -> !fir.ref> @@ -300,7 +300,7 @@ subroutine test_allocatable() ! CHECK: %[[C10_I64:.*]] = fir.convert %[[C10]] : (i32) -> i64 ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableSetBounds(%[[C3_CAST]], %[[C0]], %[[C1_I64]], %[[C10_I64]]) {{.*}}: (!fir.ref>, i32, i64, i64) -> none ! CHECK: %[[C3_CAST:.*]] = fir.convert %[[C3_DECL]]#1 : (!fir.ref>>>>) -> !fir.ref> -! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C3_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref>, i64, i1, !fir.box, !fir.ref, i32) -> i32 +! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C3_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 ! CHECK: %[[TYPE_DESC_P2:.*]] = fir.type_desc !fir.type<_QMpolyTp2{p1:!fir.type<_QMpolyTp1{a:i32,b:i32}>,c:i32}> ! CHECK: %[[C4_CAST:.*]] = fir.convert %[[C4_DECL]]#1 : (!fir.ref>>>>) -> !fir.ref> @@ -316,7 +316,7 @@ subroutine test_allocatable() ! CHECK: %[[C20_I64:.*]] = fir.convert %[[C20]] : (i32) -> i64 ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableSetBounds(%[[C4_CAST]], %[[C0]], %[[C1_I64]], %[[C20_I64]]) {{.*}}: (!fir.ref>, i32, i64, i64) -> none ! CHECK: %[[C4_CAST:.*]] = fir.convert %[[C4_DECL]]#1 : (!fir.ref>>>>) -> !fir.ref> -! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C4_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref>, i64, i1, !fir.box, !fir.ref, i32) -> i32 +! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C4_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 ! CHECK: %[[C1_LOAD1:.*]] = fir.load %[[C1_DECL]]#0 : !fir.ref>>> ! CHECK: fir.dispatch "proc1"(%[[C1_LOAD1]] : !fir.class>>) @@ -390,7 +390,7 @@ subroutine test_unlimited_polymorphic_with_intrinsic_type_spec() ! CHECK: %[[CORANK:.*]] = arith.constant 0 : i32 ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableInitIntrinsicForAllocate(%[[BOX_NONE]], %[[CAT]], %[[KIND]], %[[RANK]], %[[CORANK]]) {{.*}} : (!fir.ref>, i32, i32, i32, i32) -> none ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[P_DECL]]#1 : (!fir.ref>>) -> !fir.ref> -! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref>, i64, i1, !fir.box, !fir.ref, i32) -> i32 +! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[PTR_DECL]]#1 : (!fir.ref>>) -> !fir.ref> ! CHECK: %[[CAT:.*]] = arith.constant 2 : i32 @@ -573,7 +573,7 @@ subroutine test_allocatable_up_character() ! CHECK: %[[CORANK:.*]] = arith.constant 0 : i32 ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableInitCharacterForAllocate(%[[A_NONE]], %[[LEN]], %[[KIND]], %[[RANK]], %[[CORANK]]) {{.*}} : (!fir.ref>, i64, i32, i32, i32) -> none ! CHECK: %[[A_NONE:.*]] = fir.convert %[[A_DECL]]#1 : (!fir.ref>>) -> !fir.ref> -! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref>, i64, i1, !fir.box, !fir.ref, i32) -> i32 +! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 end module @@ -592,17 +592,17 @@ program test_alloc ! LLVM-LABEL: define void @_QMpolyPtest_allocatable() ! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp1, i32 0, i32 0) -! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i64 -1, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}}) +! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}}) ! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp1, i32 0, i32 0) -! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i64 -1, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}}) +! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}}) ! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp2, i32 0, i32 0) -! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i64 -1, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}}) +! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}}) ! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp1, i32 1, i32 0) ! LLVM: %{{.*}} = call {} @_FortranAAllocatableSetBounds(ptr %{{.*}}, i32 0, i64 1, i64 10) -! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i64 -1, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}}) +! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}}) ! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp2, i32 1, i32 0) ! LLVM: %{{.*}} = call {} @_FortranAAllocatableSetBounds(ptr %{{.*}}, i32 0, i64 1, i64 20) -! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i64 -1, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}}) +! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}}) ! LLVM-COUNT-2: call void %{{[0-9]*}}() ! LLVM: call void @llvm.memcpy.p0.p0.i32 @@ -683,5 +683,5 @@ program test_alloc ! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } { ptr null, i64 ptrtoint (ptr getelementptr (%_QMpolyTp1, ptr null, i32 1) to i64), i32 20240719, i8 0, i8 42, i8 2, i8 1, ptr @_QMpolyEXdtXp1, [1 x i64] zeroinitializer }, ptr %[[ALLOCA1:[0-9]*]] ! LLVM: call void @llvm.memcpy.p0.p0.i32(ptr %[[ALLOCA2:[0-9]+]], ptr %[[ALLOCA1]], i32 40, i1 false) ! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %[[ALLOCA2]], ptr @_QMpolyEXdtXp1, i32 0, i32 0) -! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %[[ALLOCA2]], i64 -1, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}}) +! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %[[ALLOCA2]], i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}}) ! LLVM: %{{.*}} = call i32 @_FortranAAllocatableDeallocatePolymorphic(ptr %[[ALLOCA2]], ptr {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}}) diff --git a/flang/test/Lower/allocatable-runtime.f90 b/flang/test/Lower/allocatable-runtime.f90 index effd0a3a93b84..3f1f8a86b7d07 100644 --- a/flang/test/Lower/allocatable-runtime.f90 +++ b/flang/test/Lower/allocatable-runtime.f90 @@ -31,7 +31,7 @@ subroutine foo() ! CHECK: fir.call @{{.*}}AllocatableSetBounds(%[[xBoxCast2]], %c0{{.*}}, %[[xlbCast]], %[[xubCast]]) {{.*}}: (!fir.ref>, i32, i64, i64) -> none ! CHECK-DAG: %[[xBoxCast3:.*]] = fir.convert %[[xBoxAddr]] : (!fir.ref>>>) -> !fir.ref> ! CHECK-DAG: %[[sourceFile:.*]] = fir.convert %{{.*}} -> !fir.ref - ! CHECK: fir.call @{{.*}}AllocatableAllocate(%[[xBoxCast3]], %c-1{{.*}}, %false{{.*}}, %[[errMsg]], %[[sourceFile]], %{{.*}}) {{.*}}: (!fir.ref>, i64, i1, !fir.box, !fir.ref, i32) -> i32 + ! CHECK: fir.call @{{.*}}AllocatableAllocate(%[[xBoxCast3]], %false{{.*}}, %[[errMsg]], %[[sourceFile]], %{{.*}}) {{.*}}: (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 ! Simply check that we are emitting the right numebr of set bound for y and z. Otherwise, this is just like x. ! CHECK: fir.convert %[[yBoxAddr]] : (!fir.ref>>>) -> !fir.ref> @@ -180,4 +180,4 @@ subroutine mold_allocation() ! CHECK: %[[M_BOX_NONE:.*]] = fir.convert %[[EMBOX_M]] : (!fir.box>) -> !fir.box ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableApplyMold(%[[A_BOX_NONE]], %[[M_BOX_NONE]], %[[RANK]]) {{.*}} : (!fir.ref>, !fir.box, i32) -> none ! CHECK: %[[A_BOX_NONE:.*]] = fir.convert %[[A]] : (!fir.ref>>>) -> !fir.ref> -! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref>, i64, i1, !fir.box, !fir.ref, i32) -> i32 +! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 diff --git a/flang/test/Lower/allocate-mold.f90 b/flang/test/Lower/allocate-mold.f90 index 831b26022dd46..0cc10fc9016de 100644 --- a/flang/test/Lower/allocate-mold.f90 +++ b/flang/test/Lower/allocate-mold.f90 @@ -16,7 +16,7 @@ subroutine scalar_mold_allocation() ! CHECK: %[[A_REF_BOX_NONE1:.*]] = fir.convert %[[A]] : (!fir.ref>>) -> !fir.ref> ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableApplyMold(%[[A_REF_BOX_NONE1]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref>, !fir.box, i32) -> none ! CHECK: %[[A_REF_BOX_NONE2:.*]] = fir.convert %[[A]] : (!fir.ref>>) -> !fir.ref> -! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_REF_BOX_NONE2]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref>, i64, i1, !fir.box, !fir.ref, i32) -> i32 +! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_REF_BOX_NONE2]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 subroutine array_scalar_mold_allocation() real, allocatable :: a(:) @@ -40,4 +40,4 @@ end subroutine array_scalar_mold_allocation ! CHECK: %[[REF_BOX_A1:.*]] = fir.convert %1 : (!fir.ref>>>) -> !fir.ref> ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableSetBounds(%[[REF_BOX_A1]], {{.*}},{{.*}}, {{.*}}) fastmath : (!fir.ref>, i32, i64, i64) -> none ! CHECK: %[[REF_BOX_A2:.*]] = fir.convert %[[A]] : (!fir.ref>>>) -> !fir.ref> -! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[REF_BOX_A2]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref>, i64, i1, !fir.box, !fir.ref, i32) -> i32 +! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[REF_BOX_A2]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 diff --git a/flang/test/Lower/polymorphic.f90 b/flang/test/Lower/polymorphic.f90 index 8c212ce05a8c7..8c40c91bc3baa 100644 --- a/flang/test/Lower/polymorphic.f90 +++ b/flang/test/Lower/polymorphic.f90 @@ -1154,11 +1154,11 @@ program test ! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "test"} { ! CHECK: %[[ADDR_O:.*]] = fir.address_of(@_QFEo) : !fir.ref}>>>> ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[ADDR_O]] : (!fir.ref}>>>>) -> !fir.ref> -! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref>, i64, i1, !fir.box, !fir.ref, i32) -> i32 +! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 ! CHECK: %[[O:.*]] = fir.load %[[ADDR_O]] : !fir.ref}>>>> ! CHECK: %[[FIELD_INNER:.*]] = fir.field_index inner, !fir.type<_QMpolymorphic_testTouter{inner:!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>}> ! CHECK: %[[COORD_INNER:.*]] = fir.coordinate_of %[[O]], %[[FIELD_INNER]] : (!fir.box}>>>, !fir.field) -> !fir.ref> -! CHECK: %{{.*}} = fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} unordered iter_args(%arg1 = %{{.*}}) -> (!fir.array<5x!fir.logical<4>>) { +! CHECK: %{{.*}} = fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} unordered iter_args(%arg1 = %9) -> (!fir.array<5x!fir.logical<4>>) { ! CHECK: %[[EMBOXED:.*]] = fir.embox %[[COORD_INNER]] : (!fir.ref>) -> !fir.class> -! CHECK: %{{.*}} = fir.call @_QMpolymorphic_testPlt(%{{.*}}, %[[EMBOXED]]) {{.*}} : (!fir.ref, !fir.class>) -> !fir.logical<4> +! CHECK: %{{.*}} = fir.call @_QMpolymorphic_testPlt(%17, %[[EMBOXED]]) {{.*}} : (!fir.ref, !fir.class>) -> !fir.logical<4> ! CHECK: } diff --git a/flang/unittests/Runtime/CUDA/Allocatable.cpp b/flang/unittests/Runtime/CUDA/Allocatable.cpp index 171ca982a04f1..0f7eb27789316 100644 --- a/flang/unittests/Runtime/CUDA/Allocatable.cpp +++ b/flang/unittests/Runtime/CUDA/Allocatable.cpp @@ -42,8 +42,7 @@ TEST(AllocatableCUFTest, SimpleDeviceAllocatable) { CUDA_REPORT_IF_ERROR(cudaMalloc(&device_desc, a->SizeInBytes())); RTNAME(AllocatableAllocate) - (*a, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, - __LINE__); + (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__); EXPECT_TRUE(a->IsAllocated()); RTNAME(CUFDescriptorSync)(device_desc, a.get(), __FILE__, __LINE__); cudaDeviceSynchronize(); diff --git a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp index 2cc49b6a63af5..7cb25787e7797 100644 --- a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp +++ b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp @@ -35,25 +35,7 @@ TEST(AllocatableCUFTest, SimpleDeviceAllocate) { EXPECT_FALSE(a->HasAddendum()); RTNAME(AllocatableSetBounds)(*a, 0, 1, 10); RTNAME(AllocatableAllocate) - (*a, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, - __LINE__); - EXPECT_TRUE(a->IsAllocated()); - RTNAME(AllocatableDeallocate) (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__); - EXPECT_FALSE(a->IsAllocated()); -} - -TEST(AllocatableCUFTest, SimpleStreamDeviceAllocate) { - using Fortran::common::TypeCategory; - RTNAME(CUFRegisterAllocator)(); - // REAL(4), DEVICE, ALLOCATABLE :: a(:) - auto a{createAllocatable(TypeCategory::Real, 4)}; - a->SetAllocIdx(kDeviceAllocatorPos); - EXPECT_EQ((int)kDeviceAllocatorPos, a->GetAllocIdx()); - EXPECT_FALSE(a->HasAddendum()); - RTNAME(AllocatableSetBounds)(*a, 0, 1, 10); - RTNAME(AllocatableAllocate) - (*a, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__); EXPECT_TRUE(a->IsAllocated()); RTNAME(AllocatableDeallocate) (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__); @@ -71,8 +53,7 @@ TEST(AllocatableCUFTest, SimplePinnedAllocate) { EXPECT_FALSE(a->HasAddendum()); RTNAME(AllocatableSetBounds)(*a, 0, 1, 10); RTNAME(AllocatableAllocate) - (*a, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, - __LINE__); + (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__); EXPECT_TRUE(a->IsAllocated()); RTNAME(AllocatableDeallocate) (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__); diff --git a/flang/unittests/Runtime/CUDA/Memory.cpp b/flang/unittests/Runtime/CUDA/Memory.cpp index 2f40915af3867..7c8b7aa5a4d78 100644 --- a/flang/unittests/Runtime/CUDA/Memory.cpp +++ b/flang/unittests/Runtime/CUDA/Memory.cpp @@ -51,8 +51,7 @@ TEST(MemoryCUFTest, CUFDataTransferDescDesc) { EXPECT_EQ((int)kDeviceAllocatorPos, dev->GetAllocIdx()); RTNAME(AllocatableSetBounds)(*dev, 0, 1, 10); RTNAME(AllocatableAllocate) - (*dev, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, - __LINE__); + (*dev, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__); EXPECT_TRUE(dev->IsAllocated()); // Create temp array to transfer to device. From df12983610dfb4f33ab4fa406a267f39d4c65248 Mon Sep 17 00:00:00 2001 From: vdonaldson <37090318+vdonaldson@users.noreply.github.com> Date: Tue, 24 Dec 2024 03:19:29 -0500 Subject: [PATCH 009/567] [flang] build fix (#121032) Place floating point environment calls under '#ifdef __USE_GNU'. --- flang/runtime/exceptions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flang/runtime/exceptions.cpp b/flang/runtime/exceptions.cpp index 1ed00538fef39..2fa2baa2ec84a 100644 --- a/flang/runtime/exceptions.cpp +++ b/flang/runtime/exceptions.cpp @@ -84,7 +84,7 @@ uint32_t RTNAME(MapException)(uint32_t excepts) { // Check if the processor has the ability to control whether to halt or // continue execution when a given exception is raised. bool RTNAME(SupportHalting)([[maybe_unused]] uint32_t except) { -#if (defined(__arm__) || defined(__aarch64__)) && !defined(_WIN32) +#ifdef __USE_GNU except = RTNAME(MapException)(except); int currentSet = fegetexcept(), flipSet, ok; if (currentSet & except) { From ccbbacf0fa98bd386c0a7b3bdfb85c43e7db1a93 Mon Sep 17 00:00:00 2001 From: David Green Date: Tue, 24 Dec 2024 08:48:01 +0000 Subject: [PATCH 010/567] [ARM] Fix MVE incrementing gather offset calculation The code was checking the gep ptr type as opposed to the gep source element type in calculating the offset scale. Fixes #120993 --- .../Target/ARM/MVEGatherScatterLowering.cpp | 2 +- .../CodeGen/Thumb2/mve-gather-increment.ll | 325 ++++++++++-------- 2 files changed, 191 insertions(+), 136 deletions(-) diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp index ab3a445484d13..7efd2989aa7fa 100644 --- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp +++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp @@ -752,7 +752,7 @@ Instruction *MVEGatherScatterLowering::tryCreateIncrementingGatScat( // The gep was in charge of making sure the offsets are scaled correctly // - calculate that factor so it can be applied by hand int TypeScale = - computeScale(DL->getTypeSizeInBits(GEP->getOperand(0)->getType()), + computeScale(DL->getTypeSizeInBits(GEP->getSourceElementType()), DL->getTypeSizeInBits(GEP->getType()) / cast(GEP->getType())->getNumElements()); if (TypeScale == -1) diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll index 93cab25c2cb72..e63c62574dafb 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll @@ -14,6 +14,19 @@ define arm_aapcs_vfpcc <4 x i32> @gather_inc_mini_4i32(ptr noalias nocapture rea ret <4 x i32> %wide.masked.gather } +define arm_aapcs_vfpcc <4 x i32> @gather_inc_mini_4i32_i8(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, <4 x i32> %offs) { +; CHECK-LABEL: gather_inc_mini_4i32_i8: +; CHECK: @ %bb.0: +; CHECK-NEXT: movs r1, #16 +; CHECK-NEXT: vadd.i32 q1, q0, r1 +; CHECK-NEXT: vldrw.u32 q0, [r0, q1] +; CHECK-NEXT: bx lr + %1 = add <4 x i32> %offs, + %2 = getelementptr inbounds i8, i32* %data, <4 x i32> %1 + %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> , <4 x i32> undef) + ret <4 x i32> %wide.masked.gather +} + define arm_aapcs_vfpcc <4 x i32> @gather_inc_minipred_4i32(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, <4 x i32> %offs) { ; CHECK-LABEL: gather_inc_minipred_4i32: ; CHECK: @ %bb.0: @@ -207,20 +220,20 @@ define arm_aapcs_vfpcc <16 x i8> @gather_inc_minipred_16i8(ptr noalias nocapture define arm_aapcs_vfpcc void @gather_pre_inc(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n.vec) { ; CHECK-LABEL: gather_pre_inc: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: adr r3, .LCPI6_0 +; CHECK-NEXT: adr r3, .LCPI7_0 ; CHECK-NEXT: vldrw.u32 q0, [r3] ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: .LBB6_1: @ %vector.body +; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [q0, #96]! ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vstrb.8 q1, [r1], #16 -; CHECK-NEXT: bne .LBB6_1 +; CHECK-NEXT: bne .LBB7_1 ; CHECK-NEXT: @ %bb.2: @ %end ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI6_0: +; CHECK-NEXT: .LCPI7_0: ; CHECK-NEXT: .long 4294967224 @ 0xffffffb8 ; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 ; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 @@ -246,23 +259,65 @@ end: ret void; } +define arm_aapcs_vfpcc void @gather_pre_inc_i8(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n.vec) { +; CHECK-LABEL: gather_pre_inc_i8: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: adr r3, .LCPI8_0 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: .LBB8_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [q0, #24]! +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: bne .LBB8_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI8_0: +; CHECK-NEXT: .long 4294967278 @ 0xffffffee +; CHECK-NEXT: .long 4294967284 @ 0xfffffff4 +; CHECK-NEXT: .long 4294967290 @ 0xfffffffa +; CHECK-NEXT: .long 0 @ 0x0 +vector.ph: ; preds = %for.body.preheader + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.ind = phi <4 x i32> [ , %vector.ph ], [ %vec.ind.next, %vector.body ] + %0 = mul <4 x i32> %vec.ind, + %1 = add <4 x i32> %0, + %2 = getelementptr inbounds i8, ptr %data, <4 x i32> %1 + %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %2, i32 4, <4 x i1> , <4 x i32> undef) + %3 = getelementptr inbounds i32, ptr %dst, i32 %index + store <4 x i32> %wide.masked.gather, ptr %3, align 4 + %index.next = add i32 %index, 4 + %vec.ind.next = add <4 x i32> %vec.ind, + %4 = icmp eq i32 %index.next, %n.vec + br i1 %4, label %end, label %vector.body + +end: + ret void; +} + define arm_aapcs_vfpcc void @gather_post_inc(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n.vec43) { ; CHECK-LABEL: gather_post_inc: ; CHECK: @ %bb.0: @ %vector.ph41 -; CHECK-NEXT: adr r3, .LCPI7_0 +; CHECK-NEXT: adr r3, .LCPI9_0 ; CHECK-NEXT: vldrw.u32 q0, [r3] ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: .LBB7_1: @ %vector.body39 +; CHECK-NEXT: .LBB9_1: @ %vector.body39 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [q0, #96]! ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vstrb.8 q1, [r1], #16 -; CHECK-NEXT: bne .LBB7_1 +; CHECK-NEXT: bne .LBB9_1 ; CHECK-NEXT: @ %bb.2: @ %end ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI7_0: +; CHECK-NEXT: .LCPI9_0: ; CHECK-NEXT: .long 4294967200 @ 0xffffffa0 ; CHECK-NEXT: .long 4294967224 @ 0xffffffb8 ; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 @@ -293,38 +348,38 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_simple(ptr noalias nocapture reado ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr -; CHECK-NEXT: .LBB8_1: @ %vector.ph.preheader +; CHECK-NEXT: .LBB10_1: @ %vector.ph.preheader ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: bic r12, r2, #3 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: sub.w lr, r12, #4 ; CHECK-NEXT: add.w r4, r3, lr, lsr #2 -; CHECK-NEXT: adr r3, .LCPI8_0 +; CHECK-NEXT: adr r3, .LCPI10_0 ; CHECK-NEXT: vldrw.u32 q0, [r3] ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: .LBB8_2: @ %vector.ph +; CHECK-NEXT: .LBB10_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB8_3 Depth 2 +; CHECK-NEXT: @ Child Loop BB10_3 Depth 2 ; CHECK-NEXT: dls lr, r4 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: .LBB8_3: @ %vector.body -; CHECK-NEXT: @ Parent Loop BB8_2 Depth=1 +; CHECK-NEXT: .LBB10_3: @ %vector.body +; CHECK-NEXT: @ Parent Loop BB10_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vldrw.u32 q2, [q1, #16]! ; CHECK-NEXT: vstrb.8 q2, [r0], #16 -; CHECK-NEXT: le lr, .LBB8_3 +; CHECK-NEXT: le lr, .LBB10_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block -; CHECK-NEXT: @ in Loop: Header=BB8_2 Depth=1 +; CHECK-NEXT: @ in Loop: Header=BB10_2 Depth=1 ; CHECK-NEXT: cmp r12, r2 -; CHECK-NEXT: bne .LBB8_2 +; CHECK-NEXT: bne .LBB10_2 ; CHECK-NEXT: @ %bb.5: ; CHECK-NEXT: pop.w {r4, lr} ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.6: -; CHECK-NEXT: .LCPI8_0: +; CHECK-NEXT: .LCPI10_0: ; CHECK-NEXT: .long 4294967280 @ 0xfffffff0 ; CHECK-NEXT: .long 4294967284 @ 0xfffffff4 ; CHECK-NEXT: .long 4294967288 @ 0xfffffff8 @@ -363,7 +418,7 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_complex(ptr noalias nocapture read ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr -; CHECK-NEXT: .LBB9_1: @ %vector.ph.preheader +; CHECK-NEXT: .LBB11_1: @ %vector.ph.preheader ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} @@ -371,26 +426,26 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_complex(ptr noalias nocapture read ; CHECK-NEXT: bic r12, r2, #3 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: sub.w lr, r12, #4 -; CHECK-NEXT: adr r4, .LCPI9_1 -; CHECK-NEXT: adr r5, .LCPI9_2 +; CHECK-NEXT: adr r4, .LCPI11_1 +; CHECK-NEXT: adr r5, .LCPI11_2 ; CHECK-NEXT: vldrw.u32 q1, [r4] ; CHECK-NEXT: add.w r3, r3, lr, lsr #2 -; CHECK-NEXT: adr.w lr, .LCPI9_0 +; CHECK-NEXT: adr.w lr, .LCPI11_0 ; CHECK-NEXT: vldrw.u32 q0, [r5] ; CHECK-NEXT: vldrw.u32 q2, [lr] ; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: .LBB9_2: @ %vector.ph +; CHECK-NEXT: .LBB11_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB9_3 Depth 2 +; CHECK-NEXT: @ Child Loop BB11_3 Depth 2 ; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: vmov q3, q1 ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov q5, q2 -; CHECK-NEXT: .LBB9_3: @ %vector.body -; CHECK-NEXT: @ Parent Loop BB9_2 Depth=1 +; CHECK-NEXT: .LBB11_3: @ %vector.body +; CHECK-NEXT: @ Parent Loop BB11_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vldrw.u32 q6, [q5, #48]! ; CHECK-NEXT: vldrw.u32 q7, [q3, #48]! @@ -398,28 +453,28 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_complex(ptr noalias nocapture read ; CHECK-NEXT: vldrw.u32 q7, [q4, #48]! ; CHECK-NEXT: vadd.i32 q6, q6, q7 ; CHECK-NEXT: vstrb.8 q6, [r0], #16 -; CHECK-NEXT: le lr, .LBB9_3 +; CHECK-NEXT: le lr, .LBB11_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block -; CHECK-NEXT: @ in Loop: Header=BB9_2 Depth=1 +; CHECK-NEXT: @ in Loop: Header=BB11_2 Depth=1 ; CHECK-NEXT: cmp r12, r2 -; CHECK-NEXT: bne .LBB9_2 +; CHECK-NEXT: bne .LBB11_2 ; CHECK-NEXT: @ %bb.5: ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop.w {r4, r5, r7, lr} ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.6: -; CHECK-NEXT: .LCPI9_0: +; CHECK-NEXT: .LCPI11_0: ; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 ; CHECK-NEXT: .long 4294967260 @ 0xffffffdc ; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 ; CHECK-NEXT: .long 4294967284 @ 0xfffffff4 -; CHECK-NEXT: .LCPI9_1: +; CHECK-NEXT: .LCPI11_1: ; CHECK-NEXT: .long 4294967252 @ 0xffffffd4 ; CHECK-NEXT: .long 4294967264 @ 0xffffffe0 ; CHECK-NEXT: .long 4294967276 @ 0xffffffec ; CHECK-NEXT: .long 4294967288 @ 0xfffffff8 -; CHECK-NEXT: .LCPI9_2: +; CHECK-NEXT: .LCPI11_2: ; CHECK-NEXT: .long 4294967256 @ 0xffffffd8 ; CHECK-NEXT: .long 4294967268 @ 0xffffffe4 ; CHECK-NEXT: .long 4294967280 @ 0xfffffff0 @@ -467,38 +522,38 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_large(ptr noalias nocapture readon ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr -; CHECK-NEXT: .LBB10_1: @ %vector.ph.preheader +; CHECK-NEXT: .LBB12_1: @ %vector.ph.preheader ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: bic r12, r2, #3 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: sub.w lr, r12, #4 ; CHECK-NEXT: add.w r4, r3, lr, lsr #2 -; CHECK-NEXT: adr r3, .LCPI10_0 +; CHECK-NEXT: adr r3, .LCPI12_0 ; CHECK-NEXT: vldrw.u32 q0, [r3] ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: .LBB10_2: @ %vector.ph +; CHECK-NEXT: .LBB12_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB10_3 Depth 2 +; CHECK-NEXT: @ Child Loop BB12_3 Depth 2 ; CHECK-NEXT: dls lr, r4 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: .LBB10_3: @ %vector.body -; CHECK-NEXT: @ Parent Loop BB10_2 Depth=1 +; CHECK-NEXT: .LBB12_3: @ %vector.body +; CHECK-NEXT: @ Parent Loop BB12_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vldrw.u32 q2, [q1, #508]! ; CHECK-NEXT: vstrb.8 q2, [r0], #16 -; CHECK-NEXT: le lr, .LBB10_3 +; CHECK-NEXT: le lr, .LBB12_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block -; CHECK-NEXT: @ in Loop: Header=BB10_2 Depth=1 +; CHECK-NEXT: @ in Loop: Header=BB12_2 Depth=1 ; CHECK-NEXT: cmp r12, r2 -; CHECK-NEXT: bne .LBB10_2 +; CHECK-NEXT: bne .LBB12_2 ; CHECK-NEXT: @ %bb.5: ; CHECK-NEXT: pop.w {r4, lr} ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.6: -; CHECK-NEXT: .LCPI10_0: +; CHECK-NEXT: .LCPI12_0: ; CHECK-NEXT: .long 4294966788 @ 0xfffffe04 ; CHECK-NEXT: .long 4294966792 @ 0xfffffe08 ; CHECK-NEXT: .long 4294966796 @ 0xfffffe0c @@ -543,7 +598,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(ptr noalias nocapture reado ; CHECK-NEXT: sub sp, #28 ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: strd r1, r2, [sp, #4] @ 8-byte Folded Spill -; CHECK-NEXT: blt .LBB11_5 +; CHECK-NEXT: blt .LBB13_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader ; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: movs r6, #1 @@ -553,16 +608,16 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(ptr noalias nocapture reado ; CHECK-NEXT: str r1, [sp] @ 4-byte Spill ; CHECK-NEXT: sub.w r3, r1, #8 ; CHECK-NEXT: add.w r8, r6, r3, lsr #3 -; CHECK-NEXT: adr r3, .LCPI11_0 +; CHECK-NEXT: adr r3, .LCPI13_0 ; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: .LBB11_2: @ %vector.ph +; CHECK-NEXT: .LBB13_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB11_3 Depth 2 +; CHECK-NEXT: @ Child Loop BB13_3 Depth 2 ; CHECK-NEXT: dls lr, r8 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: .LBB11_3: @ %vector.body -; CHECK-NEXT: @ Parent Loop BB11_2 Depth=1 +; CHECK-NEXT: .LBB13_3: @ %vector.body +; CHECK-NEXT: @ Parent Loop BB13_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vstrw.32 q1, [r2] ; CHECK-NEXT: mov r12, r2 @@ -595,19 +650,19 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(ptr noalias nocapture reado ; CHECK-NEXT: vmov.16 q2[6], r7 ; CHECK-NEXT: vmov.16 q2[7], r5 ; CHECK-NEXT: vstrb.8 q2, [r6], #16 -; CHECK-NEXT: le lr, .LBB11_3 +; CHECK-NEXT: le lr, .LBB13_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block -; CHECK-NEXT: @ in Loop: Header=BB11_2 Depth=1 +; CHECK-NEXT: @ in Loop: Header=BB13_2 Depth=1 ; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: ldr r3, [sp] @ 4-byte Reload ; CHECK-NEXT: cmp r3, r1 -; CHECK-NEXT: bne .LBB11_2 -; CHECK-NEXT: .LBB11_5: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB13_2 +; CHECK-NEXT: .LBB13_5: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #28 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.6: -; CHECK-NEXT: .LCPI11_0: +; CHECK-NEXT: .LCPI13_0: ; CHECK-NEXT: .short 0 @ 0x0 ; CHECK-NEXT: .short 1 @ 0x1 ; CHECK-NEXT: .short 2 @ 0x2 @@ -660,10 +715,10 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read ; CHECK-NEXT: sub sp, #136 ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: strd r1, r2, [sp, #64] @ 8-byte Folded Spill -; CHECK-NEXT: blt.w .LBB12_5 +; CHECK-NEXT: blt.w .LBB14_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader ; CHECK-NEXT: ldr r1, [sp, #68] @ 4-byte Reload -; CHECK-NEXT: adr r3, .LCPI12_2 +; CHECK-NEXT: adr r3, .LCPI14_2 ; CHECK-NEXT: vldrw.u32 q0, [r3] ; CHECK-NEXT: movs r2, #1 ; CHECK-NEXT: bic r1, r1, #7 @@ -673,17 +728,17 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read ; CHECK-NEXT: vmov.i16 q2, #0x18 ; CHECK-NEXT: add.w r1, r2, r1, lsr #3 ; CHECK-NEXT: str r1, [sp, #60] @ 4-byte Spill -; CHECK-NEXT: adr r1, .LCPI12_0 -; CHECK-NEXT: adr r2, .LCPI12_1 +; CHECK-NEXT: adr r1, .LCPI14_0 +; CHECK-NEXT: adr r2, .LCPI14_1 ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill ; CHECK-NEXT: vstrw.32 q0, [sp, #24] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: add r2, sp, #120 ; CHECK-NEXT: vstrw.32 q0, [sp, #8] @ 16-byte Spill -; CHECK-NEXT: .LBB12_2: @ %vector.ph +; CHECK-NEXT: .LBB14_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB12_3 Depth 2 +; CHECK-NEXT: @ Child Loop BB14_3 Depth 2 ; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload ; CHECK-NEXT: add.w r10, sp, #104 ; CHECK-NEXT: dls lr, r1 @@ -691,8 +746,8 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read ; CHECK-NEXT: vldrw.u32 q4, [sp, #24] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q5, [sp, #40] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q6, [sp, #8] @ 16-byte Reload -; CHECK-NEXT: .LBB12_3: @ %vector.body -; CHECK-NEXT: @ Parent Loop BB12_2 Depth=1 +; CHECK-NEXT: .LBB14_3: @ %vector.body +; CHECK-NEXT: @ Parent Loop BB14_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vstrw.32 q5, [r2] ; CHECK-NEXT: mov r8, r2 @@ -786,21 +841,21 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read ; CHECK-NEXT: vadd.i16 q0, q3, q1 ; CHECK-NEXT: vadd.i16 q0, q0, q7 ; CHECK-NEXT: vstrb.8 q0, [r7], #16 -; CHECK-NEXT: le lr, .LBB12_3 +; CHECK-NEXT: le lr, .LBB14_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block -; CHECK-NEXT: @ in Loop: Header=BB12_2 Depth=1 +; CHECK-NEXT: @ in Loop: Header=BB14_2 Depth=1 ; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: ldr r3, [sp, #68] @ 4-byte Reload ; CHECK-NEXT: cmp r1, r3 -; CHECK-NEXT: bne.w .LBB12_2 -; CHECK-NEXT: .LBB12_5: @ %for.cond.cleanup +; CHECK-NEXT: bne.w .LBB14_2 +; CHECK-NEXT: .LBB14_5: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #136 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.6: -; CHECK-NEXT: .LCPI12_0: +; CHECK-NEXT: .LCPI14_0: ; CHECK-NEXT: .short 1 @ 0x1 ; CHECK-NEXT: .short 4 @ 0x4 ; CHECK-NEXT: .short 7 @ 0x7 @@ -809,7 +864,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read ; CHECK-NEXT: .short 16 @ 0x10 ; CHECK-NEXT: .short 19 @ 0x13 ; CHECK-NEXT: .short 22 @ 0x16 -; CHECK-NEXT: .LCPI12_1: +; CHECK-NEXT: .LCPI14_1: ; CHECK-NEXT: .short 0 @ 0x0 ; CHECK-NEXT: .short 3 @ 0x3 ; CHECK-NEXT: .short 6 @ 0x6 @@ -818,7 +873,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read ; CHECK-NEXT: .short 15 @ 0xf ; CHECK-NEXT: .short 18 @ 0x12 ; CHECK-NEXT: .short 21 @ 0x15 -; CHECK-NEXT: .LCPI12_2: +; CHECK-NEXT: .LCPI14_2: ; CHECK-NEXT: .short 2 @ 0x2 ; CHECK-NEXT: .short 5 @ 0x5 ; CHECK-NEXT: .short 8 @ 0x8 @@ -880,21 +935,21 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture read ; CHECK-NEXT: sub sp, #312 ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: str r1, [sp, #116] @ 4-byte Spill -; CHECK-NEXT: blt.w .LBB13_5 +; CHECK-NEXT: blt.w .LBB15_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader -; CHECK-NEXT: adr r1, .LCPI13_0 -; CHECK-NEXT: adr r6, .LCPI13_8 +; CHECK-NEXT: adr r1, .LCPI15_0 +; CHECK-NEXT: adr r6, .LCPI15_8 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: adr r1, .LCPI13_1 -; CHECK-NEXT: adr r7, .LCPI13_7 -; CHECK-NEXT: adr r3, .LCPI13_6 +; CHECK-NEXT: adr r1, .LCPI15_1 +; CHECK-NEXT: adr r7, .LCPI15_7 +; CHECK-NEXT: adr r3, .LCPI15_6 ; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: adr r1, .LCPI13_5 +; CHECK-NEXT: adr r1, .LCPI15_5 ; CHECK-NEXT: bic r10, r2, #7 ; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: adr r6, .LCPI13_9 +; CHECK-NEXT: adr r6, .LCPI15_9 ; CHECK-NEXT: vmov.i32 q2, #0x30 ; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r7] @@ -905,22 +960,22 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture read ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r3] ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: .LBB13_2: @ %vector.ph +; CHECK-NEXT: .LBB15_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB13_3 Depth 2 -; CHECK-NEXT: adr r1, .LCPI13_3 +; CHECK-NEXT: @ Child Loop BB15_3 Depth 2 +; CHECK-NEXT: adr r1, .LCPI15_3 ; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: adr r1, .LCPI13_4 +; CHECK-NEXT: adr r1, .LCPI15_4 ; CHECK-NEXT: vldrw.u32 q5, [r1] -; CHECK-NEXT: adr r1, .LCPI13_2 +; CHECK-NEXT: adr r1, .LCPI15_2 ; CHECK-NEXT: vldrw.u32 q3, [r1] -; CHECK-NEXT: adr r1, .LCPI13_10 +; CHECK-NEXT: adr r1, .LCPI15_10 ; CHECK-NEXT: vstrw.32 q6, [sp, #280] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q6, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q3, [sp, #296] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q3, [r1] -; CHECK-NEXT: adr r1, .LCPI13_11 +; CHECK-NEXT: adr r1, .LCPI15_11 ; CHECK-NEXT: ldr.w r8, [sp, #116] @ 4-byte Reload ; CHECK-NEXT: vstrw.32 q3, [sp, #248] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q3, [sp, #80] @ 16-byte Reload @@ -935,8 +990,8 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture read ; CHECK-NEXT: mov r11, r10 ; CHECK-NEXT: vstrw.32 q6, [sp, #232] @ 16-byte Spill ; CHECK-NEXT: vstrw.32 q3, [sp, #184] @ 16-byte Spill -; CHECK-NEXT: .LBB13_3: @ %vector.body -; CHECK-NEXT: @ Parent Loop BB13_2 Depth=1 +; CHECK-NEXT: .LBB15_3: @ %vector.body +; CHECK-NEXT: @ Parent Loop BB15_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vadd.i32 q4, q1, r0 ; CHECK-NEXT: vstrw.32 q7, [sp, #136] @ 16-byte Spill @@ -1114,74 +1169,74 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture read ; CHECK-NEXT: vldrw.u32 q0, [sp, #168] @ 16-byte Reload ; CHECK-NEXT: vadd.i32 q7, q7, q2 ; CHECK-NEXT: vadd.i32 q0, q0, q2 -; CHECK-NEXT: bne.w .LBB13_3 +; CHECK-NEXT: bne.w .LBB15_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block -; CHECK-NEXT: @ in Loop: Header=BB13_2 Depth=1 +; CHECK-NEXT: @ in Loop: Header=BB15_2 Depth=1 ; CHECK-NEXT: cmp r10, r2 -; CHECK-NEXT: bne.w .LBB13_2 -; CHECK-NEXT: .LBB13_5: @ %for.cond.cleanup +; CHECK-NEXT: bne.w .LBB15_2 +; CHECK-NEXT: .LBB15_5: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #312 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.6: -; CHECK-NEXT: .LCPI13_0: +; CHECK-NEXT: .LCPI15_0: ; CHECK-NEXT: .long 38 @ 0x26 ; CHECK-NEXT: .long 41 @ 0x29 ; CHECK-NEXT: .long 44 @ 0x2c ; CHECK-NEXT: .long 47 @ 0x2f -; CHECK-NEXT: .LCPI13_1: +; CHECK-NEXT: .LCPI15_1: ; CHECK-NEXT: .long 14 @ 0xe ; CHECK-NEXT: .long 17 @ 0x11 ; CHECK-NEXT: .long 20 @ 0x14 ; CHECK-NEXT: .long 23 @ 0x17 -; CHECK-NEXT: .LCPI13_2: +; CHECK-NEXT: .LCPI15_2: ; CHECK-NEXT: .long 24 @ 0x18 ; CHECK-NEXT: .long 27 @ 0x1b ; CHECK-NEXT: .long 30 @ 0x1e ; CHECK-NEXT: .long 33 @ 0x21 -; CHECK-NEXT: .LCPI13_3: +; CHECK-NEXT: .LCPI15_3: ; CHECK-NEXT: .long 1 @ 0x1 ; CHECK-NEXT: .long 4 @ 0x4 ; CHECK-NEXT: .long 7 @ 0x7 ; CHECK-NEXT: .long 10 @ 0xa -; CHECK-NEXT: .LCPI13_4: +; CHECK-NEXT: .LCPI15_4: ; CHECK-NEXT: .long 36 @ 0x24 ; CHECK-NEXT: .long 39 @ 0x27 ; CHECK-NEXT: .long 42 @ 0x2a ; CHECK-NEXT: .long 45 @ 0x2d -; CHECK-NEXT: .LCPI13_5: +; CHECK-NEXT: .LCPI15_5: ; CHECK-NEXT: .long 25 @ 0x19 ; CHECK-NEXT: .long 28 @ 0x1c ; CHECK-NEXT: .long 31 @ 0x1f ; CHECK-NEXT: .long 34 @ 0x22 -; CHECK-NEXT: .LCPI13_6: +; CHECK-NEXT: .LCPI15_6: ; CHECK-NEXT: .long 13 @ 0xd ; CHECK-NEXT: .long 16 @ 0x10 ; CHECK-NEXT: .long 19 @ 0x13 ; CHECK-NEXT: .long 22 @ 0x16 -; CHECK-NEXT: .LCPI13_7: +; CHECK-NEXT: .LCPI15_7: ; CHECK-NEXT: .long 2 @ 0x2 ; CHECK-NEXT: .long 5 @ 0x5 ; CHECK-NEXT: .long 8 @ 0x8 ; CHECK-NEXT: .long 11 @ 0xb -; CHECK-NEXT: .LCPI13_8: +; CHECK-NEXT: .LCPI15_8: ; CHECK-NEXT: .long 26 @ 0x1a ; CHECK-NEXT: .long 29 @ 0x1d ; CHECK-NEXT: .long 32 @ 0x20 ; CHECK-NEXT: .long 35 @ 0x23 -; CHECK-NEXT: .LCPI13_9: +; CHECK-NEXT: .LCPI15_9: ; CHECK-NEXT: .long 37 @ 0x25 ; CHECK-NEXT: .long 40 @ 0x28 ; CHECK-NEXT: .long 43 @ 0x2b ; CHECK-NEXT: .long 46 @ 0x2e -; CHECK-NEXT: .LCPI13_10: +; CHECK-NEXT: .LCPI15_10: ; CHECK-NEXT: .long 12 @ 0xc ; CHECK-NEXT: .long 15 @ 0xf ; CHECK-NEXT: .long 18 @ 0x12 ; CHECK-NEXT: .long 21 @ 0x15 -; CHECK-NEXT: .LCPI13_11: +; CHECK-NEXT: .LCPI15_11: ; CHECK-NEXT: .long 0 @ 0x0 ; CHECK-NEXT: .long 3 @ 0x3 ; CHECK-NEXT: .long 6 @ 0x6 @@ -1238,14 +1293,14 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado ; CHECK-NEXT: sub sp, #64 ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: strd r1, r2, [sp, #56] @ 8-byte Folded Spill -; CHECK-NEXT: blt.w .LBB14_5 +; CHECK-NEXT: blt.w .LBB16_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader -; CHECK-NEXT: adr r5, .LCPI14_3 -; CHECK-NEXT: adr r7, .LCPI14_1 +; CHECK-NEXT: adr r5, .LCPI16_3 +; CHECK-NEXT: adr r7, .LCPI16_1 ; CHECK-NEXT: vldrw.u32 q0, [r5] ; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload -; CHECK-NEXT: adr r3, .LCPI14_0 -; CHECK-NEXT: adr r6, .LCPI14_2 +; CHECK-NEXT: adr r3, .LCPI16_0 +; CHECK-NEXT: adr r6, .LCPI16_2 ; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r7] ; CHECK-NEXT: bic r9, r1, #7 @@ -1255,16 +1310,16 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado ; CHECK-NEXT: mov.w lr, #16 ; CHECK-NEXT: str.w r9, [sp, #52] @ 4-byte Spill ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: .LBB14_2: @ %vector.ph +; CHECK-NEXT: .LBB16_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB14_3 Depth 2 +; CHECK-NEXT: @ Child Loop BB16_3 Depth 2 ; CHECK-NEXT: ldr.w r8, [sp, #56] @ 4-byte Reload ; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vmov q4, q3 -; CHECK-NEXT: .LBB14_3: @ %vector.body -; CHECK-NEXT: @ Parent Loop BB14_2 Depth=1 +; CHECK-NEXT: .LBB16_3: @ %vector.body +; CHECK-NEXT: @ Parent Loop BB16_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vadd.i32 q1, q5, r0 ; CHECK-NEXT: vadd.i32 q2, q4, r0 @@ -1318,36 +1373,36 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado ; CHECK-NEXT: vmov.8 q0[15], r12 ; CHECK-NEXT: vstrb.8 q0, [r8], #16 ; CHECK-NEXT: vmov q0, q6 -; CHECK-NEXT: bne .LBB14_3 +; CHECK-NEXT: bne .LBB16_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block -; CHECK-NEXT: @ in Loop: Header=BB14_2 Depth=1 +; CHECK-NEXT: @ in Loop: Header=BB16_2 Depth=1 ; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload ; CHECK-NEXT: ldr.w r9, [sp, #52] @ 4-byte Reload ; CHECK-NEXT: cmp r9, r1 -; CHECK-NEXT: bne .LBB14_2 -; CHECK-NEXT: .LBB14_5: @ %for.cond.cleanup +; CHECK-NEXT: bne .LBB16_2 +; CHECK-NEXT: .LBB16_5: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #64 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.6: -; CHECK-NEXT: .LCPI14_0: +; CHECK-NEXT: .LCPI16_0: ; CHECK-NEXT: .long 0 @ 0x0 ; CHECK-NEXT: .long 1 @ 0x1 ; CHECK-NEXT: .long 2 @ 0x2 ; CHECK-NEXT: .long 3 @ 0x3 -; CHECK-NEXT: .LCPI14_1: +; CHECK-NEXT: .LCPI16_1: ; CHECK-NEXT: .long 8 @ 0x8 ; CHECK-NEXT: .long 9 @ 0x9 ; CHECK-NEXT: .long 10 @ 0xa ; CHECK-NEXT: .long 11 @ 0xb -; CHECK-NEXT: .LCPI14_2: +; CHECK-NEXT: .LCPI16_2: ; CHECK-NEXT: .long 4 @ 0x4 ; CHECK-NEXT: .long 5 @ 0x5 ; CHECK-NEXT: .long 6 @ 0x6 ; CHECK-NEXT: .long 7 @ 0x7 -; CHECK-NEXT: .LCPI14_3: +; CHECK-NEXT: .LCPI16_3: ; CHECK-NEXT: .long 12 @ 0xc ; CHECK-NEXT: .long 13 @ 0xd ; CHECK-NEXT: .long 14 @ 0xe @@ -1390,21 +1445,21 @@ define void @shl(ptr nocapture %x, ptr noalias nocapture readonly %y, i32 %n) { ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} -; CHECK-NEXT: .LBB15_1: @ %vector.ph -; CHECK-NEXT: adr r3, .LCPI15_0 +; CHECK-NEXT: .LBB17_1: @ %vector.ph +; CHECK-NEXT: adr r3, .LCPI17_0 ; CHECK-NEXT: vldrw.u32 q0, [r3] ; CHECK-NEXT: vadd.i32 q0, q0, r1 ; CHECK-NEXT: dlstp.32 lr, r2 -; CHECK-NEXT: .LBB15_2: @ %vector.body +; CHECK-NEXT: .LBB17_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [q0, #64]! ; CHECK-NEXT: vstrw.32 q1, [r0], #16 -; CHECK-NEXT: letp lr, .LBB15_2 +; CHECK-NEXT: letp lr, .LBB17_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.4: -; CHECK-NEXT: .LCPI15_0: +; CHECK-NEXT: .LCPI17_0: ; CHECK-NEXT: .long 4294967232 @ 0xffffffc0 ; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 ; CHECK-NEXT: .long 4294967264 @ 0xffffffe0 @@ -1444,12 +1499,12 @@ define void @shlor(ptr nocapture %x, ptr noalias nocapture readonly %y, i32 %n) ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: blt .LBB16_3 +; CHECK-NEXT: blt .LBB18_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: adr.w lr, .LCPI16_0 -; CHECK-NEXT: adr r4, .LCPI16_1 -; CHECK-NEXT: adr r5, .LCPI16_2 -; CHECK-NEXT: adr r6, .LCPI16_3 +; CHECK-NEXT: adr.w lr, .LCPI18_0 +; CHECK-NEXT: adr r4, .LCPI18_1 +; CHECK-NEXT: adr r5, .LCPI18_2 +; CHECK-NEXT: adr r6, .LCPI18_3 ; CHECK-NEXT: vldrw.u32 q0, [r6] ; CHECK-NEXT: vldrw.u32 q1, [r5] ; CHECK-NEXT: vldrw.u32 q2, [r4] @@ -1459,7 +1514,7 @@ define void @shlor(ptr nocapture %x, ptr noalias nocapture readonly %y, i32 %n) ; CHECK-NEXT: vadd.i32 q2, q2, r1 ; CHECK-NEXT: vadd.i32 q3, q3, r1 ; CHECK-NEXT: dlstp.32 lr, r2 -; CHECK-NEXT: .LBB16_2: @ %vector.body +; CHECK-NEXT: .LBB18_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q4, [q3, #128]! ; CHECK-NEXT: vldrw.u32 q5, [q2, #128]! @@ -1469,28 +1524,28 @@ define void @shlor(ptr nocapture %x, ptr noalias nocapture readonly %y, i32 %n) ; CHECK-NEXT: vadd.i32 q4, q4, q5 ; CHECK-NEXT: vadd.i32 q4, q4, q6 ; CHECK-NEXT: vstrw.32 q4, [r0], #16 -; CHECK-NEXT: letp lr, .LBB16_2 -; CHECK-NEXT: .LBB16_3: @ %for.cond.cleanup +; CHECK-NEXT: letp lr, .LBB18_2 +; CHECK-NEXT: .LBB18_3: @ %for.cond.cleanup ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: pop {r4, r5, r6, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.4: -; CHECK-NEXT: .LCPI16_0: +; CHECK-NEXT: .LCPI18_0: ; CHECK-NEXT: .long 4294967168 @ 0xffffff80 ; CHECK-NEXT: .long 4294967200 @ 0xffffffa0 ; CHECK-NEXT: .long 4294967232 @ 0xffffffc0 ; CHECK-NEXT: .long 4294967264 @ 0xffffffe0 -; CHECK-NEXT: .LCPI16_1: +; CHECK-NEXT: .LCPI18_1: ; CHECK-NEXT: .long 4294967176 @ 0xffffff88 ; CHECK-NEXT: .long 4294967208 @ 0xffffffa8 ; CHECK-NEXT: .long 4294967240 @ 0xffffffc8 ; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 -; CHECK-NEXT: .LCPI16_2: +; CHECK-NEXT: .LCPI18_2: ; CHECK-NEXT: .long 4294967184 @ 0xffffff90 ; CHECK-NEXT: .long 4294967216 @ 0xffffffb0 ; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 ; CHECK-NEXT: .long 4294967280 @ 0xfffffff0 -; CHECK-NEXT: .LCPI16_3: +; CHECK-NEXT: .LCPI18_3: ; CHECK-NEXT: .long 4294967192 @ 0xffffff98 ; CHECK-NEXT: .long 4294967224 @ 0xffffffb8 ; CHECK-NEXT: .long 4294967256 @ 0xffffffd8 From c84f5a9e00c02e6a4349846ed59ec85154b65e3f Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Tue, 24 Dec 2024 01:09:34 -0800 Subject: [PATCH 011/567] [Github] Skip MIR files for undef check (#120919) This patch skips checking files with a .mir extension for the presence of undef. This was creating false positives that got reported on discourse. --- llvm/utils/git/code-format-helper.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llvm/utils/git/code-format-helper.py b/llvm/utils/git/code-format-helper.py index 36fc5ee4c3d52..48a338aca9c8e 100755 --- a/llvm/utils/git/code-format-helper.py +++ b/llvm/utils/git/code-format-helper.py @@ -379,6 +379,10 @@ def format_run(self, changed_files: List[str], args: FormatArgs) -> Optional[str # Each file is prefixed like: # diff --git a/file b/file for file in re.split("^diff --git ", stdout, 0, re.MULTILINE): + # We skip checking in MIR files as undef is a valid token and not + # going away. + if file.endswith(".mir"): + continue # search for additions of undef if re.search(r"^[+](?!\s*#\s*).*(\bundef\b|UndefValue::get)", file, re.MULTILINE): files.append(re.match("a/([^ ]+)", file.splitlines()[0])[1]) From b2073fb9b9282c0f59861a137660f6a0782d7468 Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Tue, 24 Dec 2024 10:18:48 +0000 Subject: [PATCH 012/567] [AArch64] Prefer SVE2.2 zeroing forms of certain instructions with an all-true predicate (#120595) When the predicate of a destructive operation is known to be all-true, for example fabs z0.s, p0/m, z1.s then the entire output register is written and we can use a zeroing (instead of a merging) form of the instruction, for example fabs z0.s, p0/z, z1.s thus eliminate the dependency on the input-output destination register without the need to insert a `movprfx`. This patch complements (and in the case of https://github.com/llvm/llvm-project/commit/2b3266c1701f315d7e89c81977800001563afacb, fixes a regression) the following: https://github.com/llvm/llvm-project/commit/7f4414b2a1a4d9f802a03f56894c406f0fe3e9a9 [AArch64] Generate zeroing forms of certain SVE2.2 instructions (4/11) (https://github.com/llvm/llvm-project/pull/116830) https://github.com/llvm/llvm-project/commit/2474cf7ad123ea14308293a2237e3552cddb1136 [AArch64] Generate zeroing forms of certain SVE2.2 instructions (3/11) (https://github.com/llvm/llvm-project/pull/116829) https://github.com/llvm/llvm-project/commit/6f285d31159501050de5563b1a844a3e1ac79a03 [AArch64] Generate zeroing forms of certain SVE2.2 instructions (2/11) (https://github.com/llvm/llvm-project/pull/116828) https://github.com/llvm/llvm-project/commit/2b3266c1701f315d7e89c81977800001563afacb [AArch64] Generate zeroing forms of certain SVE2.2 instructions (1/11) (https://github.com/llvm/llvm-project/pull/116259) --- .../Target/AArch64/AArch64ISelDAGToDAG.cpp | 2 + llvm/lib/Target/AArch64/AArch64InstrInfo.td | 3 - .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 8 - llvm/lib/Target/AArch64/SVEInstrFormats.td | 102 ++- .../CodeGen/AArch64/zeroing-forms-abs-neg.ll | 600 +++++++++++++++++- .../AArch64/zeroing-forms-fcvt-bfcvt.ll | 287 ++++++++- .../AArch64/zeroing-forms-fcvtlt-fcvtx.ll | 123 +++- .../CodeGen/AArch64/zeroing-forms-fcvtzsu.ll | 580 ++++++++++++++++- 8 files changed, 1577 insertions(+), 128 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index ff3ca8a24fc04..6aa8cd4f0232a 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -228,6 +228,8 @@ class AArch64DAGToDAGISel : public SelectionDAGISel { return false; } + bool SelectAny(SDValue) { return true; } + bool SelectDupZero(SDValue N) { switch(N->getOpcode()) { case AArch64ISD::DUP: diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index b37f4a08755c5..629098cda0c4e 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -381,9 +381,6 @@ def NoUseScalarIncVL : Predicate<"!Subtarget->useScalarIncVL()">; def UseSVEFPLD1R : Predicate<"!Subtarget->noSVEFPLD1R()">; -def UseUnaryUndefPseudos - : Predicate<"!(Subtarget->isSVEorStreamingSVEAvailable() && (Subtarget->hasSVE2p2() || Subtarget->hasSME2p2()))">; - def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER", SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisInt<1>]>>; diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index c8892de647437..7dd6d49bf2022 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -675,14 +675,6 @@ let Predicates = [HasSVEorSME] in { defm FABS_ZPmZ : sve_int_un_pred_arit_bitwise_fp<0b100, "fabs", AArch64fabs_mt>; defm FNEG_ZPmZ : sve_int_un_pred_arit_bitwise_fp<0b101, "fneg", AArch64fneg_mt>; - let Predicates = [HasSVEorSME, UseUnaryUndefPseudos] in { - defm FABS_ZPmZ : sve_fp_un_pred_arit_hsd; - defm FNEG_ZPmZ : sve_fp_un_pred_arit_hsd; - - defm ABS_ZPmZ : sve_int_un_pred_arit_bhsd; - defm NEG_ZPmZ : sve_int_un_pred_arit_bhsd; - } - foreach VT = [nxv2bf16, nxv4bf16, nxv8bf16] in { // No dedicated instruction, so just clear the sign bit. def : Pat<(VT (fabs VT:$op)), diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index a831de878a910..0ef862fc1a27c 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -484,6 +484,7 @@ let Predicates = [HasSVEorSME] in { //===----------------------------------------------------------------------===// def SVEDup0 : ComplexPattern; def SVEDup0Undef : ComplexPattern; +def SVEAny : ComplexPattern; class SVE_1_Op_Pat @@ -504,10 +505,15 @@ multiclass SVE_1_Op_PassthruUndef_Pat; } -class SVE_1_Op_PassthruUndefZero_Pat - : Pat<(vtd (op pg:$Op1, vts:$Op2, (vtd (SVEDup0Undef)))), - (inst $Op1, $Op2)>; +multiclass SVE_1_Op_PassthruUndefZero_Pat { + let AddedComplexity = 1 in { + def : Pat<(vtd (op pg:$Op1, vts:$Op2, (vtd (SVEDup0Undef)))), + (inst $Op1, $Op2)>; + def : Pat<(vtd (op (pg (SVEAllActive:$Op1)), vts:$Op2, (vtd (SVEAny)))), + (inst $Op1, $Op2)>; + } +} // Used to match FP_ROUND_MERGE_PASSTHRU, which has an additional flag for the // type of rounding. This is matched by timm0_1 in pattern below and ignored. @@ -576,10 +582,15 @@ multiclass SVE_3_Op_Undef_Pat; } -class SVE_3_Op_UndefZero_Pat - : Pat<(vtd (op (vt1 (SVEDup0Undef)), vt2:$Op1, vt3:$Op2)), - (inst $Op1, $Op2)>; +multiclass SVE_3_Op_UndefZero_Pat { + let AddedComplexity = 1 in { + def : Pat<(vtd (op (vt1 (SVEDup0Undef)), vt2:$Op1, vt3:$Op2)), + (inst $Op1, $Op2)>; + def : Pat<(vtd (op (vt1 (SVEAny)), (vt2 (SVEAllActive:$Op2)), vt3:$Op3)), + (inst $Op2, $Op3)>; + } +} class SVE_4_Op_Pat { def _HtoS : sve2_fp_convert_precision<0b1001, 0b0, asm, ZPR32, ZPR16>; def _StoD : sve2_fp_convert_precision<0b1111, 0b0, asm, ZPR64, ZPR32>; - def : SVE_3_Op_UndefZero_Pat(op # _f32f16), nxv4f32, nxv4i1, nxv8f16, !cast(NAME # _HtoS)>; - def : SVE_3_Op_UndefZero_Pat(op # _f64f32), nxv2f64, nxv2i1, nxv4f32, !cast(NAME # _StoD)>; + defm : SVE_3_Op_UndefZero_Pat(op # _f32f16), nxv4f32, nxv4i1, nxv8f16, !cast(NAME # _HtoS)>; + defm : SVE_3_Op_UndefZero_Pat(op # _f64f32), nxv2f64, nxv2i1, nxv4f32, !cast(NAME # _StoD)>; } multiclass sve2_fp_convert_down_narrow_z { @@ -3256,7 +3267,7 @@ class sve_fp_z2op_p_zd opc,string asm, RegisterOperand i_zprtype, multiclass sve_fp_z2op_p_zd { def _DtoS : sve_fp_z2op_p_zd<0b0001010, asm, ZPR64, ZPR32>; - def : SVE_3_Op_UndefZero_Pat(NAME # _DtoS)>; + defm : SVE_3_Op_UndefZero_Pat(NAME # _DtoS)>; } multiclass sve_fp_z2op_p_zd_hsd opc, string asm> { @@ -3273,7 +3284,7 @@ multiclass sve_fp_z2op_p_zd_frint opc, string asm> { multiclass sve_fp_z2op_p_zd_bfcvt { def NAME : sve_fp_z2op_p_zd<0b1001010, asm, ZPR32, ZPR16>; - def : SVE_3_Op_UndefZero_Pat(NAME)>; + defm : SVE_3_Op_UndefZero_Pat(NAME)>; } multiclass sve_fp_z2op_p_zd_d { @@ -3285,14 +3296,14 @@ multiclass sve_fp_z2op_p_zd_d; def _DtoD : sve_fp_z2op_p_zd<{ 0b111111, U }, asm, ZPR64, ZPR64>; - def : SVE_3_Op_UndefZero_Pat(int_op # _i32f64), nxv4i32, nxv2i1, nxv2f64, !cast(NAME # _DtoS)>; - def : SVE_3_Op_UndefZero_Pat(int_op # _i64f32), nxv2i64, nxv2i1, nxv4f32, !cast(NAME # _StoD)>; - def : SVE_3_Op_UndefZero_Pat(int_op # _i32f16), nxv4i32, nxv4i1, nxv8f16, !cast(NAME # _HtoS)>; - def : SVE_3_Op_UndefZero_Pat(int_op # _i64f16), nxv2i64, nxv2i1, nxv8f16, !cast(NAME # _HtoD)>; + defm : SVE_3_Op_UndefZero_Pat(int_op # _i32f64), nxv4i32, nxv2i1, nxv2f64, !cast(NAME # _DtoS)>; + defm : SVE_3_Op_UndefZero_Pat(int_op # _i64f32), nxv2i64, nxv2i1, nxv4f32, !cast(NAME # _StoD)>; + defm : SVE_3_Op_UndefZero_Pat(int_op # _i32f16), nxv4i32, nxv4i1, nxv8f16, !cast(NAME # _HtoS)>; + defm : SVE_3_Op_UndefZero_Pat(int_op # _i64f16), nxv2i64, nxv2i1, nxv8f16, !cast(NAME # _HtoD)>; - def : SVE_1_Op_PassthruUndefZero_Pat(NAME # _HtoH)>; - def : SVE_1_Op_PassthruUndefZero_Pat(NAME # _StoS)>; - def : SVE_1_Op_PassthruUndefZero_Pat(NAME # _DtoD)>; + defm : SVE_1_Op_PassthruUndefZero_Pat(NAME # _HtoH)>; + defm : SVE_1_Op_PassthruUndefZero_Pat(NAME # _StoS)>; + defm : SVE_1_Op_PassthruUndefZero_Pat(NAME # _DtoD)>; } multiclass sve_fp_z2op_p_zd_c { @@ -3319,12 +3330,12 @@ multiclass sve_fp_z2op_p_zd_b_0 { def _DtoS : sve_fp_z2op_p_zd<0b1101010, asm, ZPR64, ZPR32>; def _StoD : sve_fp_z2op_p_zd<0b1101011, asm, ZPR32, ZPR64>; - def : SVE_3_Op_UndefZero_Pat(op # _f16f32), nxv8f16, nxv4i1, nxv4f32, !cast(NAME # _StoH)>; - def : SVE_3_Op_UndefZero_Pat(op # _f16f64), nxv8f16, nxv2i1, nxv2f64, !cast(NAME # _DtoH)>; - def : SVE_3_Op_UndefZero_Pat(op # _f32f64), nxv4f32, nxv2i1, nxv2f64, !cast(NAME # _DtoS)>; - def : SVE_3_Op_UndefZero_Pat(op # _f32f16), nxv4f32, nxv4i1, nxv8f16, !cast(NAME # _HtoS)>; - def : SVE_3_Op_UndefZero_Pat(op # _f64f16), nxv2f64, nxv2i1, nxv8f16, !cast(NAME # _HtoD)>; - def : SVE_3_Op_UndefZero_Pat(op # _f64f32), nxv2f64, nxv2i1, nxv4f32, !cast(NAME # _StoD)>; + defm : SVE_3_Op_UndefZero_Pat(op # _f16f32), nxv8f16, nxv4i1, nxv4f32, !cast(NAME # _StoH)>; + defm : SVE_3_Op_UndefZero_Pat(op # _f16f64), nxv8f16, nxv2i1, nxv2f64, !cast(NAME # _DtoH)>; + defm : SVE_3_Op_UndefZero_Pat(op # _f32f64), nxv4f32, nxv2i1, nxv2f64, !cast(NAME # _DtoS)>; + defm : SVE_3_Op_UndefZero_Pat(op # _f32f16), nxv4f32, nxv4i1, nxv8f16, !cast(NAME # _HtoS)>; + defm : SVE_3_Op_UndefZero_Pat(op # _f64f16), nxv2f64, nxv2i1, nxv8f16, !cast(NAME # _HtoD)>; + defm : SVE_3_Op_UndefZero_Pat(op # _f64f32), nxv2f64, nxv2i1, nxv4f32, !cast(NAME # _StoD)>; } //===----------------------------------------------------------------------===// @@ -4842,6 +4853,16 @@ multiclass sve_int_un_pred_arit opc, string asm, def : SVE_1_Op_Passthru_Pat(NAME # _H)>; def : SVE_1_Op_Passthru_Pat(NAME # _S)>; def : SVE_1_Op_Passthru_Pat(NAME # _D)>; + + def _B_UNDEF : PredOneOpPassthruPseudo; + def _H_UNDEF : PredOneOpPassthruPseudo; + def _S_UNDEF : PredOneOpPassthruPseudo; + def _D_UNDEF : PredOneOpPassthruPseudo; + + defm : SVE_1_Op_PassthruUndef_Pat(NAME # _B_UNDEF)>; + defm : SVE_1_Op_PassthruUndef_Pat(NAME # _H_UNDEF)>; + defm : SVE_1_Op_PassthruUndef_Pat(NAME # _S_UNDEF)>; + defm : SVE_1_Op_PassthruUndef_Pat(NAME # _D_UNDEF)>; } multiclass sve_int_un_pred_arit_z opc, string asm, SDPatternOperator op> { @@ -4850,10 +4871,10 @@ multiclass sve_int_un_pred_arit_z opc, string asm, SDPatternOperator op> def _S : sve_int_un_pred_arit_z<0b10, { opc, 0b0 }, asm, ZPR32>; def _D : sve_int_un_pred_arit_z<0b11, { opc, 0b0 }, asm, ZPR64>; - def : SVE_1_Op_PassthruUndefZero_Pat(NAME # _B)>; - def : SVE_1_Op_PassthruUndefZero_Pat(NAME # _H)>; - def : SVE_1_Op_PassthruUndefZero_Pat(NAME # _S)>; - def : SVE_1_Op_PassthruUndefZero_Pat(NAME # _D)>; + defm : SVE_1_Op_PassthruUndefZero_Pat(NAME # _B)>; + defm : SVE_1_Op_PassthruUndefZero_Pat(NAME # _H)>; + defm : SVE_1_Op_PassthruUndefZero_Pat(NAME # _S)>; + defm : SVE_1_Op_PassthruUndefZero_Pat(NAME # _D)>; } multiclass sve_int_un_pred_arit_h opc, string asm, @@ -4967,6 +4988,17 @@ multiclass sve_int_un_pred_arit_bitwise_fp opc, string asm, def : SVE_1_Op_Passthru_Pat(NAME # _S)>; def : SVE_1_Op_Passthru_Pat(NAME # _S)>; def : SVE_1_Op_Passthru_Pat(NAME # _D)>; + + def _H_UNDEF : PredOneOpPassthruPseudo; + def _S_UNDEF : PredOneOpPassthruPseudo; + def _D_UNDEF : PredOneOpPassthruPseudo; + + defm : SVE_1_Op_PassthruUndef_Pat(NAME # _H_UNDEF)>; + defm : SVE_1_Op_PassthruUndef_Pat(NAME # _H_UNDEF)>; + defm : SVE_1_Op_PassthruUndef_Pat(NAME # _H_UNDEF)>; + defm : SVE_1_Op_PassthruUndef_Pat(NAME # _S_UNDEF)>; + defm : SVE_1_Op_PassthruUndef_Pat(NAME # _S_UNDEF)>; + defm : SVE_1_Op_PassthruUndef_Pat(NAME # _D_UNDEF)>; } multiclass sve_int_un_pred_arit_bitwise_fp_z opc, string asm, SDPatternOperator op> { @@ -4974,12 +5006,12 @@ multiclass sve_int_un_pred_arit_bitwise_fp_z opc, string asm, SDPatternO def _S : sve_int_un_pred_arit_z<0b10, { opc, 0b1 }, asm, ZPR32>; def _D : sve_int_un_pred_arit_z<0b11, { opc, 0b1 }, asm, ZPR64>; - def : SVE_1_Op_PassthruUndefZero_Pat(NAME # _H)>; - def : SVE_1_Op_PassthruUndefZero_Pat(NAME # _H)>; - def : SVE_1_Op_PassthruUndefZero_Pat(NAME # _H)>; - def : SVE_1_Op_PassthruUndefZero_Pat(NAME # _S)>; - def : SVE_1_Op_PassthruUndefZero_Pat(NAME # _S)>; - def : SVE_1_Op_PassthruUndefZero_Pat(NAME # _D)>; + defm : SVE_1_Op_PassthruUndefZero_Pat(NAME # _H)>; + defm : SVE_1_Op_PassthruUndefZero_Pat(NAME # _H)>; + defm : SVE_1_Op_PassthruUndefZero_Pat(NAME # _H)>; + defm : SVE_1_Op_PassthruUndefZero_Pat(NAME # _S)>; + defm : SVE_1_Op_PassthruUndefZero_Pat(NAME # _S)>; + defm : SVE_1_Op_PassthruUndefZero_Pat(NAME # _D)>; } multiclass sve_fp_un_pred_arit_hsd { diff --git a/llvm/test/CodeGen/AArch64/zeroing-forms-abs-neg.ll b/llvm/test/CodeGen/AArch64/zeroing-forms-abs-neg.ll index 1caee994220f0..510d4576646f1 100644 --- a/llvm/test/CodeGen/AArch64/zeroing-forms-abs-neg.ll +++ b/llvm/test/CodeGen/AArch64/zeroing-forms-abs-neg.ll @@ -18,7 +18,7 @@ define @test_svabs_f64_x_1( %pg, @llvm.aarch64.sve.fabs.nxv2f64( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fabs.nxv2f64( poison, %pg, %x) ret %0 } @@ -34,7 +34,7 @@ define @test_svabs_f64_x_2( %pg, double % ; CHECK-2p2-NEXT: fabs z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.fabs.nxv2f64( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fabs.nxv2f64( poison, %pg, %x) ret %0 } @@ -65,7 +65,7 @@ define @test_svabs_f32_x_1( %pg, @llvm.aarch64.sve.fabs.nxv4f32( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fabs.nxv4f32( poison, %pg, %x) ret %0 } @@ -81,7 +81,7 @@ define @test_svabs_f32_x_2( %pg, double %z ; CHECK-2p2-NEXT: fabs z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.fabs.nxv4f32( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fabs.nxv4f32( poison, %pg, %x) ret %0 } @@ -112,7 +112,7 @@ define @test_svabs_f16_x_1( %pg, @llvm.aarch64.sve.fabs.nxv8f16( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fabs.nxv8f16( poison, %pg, %x) ret %0 } @@ -128,7 +128,7 @@ define @test_svabs_f16_x_2( %pg, double %z0 ; CHECK-2p2-NEXT: fabs z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.fabs.nxv8f16( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fabs.nxv8f16( poison, %pg, %x) ret %0 } @@ -159,7 +159,7 @@ define @test_svabs_s8_x_1( %pg, @llvm.aarch64.sve.abs.nxv16i8( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.abs.nxv16i8( poison, %pg, %x) ret %0 } @@ -175,8 +175,8 @@ define @test_svabs_s8_x_2( %pg, double %z0, ; CHECK-2p2-NEXT: abs z0.b, p0/z, z1.b ; CHECK-2p2-NEXT: ret entry: - %1 = tail call @llvm.aarch64.sve.abs.nxv16i8( undef, %pg, %x) - ret %1 + %0 = tail call @llvm.aarch64.sve.abs.nxv16i8( poison, %pg, %x) + ret %0 } define @test_svabs_s8_z( %pg, double %z0, %x) { @@ -191,8 +191,8 @@ define @test_svabs_s8_z( %pg, double %z0, < ; CHECK-2p2-NEXT: abs z0.b, p0/z, z1.b ; CHECK-2p2-NEXT: ret entry: - %1 = tail call @llvm.aarch64.sve.abs.nxv16i8( zeroinitializer, %pg, %x) - ret %1 + %0 = tail call @llvm.aarch64.sve.abs.nxv16i8( zeroinitializer, %pg, %x) + ret %0 } define @test_svabs_s16_x_1( %pg, %x) { @@ -206,7 +206,7 @@ define @test_svabs_s16_x_1( %pg, @llvm.aarch64.sve.abs.nxv8i16( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.abs.nxv8i16( poison, %pg, %x) ret %0 } @@ -222,7 +222,7 @@ define @test_svabs_s16_x_2( %pg, double %z0, ; CHECK-2p2-NEXT: abs z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.abs.nxv8i16( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.abs.nxv8i16( poison, %pg, %x) ret %0 } @@ -253,7 +253,7 @@ define @test_svabs_s32_x_1( %pg, @llvm.aarch64.sve.abs.nxv4i32( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.abs.nxv4i32( poison, %pg, %x) ret %0 } @@ -269,7 +269,7 @@ define @test_svabs_s32_x_2( %pg, double %z0, ; CHECK-2p2-NEXT: abs z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.abs.nxv4i32( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.abs.nxv4i32( poison, %pg, %x) ret %0 } @@ -300,7 +300,7 @@ define @test_svabs_s64_x_1( %pg, @llvm.aarch64.sve.abs.nxv2i64( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.abs.nxv2i64( poison, %pg, %x) ret %0 } @@ -316,7 +316,7 @@ define @test_svabs_s64_x_2( %pg, double %z0, ; CHECK-2p2-NEXT: abs z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.abs.nxv2i64( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.abs.nxv2i64( poison, %pg, %x) ret %0 } @@ -347,7 +347,7 @@ define @test_svneg_f64_x_1( %pg, @llvm.aarch64.sve.fneg.nxv2f64( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fneg.nxv2f64( poison, %pg, %x) ret %0 } @@ -363,7 +363,7 @@ define @test_svneg_f64_x_2( %pg, double % ; CHECK-2p2-NEXT: fneg z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.fneg.nxv2f64( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fneg.nxv2f64( poison, %pg, %x) ret %0 } @@ -394,7 +394,7 @@ define @test_svneg_f32_x_1( %pg, @llvm.aarch64.sve.fneg.nxv4f32( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fneg.nxv4f32( poison, %pg, %x) ret %0 } @@ -410,7 +410,7 @@ define @test_svneg_f32_x_2( %pg, double %z ; CHECK-2p2-NEXT: fneg z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.fneg.nxv4f32( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fneg.nxv4f32( poison, %pg, %x) ret %0 } @@ -441,7 +441,7 @@ define @test_svneg_f16_x_1( %pg, @llvm.aarch64.sve.fneg.nxv8f16( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fneg.nxv8f16( poison, %pg, %x) ret %0 } @@ -457,7 +457,7 @@ define @test_svneg_f16_x_2( %pg, double %z0 ; CHECK-2p2-NEXT: fneg z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.fneg.nxv8f16( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fneg.nxv8f16( poison, %pg, %x) ret %0 } @@ -488,7 +488,7 @@ define @test_svneg_s8_x_1( %pg, @llvm.aarch64.sve.neg.nxv16i8( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.neg.nxv16i8( poison, %pg, %x) ret %0 } @@ -504,8 +504,8 @@ define @test_svneg_s8_x_2( %pg, double %z0, ; CHECK-2p2-NEXT: neg z0.b, p0/z, z1.b ; CHECK-2p2-NEXT: ret entry: - %1 = tail call @llvm.aarch64.sve.neg.nxv16i8( undef, %pg, %x) - ret %1 + %0 = tail call @llvm.aarch64.sve.neg.nxv16i8( poison, %pg, %x) + ret %0 } define @test_svneg_s8_z( %pg, double %z0, %x) { @@ -520,8 +520,8 @@ define @test_svneg_s8_z( %pg, double %z0, < ; CHECK-2p2-NEXT: neg z0.b, p0/z, z1.b ; CHECK-2p2-NEXT: ret entry: - %1 = tail call @llvm.aarch64.sve.neg.nxv16i8( zeroinitializer, %pg, %x) - ret %1 + %0 = tail call @llvm.aarch64.sve.neg.nxv16i8( zeroinitializer, %pg, %x) + ret %0 } define @test_svneg_s16_x_1( %pg, %x) { @@ -535,7 +535,7 @@ define @test_svneg_s16_x_1( %pg, @llvm.aarch64.sve.neg.nxv8i16( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.neg.nxv8i16( poison, %pg, %x) ret %0 } @@ -551,7 +551,7 @@ define @test_svneg_s16_x_2( %pg, double %z0, ; CHECK-2p2-NEXT: neg z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.neg.nxv8i16( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.neg.nxv8i16( poison, %pg, %x) ret %0 } @@ -582,7 +582,7 @@ define @test_svneg_s32_x_1( %pg, @llvm.aarch64.sve.neg.nxv4i32( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.neg.nxv4i32( poison, %pg, %x) ret %0 } @@ -598,7 +598,7 @@ define @test_svneg_s32_x_2( %pg, double %z0, ; CHECK-2p2-NEXT: neg z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.neg.nxv4i32( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.neg.nxv4i32( poison, %pg, %x) ret %0 } @@ -629,7 +629,7 @@ define @test_svneg_s64_x_1( %pg, @llvm.aarch64.sve.neg.nxv2i64( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.neg.nxv2i64( poison, %pg, %x) ret %0 } @@ -645,7 +645,7 @@ define @test_svneg_s64_x_2( %pg, double %z0, ; CHECK-2p2-NEXT: neg z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.neg.nxv2i64( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.neg.nxv2i64( poison, %pg, %x) ret %0 } @@ -664,3 +664,535 @@ entry: %0 = tail call @llvm.aarch64.sve.neg.nxv2i64( zeroinitializer, %pg, %x) ret %0 } + +define @test_svfabs_f64_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfabs_f64_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fabs z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfabs_f64_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fabs z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fabs.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_svfabs_f64_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfabs_f64_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: fabs z0.d, p0/m, z2.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfabs_f64_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fabs z0.d, p0/z, z2.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fabs.nxv2f64( %x, %pg, %y) + ret %0 +} + +define @test_svfabs_f32_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfabs_f32_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fabs z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfabs_f32_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: fabs z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fabs.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svfabs_f32_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfabs_f32_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: fabs z0.s, p0/m, z2.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfabs_f32_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: fabs z0.s, p0/z, z2.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fabs.nxv4f32( %x, %pg, %y) + ret %0 +} + +define @test_svfabs_f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfabs_f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fabs z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfabs_f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: fabs z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fabs.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_svfabs_f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfabs_f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: fabs z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfabs_f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: fabs z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fabs.nxv8f16( %x, %pg, %y) + ret %0 +} + +define @test_svabs_s8_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svabs_s8_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: abs z0.b, p0/m, z1.b +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svabs_s8_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.b +; CHECK-2p2-NEXT: abs z0.b, p0/z, z1.b +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %0 = tail call @llvm.aarch64.sve.abs.nxv16i8( poison, %pg, %x) + ret %0 +} + +define @test_svabs_s8_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svabs_s8_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: abs z0.b, p0/m, z2.b +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svabs_s8_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.b +; CHECK-2p2-NEXT: abs z0.b, p0/z, z2.b +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %0 = tail call @llvm.aarch64.sve.abs.nxv16i8( %x, %pg, %y) + ret %0 +} + +define @test_svabs_s16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svabs_s16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: abs z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svabs_s16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: abs z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nx84i1(i32 31) + %0 = tail call @llvm.aarch64.sve.abs.nxv8i16( poison, %pg, %x) + ret %0 +} + +define @test_svabs_s16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svabs_s16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: abs z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svabs_s16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: abs z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %0 = tail call @llvm.aarch64.sve.abs.nxv8i16( %x, %pg, %y) + ret %0 +} + +define @test_svabs_s32_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svabs_s32_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: abs z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svabs_s32_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: abs z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.abs.nxv4i32( poison, %pg, %x) + ret %0 +} + +define @test_svabs_s32_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svabs_s32_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: abs z0.s, p0/m, z2.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svabs_s32_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: abs z0.s, p0/z, z2.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.abs.nxv4i32( %x, %pg, %y) + ret %0 +} + +define @test_svabs_s64_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svabs_s64_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: abs z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svabs_s64_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: abs z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.abs.nxv2i64( poison, %pg, %x) + ret %0 +} + +define @test_svabs_s64_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svabs_s64_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: abs z0.d, p0/m, z2.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svabs_s64_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: abs z0.d, p0/z, z2.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.abs.nxv2i64( %x, %pg, %y) + ret %0 +} + +define @test_svfneg_f64_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfneg_f64_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fneg z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfneg_f64_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fneg z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fneg.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_svfneg_f64_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfneg_f64_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: fneg z0.d, p0/m, z2.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfneg_f64_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fneg z0.d, p0/z, z2.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fneg.nxv2f64( %x, %pg, %y) + ret %0 +} + +define @test_svfneg_f32_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfneg_f32_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fneg z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfneg_f32_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: fneg z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fneg.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svfneg_f32_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfneg_f32_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: fneg z0.s, p0/m, z2.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfneg_f32_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: fneg z0.s, p0/z, z2.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fneg.nxv4f32( %x, %pg, %y) + ret %0 +} + +define @test_svfneg_f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfneg_f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fneg z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfneg_f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: fneg z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fneg.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_svfneg_f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfneg_f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: fneg z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfneg_f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: fneg z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fneg.nxv8f16( %x, %pg, %y) + ret %0 +} + +define @test_svneg_s8_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svneg_s8_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: neg z0.b, p0/m, z1.b +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svneg_s8_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.b +; CHECK-2p2-NEXT: neg z0.b, p0/z, z1.b +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %0 = tail call @llvm.aarch64.sve.neg.nxv16i8( poison, %pg, %x) + ret %0 +} + +define @test_svneg_s8_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svneg_s8_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: neg z0.b, p0/m, z2.b +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svneg_s8_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.b +; CHECK-2p2-NEXT: neg z0.b, p0/z, z2.b +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %0 = tail call @llvm.aarch64.sve.neg.nxv16i8( %x, %pg, %y) + ret %0 +} + +define @test_svneg_s16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svneg_s16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: neg z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svneg_s16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: neg z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %0 = tail call @llvm.aarch64.sve.neg.nxv8i16( poison, %pg, %x) + ret %0 +} + +define @test_svneg_s16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svneg_s16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: neg z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svneg_s16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: neg z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %0 = tail call @llvm.aarch64.sve.neg.nxv8i16( %x, %pg, %y) + ret %0 +} + +define @test_svneg_s32_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svneg_s32_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: neg z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svneg_s32_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: neg z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.neg.nxv4i32( poison, %pg, %x) + ret %0 +} + +define @test_svneg_s32_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svneg_s32_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: neg z0.s, p0/m, z2.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svneg_s32_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: neg z0.s, p0/z, z2.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.neg.nxv4i32( %x, %pg, %y) + ret %0 +} + +define @test_svneg_s64_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svneg_s64_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: neg z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svneg_s64_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: neg z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.neg.nxv2i64( poison, %pg, %x) + ret %0 +} + +define @test_svneg_s64_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svneg_s64_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: neg z0.d, p0/m, z2.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svneg_s64_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: neg z0.d, p0/z, z2.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.neg.nxv2i64( %x, %pg, %y) + ret %0 +} diff --git a/llvm/test/CodeGen/AArch64/zeroing-forms-fcvt-bfcvt.ll b/llvm/test/CodeGen/AArch64/zeroing-forms-fcvt-bfcvt.ll index cf9ac49ca7b23..855bf9a3b3c49 100644 --- a/llvm/test/CodeGen/AArch64/zeroing-forms-fcvt-bfcvt.ll +++ b/llvm/test/CodeGen/AArch64/zeroing-forms-fcvt-bfcvt.ll @@ -18,7 +18,7 @@ define @test_svcvt_f16_f32_x_1( %pg, @llvm.aarch64.sve.fcvt.f16f32( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvt.f16f32( poison, %pg, %x) ret %0 } @@ -33,7 +33,7 @@ define @test_svcvt_f16_f32_x_2( %pg, double ; CHECK-2p2-NEXT: fcvt z0.h, p0/z, z1.s ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.fcvt.f16f32( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvt.f16f32( poison, %pg, %x) ret %0 } @@ -64,7 +64,7 @@ define @test_svcvt_bf16_f32_x_1( %pg, @llvm.aarch64.sve.fcvt.bf16f32.v2( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvt.bf16f32.v2( poison, %pg, %x) ret %0 } @@ -79,7 +79,7 @@ define @test_svcvt_bf16_f32_x_2( %pg, dou ; CHECK-2p2-NEXT: bfcvt z0.h, p0/z, z1.s ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.fcvt.bf16f32.v2( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvt.bf16f32.v2( poison, %pg, %x) ret %0 } @@ -110,7 +110,7 @@ define @test_svcvt_f16_f64_x_1( %pg, @llvm.aarch64.sve.fcvt.f16f64( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvt.f16f64( poison, %pg, %x) ret %0 } @@ -125,7 +125,7 @@ define @test_svcvt_f16_f64_x_2( %pg, double ; CHECK-2p2-NEXT: fcvt z0.h, p0/z, z1.d ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.fcvt.f16f64( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvt.f16f64( poison, %pg, %x) ret %0 } @@ -156,7 +156,7 @@ define @test_svcvt_f32_f64_x_1( %pg, @llvm.aarch64.sve.fcvt.f32f64( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvt.f32f64( poison, %pg, %x) ret %0 } @@ -171,7 +171,7 @@ define @test_svcvt_f32_f64_x_2( %pg, doubl ; CHECK-2p2-NEXT: fcvt z0.s, p0/z, z1.d ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.fcvt.f32f64( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvt.f32f64( poison, %pg, %x) ret %0 } @@ -202,7 +202,7 @@ define @test_svcvt_f32_f16_x_1( %pg, @llvm.aarch64.sve.fcvt.f32f16( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvt.f32f16( poison, %pg, %x) ret %0 } @@ -217,7 +217,7 @@ define @test_svcvt_f32_f16_x_2( %pg, doubl ; CHECK-2p2-NEXT: fcvt z0.s, p0/z, z1.h ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.fcvt.f32f16( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvt.f32f16( poison, %pg, %x) ret %0 } @@ -248,7 +248,7 @@ define @test_svcvt_f64_f16_x_1( %pg, @llvm.aarch64.sve.fcvt.f64f16( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvt.f64f16( poison, %pg, %x) ret %0 } @@ -263,7 +263,7 @@ define @test_svcvt_f64_f16_x_2( %pg, doub ; CHECK-2p2-NEXT: fcvt z0.d, p0/z, z1.h ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.fcvt.f64f16( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvt.f64f16( poison, %pg, %x) ret %0 } @@ -294,7 +294,7 @@ define @test_svcvt_f64_f32_x_1( %pg, @llvm.aarch64.sve.fcvt.f64f32( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvt.f64f32( poison, %pg, %x) ret %0 } @@ -309,7 +309,7 @@ define @test_svcvt_f64_f32_x_2( %pg, doub ; CHECK-2p2-NEXT: fcvt z0.d, p0/z, z1.s ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.fcvt.f64f32( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvt.f64f32( poison, %pg, %x) ret %0 } @@ -328,3 +328,262 @@ entry: %0 = tail call @llvm.aarch64.sve.fcvt.f64f32( zeroinitializer, %pg, %x) ret %0 } + +define @test_svcvt_f16_f32_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svcvt_f16_f32_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvt z0.h, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svcvt_f16_f32_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: fcvt z0.h, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvt.f16f32( poison, %pg, %x) + ret %0 +} + +define @test_svcvt_f16_f32_ptrue(double %z0, %x, %y ) { +; CHECK-LABEL: test_svcvt_f16_f32_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvt z0.h, p0/m, z2.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svcvt_f16_f32_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: fcvt z0.h, p0/z, z2.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvt.f16f32( %x, %pg, %y) + ret %0 +} + +define @test_svcvt_bf16_f32_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svcvt_bf16_f32_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svcvt_bf16_f32_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: bfcvt z0.h, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvt.bf16f32.v2( poison, %pg, %x) + ret %0 +} + +define @test_svcvt_bf16_f32_ptrue(double %z0, %x, %y ) { +; CHECK-LABEL: test_svcvt_bf16_f32_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z2.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svcvt_bf16_f32_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: bfcvt z0.h, p0/z, z2.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvt.bf16f32.v2( %x, %pg, %y) + ret %0 +} + +define @test_svcvt_f16_f64_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svcvt_f16_f64_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvt z0.h, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svcvt_f16_f64_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fcvt z0.h, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvt.f16f64( poison, %pg, %x) + ret %0 +} + +define @test_svcvt_f16_f64_ptrue(double %z0, %x, %y ) { +; CHECK-LABEL: test_svcvt_f16_f64_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvt z0.h, p0/m, z2.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svcvt_f16_f64_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fcvt z0.h, p0/z, z2.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvt.f16f64( %x, %pg, %y) + ret %0 +} + +define @test_svcvt_f32_f64_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svcvt_f32_f64_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvt z0.s, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svcvt_f32_f64_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fcvt z0.s, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvt.f32f64( poison, %pg, %x) + ret %0 +} + +define @test_svcvt_f32_f64_ptrue(double %z0, %x, %y ) { +; CHECK-LABEL: test_svcvt_f32_f64_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvt z0.s, p0/m, z2.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svcvt_f32_f64_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fcvt z0.s, p0/z, z2.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvt.f32f64( %x, %pg, %y) + ret %0 +} + +define @test_svcvt_f32_f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svcvt_f32_f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvt z0.s, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svcvt_f32_f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: fcvt z0.s, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvt.f32f16( poison, %pg, %x) + ret %0 +} + +define @test_svcvt_f32_f16_ptrue(double %z0, %x, %y ) { +; CHECK-LABEL: test_svcvt_f32_f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvt z0.s, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svcvt_f32_f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: fcvt z0.s, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvt.f32f16( %x, %pg, %y) + ret %0 +} + +define @test_svcvt_f64_f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svcvt_f64_f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvt z0.d, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svcvt_f64_f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fcvt z0.d, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvt.f64f16( poison, %pg, %x) + ret %0 +} + +define @test_svcvt_f64_f16_ptrue(double %z0, %x, %y ) { +; CHECK-LABEL: test_svcvt_f64_f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvt z0.d, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svcvt_f64_f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fcvt z0.d, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvt.f64f16( %x, %pg, %y) + ret %0 +} + +define @test_svcvt_f64_f32_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svcvt_f64_f32_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvt z0.d, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svcvt_f64_f32_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fcvt z0.d, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvt.f64f32( poison, %pg, %x) + ret %0 +} + +define @test_svcvt_f64_f32_ptrue(double %z0, %x, %y ) { +; CHECK-LABEL: test_svcvt_f64_f32_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvt z0.d, p0/m, z2.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svcvt_f64_f32_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fcvt z0.d, p0/z, z2.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvt.f64f32( %x, %pg, %y) + ret %0 +} diff --git a/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtlt-fcvtx.ll b/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtlt-fcvtx.ll index 60879b1529230..c7431e11c21ca 100644 --- a/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtlt-fcvtx.ll +++ b/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtlt-fcvtx.ll @@ -18,7 +18,7 @@ define @test_svcvtlt_f32_f16_x_1( %pg, @llvm.aarch64.sve.fcvtlt.f32f16( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtlt.f32f16( poison, %pg, %x) ret %0 } @@ -33,7 +33,7 @@ define @test_svcvtlt_f32_f16_x_2( %pg, dou ; CHECK-2p2-NEXT: fcvtlt z0.s, p0/z, z1.h ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.fcvtlt.f32f16( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtlt.f32f16( poison, %pg, %x) ret %0 } @@ -64,7 +64,7 @@ define @test_svcvtlt_f64_f32_x_1( %pg, @llvm.aarch64.sve.fcvtlt.f64f32( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtlt.f64f32( poison, %pg, %x) ret %0 } @@ -79,7 +79,7 @@ define @test_svcvtlt_f64_f32_x_2( %pg, do ; CHECK-2p2-NEXT: fcvtlt z0.d, p0/z, z1.s ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.fcvtlt.f64f32( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtlt.f64f32( poison, %pg, %x) ret %0 } @@ -110,7 +110,7 @@ define @test_svcvtx_f32_f64_x_1( %pg, @llvm.aarch64.sve.fcvtx.f32f64( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtx.f32f64( poison, %pg, %x) ret %0 } @@ -125,7 +125,7 @@ define @test_svcvtx_f32_f64_x_2( %pg, doub ; CHECK-2p2-NEXT: fcvtx z0.s, p0/z, z1.d ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.fcvtx.f32f64( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtx.f32f64( poison, %pg, %x) ret %0 } @@ -144,3 +144,114 @@ entry: %0 = tail call @llvm.aarch64.sve.fcvtx.f32f64( zeroinitializer, %pg, %x) ret %0 } + +define @test_svcvtlt_f32_f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svcvtlt_f32_f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvtlt z0.s, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svcvtlt_f32_f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: fcvtlt z0.s, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtlt.f32f16( poison, %pg, %x) + ret %0 +} + +define @test_svcvtlt_f32_f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svcvtlt_f32_f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvtlt z0.s, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svcvtlt_f32_f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: fcvtlt z0.s, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtlt.f32f16( %x, %pg, %y) + ret %0 +} + +define @test_svcvtlt_f64_f32_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svcvtlt_f64_f32_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtlt z0.d, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svcvtlt_f64_f32_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fcvtlt z0.d, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtlt.f64f32( poison, %pg, %x) + ret %0 +} + +define @test_svcvtlt_f64_f32_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svcvtlt_f64_f32_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtlt z0.d, p0/m, z2.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svcvtlt_f64_f32_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fcvtlt z0.d, p0/z, z2.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtlt.f64f32( %x, %pg, %y) + ret %0 +} + +define @test_svcvtx_f32_f64_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svcvtx_f32_f64_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtx z0.s, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svcvtx_f32_f64_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fcvtx z0.s, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtx.f32f64( poison, %pg, %x) + ret %0 +} + +define @test_svcvtx_f32_f64_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svcvtx_f32_f64_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtx z0.s, p0/m, z2.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svcvtx_f32_f64_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fcvtx z0.s, p0/z, z2.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtx.f32f64( %x, %pg, %y) + ret %0 +} diff --git a/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtzsu.ll b/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtzsu.ll index b8b36d390330a..7259502bf4400 100644 --- a/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtzsu.ll +++ b/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtzsu.ll @@ -18,7 +18,7 @@ define @test_fcvtzs_s32_f64_x_1( %pg, @llvm.aarch64.sve.fcvtzs.i32f64( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtzs.i32f64( poison, %pg, %x) ret %0 } @@ -33,7 +33,7 @@ define @test_fcvtzs_s32_f64_x_2( %pg, double ; CHECK-2p2-NEXT: fcvtzs z0.s, p0/z, z1.d ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.fcvtzs.i32f64( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtzs.i32f64( poison, %pg, %x) ret %0 } @@ -64,7 +64,7 @@ define @test_fcvtzs_s64_f32_x_1( %pg, @llvm.aarch64.sve.fcvtzs.i64f32( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtzs.i64f32( poison, %pg, %x) ret %0 } @@ -79,7 +79,7 @@ define @test_fcvtzs_s64_f32_x_2( %pg, double ; CHECK-2p2-NEXT: fcvtzs z0.d, p0/z, z1.s ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.fcvtzs.i64f32( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtzs.i64f32( poison, %pg, %x) ret %0 } @@ -110,7 +110,7 @@ define @test_fcvtzs_s32_f16_x_1( %pg, @llvm.aarch64.sve.fcvtzs.i32f16( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtzs.i32f16( poison, %pg, %x) ret %0 } @@ -125,7 +125,7 @@ define @test_fcvtzs_s32_f16_x_2( %pg, double ; CHECK-2p2-NEXT: fcvtzs z0.s, p0/z, z1.h ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.fcvtzs.i32f16( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtzs.i32f16( poison, %pg, %x) ret %0 } @@ -156,7 +156,7 @@ define @test_fcvtzs_s64_f16_x_1( %pg, @llvm.aarch64.sve.fcvtzs.i64f16( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtzs.i64f16( poison, %pg, %x) ret %0 } @@ -171,7 +171,7 @@ define @test_fcvtzs_s64_f16_x_2( %pg, double ; CHECK-2p2-NEXT: fcvtzs z0.d, p0/z, z1.h ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.fcvtzs.i64f16( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtzs.i64f16( poison, %pg, %x) ret %0 } @@ -202,7 +202,7 @@ define @test_fcvtzu_u32_f64_x_1( %pg, @llvm.aarch64.sve.fcvtzu.i32f64( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtzu.i32f64( poison, %pg, %x) ret %0 } @@ -217,7 +217,7 @@ define @test_fcvtzu_u32_f64_x_2( %pg, double ; CHECK-2p2-NEXT: fcvtzu z0.s, p0/z, z1.d ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.fcvtzu.i32f64( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtzu.i32f64( poison, %pg, %x) ret %0 } @@ -248,7 +248,7 @@ define @test_fcvtzu_u64_f32_x_1( %pg, @llvm.aarch64.sve.fcvtzu.i64f32( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtzu.i64f32( poison, %pg, %x) ret %0 } @@ -263,7 +263,7 @@ define @test_fcvtzu_u64_f32_x_2( %pg, double ; CHECK-2p2-NEXT: fcvtzu z0.d, p0/z, z1.s ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.fcvtzu.i64f32( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtzu.i64f32( poison, %pg, %x) ret %0 } @@ -294,7 +294,7 @@ define @test_fcvtzu_u32_f16_x_1( %pg, @llvm.aarch64.sve.fcvtzu.i32f16( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtzu.i32f16( poison, %pg, %x) ret %0 } @@ -309,7 +309,7 @@ define @test_fcvtzu_u32_f16_x_2( %pg, double ; CHECK-2p2-NEXT: fcvtzu z0.s, p0/z, z1.h ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.fcvtzu.i32f16( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtzu.i32f16( poison, %pg, %x) ret %0 } @@ -340,7 +340,7 @@ define @test_fcvtzu_u64_f16_x_1( %pg, @llvm.aarch64.sve.fcvtzu.i64f16( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtzu.i64f16( poison, %pg, %x) ret %0 } @@ -355,7 +355,7 @@ define @test_fcvtzu_u64_f16_x_2( %pg, double ; CHECK-2p2-NEXT: fcvtzu z0.d, p0/z, z1.h ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.fcvtzu.i64f16( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtzu.i64f16( poison, %pg, %x) ret %0 } @@ -387,7 +387,7 @@ define @test_svcvt_s16_f16_x_1( %pg, @llvm.aarch64.sve.fcvtzs.nxv8i16.nxv8f16( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtzs.nxv8i16.nxv8f16( poison, %pg, %x) ret %0 } @@ -403,7 +403,7 @@ define @test_svcvt_s16_f16_x_2( %pg, double ; CHECK-2p2-NEXT: fcvtzs z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.fcvtzs.nxv8i16.nxv8f16( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtzs.nxv8i16.nxv8f16( poison, %pg, %x) ret %0 } @@ -434,7 +434,7 @@ define @test_svcvt_u16_f16_x_1( %pg, @llvm.aarch64.sve.fcvtzu.nxv8i16.nxv8f16( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtzu.nxv8i16.nxv8f16( poison, %pg, %x) ret %0 } @@ -450,7 +450,7 @@ define @test_svcvt_u16_f16_x_2( %pg, double ; CHECK-2p2-NEXT: fcvtzu z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.fcvtzu.nxv8i16.nxv8f16( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtzu.nxv8i16.nxv8f16( poison, %pg, %x) ret %0 } @@ -481,7 +481,7 @@ define @test_svcvt_s32_f32_x_1( %pg, @llvm.aarch64.sve.fcvtzs.nxv4i32.nxv4f32( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtzs.nxv4i32.nxv4f32( poison, %pg, %x) ret %0 } @@ -497,7 +497,7 @@ define @test_svcvt_s32_f32_x_2( %pg, double ; CHECK-2p2-NEXT: fcvtzs z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.fcvtzs.nxv4i32.nxv4f32( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtzs.nxv4i32.nxv4f32( poison, %pg, %x) ret %0 } @@ -528,7 +528,7 @@ define @test_svcvt_u32_f32_x_1( %pg, @llvm.aarch64.sve.fcvtzu.nxv4i32.nxv4f32( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtzu.nxv4i32.nxv4f32( poison, %pg, %x) ret %0 } @@ -544,7 +544,7 @@ define @test_svcvt_u32_f32_x_2( %pg, double ; CHECK-2p2-NEXT: fcvtzu z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.fcvtzu.nxv4i32.nxv4f32( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtzu.nxv4i32.nxv4f32( poison, %pg, %x) ret %0 } @@ -575,7 +575,7 @@ define @test_svcvt_s64_f64_x_1( %pg, @llvm.aarch64.sve.fcvtzs.nxv2i64.nxv2f64( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtzs.nxv2i64.nxv2f64( poison, %pg, %x) ret %0 } @@ -591,7 +591,7 @@ define @test_svcvt_s64_f64_x_2( %pg, double ; CHECK-2p2-NEXT: fcvtzs z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.fcvtzs.nxv2i64.nxv2f64( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtzs.nxv2i64.nxv2f64( poison, %pg, %x) ret %0 } @@ -622,7 +622,7 @@ define @test_svcvt_u64_f64_x_1( %pg, @llvm.aarch64.sve.fcvtzu.nxv2i64.nxv2f64( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtzu.nxv2i64.nxv2f64( poison, %pg, %x) ret %0 } @@ -638,7 +638,7 @@ define @test_svcvt_u64_f64_x_2( %pg, double ; CHECK-2p2-NEXT: fcvtzu z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret entry: - %0 = tail call @llvm.aarch64.sve.fcvtzu.nxv2i64.nxv2f64( undef, %pg, %x) + %0 = tail call @llvm.aarch64.sve.fcvtzu.nxv2i64.nxv2f64( poison, %pg, %x) ret %0 } @@ -657,3 +657,527 @@ entry: %0 = tail call @llvm.aarch64.sve.fcvtzu.nxv2i64.nxv2f64( zeroinitializer, %pg, %x) ret %0 } + +define @test_fcvtzs_i32_f64_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_fcvtzs_i32_f64_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.s, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_fcvtzs_i32_f64_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fcvtzs z0.s, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtzs.i32f64( poison, %pg, %x) + ret %0 +} + +define @test_fcvtzs_i32_f64_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_fcvtzs_i32_f64_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.s, p0/m, z2.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_fcvtzs_i32_f64_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fcvtzs z0.s, p0/z, z2.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtzs.i32f64( %x, %pg, %y) + ret %0 +} + +define @test_fcvtzu_i32_f64_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_fcvtzu_i32_f64_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzu z0.s, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_fcvtzu_i32_f64_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fcvtzu z0.s, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtzu.i32f64( poison, %pg, %x) + ret %0 +} + +define @test_fcvtzu_i32_f64_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_fcvtzu_i32_f64_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzu z0.s, p0/m, z2.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_fcvtzu_i32_f64_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fcvtzu z0.s, p0/z, z2.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtzu.i32f64( %x, %pg, %y) + ret %0 +} + +define @test_fcvtzs_i64_f32_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_fcvtzs_i64_f32_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_fcvtzs_i64_f32_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fcvtzs z0.d, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtzs.i64f32( poison, %pg, %x) + ret %0 +} + +define @test_fcvtzs_i64_f32_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_fcvtzs_i64_f32_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z2.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_fcvtzs_i64_f32_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fcvtzs z0.d, p0/z, z2.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtzs.i64f32( %x, %pg, %y) + ret %0 +} + +define @test_fcvtzu_i64_f32_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_fcvtzu_i64_f32_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzu z0.d, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_fcvtzu_i64_f32_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fcvtzu z0.d, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtzu.i64f32( poison, %pg, %x) + ret %0 +} + +define @test_fcvtzu_i64_f32_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_fcvtzu_i64_f32_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzu z0.d, p0/m, z2.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_fcvtzu_i64_f32_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fcvtzu z0.d, p0/z, z2.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtzu.i64f32( %x, %pg, %y) + ret %0 +} + +define @test_fcvtzs_i32_f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_fcvtzs_i32_f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvtzs z0.s, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_fcvtzs_i32_f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: fcvtzs z0.s, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtzs.i32f16( poison, %pg, %x) + ret %0 +} + +define @test_fcvtzs_i32_f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_fcvtzs_i32_f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvtzs z0.s, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_fcvtzs_i32_f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: fcvtzs z0.s, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtzs.i32f16( %x, %pg, %y) + ret %0 +} + +define @test_fcvtzu_i32_f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_fcvtzu_i32_f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvtzu z0.s, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_fcvtzu_i32_f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: fcvtzu z0.s, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtzu.i32f16( poison, %pg, %x) + ret %0 +} + +define @test_fcvtzu_i32_f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_fcvtzu_i32_f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvtzu z0.s, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_fcvtzu_i32_f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: fcvtzu z0.s, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtzu.i32f16( %x, %pg, %y) + ret %0 +} + +define @test_fcvtzs_i64_f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_fcvtzs_i64_f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_fcvtzs_i64_f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fcvtzs z0.d, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtzs.i64f16( poison, %pg, %x) + ret %0 +} + +define @test_fcvtzs_i64_f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_fcvtzs_i64_f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_fcvtzs_i64_f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fcvtzs z0.d, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtzs.i64f16( %x, %pg, %y) + ret %0 +} + +define @test_fcvtzu_i64_f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_fcvtzu_i64_f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzu z0.d, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_fcvtzu_i64_f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fcvtzu z0.d, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtzu.i64f16( poison, %pg, %x) + ret %0 +} + +define @test_fcvtzu_i64_f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_fcvtzu_i64_f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzu z0.d, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_fcvtzu_i64_f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fcvtzu z0.d, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtzu.i64f16( %x, %pg, %y) + ret %0 +} + +define @test_fcvtzs_i16_f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_fcvtzs_i16_f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fcvtzs z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_fcvtzs_i16_f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: fcvtzs z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtzs.nxv8i16.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_fcvtzs_i16_f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_fcvtzs_i16_f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: fcvtzs z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_fcvtzs_i16_f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: fcvtzs z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtzs.nxv8i16.nxv8f16( %x, %pg, %y) + ret %0 +} + +define @test_fcvtzu_i16_f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_fcvtzu_i16_f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fcvtzu z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_fcvtzu_i16_f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: fcvtzu z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtzu.nxv8i16.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_fcvtzu_i16_f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_fcvtzu_i16_f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: fcvtzu z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_fcvtzu_i16_f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: fcvtzu z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtzu.nxv8i16.nxv8f16( %x, %pg, %y) + ret %0 +} + +define @test_fcvtzs_i32_f32_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_fcvtzs_i32_f32_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fcvtzs z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_fcvtzs_i32_f32_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: fcvtzs z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtzs.nxv4i32.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_fcvtzs_i32_f32_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_fcvtzs_i32_f32_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: fcvtzs z0.s, p0/m, z2.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_fcvtzs_i32_f32_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: fcvtzs z0.s, p0/z, z2.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtzs.nxv4i32.nxv4f32( %x, %pg, %y) + ret %0 +} + +define @test_fcvtzu_i32_f32_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_fcvtzu_i32_f32_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fcvtzu z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_fcvtzu_i32_f32_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: fcvtzu z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtzu.nxv4i32.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_fcvtzu_i32_f32_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_fcvtzu_i32_f32_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: fcvtzu z0.s, p0/m, z2.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_fcvtzu_i32_f32_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: fcvtzu z0.s, p0/z, z2.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtzu.nxv4i32.nxv4f32( %x, %pg, %y) + ret %0 +} + +define @test_fcvtzs_i64_f64_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_fcvtzs_i64_f64_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_fcvtzs_i64_f64_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fcvtzs z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtzs.nxv2i64.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_fcvtzs_i64_f64_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_fcvtzs_i64_f64_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z2.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_fcvtzs_i64_f64_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fcvtzs z0.d, p0/z, z2.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtzs.nxv2i64.nxv2f64( %x, %pg, %y) + ret %0 +} + +define @test_fcvtzu_i64_f64_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_fcvtzu_i64_f64_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fcvtzu z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_fcvtzu_i64_f64_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fcvtzu z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtzu.nxv2i64.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_fcvtzu_i64_f64_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_fcvtzu_i64_f64_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: fcvtzu z0.d, p0/m, z2.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_fcvtzu_i64_f64_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fcvtzu z0.d, p0/z, z2.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fcvtzu.nxv2i64.nxv2f64( %x, %pg, %y) + ret %0 +} From c858bf620c3ab2a4db53e84b9365b553c3ad1aa6 Mon Sep 17 00:00:00 2001 From: Sam Tebbs Date: Tue, 24 Dec 2024 12:08:17 +0000 Subject: [PATCH 013/567] Reland "[LoopVectorizer] Add support for partial reductions" (#120721) This re-lands the reverted #92418 When the VF is small enough so that dividing the VF by the scaling factor results in 1, the reduction phi execution thinks the VF is scalar and sets the reduction's output as a scalar value, tripping assertions expecting a vector value. The latest commit in this PR fixes that by using `State.VF` in the scalar check, rather than the divided VF. --------- Co-authored-by: Nicholas Guy --- .../llvm/Analysis/TargetTransformInfo.h | 39 + .../llvm/Analysis/TargetTransformInfoImpl.h | 9 + llvm/lib/Analysis/TargetTransformInfo.cpp | 17 + .../AArch64/AArch64TargetTransformInfo.h | 56 + .../Transforms/Vectorize/LoopVectorize.cpp | 136 +- .../Transforms/Vectorize/VPRecipeBuilder.h | 59 +- llvm/lib/Transforms/Vectorize/VPlan.h | 63 +- .../Transforms/Vectorize/VPlanAnalysis.cpp | 8 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 74 +- llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 + .../AArch64/fully-unrolled-cost.ll | 20 +- .../partial-reduce-dot-product-epilogue.ll | 213 ++ .../partial-reduce-dot-product-neon.ll | 1375 +++++++++++++ .../AArch64/partial-reduce-dot-product.ll | 1733 +++++++++++++++++ .../AArch64/partial-reduce-no-dotprod.ll | 61 + .../LoopVectorize/AArch64/vplan-printing.ll | 93 + 16 files changed, 3927 insertions(+), 30 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 752313ab15858..c6b846f96f162 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -211,6 +211,12 @@ typedef TargetTransformInfo TTI; /// for IR-level transformations. class TargetTransformInfo { public: + enum PartialReductionExtendKind { PR_None, PR_SignExtend, PR_ZeroExtend }; + + /// Get the kind of extension that an instruction represents. + static PartialReductionExtendKind + getPartialReductionExtendKind(Instruction *I); + /// Construct a TTI object using a type implementing the \c Concept /// API below. /// @@ -1280,6 +1286,18 @@ class TargetTransformInfo { /// \return if target want to issue a prefetch in address space \p AS. bool shouldPrefetchAddressSpace(unsigned AS) const; + /// \return The cost of a partial reduction, which is a reduction from a + /// vector to another vector with fewer elements of larger size. They are + /// represented by the llvm.experimental.partial.reduce.add intrinsic, which + /// takes an accumulator and a binary operation operand that itself is fed by + /// two extends. An example of an operation that uses a partial reduction is a + /// dot product, which reduces a vector to another of 4 times fewer elements. + InstructionCost + getPartialReductionCost(unsigned Opcode, Type *InputType, Type *AccumType, + ElementCount VF, PartialReductionExtendKind OpAExtend, + PartialReductionExtendKind OpBExtend, + std::optional BinOp = std::nullopt) const; + /// \return The maximum interleave factor that any transform should try to /// perform for this target. This number depends on the level of parallelism /// and the number of execution units in the CPU. @@ -2107,6 +2125,18 @@ class TargetTransformInfo::Concept { /// \return if target want to issue a prefetch in address space \p AS. virtual bool shouldPrefetchAddressSpace(unsigned AS) const = 0; + /// \return The cost of a partial reduction, which is a reduction from a + /// vector to another vector with fewer elements of larger size. They are + /// represented by the llvm.experimental.partial.reduce.add intrinsic, which + /// takes an accumulator and a binary operation operand that itself is fed by + /// two extends. An example of an operation that uses a partial reduction is a + /// dot product, which reduces a vector to another of 4 times fewer elements. + virtual InstructionCost + getPartialReductionCost(unsigned Opcode, Type *InputType, Type *AccumType, + ElementCount VF, PartialReductionExtendKind OpAExtend, + PartialReductionExtendKind OpBExtend, + std::optional BinOp) const = 0; + virtual unsigned getMaxInterleaveFactor(ElementCount VF) = 0; virtual InstructionCost getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, @@ -2786,6 +2816,15 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { return Impl.shouldPrefetchAddressSpace(AS); } + InstructionCost getPartialReductionCost( + unsigned Opcode, Type *InputType, Type *AccumType, ElementCount VF, + PartialReductionExtendKind OpAExtend, + PartialReductionExtendKind OpBExtend, + std::optional BinOp = std::nullopt) const override { + return Impl.getPartialReductionCost(Opcode, InputType, AccumType, VF, + OpAExtend, OpBExtend, BinOp); + } + unsigned getMaxInterleaveFactor(ElementCount VF) override { return Impl.getMaxInterleaveFactor(VF); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 9c74b2a0c31df..5fa0c46ad292d 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -585,6 +585,15 @@ class TargetTransformInfoImplBase { bool enableWritePrefetching() const { return false; } bool shouldPrefetchAddressSpace(unsigned AS) const { return !AS; } + InstructionCost + getPartialReductionCost(unsigned Opcode, Type *InputType, Type *AccumType, + ElementCount VF, + TTI::PartialReductionExtendKind OpAExtend, + TTI::PartialReductionExtendKind OpBExtend, + std::optional BinOp = std::nullopt) const { + return InstructionCost::getInvalid(); + } + unsigned getMaxInterleaveFactor(ElementCount VF) const { return 1; } InstructionCost getArithmeticInstrCost( diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index b32dffa9f0fe8..c62e40db0c577 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -863,6 +863,14 @@ bool TargetTransformInfo::shouldPrefetchAddressSpace(unsigned AS) const { return TTIImpl->shouldPrefetchAddressSpace(AS); } +InstructionCost TargetTransformInfo::getPartialReductionCost( + unsigned Opcode, Type *InputType, Type *AccumType, ElementCount VF, + PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, + std::optional BinOp) const { + return TTIImpl->getPartialReductionCost(Opcode, InputType, AccumType, VF, + OpAExtend, OpBExtend, BinOp); +} + unsigned TargetTransformInfo::getMaxInterleaveFactor(ElementCount VF) const { return TTIImpl->getMaxInterleaveFactor(VF); } @@ -974,6 +982,15 @@ InstructionCost TargetTransformInfo::getShuffleCost( return Cost; } +TargetTransformInfo::PartialReductionExtendKind +TargetTransformInfo::getPartialReductionExtendKind(Instruction *I) { + if (isa(I)) + return PR_SignExtend; + if (isa(I)) + return PR_ZeroExtend; + return PR_None; +} + TTI::CastContextHint TargetTransformInfo::getCastContextHint(const Instruction *I) { if (!I) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 83b86e31565e4..2a31cacc203f4 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -23,6 +23,7 @@ #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/IR/Function.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/Support/InstructionCost.h" #include #include @@ -357,6 +358,61 @@ class AArch64TTIImpl : public BasicTTIImplBase { return BaseT::isLegalNTLoad(DataType, Alignment); } + InstructionCost + getPartialReductionCost(unsigned Opcode, Type *InputType, Type *AccumType, + ElementCount VF, + TTI::PartialReductionExtendKind OpAExtend, + TTI::PartialReductionExtendKind OpBExtend, + std::optional BinOp) const { + + InstructionCost Invalid = InstructionCost::getInvalid(); + InstructionCost Cost(TTI::TCC_Basic); + + if (Opcode != Instruction::Add) + return Invalid; + + EVT InputEVT = EVT::getEVT(InputType); + EVT AccumEVT = EVT::getEVT(AccumType); + + if (VF.isScalable() && !ST->isSVEorStreamingSVEAvailable()) + return Invalid; + if (VF.isFixed() && (!ST->isNeonAvailable() || !ST->hasDotProd())) + return Invalid; + + if (InputEVT == MVT::i8) { + switch (VF.getKnownMinValue()) { + default: + return Invalid; + case 8: + if (AccumEVT == MVT::i32) + Cost *= 2; + else if (AccumEVT != MVT::i64) + return Invalid; + break; + case 16: + if (AccumEVT == MVT::i64) + Cost *= 2; + else if (AccumEVT != MVT::i32) + return Invalid; + break; + } + } else if (InputEVT == MVT::i16) { + // FIXME: Allow i32 accumulator but increase cost, as we would extend + // it to i64. + if (VF.getKnownMinValue() != 8 || AccumEVT != MVT::i64) + return Invalid; + } else + return Invalid; + + if (OpAExtend == TTI::PR_None || OpBExtend == TTI::PR_None) + return Invalid; + + if (!BinOp || (*BinOp) != Instruction::Mul) + return Invalid; + + return Cost; + } + bool enableOrderedReductions() const { return true; } InstructionCost getInterleavedMemoryOpCost( diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 10b998fff02b7..cb828b738d310 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7605,6 +7605,10 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, } continue; } + // The VPlan-based cost model is more accurate for partial reduction and + // comparing against the legacy cost isn't desirable. + if (isa(&R)) + return true; if (Instruction *UI = GetInstructionForCost(&R)) SeenInstrs.insert(UI); } @@ -8827,6 +8831,103 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I, return Recipe; } +/// Find all possible partial reductions in the loop and track all of those that +/// are valid so recipes can be formed later. +void VPRecipeBuilder::collectScaledReductions(VFRange &Range) { + // Find all possible partial reductions. + SmallVector, 1> + PartialReductionChains; + for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) + if (std::optional> Pair = + getScaledReduction(Phi, RdxDesc, Range)) + PartialReductionChains.push_back(*Pair); + + // A partial reduction is invalid if any of its extends are used by + // something that isn't another partial reduction. This is because the + // extends are intended to be lowered along with the reduction itself. + + // Build up a set of partial reduction bin ops for efficient use checking. + SmallSet PartialReductionBinOps; + for (const auto &[PartialRdx, _] : PartialReductionChains) + PartialReductionBinOps.insert(PartialRdx.BinOp); + + auto ExtendIsOnlyUsedByPartialReductions = + [&PartialReductionBinOps](Instruction *Extend) { + return all_of(Extend->users(), [&](const User *U) { + return PartialReductionBinOps.contains(U); + }); + }; + + // Check if each use of a chain's two extends is a partial reduction + // and only add those that don't have non-partial reduction users. + for (auto Pair : PartialReductionChains) { + PartialReductionChain Chain = Pair.first; + if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) && + ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB)) + ScaledReductionExitInstrs.insert(std::make_pair(Chain.Reduction, Pair)); + } +} + +std::optional> +VPRecipeBuilder::getScaledReduction(PHINode *PHI, + const RecurrenceDescriptor &Rdx, + VFRange &Range) { + // TODO: Allow scaling reductions when predicating. The select at + // the end of the loop chooses between the phi value and most recent + // reduction result, both of which have different VFs to the active lane + // mask when scaling. + if (CM.blockNeedsPredicationForAnyReason(Rdx.getLoopExitInstr()->getParent())) + return std::nullopt; + + auto *Update = dyn_cast(Rdx.getLoopExitInstr()); + if (!Update) + return std::nullopt; + + Value *Op = Update->getOperand(0); + if (Op == PHI) + Op = Update->getOperand(1); + + auto *BinOp = dyn_cast(Op); + if (!BinOp || !BinOp->hasOneUse()) + return std::nullopt; + + using namespace llvm::PatternMatch; + Value *A, *B; + if (!match(BinOp->getOperand(0), m_ZExtOrSExt(m_Value(A))) || + !match(BinOp->getOperand(1), m_ZExtOrSExt(m_Value(B)))) + return std::nullopt; + + Instruction *ExtA = cast(BinOp->getOperand(0)); + Instruction *ExtB = cast(BinOp->getOperand(1)); + + // Check that the extends extend from the same type. + if (A->getType() != B->getType()) + return std::nullopt; + + TTI::PartialReductionExtendKind OpAExtend = + TargetTransformInfo::getPartialReductionExtendKind(ExtA); + TTI::PartialReductionExtendKind OpBExtend = + TargetTransformInfo::getPartialReductionExtendKind(ExtB); + + PartialReductionChain Chain(Rdx.getLoopExitInstr(), ExtA, ExtB, BinOp); + + unsigned TargetScaleFactor = + PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor( + A->getType()->getPrimitiveSizeInBits()); + + if (LoopVectorizationPlanner::getDecisionAndClampRange( + [&](ElementCount VF) { + InstructionCost Cost = TTI->getPartialReductionCost( + Update->getOpcode(), A->getType(), PHI->getType(), VF, + OpAExtend, OpBExtend, std::make_optional(BinOp->getOpcode())); + return Cost.isValid(); + }, + Range)) + return std::make_pair(Chain, TargetScaleFactor); + + return std::nullopt; +} + VPRecipeBase * VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, ArrayRef Operands, @@ -8851,9 +8952,14 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, Legal->getReductionVars().find(Phi)->second; assert(RdxDesc.getRecurrenceStartValue() == Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); - PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, - CM.isInLoopReduction(Phi), - CM.useOrderedReductions(RdxDesc)); + + // If the PHI is used by a partial reduction, set the scale factor. + std::optional> Pair = + getScaledReductionForInstr(RdxDesc.getLoopExitInstr()); + unsigned ScaleFactor = Pair ? Pair->second : 1; + PhiRecipe = new VPReductionPHIRecipe( + Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi), + CM.useOrderedReductions(RdxDesc), ScaleFactor); } else { // TODO: Currently fixed-order recurrences are modeled as chains of // first-order recurrences. If there are no users of the intermediate @@ -8885,6 +8991,9 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, if (isa(Instr) || isa(Instr)) return tryToWidenMemory(Instr, Operands, Range); + if (getScaledReductionForInstr(Instr)) + return tryToCreatePartialReduction(Instr, Operands); + if (!shouldWiden(Instr, Range)) return nullptr; @@ -8905,6 +9014,21 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, return tryToWiden(Instr, Operands, VPBB); } +VPRecipeBase * +VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, + ArrayRef Operands) { + assert(Operands.size() == 2 && + "Unexpected number of operands for partial reduction"); + + VPValue *BinOp = Operands[0]; + VPValue *Phi = Operands[1]; + if (isa(BinOp->getDefiningRecipe())) + std::swap(BinOp, Phi); + + return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Phi, + Reduction); +} + void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF) { assert(OrigLoop->isInnermost() && "Inner loop expected."); @@ -9222,7 +9346,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None; addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL); - VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder); + VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE, + Builder); // --------------------------------------------------------------------------- // Pre-construction: record ingredients whose recipes we'll need to further @@ -9268,6 +9393,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { bool NeedsBlends = BB != HeaderBB && !BB->phis().empty(); return Legal->blockNeedsPredication(BB) || NeedsBlends; }); + + RecipeBuilder.collectScaledReductions(Range); + auto *MiddleVPBB = Plan->getMiddleBlock(); VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi(); for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 5d4a3b555981c..cf653e2d3e658 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -21,8 +21,28 @@ namespace llvm { class LoopVectorizationLegality; class LoopVectorizationCostModel; class TargetLibraryInfo; +class TargetTransformInfo; struct HistogramInfo; +/// A chain of instructions that form a partial reduction. +/// Designed to match: reduction_bin_op (bin_op (extend (A), (extend (B))), +/// accumulator). +struct PartialReductionChain { + PartialReductionChain(Instruction *Reduction, Instruction *ExtendA, + Instruction *ExtendB, Instruction *BinOp) + : Reduction(Reduction), ExtendA(ExtendA), ExtendB(ExtendB), BinOp(BinOp) { + } + /// The top-level binary operation that forms the reduction to a scalar + /// after the loop body. + Instruction *Reduction; + /// The extension of each of the inner binary operation's operands. + Instruction *ExtendA; + Instruction *ExtendB; + + /// The binary operation using the extends that is then reduced. + Instruction *BinOp; +}; + /// Helper class to create VPRecipies from IR instructions. class VPRecipeBuilder { /// The VPlan new recipes are added to. @@ -34,6 +54,9 @@ class VPRecipeBuilder { /// Target Library Info. const TargetLibraryInfo *TLI; + // Target Transform Info. + const TargetTransformInfo *TTI; + /// The legality analysis. LoopVectorizationLegality *Legal; @@ -63,6 +86,11 @@ class VPRecipeBuilder { /// created. SmallVector PhisToFix; + /// The set of reduction exit instructions that will be scaled to + /// a smaller VF via partial reductions, paired with the scaling factor. + DenseMap> + ScaledReductionExitInstrs; + /// Check if \p I can be widened at the start of \p Range and possibly /// decrease the range such that the returned value holds for the entire \p /// Range. The function should not be called for memory instructions or calls. @@ -111,13 +139,35 @@ class VPRecipeBuilder { VPHistogramRecipe *tryToWidenHistogram(const HistogramInfo *HI, ArrayRef Operands); + /// Examines reduction operations to see if the target can use a cheaper + /// operation with a wider per-iteration input VF and narrower PHI VF. + /// Returns null if no scaled reduction was found, otherwise a pair with a + /// struct containing reduction information and the scaling factor between the + /// number of elements in the input and output. + std::optional> + getScaledReduction(PHINode *PHI, const RecurrenceDescriptor &Rdx, + VFRange &Range); + public: VPRecipeBuilder(VPlan &Plan, Loop *OrigLoop, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, LoopVectorizationLegality *Legal, LoopVectorizationCostModel &CM, PredicatedScalarEvolution &PSE, VPBuilder &Builder) - : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM), - PSE(PSE), Builder(Builder) {} + : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal), + CM(CM), PSE(PSE), Builder(Builder) {} + + std::optional> + getScaledReductionForInstr(const Instruction *ExitInst) { + auto It = ScaledReductionExitInstrs.find(ExitInst); + return It == ScaledReductionExitInstrs.end() + ? std::nullopt + : std::make_optional(It->second); + } + + /// Find all possible partial reductions in the loop and track all of those + /// that are valid so recipes can be formed later. + void collectScaledReductions(VFRange &Range); /// Create and return a widened recipe for \p I if one can be created within /// the given VF \p Range. @@ -125,6 +175,11 @@ class VPRecipeBuilder { ArrayRef Operands, VFRange &Range, VPBasicBlock *VPBB); + /// Create and return a partial reduction recipe for a reduction instruction + /// along with binary operation and reduction phi operands. + VPRecipeBase *tryToCreatePartialReduction(Instruction *Reduction, + ArrayRef Operands); + /// Set the recipe created for given ingredient. void setRecipe(Instruction *I, VPRecipeBase *R) { assert(!Ingredient2Recipe.contains(I) && diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 6486c6745a680..606780fa7dd5c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -889,6 +889,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPWidenPointerInductionSC: case VPRecipeBase::VPReductionPHISC: case VPRecipeBase::VPScalarCastSC: + case VPRecipeBase::VPPartialReductionSC: return true; case VPRecipeBase::VPBranchOnMaskSC: case VPRecipeBase::VPInterleaveSC: @@ -2376,23 +2377,28 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, /// The phi is part of an ordered reduction. Requires IsInLoop to be true. bool IsOrdered; + /// When expanding the reduction PHI, the plan's VF element count is divided + /// by this factor to form the reduction phi's VF. + unsigned VFScaleFactor = 1; + public: /// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p /// RdxDesc. VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc, VPValue &Start, bool IsInLoop = false, - bool IsOrdered = false) + bool IsOrdered = false, unsigned VFScaleFactor = 1) : VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start), - RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered) { + RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered), + VFScaleFactor(VFScaleFactor) { assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop"); } ~VPReductionPHIRecipe() override = default; VPReductionPHIRecipe *clone() override { - auto *R = - new VPReductionPHIRecipe(cast(getUnderlyingInstr()), RdxDesc, - *getOperand(0), IsInLoop, IsOrdered); + auto *R = new VPReductionPHIRecipe(cast(getUnderlyingInstr()), + RdxDesc, *getOperand(0), IsInLoop, + IsOrdered, VFScaleFactor); R->addOperand(getBackedgeValue()); return R; } @@ -2423,6 +2429,51 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, bool isInLoop() const { return IsInLoop; } }; +/// A recipe for forming partial reductions. In the loop, an accumulator and +/// vector operand are added together and passed to the next iteration as the +/// next accumulator. After the loop body, the accumulator is reduced to a +/// scalar value. +class VPPartialReductionRecipe : public VPSingleDefRecipe { + unsigned Opcode; + +public: + VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0, + VPValue *Op1) + : VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1, + ReductionInst) {} + VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1, + Instruction *ReductionInst = nullptr) + : VPSingleDefRecipe(VPDef::VPPartialReductionSC, + ArrayRef({Op0, Op1}), ReductionInst), + Opcode(Opcode) { + assert(isa(getOperand(1)->getDefiningRecipe()) && + "Unexpected operand order for partial reduction recipe"); + } + ~VPPartialReductionRecipe() override = default; + + VPPartialReductionRecipe *clone() override { + return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1)); + } + + VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC) + + /// Generate the reduction in the loop. + void execute(VPTransformState &State) override; + + /// Return the cost of this VPPartialReductionRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + + /// Get the binary op's opcode. + unsigned getOpcode() const { return Opcode; } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// A recipe for vectorizing a phi-node as a sequence of mask-based select /// instructions. class VPBlendRecipe : public VPSingleDefRecipe { @@ -2630,7 +2681,7 @@ class VPReductionRecipe : public VPSingleDefRecipe { return R && classof(R); } - /// Generate the reduction in the loop + /// Generate the reduction in the loop. void execute(VPTransformState &State) override; /// Return the cost of VPReductionRecipe. diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 35497a7431f76..8fea2c6fd33b6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -231,10 +231,10 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { [](const auto *R) { return R->getScalarType(); }) .Case( - [this](const VPRecipeBase *R) { - return inferScalarType(R->getOperand(0)); - }) + VPReverseVectorPointerRecipe, VPWidenCanonicalIVRecipe, + VPPartialReductionRecipe>([this](const VPRecipeBase *R) { + return inferScalarType(R->getOperand(0)); + }) .Case( diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 8be2b894acd40..bbee9b0125206 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -292,6 +292,66 @@ InstructionCost VPRecipeBase::computeCost(ElementCount VF, llvm_unreachable("subclasses should implement computeCost"); } +InstructionCost +VPPartialReductionRecipe::computeCost(ElementCount VF, + VPCostContext &Ctx) const { + std::optional Opcode = std::nullopt; + VPRecipeBase *BinOpR = getOperand(0)->getDefiningRecipe(); + if (auto *WidenR = dyn_cast(BinOpR)) + Opcode = std::make_optional(WidenR->getOpcode()); + + VPRecipeBase *ExtAR = BinOpR->getOperand(0)->getDefiningRecipe(); + VPRecipeBase *ExtBR = BinOpR->getOperand(1)->getDefiningRecipe(); + + auto GetExtendKind = [](VPRecipeBase *R) { + auto *WidenCastR = dyn_cast(R); + if (!WidenCastR) + return TargetTransformInfo::PR_None; + if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt) + return TargetTransformInfo::PR_ZeroExtend; + if (WidenCastR->getOpcode() == Instruction::CastOps::SExt) + return TargetTransformInfo::PR_SignExtend; + return TargetTransformInfo::PR_None; + }; + + auto *PhiType = Ctx.Types.inferScalarType(getOperand(1)); + auto *ExtTy = Ctx.Types.inferScalarType(ExtAR->getOperand(0)); + + return Ctx.TTI.getPartialReductionCost(getOpcode(), ExtTy, PhiType, VF, + GetExtendKind(ExtAR), + GetExtendKind(ExtBR), Opcode); +} + +void VPPartialReductionRecipe::execute(VPTransformState &State) { + State.setDebugLocFrom(getDebugLoc()); + auto &Builder = State.Builder; + + assert(getOpcode() == Instruction::Add && + "Unhandled partial reduction opcode"); + + Value *BinOpVal = State.get(getOperand(0)); + Value *PhiVal = State.get(getOperand(1)); + assert(PhiVal && BinOpVal && "Phi and Mul must be set"); + + Type *RetTy = PhiVal->getType(); + + CallInst *V = Builder.CreateIntrinsic( + RetTy, Intrinsic::experimental_vector_partial_reduce_add, + {PhiVal, BinOpVal}, nullptr, "partial.reduce"); + + State.set(this, V); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "PARTIAL-REDUCE "; + printAsOperand(O, SlotTracker); + O << " = " << Instruction::getOpcodeName(getOpcode()) << " "; + printOperands(O, SlotTracker); +} +#endif + FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const { assert(OpType == OperationType::FPMathOp && "recipe doesn't have fast math flags"); @@ -3351,6 +3411,10 @@ void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, void VPReductionPHIRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; + // If this phi is fed by a scaled reduction then it should output a + // vector with fewer elements than the VF. + ElementCount VF = State.VF.divideCoefficientBy(VFScaleFactor); + // Reductions do not have to start at zero. They can start with // any loop invariant values. VPValue *StartVPV = getStartValue(); @@ -3361,8 +3425,8 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { // stage #1: We create a new vector PHI node with no incoming edges. We'll use // this value when we vectorize all of the instructions that use the PHI. bool ScalarPHI = State.VF.isScalar() || IsInLoop; - Type *VecTy = ScalarPHI ? StartV->getType() - : VectorType::get(StartV->getType(), State.VF); + Type *VecTy = + ScalarPHI ? StartV->getType() : VectorType::get(StartV->getType(), VF); BasicBlock *HeaderBB = State.CFG.PrevBB; assert(State.CurrentVectorLoop->getHeader() == HeaderBB && @@ -3412,13 +3476,13 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { // Create start and identity vector values for the reduction in the // preheader. // TODO: Introduce recipes in VPlan preheader to create initial values. - Iden = Builder.CreateVectorSplat(State.VF, Iden); + Iden = Builder.CreateVectorSplat(VF, Iden); IRBuilderBase::InsertPointGuard IPBuilder(Builder); Builder.SetInsertPoint(VectorPH->getTerminator()); Constant *Zero = Builder.getInt32(0); StartV = Builder.CreateInsertElement(Iden, StartV, Zero); } else { - Iden = Builder.CreateVectorSplat(State.VF, Iden); + Iden = Builder.CreateVectorSplat(VF, Iden); } } } @@ -3436,6 +3500,8 @@ void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, printAsOperand(O, SlotTracker); O << " = phi "; printOperands(O, SlotTracker); + if (VFScaleFactor != 1) + O << " (VF scaled by 1/" << VFScaleFactor << ")"; } #endif diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 957a602091c73..7aaf4002b8b3e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -329,6 +329,7 @@ class VPDef { VPInterleaveSC, VPReductionEVLSC, VPReductionSC, + VPPartialReductionSC, VPReplicateSC, VPScalarCastSC, VPScalarIVStepsSC, diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll index 1cfb507a74344..c3e8c895fce24 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll @@ -11,10 +11,10 @@ define i64 @test(ptr %a, ptr %b) #0 { ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 8: 26 +; CHECK: Cost for VF 8: 30 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 16: 48 +; CHECK: Cost for VF 16: 56 ; CHECK: LV: Selecting VF: 16 entry: br label %for.body @@ -31,8 +31,8 @@ for.body: ; preds = %entry, %for.body %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %i.iv %1 = load i8, ptr %arrayidx2, align 1 %conv3 = zext i8 %1 to i64 - %mul = mul nuw nsw i64 %conv3, %conv - %add = add i64 %mul, %sum + %div = udiv i64 %conv3, %conv + %add = add i64 %div, %sum %i.iv.next = add nuw nsw i64 %i.iv, 1 %exitcond.not = icmp eq i64 %i.iv.next, 16 br i1 %exitcond.not, label %exit, label %for.body @@ -45,11 +45,11 @@ define i64 @test_external_iv_user(ptr %a, ptr %b) #0 { ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 8: 26 +; CHECK: Cost for VF 8: 30 ; CHECK-NEXT: Cost of 1 for VF 16: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 16: 49 +; CHECK: Cost for VF 16: 57 ; CHECK: LV: Selecting VF: vscale x 2 entry: br label %for.body @@ -64,8 +64,8 @@ for.body: ; preds = %entry, %for.body %arrayidx2 = getelementptr inbounds nuw i8, ptr %b, i64 %i.iv.next %1 = load i8, ptr %arrayidx2, align 1 %conv3 = zext i8 %1 to i64 - %mul = mul nuw nsw i64 %conv3, %conv - %add = add i64 %sum, %mul + %div = udiv i64 %conv3, %conv + %add = add i64 %sum, %div %exitcond.not = icmp eq i64 %i.iv.next, 16 br i1 %exitcond.not, label %exit, label %for.body @@ -82,11 +82,11 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 { ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 8: 27 +; CHECK: Cost for VF 8: 24 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 16: 48 +; CHECK: Cost for VF 16: 42 ; CHECK: LV: Selecting VF: 16 entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll new file mode 100644 index 0000000000000..5cc00daab7ce5 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll @@ -0,0 +1,213 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -mattr=+dotprod -passes=loop-vectorize -force-vector-interleave=1 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +define i32 @dotp(ptr %a, ptr %b) #0 { +; CHECK-LABEL: define i32 @dotp( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: iter.check: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP13]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP15]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0 +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi [ [[TMP18]], [[VEC_EPILOG_PH]] ], [ [[TMP27:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX2]], 0 +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP21]], align 1 +; CHECK-NEXT: [[TMP22:%.*]] = zext [[WIDE_LOAD4]] to +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[TMP23]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP24]], align 1 +; CHECK-NEXT: [[TMP25:%.*]] = zext [[WIDE_LOAD5]] to +; CHECK-NEXT: [[TMP26:%.*]] = mul [[TMP25]], [[TMP22]] +; CHECK-NEXT: [[TMP27]] = add [[TMP26]], [[VEC_PHI3]] +; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX2]], [[TMP17]] +; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP27]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 { +; CHECK-LABEL: define void @dotp_small_epilogue_vf( +; CHECK-SAME: i64 [[IDX_NEG:%.*]], i8 [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: iter.check: +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 1, [[IDX_NEG]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[ENTRY:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16 +; CHECK-NEXT: [[IV_NEXT:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = sext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i64> +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr null, align 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i8> poison, i8 [[TMP2]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT2]], <16 x i8> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[BROADCAST_SPLAT3]] to <16 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i64> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP4]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[IV_NEXT]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[ADD:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[IV_NEXT]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[IND_END6:%.*]] = add i64 [[IDX_NEG]], [[IV_NEXT]] +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[IV_NEXT]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi i64 [ [[ADD]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i64 [[TMP0]], 8 +; CHECK-NEXT: [[N_VEC5:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF4]] +; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[IDX_NEG]], [[N_VEC5]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT7]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = sext <8 x i8> [[BROADCAST_SPLAT8]] to <8 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <1 x i64> zeroinitializer, i64 [[ACCUM]], i32 0 +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX9:%.*]] = phi i64 [ [[IV]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI10:%.*]] = phi <1 x i64> [ [[TMP8]], [[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr null, align 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <8 x i8> poison, i8 [[TMP9]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT11]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = sext <8 x i8> [[BROADCAST_SPLAT12]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = mul <8 x i64> [[TMP10]], [[TMP7]] +; CHECK-NEXT: [[PARTIAL_REDUCE13]] = call <1 x i64> @llvm.experimental.vector.partial.reduce.add.v1i64.v8i64(<1 x i64> [[VEC_PHI10]], <8 x i64> [[TMP11]]) +; CHECK-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX9]], 8 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC5]] +; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> [[PARTIAL_REDUCE13]]) +; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]] +; CHECK-NEXT: br i1 [[CMP_N15]], label [[WHILE_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IDX_NEG]], [[ITER_CHECK:%.*]] ], [ [[IND_END6]], [[WHILE_BODY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL16:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[IV_NEXT]], [[WHILE_BODY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX17:%.*]] = phi i64 [ [[TMP13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[ADD]], [[WHILE_BODY]] ] +; CHECK-NEXT: br label [[WHILE_BODY1:%.*]] +; CHECK: while.body: +; CHECK-NEXT: [[IV_NEG:%.*]] = phi i64 [ [[IV_NEG_NEXT:%.*]], [[WHILE_BODY1]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[IV_NEXT1:%.*]], [[WHILE_BODY1]] ], [ [[BC_RESUME_VAL16]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[ACCUM1:%.*]] = phi i64 [ [[ADD1:%.*]], [[WHILE_BODY1]] ], [ [[BC_MERGE_RDX17]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[IV_NEG_NEXT]] = add i64 [[IV_NEG]], 1 +; CHECK-NEXT: [[EXT_A:%.*]] = sext i8 [[A]] to i64 +; CHECK-NEXT: [[IV_NEXT1]] = add i64 [[IV1]], 1 +; CHECK-NEXT: [[B:%.*]] = load i8, ptr null, align 1 +; CHECK-NEXT: [[EXT_B:%.*]] = sext i8 [[B]] to i64 +; CHECK-NEXT: [[MUL:%.*]] = mul i64 [[EXT_B]], [[EXT_A]] +; CHECK-NEXT: [[ADD1]] = add i64 [[MUL]], [[ACCUM1]] +; CHECK-NEXT: [[CMP_IV_NEG:%.*]] = icmp ugt i64 [[IV_NEG]], 0 +; CHECK-NEXT: [[CMP_IV:%.*]] = icmp ne i64 [[IV1]], -1 +; CHECK-NEXT: [[EXITCOND:%.*]] = and i1 [[CMP_IV_NEG]], [[CMP_IV]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[WHILE_BODY1]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: while.end.loopexit: +; CHECK-NEXT: [[RESULT:%.*]] = phi i64 [ [[ADD1]], [[WHILE_BODY1]] ], [ [[ADD]], [[MIDDLE_BLOCK]] ], [ [[TMP13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret void +; +entry: + br label %while.body + +while.body: ; preds = %while.body, %entry + %iv.neg = phi i64 [ %iv.neg.next, %while.body ], [ %idx.neg, %entry ] + %iv = phi i64 [ %iv.next, %while.body ], [ 0, %entry ] + %accum = phi i64 [ %add, %while.body ], [ 0, %entry ] + %iv.neg.next = add i64 %iv.neg, 1 + %ext.a = sext i8 %a to i64 + %iv.next = add i64 %iv, 1 + %b = load i8, ptr null, align 1 + %ext.b = sext i8 %b to i64 + %mul = mul i64 %ext.b, %ext.a + %add = add i64 %mul, %accum + %cmp.iv.neg = icmp ugt i64 %iv.neg, 0 + %cmp.iv = icmp ne i64 %iv, -1 + %exitcond = and i1 %cmp.iv.neg, %cmp.iv + br i1 %exitcond, label %while.body, label %while.end.loopexit + +while.end.loopexit: ; preds = %while.body + %result = phi i64 [ %add, %while.body ] + ret void +} + +attributes #0 = { vscale_range(1,16) "target-features"="+sve" } +attributes #1 = { "target-cpu"="apple-m1" } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll new file mode 100644 index 0000000000000..c66695f1b50f0 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll @@ -0,0 +1,1375 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1 +; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED +; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -force-vector-interleave=1 -vectorizer-maximize-bandwidth -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +define i32 @dotp(ptr %a, ptr %b) { +; CHECK-INTERLEAVE1-LABEL: define i32 @dotp( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @dotp( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-MAXBW-LABEL: define i32 @dotp( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]]) +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-MAXBW-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-MAXBW-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +define i32 @not_dotp_different_types(ptr %a, ptr %b) { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_different_types( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]] +; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]] +; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]] +; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]] +; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]] +; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]] +; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]] +; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]] +; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = load i16, ptr [[TMP19]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = load i16, ptr [[TMP20]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = load i16, ptr [[TMP21]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = load i16, ptr [[TMP22]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = load i16, ptr [[TMP23]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP40:%.*]] = load i16, ptr [[TMP24]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP41:%.*]] = load i16, ptr [[TMP25]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = load i16, ptr [[TMP26]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = load i16, ptr [[TMP27]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = load i16, ptr [[TMP28]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = load i16, ptr [[TMP29]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP46:%.*]] = load i16, ptr [[TMP30]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP47:%.*]] = load i16, ptr [[TMP31]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP48:%.*]] = load i16, ptr [[TMP32]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP49:%.*]] = load i16, ptr [[TMP33]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP50:%.*]] = load i16, ptr [[TMP34]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP51:%.*]] = insertelement <16 x i16> poison, i16 [[TMP35]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP52:%.*]] = insertelement <16 x i16> [[TMP51]], i16 [[TMP36]], i32 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP53:%.*]] = insertelement <16 x i16> [[TMP52]], i16 [[TMP37]], i32 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP54:%.*]] = insertelement <16 x i16> [[TMP53]], i16 [[TMP38]], i32 3 +; CHECK-INTERLEAVE1-NEXT: [[TMP55:%.*]] = insertelement <16 x i16> [[TMP54]], i16 [[TMP39]], i32 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP56:%.*]] = insertelement <16 x i16> [[TMP55]], i16 [[TMP40]], i32 5 +; CHECK-INTERLEAVE1-NEXT: [[TMP57:%.*]] = insertelement <16 x i16> [[TMP56]], i16 [[TMP41]], i32 6 +; CHECK-INTERLEAVE1-NEXT: [[TMP58:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[TMP42]], i32 7 +; CHECK-INTERLEAVE1-NEXT: [[TMP59:%.*]] = insertelement <16 x i16> [[TMP58]], i16 [[TMP43]], i32 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP60:%.*]] = insertelement <16 x i16> [[TMP59]], i16 [[TMP44]], i32 9 +; CHECK-INTERLEAVE1-NEXT: [[TMP61:%.*]] = insertelement <16 x i16> [[TMP60]], i16 [[TMP45]], i32 10 +; CHECK-INTERLEAVE1-NEXT: [[TMP62:%.*]] = insertelement <16 x i16> [[TMP61]], i16 [[TMP46]], i32 11 +; CHECK-INTERLEAVE1-NEXT: [[TMP63:%.*]] = insertelement <16 x i16> [[TMP62]], i16 [[TMP47]], i32 12 +; CHECK-INTERLEAVE1-NEXT: [[TMP64:%.*]] = insertelement <16 x i16> [[TMP63]], i16 [[TMP48]], i32 13 +; CHECK-INTERLEAVE1-NEXT: [[TMP65:%.*]] = insertelement <16 x i16> [[TMP64]], i16 [[TMP49]], i32 14 +; CHECK-INTERLEAVE1-NEXT: [[TMP66:%.*]] = insertelement <16 x i16> [[TMP65]], i16 [[TMP50]], i32 15 +; CHECK-INTERLEAVE1-NEXT: [[TMP67:%.*]] = zext <16 x i16> [[TMP66]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]] +; CHECK-INTERLEAVE1-NEXT: [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_different_types( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP137:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 17 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 18 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 19 +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 20 +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 21 +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 22 +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 23 +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = add i64 [[INDEX]], 24 +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], 25 +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = add i64 [[INDEX]], 26 +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 27 +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = add i64 [[INDEX]], 28 +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = add i64 [[INDEX]], 29 +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = add i64 [[INDEX]], 30 +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 31 +; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[TMP32]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP34]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]] +; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]] +; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]] +; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP16]] +; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]] +; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP18]] +; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]] +; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP20]] +; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP21]] +; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP22]] +; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP23]] +; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP24]] +; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP25]] +; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP26]] +; CHECK-INTERLEAVED-NEXT: [[TMP64:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP27]] +; CHECK-INTERLEAVED-NEXT: [[TMP65:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP28]] +; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP29]] +; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP30]] +; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP31]] +; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = load i16, ptr [[TMP37]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = load i16, ptr [[TMP38]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = load i16, ptr [[TMP39]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = load i16, ptr [[TMP40]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = load i16, ptr [[TMP41]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = load i16, ptr [[TMP42]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP75:%.*]] = load i16, ptr [[TMP43]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP76:%.*]] = load i16, ptr [[TMP44]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP77:%.*]] = load i16, ptr [[TMP45]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP78:%.*]] = load i16, ptr [[TMP46]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP79:%.*]] = load i16, ptr [[TMP47]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP80:%.*]] = load i16, ptr [[TMP48]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP81:%.*]] = load i16, ptr [[TMP49]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = load i16, ptr [[TMP50]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP83:%.*]] = load i16, ptr [[TMP51]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP84:%.*]] = load i16, ptr [[TMP52]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1 +; CHECK-INTERLEAVED-NEXT: [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2 +; CHECK-INTERLEAVED-NEXT: [[TMP88:%.*]] = insertelement <16 x i16> [[TMP87]], i16 [[TMP72]], i32 3 +; CHECK-INTERLEAVED-NEXT: [[TMP89:%.*]] = insertelement <16 x i16> [[TMP88]], i16 [[TMP73]], i32 4 +; CHECK-INTERLEAVED-NEXT: [[TMP90:%.*]] = insertelement <16 x i16> [[TMP89]], i16 [[TMP74]], i32 5 +; CHECK-INTERLEAVED-NEXT: [[TMP91:%.*]] = insertelement <16 x i16> [[TMP90]], i16 [[TMP75]], i32 6 +; CHECK-INTERLEAVED-NEXT: [[TMP92:%.*]] = insertelement <16 x i16> [[TMP91]], i16 [[TMP76]], i32 7 +; CHECK-INTERLEAVED-NEXT: [[TMP93:%.*]] = insertelement <16 x i16> [[TMP92]], i16 [[TMP77]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[TMP94:%.*]] = insertelement <16 x i16> [[TMP93]], i16 [[TMP78]], i32 9 +; CHECK-INTERLEAVED-NEXT: [[TMP95:%.*]] = insertelement <16 x i16> [[TMP94]], i16 [[TMP79]], i32 10 +; CHECK-INTERLEAVED-NEXT: [[TMP96:%.*]] = insertelement <16 x i16> [[TMP95]], i16 [[TMP80]], i32 11 +; CHECK-INTERLEAVED-NEXT: [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP81]], i32 12 +; CHECK-INTERLEAVED-NEXT: [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13 +; CHECK-INTERLEAVED-NEXT: [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14 +; CHECK-INTERLEAVED-NEXT: [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15 +; CHECK-INTERLEAVED-NEXT: [[TMP101:%.*]] = load i16, ptr [[TMP53]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP102:%.*]] = load i16, ptr [[TMP54]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP103:%.*]] = load i16, ptr [[TMP55]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP104:%.*]] = load i16, ptr [[TMP56]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP105:%.*]] = load i16, ptr [[TMP57]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP106:%.*]] = load i16, ptr [[TMP58]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP107:%.*]] = load i16, ptr [[TMP59]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP108:%.*]] = load i16, ptr [[TMP60]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP109:%.*]] = load i16, ptr [[TMP61]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP110:%.*]] = load i16, ptr [[TMP62]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP111:%.*]] = load i16, ptr [[TMP63]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP112:%.*]] = load i16, ptr [[TMP64]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP113:%.*]] = load i16, ptr [[TMP65]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP114:%.*]] = load i16, ptr [[TMP66]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP115:%.*]] = load i16, ptr [[TMP67]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP116:%.*]] = load i16, ptr [[TMP68]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1 +; CHECK-INTERLEAVED-NEXT: [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2 +; CHECK-INTERLEAVED-NEXT: [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3 +; CHECK-INTERLEAVED-NEXT: [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4 +; CHECK-INTERLEAVED-NEXT: [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5 +; CHECK-INTERLEAVED-NEXT: [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6 +; CHECK-INTERLEAVED-NEXT: [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7 +; CHECK-INTERLEAVED-NEXT: [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9 +; CHECK-INTERLEAVED-NEXT: [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10 +; CHECK-INTERLEAVED-NEXT: [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11 +; CHECK-INTERLEAVED-NEXT: [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12 +; CHECK-INTERLEAVED-NEXT: [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13 +; CHECK-INTERLEAVED-NEXT: [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14 +; CHECK-INTERLEAVED-NEXT: [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15 +; CHECK-INTERLEAVED-NEXT: [[TMP133:%.*]] = zext <16 x i16> [[TMP100]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP135:%.*]] = mul <16 x i32> [[TMP133]], [[TMP35]] +; CHECK-INTERLEAVED-NEXT: [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]] +; CHECK-INTERLEAVED-NEXT: [[TMP137]] = add <16 x i32> [[TMP135]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_different_types( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 +; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]] +; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]] +; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]] +; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]] +; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]] +; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]] +; CHECK-MAXBW-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]] +; CHECK-MAXBW-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]] +; CHECK-MAXBW-NEXT: [[TMP35:%.*]] = load i16, ptr [[TMP19]], align 2 +; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = load i16, ptr [[TMP20]], align 2 +; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = load i16, ptr [[TMP21]], align 2 +; CHECK-MAXBW-NEXT: [[TMP38:%.*]] = load i16, ptr [[TMP22]], align 2 +; CHECK-MAXBW-NEXT: [[TMP39:%.*]] = load i16, ptr [[TMP23]], align 2 +; CHECK-MAXBW-NEXT: [[TMP40:%.*]] = load i16, ptr [[TMP24]], align 2 +; CHECK-MAXBW-NEXT: [[TMP41:%.*]] = load i16, ptr [[TMP25]], align 2 +; CHECK-MAXBW-NEXT: [[TMP42:%.*]] = load i16, ptr [[TMP26]], align 2 +; CHECK-MAXBW-NEXT: [[TMP43:%.*]] = load i16, ptr [[TMP27]], align 2 +; CHECK-MAXBW-NEXT: [[TMP44:%.*]] = load i16, ptr [[TMP28]], align 2 +; CHECK-MAXBW-NEXT: [[TMP45:%.*]] = load i16, ptr [[TMP29]], align 2 +; CHECK-MAXBW-NEXT: [[TMP46:%.*]] = load i16, ptr [[TMP30]], align 2 +; CHECK-MAXBW-NEXT: [[TMP47:%.*]] = load i16, ptr [[TMP31]], align 2 +; CHECK-MAXBW-NEXT: [[TMP48:%.*]] = load i16, ptr [[TMP32]], align 2 +; CHECK-MAXBW-NEXT: [[TMP49:%.*]] = load i16, ptr [[TMP33]], align 2 +; CHECK-MAXBW-NEXT: [[TMP50:%.*]] = load i16, ptr [[TMP34]], align 2 +; CHECK-MAXBW-NEXT: [[TMP51:%.*]] = insertelement <16 x i16> poison, i16 [[TMP35]], i32 0 +; CHECK-MAXBW-NEXT: [[TMP52:%.*]] = insertelement <16 x i16> [[TMP51]], i16 [[TMP36]], i32 1 +; CHECK-MAXBW-NEXT: [[TMP53:%.*]] = insertelement <16 x i16> [[TMP52]], i16 [[TMP37]], i32 2 +; CHECK-MAXBW-NEXT: [[TMP54:%.*]] = insertelement <16 x i16> [[TMP53]], i16 [[TMP38]], i32 3 +; CHECK-MAXBW-NEXT: [[TMP55:%.*]] = insertelement <16 x i16> [[TMP54]], i16 [[TMP39]], i32 4 +; CHECK-MAXBW-NEXT: [[TMP56:%.*]] = insertelement <16 x i16> [[TMP55]], i16 [[TMP40]], i32 5 +; CHECK-MAXBW-NEXT: [[TMP57:%.*]] = insertelement <16 x i16> [[TMP56]], i16 [[TMP41]], i32 6 +; CHECK-MAXBW-NEXT: [[TMP58:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[TMP42]], i32 7 +; CHECK-MAXBW-NEXT: [[TMP59:%.*]] = insertelement <16 x i16> [[TMP58]], i16 [[TMP43]], i32 8 +; CHECK-MAXBW-NEXT: [[TMP60:%.*]] = insertelement <16 x i16> [[TMP59]], i16 [[TMP44]], i32 9 +; CHECK-MAXBW-NEXT: [[TMP61:%.*]] = insertelement <16 x i16> [[TMP60]], i16 [[TMP45]], i32 10 +; CHECK-MAXBW-NEXT: [[TMP62:%.*]] = insertelement <16 x i16> [[TMP61]], i16 [[TMP46]], i32 11 +; CHECK-MAXBW-NEXT: [[TMP63:%.*]] = insertelement <16 x i16> [[TMP62]], i16 [[TMP47]], i32 12 +; CHECK-MAXBW-NEXT: [[TMP64:%.*]] = insertelement <16 x i16> [[TMP63]], i16 [[TMP48]], i32 13 +; CHECK-MAXBW-NEXT: [[TMP65:%.*]] = insertelement <16 x i16> [[TMP64]], i16 [[TMP49]], i32 14 +; CHECK-MAXBW-NEXT: [[TMP66:%.*]] = insertelement <16 x i16> [[TMP65]], i16 [[TMP50]], i32 15 +; CHECK-MAXBW-NEXT: [[TMP67:%.*]] = zext <16 x i16> [[TMP66]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]] +; CHECK-MAXBW-NEXT: [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-MAXBW-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i16, ptr %gep.b, align 2 + %ext.b = zext i16 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_loop_carried( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ , [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ , [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_loop_carried( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ , [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-MAXBW-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %mul, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +define i32 @not_dotp_not_phi(ptr %a, ptr %b) { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_phi( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ , [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_phi( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ , [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_phi( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ , [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-MAXBW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %ext.b + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { +; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_unrolled( +; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16 +; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16 +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled( +; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16 +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16 +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2 +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]]) +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]]) +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]]) +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]]) +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]]) +; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]]) +; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-MAXBW-LABEL: define i32 @dotp_unrolled( +; CHECK-MAXBW-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16 +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16 +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1 +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1 +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]]) +; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 +; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1 +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]]) +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1 +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1 +; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]]) +; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1 +; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1 +; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]]) +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]]) +; CHECK-MAXBW-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]]) +; CHECK-MAXBW-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]]) +; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum3 = phi i32 [ 0, %entry ], [ %add.a3, %for.body ] + %accum2 = phi i32 [ 0, %entry ], [ %add.a2, %for.body ] + %accum1 = phi i32 [ 0, %entry ], [ %add.a1, %for.body ] + %accum0 = phi i32 [ 0, %entry ], [ %add.a0, %for.body ] + %gep.a0 = getelementptr inbounds i8, ptr %a, i64 %iv + %gep.b0 = getelementptr inbounds i8, ptr %b, i64 %iv + %offset.1 = or disjoint i64 %iv, 1 + %gep.a1 = getelementptr inbounds i8, ptr %a, i64 %offset.1 + %gep.b1 = getelementptr inbounds i8, ptr %b, i64 %offset.1 + %offset.2 = or disjoint i64 %iv, 2 + %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %offset.2 + %gep.b2 = getelementptr inbounds i8, ptr %b, i64 %offset.2 + %offset.3 = or disjoint i64 %iv, 3 + %gep.a3 = getelementptr inbounds i8, ptr %a, i64 %offset.3 + %gep.b3 = getelementptr inbounds i8, ptr %b, i64 %offset.3 + %load.a0 = load i8, ptr %gep.a0, align 1 + %ext.a0 = sext i8 %load.a0 to i32 + %load.b0 = load i8, ptr %gep.b0, align 1 + %ext.b0 = sext i8 %load.b0 to i32 + %mul.a0 = mul nsw i32 %ext.b0, %ext.a0 + %add.a0 = add nsw i32 %mul.a0, %accum0 + %load.a1 = load i8, ptr %gep.a1, align 1 + %ext.a1 = sext i8 %load.a1 to i32 + %load.b1 = load i8, ptr %gep.b1, align 1 + %ext.b1 = sext i8 %load.b1 to i32 + %mul.a1 = mul nsw i32 %ext.a1, %ext.b1 + %add.a1 = add nsw i32 %mul.a1, %accum1 + %load.a2 = load i8, ptr %gep.a2, align 1 + %ext.a2 = sext i8 %load.a2 to i32 + %load.b2 = load i8, ptr %gep.b2, align 1 + %ext.b2 = sext i8 %load.b2 to i32 + %mul.a2 = mul nsw i32 %ext.a2, %ext.b2 + %add.a2 = add nsw i32 %mul.a2, %accum2 + %load.a3 = load i8, ptr %gep.a3, align 1 + %ext.a3 = sext i8 %load.a3 to i32 + %load.b3 = load i8, ptr %gep.b3, align 1 + %ext.b3 = sext i8 %load.b3 to i32 + %mul.a3 = mul nsw i32 %ext.a3, %ext.b3 + %add.a3 = add nsw i32 %mul.a3, %accum3 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %num_in + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %for.body + %result0 = add nsw i32 %add.a0, %add.a1 + %result1 = add nsw i32 %add.a2, %add.a3 + %result = add nsw i32 %result0, %result1 + ret i32 %result +} + +define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated( +; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16 +; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated( +; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32 +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32 +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP9]], [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP10]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated( +; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16 +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]]) +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = sext i8 %load.a to i32 + %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = sext i8 %load.b to i32 + %mul = mul nsw i32 %ext.b, %ext.a + %add = add nsw i32 %mul, %accum + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %for.body + ret i32 %add +} + +define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated_pragma( +; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 15 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16 +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE62]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated_pragma( +; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 15 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16 +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE62]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated_pragma( +; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 15 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16 +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ] +; CHECK-MAXBW-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE62]] ] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 +; CHECK-MAXBW-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr inbounds i8, ptr %b, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = sext i8 %load.a to i32 + %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %iv + %load.b = load i8, ptr %gep.a2, align 1 + %ext.b = sext i8 %load.b to i32 + %mul = mul nsw i32 %ext.b, %ext.a + %add = add nsw i32 %mul, %accum + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !7 + +exit: ; preds = %for.body + ret i32 %add +} + +define i32 @not_dotp_extend_user(ptr %a, ptr %b) { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_extend_user( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15 +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[RESULT]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP14]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = extractelement <16 x i32> [[TMP10]], i32 15 +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]] +; CHECK-INTERLEAVED-NEXT: ret i32 [[RESULT]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-MAXBW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]]) +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15 +; CHECK-MAXBW-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]] +; CHECK-MAXBW-NEXT: ret i32 [[RESULT]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + %result = add i32 %add, %ext.b + ret i32 %result +} + +!7 = distinct !{!7, !8, !9, !10} +!8 = !{!"llvm.loop.mustprogress"} +!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} +!10 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll new file mode 100644 index 0000000000000..9530947232192 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -0,0 +1,1733 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1 +; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED +; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -vectorizer-maximize-bandwidth -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +define i32 @dotp(ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define i32 @dotp( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY1:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY1]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP14:%.*]], [[VECTOR_BODY1]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP12]] +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP17]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]] +; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP21]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul [[TMP18]], [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP14]] = add [[TMP13]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP14]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP27]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ], [ [[TMP27]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @dotp( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY1:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY1]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP23:%.*]], [[VECTOR_BODY1]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP24:%.*]], [[VECTOR_BODY1]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP17]] +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP14]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP20]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP21]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP28]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP28]], i64 [[TMP27]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP9]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP18]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = zext [[WIDE_LOAD3]] to +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = zext [[WIDE_LOAD4]] to +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul [[TMP19]], [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul [[TMP29]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add [[TMP30]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add [[TMP22]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP24]], [[TMP23]] +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ], [ [[TMP16]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] +; +; CHECK-MAXBW-LABEL: define i32 @dotp( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP15]], align 1 +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = zext [[WIDE_LOAD4]] to +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = mul [[TMP20]], [[TMP13]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE5]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI1]], [[TMP22]]) +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_different_types( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]] +; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]] +; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]] +; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]] +; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]] +; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]] +; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]] +; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]] +; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = load i16, ptr [[TMP19]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = load i16, ptr [[TMP20]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = load i16, ptr [[TMP21]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = load i16, ptr [[TMP22]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP41:%.*]] = load i16, ptr [[TMP23]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = load i16, ptr [[TMP24]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = load i16, ptr [[TMP25]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = load i16, ptr [[TMP26]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = load i16, ptr [[TMP27]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP46:%.*]] = load i16, ptr [[TMP28]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP55:%.*]] = load i16, ptr [[TMP29]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP56:%.*]] = load i16, ptr [[TMP30]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP47:%.*]] = load i16, ptr [[TMP31]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP48:%.*]] = load i16, ptr [[TMP32]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP49:%.*]] = load i16, ptr [[TMP33]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP50:%.*]] = load i16, ptr [[TMP34]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP51:%.*]] = insertelement <16 x i16> poison, i16 [[TMP35]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP52:%.*]] = insertelement <16 x i16> [[TMP51]], i16 [[TMP36]], i32 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP53:%.*]] = insertelement <16 x i16> [[TMP52]], i16 [[TMP37]], i32 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP54:%.*]] = insertelement <16 x i16> [[TMP53]], i16 [[TMP38]], i32 3 +; CHECK-INTERLEAVE1-NEXT: [[TMP57:%.*]] = insertelement <16 x i16> [[TMP54]], i16 [[TMP41]], i32 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP58:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[TMP42]], i32 5 +; CHECK-INTERLEAVE1-NEXT: [[TMP59:%.*]] = insertelement <16 x i16> [[TMP58]], i16 [[TMP43]], i32 6 +; CHECK-INTERLEAVE1-NEXT: [[TMP60:%.*]] = insertelement <16 x i16> [[TMP59]], i16 [[TMP44]], i32 7 +; CHECK-INTERLEAVE1-NEXT: [[TMP61:%.*]] = insertelement <16 x i16> [[TMP60]], i16 [[TMP45]], i32 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP96:%.*]] = insertelement <16 x i16> [[TMP61]], i16 [[TMP46]], i32 9 +; CHECK-INTERLEAVE1-NEXT: [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP55]], i32 10 +; CHECK-INTERLEAVE1-NEXT: [[TMP62:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP56]], i32 11 +; CHECK-INTERLEAVE1-NEXT: [[TMP63:%.*]] = insertelement <16 x i16> [[TMP62]], i16 [[TMP47]], i32 12 +; CHECK-INTERLEAVE1-NEXT: [[TMP64:%.*]] = insertelement <16 x i16> [[TMP63]], i16 [[TMP48]], i32 13 +; CHECK-INTERLEAVE1-NEXT: [[TMP65:%.*]] = insertelement <16 x i16> [[TMP64]], i16 [[TMP49]], i32 14 +; CHECK-INTERLEAVE1-NEXT: [[TMP66:%.*]] = insertelement <16 x i16> [[TMP65]], i16 [[TMP50]], i32 15 +; CHECK-INTERLEAVE1-NEXT: [[TMP67:%.*]] = zext <16 x i16> [[TMP66]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]] +; CHECK-INTERLEAVE1-NEXT: [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP70]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]]) +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i16, ptr [[GEP_B]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i16 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_different_types( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP137:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 17 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 18 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 19 +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 20 +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 21 +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 22 +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 23 +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = add i64 [[INDEX]], 24 +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], 25 +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = add i64 [[INDEX]], 26 +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 27 +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = add i64 [[INDEX]], 28 +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = add i64 [[INDEX]], 29 +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = add i64 [[INDEX]], 30 +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 31 +; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[TMP32]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP34]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]] +; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]] +; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]] +; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP16]] +; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]] +; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP18]] +; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]] +; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP20]] +; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP21]] +; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP22]] +; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP23]] +; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP24]] +; CHECK-INTERLEAVED-NEXT: [[TMP64:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP25]] +; CHECK-INTERLEAVED-NEXT: [[TMP65:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP26]] +; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP27]] +; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP28]] +; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP29]] +; CHECK-INTERLEAVED-NEXT: [[TMP139:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP30]] +; CHECK-INTERLEAVED-NEXT: [[TMP140:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP31]] +; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = load i16, ptr [[TMP39]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = load i16, ptr [[TMP40]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = load i16, ptr [[TMP41]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = load i16, ptr [[TMP42]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = load i16, ptr [[TMP43]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = load i16, ptr [[TMP44]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP75:%.*]] = load i16, ptr [[TMP45]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP76:%.*]] = load i16, ptr [[TMP46]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP77:%.*]] = load i16, ptr [[TMP47]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP78:%.*]] = load i16, ptr [[TMP48]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP79:%.*]] = load i16, ptr [[TMP49]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP80:%.*]] = load i16, ptr [[TMP50]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP81:%.*]] = load i16, ptr [[TMP51]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = load i16, ptr [[TMP52]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP83:%.*]] = load i16, ptr [[TMP53]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP84:%.*]] = load i16, ptr [[TMP54]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1 +; CHECK-INTERLEAVED-NEXT: [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2 +; CHECK-INTERLEAVED-NEXT: [[TMP88:%.*]] = insertelement <16 x i16> [[TMP87]], i16 [[TMP72]], i32 3 +; CHECK-INTERLEAVED-NEXT: [[TMP89:%.*]] = insertelement <16 x i16> [[TMP88]], i16 [[TMP73]], i32 4 +; CHECK-INTERLEAVED-NEXT: [[TMP90:%.*]] = insertelement <16 x i16> [[TMP89]], i16 [[TMP74]], i32 5 +; CHECK-INTERLEAVED-NEXT: [[TMP91:%.*]] = insertelement <16 x i16> [[TMP90]], i16 [[TMP75]], i32 6 +; CHECK-INTERLEAVED-NEXT: [[TMP92:%.*]] = insertelement <16 x i16> [[TMP91]], i16 [[TMP76]], i32 7 +; CHECK-INTERLEAVED-NEXT: [[TMP93:%.*]] = insertelement <16 x i16> [[TMP92]], i16 [[TMP77]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[TMP94:%.*]] = insertelement <16 x i16> [[TMP93]], i16 [[TMP78]], i32 9 +; CHECK-INTERLEAVED-NEXT: [[TMP95:%.*]] = insertelement <16 x i16> [[TMP94]], i16 [[TMP79]], i32 10 +; CHECK-INTERLEAVED-NEXT: [[TMP96:%.*]] = insertelement <16 x i16> [[TMP95]], i16 [[TMP80]], i32 11 +; CHECK-INTERLEAVED-NEXT: [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP81]], i32 12 +; CHECK-INTERLEAVED-NEXT: [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13 +; CHECK-INTERLEAVED-NEXT: [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14 +; CHECK-INTERLEAVED-NEXT: [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15 +; CHECK-INTERLEAVED-NEXT: [[TMP101:%.*]] = load i16, ptr [[TMP55]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP102:%.*]] = load i16, ptr [[TMP56]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP103:%.*]] = load i16, ptr [[TMP57]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP104:%.*]] = load i16, ptr [[TMP58]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP105:%.*]] = load i16, ptr [[TMP59]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP106:%.*]] = load i16, ptr [[TMP60]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP107:%.*]] = load i16, ptr [[TMP61]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP108:%.*]] = load i16, ptr [[TMP62]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP109:%.*]] = load i16, ptr [[TMP63]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP110:%.*]] = load i16, ptr [[TMP64]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP111:%.*]] = load i16, ptr [[TMP65]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP112:%.*]] = load i16, ptr [[TMP66]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP113:%.*]] = load i16, ptr [[TMP67]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP114:%.*]] = load i16, ptr [[TMP68]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP115:%.*]] = load i16, ptr [[TMP139]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP116:%.*]] = load i16, ptr [[TMP140]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1 +; CHECK-INTERLEAVED-NEXT: [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2 +; CHECK-INTERLEAVED-NEXT: [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3 +; CHECK-INTERLEAVED-NEXT: [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4 +; CHECK-INTERLEAVED-NEXT: [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5 +; CHECK-INTERLEAVED-NEXT: [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6 +; CHECK-INTERLEAVED-NEXT: [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7 +; CHECK-INTERLEAVED-NEXT: [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9 +; CHECK-INTERLEAVED-NEXT: [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10 +; CHECK-INTERLEAVED-NEXT: [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11 +; CHECK-INTERLEAVED-NEXT: [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12 +; CHECK-INTERLEAVED-NEXT: [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13 +; CHECK-INTERLEAVED-NEXT: [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14 +; CHECK-INTERLEAVED-NEXT: [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15 +; CHECK-INTERLEAVED-NEXT: [[TMP133:%.*]] = zext <16 x i16> [[TMP100]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP135:%.*]] = mul <16 x i32> [[TMP133]], [[TMP35]] +; CHECK-INTERLEAVED-NEXT: [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]] +; CHECK-INTERLEAVED-NEXT: [[TMP137]] = add <16 x i32> [[TMP135]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP141:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP141]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP138]], [[TMP137]] +; CHECK-INTERLEAVED-NEXT: [[TMP142:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_different_types( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 +; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1 +; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP38:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; CHECK-MAXBW-NEXT: [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-MAXBW-NEXT: [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]] +; CHECK-MAXBW-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]] +; CHECK-MAXBW-NEXT: [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]] +; CHECK-MAXBW-NEXT: [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]] +; CHECK-MAXBW-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]] +; CHECK-MAXBW-NEXT: [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]] +; CHECK-MAXBW-NEXT: [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]] +; CHECK-MAXBW-NEXT: [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]] +; CHECK-MAXBW-NEXT: [[TMP101:%.*]] = load i16, ptr [[TMP37]], align 2 +; CHECK-MAXBW-NEXT: [[TMP102:%.*]] = load i16, ptr [[TMP38]], align 2 +; CHECK-MAXBW-NEXT: [[TMP103:%.*]] = load i16, ptr [[TMP39]], align 2 +; CHECK-MAXBW-NEXT: [[TMP104:%.*]] = load i16, ptr [[TMP40]], align 2 +; CHECK-MAXBW-NEXT: [[TMP105:%.*]] = load i16, ptr [[TMP41]], align 2 +; CHECK-MAXBW-NEXT: [[TMP106:%.*]] = load i16, ptr [[TMP42]], align 2 +; CHECK-MAXBW-NEXT: [[TMP107:%.*]] = load i16, ptr [[TMP43]], align 2 +; CHECK-MAXBW-NEXT: [[TMP108:%.*]] = load i16, ptr [[TMP44]], align 2 +; CHECK-MAXBW-NEXT: [[TMP109:%.*]] = load i16, ptr [[TMP45]], align 2 +; CHECK-MAXBW-NEXT: [[TMP110:%.*]] = load i16, ptr [[TMP46]], align 2 +; CHECK-MAXBW-NEXT: [[TMP111:%.*]] = load i16, ptr [[TMP47]], align 2 +; CHECK-MAXBW-NEXT: [[TMP112:%.*]] = load i16, ptr [[TMP48]], align 2 +; CHECK-MAXBW-NEXT: [[TMP113:%.*]] = load i16, ptr [[TMP49]], align 2 +; CHECK-MAXBW-NEXT: [[TMP114:%.*]] = load i16, ptr [[TMP50]], align 2 +; CHECK-MAXBW-NEXT: [[TMP115:%.*]] = load i16, ptr [[TMP51]], align 2 +; CHECK-MAXBW-NEXT: [[TMP116:%.*]] = load i16, ptr [[TMP52]], align 2 +; CHECK-MAXBW-NEXT: [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0 +; CHECK-MAXBW-NEXT: [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1 +; CHECK-MAXBW-NEXT: [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2 +; CHECK-MAXBW-NEXT: [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3 +; CHECK-MAXBW-NEXT: [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4 +; CHECK-MAXBW-NEXT: [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5 +; CHECK-MAXBW-NEXT: [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6 +; CHECK-MAXBW-NEXT: [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7 +; CHECK-MAXBW-NEXT: [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8 +; CHECK-MAXBW-NEXT: [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9 +; CHECK-MAXBW-NEXT: [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10 +; CHECK-MAXBW-NEXT: [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11 +; CHECK-MAXBW-NEXT: [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12 +; CHECK-MAXBW-NEXT: [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13 +; CHECK-MAXBW-NEXT: [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14 +; CHECK-MAXBW-NEXT: [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15 +; CHECK-MAXBW-NEXT: [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]] +; CHECK-MAXBW-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-MAXBW-NEXT: br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i16, ptr %gep.b, align 2 + %ext.b = zext i16 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_loop_carried( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 0, i32 [[TMP8]] +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP14]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP16]] = mul [[TMP15]], [[TMP12]] +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call @llvm.vector.splice.nxv8i32( [[VECTOR_RECUR]], [[TMP16]], i32 -1) +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = add [[TMP16]], [[TMP17]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 0, i32 [[TMP8]] +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP14]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP17]], i64 [[TMP20]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP18]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP21]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = zext [[WIDE_LOAD3]] to +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = mul [[TMP22]], [[TMP15]] +; CHECK-INTERLEAVED-NEXT: [[TMP25]] = mul [[TMP23]], [[TMP16]] +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call @llvm.vector.splice.nxv8i32( [[TMP24]], [[TMP25]], i32 -1) +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = add [[TMP25]], [[TMP26]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_loop_carried( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8 +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 0, i32 [[TMP8]] +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP18]], align 1 +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = zext [[WIDE_LOAD3]] to +; CHECK-MAXBW-NEXT: [[TMP25]] = mul [[TMP23]], [[TMP16]] +; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = call @llvm.vector.splice.nxv8i32( [[VECTOR_RECUR]], [[TMP25]], i32 -1) +; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = add [[TMP25]], [[TMP26]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %mul, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_phi( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 0, i32 [[TMP8]] +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP14]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul [[TMP15]], [[TMP12]] +; CHECK-INTERLEAVE1-NEXT: [[TMP17]] = add [[TMP16]], [[TMP15]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_phi( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 0, i32 [[TMP8]] +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP16]], i64 [[TMP19]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP20]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul [[TMP22]], [[TMP15]] +; CHECK-INTERLEAVED-NEXT: [[TMP21]] = add [[TMP30]], [[TMP22]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_phi( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8 +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 0, i32 [[TMP8]] +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP13]], align 1 +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP15]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP18]], align 1 +; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = mul [[TMP19]], [[TMP14]] +; CHECK-MAXBW-NEXT: [[TMP21]] = add [[TMP20]], [[TMP19]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %ext.b + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_unrolled( +; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = mul i64 [[TMP13]], 4 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP15]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], 4 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP18]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = mul i64 [[TMP20]], 4 +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI2:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP14]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = sext [[WIDE_LOAD4]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = mul nsw [[TMP21]], [[TMP36]] +; CHECK-INTERLEAVE1-NEXT: [[TMP23]] = add [[TMP38]], [[VEC_PHI3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP17]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = sext [[WIDE_LOAD5]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP19]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = sext [[WIDE_LOAD6]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = mul nsw [[TMP25]], [[TMP42]] +; CHECK-INTERLEAVE1-NEXT: [[TMP30]] = add [[TMP28]], [[VEC_PHI2]] +; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP22]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = sext [[WIDE_LOAD7]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD8:%.*]] = load , ptr [[TMP24]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = sext [[WIDE_LOAD8]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = mul nsw [[TMP31]], [[TMP33]] +; CHECK-INTERLEAVE1-NEXT: [[TMP35]] = add [[TMP34]], [[VEC_PHI1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD9:%.*]] = load , ptr [[TMP27]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = sext [[WIDE_LOAD9]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD10:%.*]] = load , ptr [[TMP29]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = sext [[WIDE_LOAD10]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP40:%.*]] = mul nsw [[TMP37]], [[TMP39]] +; CHECK-INTERLEAVE1-NEXT: [[TMP41]] = add [[TMP40]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP26]] +; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled( +; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = mul i64 [[TMP13]], 8 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP15]] +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], 8 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP18]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = mul i64 [[TMP34]], 8 +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP64:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP65:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI4:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI5:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI6:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2 +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul i64 [[TMP56]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP20]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load , ptr [[TMP21]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext [[WIDE_LOAD8]] to +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 [[TMP26]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load , ptr [[TMP14]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load , ptr [[TMP72]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext [[WIDE_LOAD9]] to +; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = sext [[WIDE_LOAD10]] to +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul nsw [[TMP28]], [[TMP66]] +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw [[TMP82]], [[TMP23]] +; CHECK-INTERLEAVED-NEXT: [[TMP50]] = add [[TMP30]], [[VEC_PHI6]] +; CHECK-INTERLEAVED-NEXT: [[TMP33]] = add [[TMP31]], [[VEC_PHI7]] +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = mul i64 [[TMP35]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 [[TMP36]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load , ptr [[TMP17]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load , ptr [[TMP37]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = sext [[WIDE_LOAD11]] to +; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = sext [[WIDE_LOAD12]] to +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = mul i64 [[TMP41]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP42]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD13:%.*]] = load , ptr [[TMP19]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD14:%.*]] = load , ptr [[TMP43]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = sext [[WIDE_LOAD13]] to +; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = sext [[WIDE_LOAD14]] to +; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = mul nsw [[TMP38]], [[TMP44]] +; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = mul nsw [[TMP39]], [[TMP45]] +; CHECK-INTERLEAVED-NEXT: [[TMP48]] = add [[TMP46]], [[VEC_PHI4]] +; CHECK-INTERLEAVED-NEXT: [[TMP49]] = add [[TMP47]], [[VEC_PHI5]] +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = mul i64 [[TMP51]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP52]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load , ptr [[TMP22]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD16:%.*]] = load , ptr [[TMP53]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = sext [[WIDE_LOAD15]] to +; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = sext [[WIDE_LOAD16]] to +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = mul i64 [[TMP57]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[TMP58]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD17:%.*]] = load , ptr [[TMP24]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD18:%.*]] = load , ptr [[TMP59]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = sext [[WIDE_LOAD17]] to +; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = sext [[WIDE_LOAD18]] to +; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = mul nsw [[TMP54]], [[TMP60]] +; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = mul nsw [[TMP55]], [[TMP61]] +; CHECK-INTERLEAVED-NEXT: [[TMP64]] = add [[TMP62]], [[VEC_PHI2]] +; CHECK-INTERLEAVED-NEXT: [[TMP65]] = add [[TMP63]], [[VEC_PHI3]] +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = mul i64 [[TMP67]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 [[TMP68]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD19:%.*]] = load , ptr [[TMP27]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD20:%.*]] = load , ptr [[TMP69]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = sext [[WIDE_LOAD19]] to +; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = sext [[WIDE_LOAD20]] to +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = mul i64 [[TMP73]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP75:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 [[TMP74]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load , ptr [[TMP29]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD22:%.*]] = load , ptr [[TMP75]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP76:%.*]] = sext [[WIDE_LOAD21]] to +; CHECK-INTERLEAVED-NEXT: [[TMP77:%.*]] = sext [[WIDE_LOAD22]] to +; CHECK-INTERLEAVED-NEXT: [[TMP78:%.*]] = mul nsw [[TMP70]], [[TMP76]] +; CHECK-INTERLEAVED-NEXT: [[TMP79:%.*]] = mul nsw [[TMP71]], [[TMP77]] +; CHECK-INTERLEAVED-NEXT: [[TMP80]] = add [[TMP78]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP81]] = add [[TMP79]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP40]] +; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; +; CHECK-MAXBW-LABEL: define i32 @dotp_unrolled( +; CHECK-MAXBW-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP1]] +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI4:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI5:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI6:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI7:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP6]], 1 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = or disjoint i64 [[TMP6]], 2 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = or disjoint i64 [[TMP6]], 3 +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]] +; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]] +; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP18]], align 1 +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD9:%.*]] = load , ptr [[TMP24]], align 1 +; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = sext [[WIDE_LOAD9]] to +; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = mul nsw [[TMP29]], [[TMP23]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE11]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI7]], [[TMP31]]) +; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD12:%.*]] = load , ptr [[TMP32]], align 1 +; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = sext [[WIDE_LOAD12]] to +; CHECK-MAXBW-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD14:%.*]] = load , ptr [[TMP38]], align 1 +; CHECK-MAXBW-NEXT: [[TMP43:%.*]] = sext [[WIDE_LOAD14]] to +; CHECK-MAXBW-NEXT: [[TMP45:%.*]] = mul nsw [[TMP37]], [[TMP43]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI6]], [[TMP45]]) +; CHECK-MAXBW-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD18:%.*]] = load , ptr [[TMP46]], align 1 +; CHECK-MAXBW-NEXT: [[TMP51:%.*]] = sext [[WIDE_LOAD18]] to +; CHECK-MAXBW-NEXT: [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD20:%.*]] = load , ptr [[TMP52]], align 1 +; CHECK-MAXBW-NEXT: [[TMP57:%.*]] = sext [[WIDE_LOAD20]] to +; CHECK-MAXBW-NEXT: [[TMP59:%.*]] = mul nsw [[TMP51]], [[TMP57]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE17]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI5]], [[TMP59]]) +; CHECK-MAXBW-NEXT: [[TMP60:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD24:%.*]] = load , ptr [[TMP60]], align 1 +; CHECK-MAXBW-NEXT: [[TMP65:%.*]] = sext [[WIDE_LOAD24]] to +; CHECK-MAXBW-NEXT: [[TMP66:%.*]] = getelementptr inbounds i8, ptr [[TMP17]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD26:%.*]] = load , ptr [[TMP66]], align 1 +; CHECK-MAXBW-NEXT: [[TMP71:%.*]] = sext [[WIDE_LOAD26]] to +; CHECK-MAXBW-NEXT: [[TMP73:%.*]] = mul nsw [[TMP65]], [[TMP71]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE16]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI4]], [[TMP73]]) +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP74:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum3 = phi i32 [ 0, %entry ], [ %add.a3, %for.body ] + %accum2 = phi i32 [ 0, %entry ], [ %add.a2, %for.body ] + %accum1 = phi i32 [ 0, %entry ], [ %add.a1, %for.body ] + %accum0 = phi i32 [ 0, %entry ], [ %add.a0, %for.body ] + %gep.a0 = getelementptr inbounds i8, ptr %a, i64 %iv + %gep.b0 = getelementptr inbounds i8, ptr %b, i64 %iv + %offset.1 = or disjoint i64 %iv, 1 + %gep.a1 = getelementptr inbounds i8, ptr %a, i64 %offset.1 + %gep.b1 = getelementptr inbounds i8, ptr %b, i64 %offset.1 + %offset.2 = or disjoint i64 %iv, 2 + %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %offset.2 + %gep.b2 = getelementptr inbounds i8, ptr %b, i64 %offset.2 + %offset.3 = or disjoint i64 %iv, 3 + %gep.a3 = getelementptr inbounds i8, ptr %a, i64 %offset.3 + %gep.b3 = getelementptr inbounds i8, ptr %b, i64 %offset.3 + %load.a0 = load i8, ptr %gep.a0, align 1 + %ext.a0 = sext i8 %load.a0 to i32 + %load.b0 = load i8, ptr %gep.b0, align 1 + %ext.b0 = sext i8 %load.b0 to i32 + %mul.a0 = mul nsw i32 %ext.b0, %ext.a0 + %add.a0 = add nsw i32 %mul.a0, %accum0 + %load.a1 = load i8, ptr %gep.a1, align 1 + %ext.a1 = sext i8 %load.a1 to i32 + %load.b1 = load i8, ptr %gep.b1, align 1 + %ext.b1 = sext i8 %load.b1 to i32 + %mul.a1 = mul nsw i32 %ext.a1, %ext.b1 + %add.a1 = add nsw i32 %mul.a1, %accum1 + %load.a2 = load i8, ptr %gep.a2, align 1 + %ext.a2 = sext i8 %load.a2 to i32 + %load.b2 = load i8, ptr %gep.b2, align 1 + %ext.b2 = sext i8 %load.b2 to i32 + %mul.a2 = mul nsw i32 %ext.a2, %ext.b2 + %add.a2 = add nsw i32 %mul.a2, %accum2 + %load.a3 = load i8, ptr %gep.a3, align 1 + %ext.a3 = sext i8 %load.a3 to i32 + %load.b3 = load i8, ptr %gep.b3, align 1 + %ext.b3 = sext i8 %load.b3 to i32 + %mul.a3 = mul nsw i32 %ext.a3, %ext.b3 + %add.a3 = add nsw i32 %mul.a3, %accum3 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %num_in + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %for.body + %result0 = add nsw i32 %add.a0, %add.a1 + %result1 = add nsw i32 %add.a2, %add.a3 + %result = add nsw i32 %result0, %result1 + ret i32 %result +} + +define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated( +; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP7]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP10]], 4 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = mul i64 [[TMP11]], 4 +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP5]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul nsw [[TMP12]], [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP14]] = add [[TMP13]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP14]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated( +; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul i64 [[TMP8]], 8 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP14]] +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP15]], 8 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[TMP17]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP7]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP18]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = sext [[WIDE_LOAD3]] to +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext [[WIDE_LOAD4]] to +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw [[TMP19]], [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul nsw [[TMP20]], [[TMP25]] +; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add [[TMP21]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add [[TMP22]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP24]], [[TMP23]] +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated( +; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP15]], align 1 +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = sext [[WIDE_LOAD4]] to +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = mul nsw [[TMP20]], [[TMP13]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE5]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI1]], [[TMP22]]) +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = sext i8 %load.a to i32 + %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = sext i8 %load.b to i32 + %mul = mul nsw i32 %ext.b, %ext.a + %add = add nsw i32 %mul, %accum + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %for.body + ret i32 %add +} + +define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated_pragma( +; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = sub i64 [[TMP10]], 1 +; CHECK-INTERLEAVE1-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP11]] +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = mul i64 [[TMP12]], 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = mul i64 [[TMP15]], 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[N]], [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0 +; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP5]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP8]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = sext [[WIDE_MASKED_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = mul nsw [[TMP16]], [[TMP13]] +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = add [[TMP17]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[TMP19]] = select [[ACTIVE_LANE_MASK]], [[TMP18]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]] +; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) +; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i32 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated_pragma( +; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = sub i64 [[TMP10]], 1 +; CHECK-INTERLEAVED-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP11]] +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul i64 [[TMP12]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP15]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[N]], [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0 +; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP5]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP8]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = sext [[WIDE_MASKED_LOAD1]] to +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul nsw [[TMP16]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = add [[TMP17]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP19]] = select [[ACTIVE_LANE_MASK]], [[TMP18]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]]) +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i32 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated_pragma( +; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-MAXBW-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP2]] +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 +; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP15]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = sext [[WIDE_MASKED_LOAD1]] to +; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw [[TMP16]], [[TMP13]] +; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = add [[TMP17]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[TMP19]] = select [[ACTIVE_LANE_MASK]], [[TMP18]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] +; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) +; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i32 0 +; CHECK-MAXBW-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr inbounds i8, ptr %b, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = sext i8 %load.a to i32 + %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %iv + %load.b = load i8, ptr %gep.a2, align 1 + %ext.b = sext i8 %load.b to i32 + %mul = mul nsw i32 %ext.b, %ext.a + %add = add nsw i32 %mul, %accum + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !7 + +exit: ; preds = %for.body + ret i32 %add +} + +define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_extend_user( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = mul i64 [[TMP8]], 4 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP11]], 4 +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP7]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul [[TMP12]], [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP14]] = add [[TMP13]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP7]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP14]], 8 +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP15]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP17]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP9]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP18]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = zext [[WIDE_LOAD3]] to +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = zext [[WIDE_LOAD4]] to +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul [[TMP19]], [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul [[TMP20]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add [[TMP21]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add [[TMP22]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP15]], align 1 +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = zext [[WIDE_LOAD4]] to +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = mul [[TMP20]], [[TMP13]] +; CHECK-MAXBW-NEXT: [[TMP24]] = add [[TMP22]], [[VEC_PHI1]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + %result = add i32 %add, %ext.b + ret i32 %result +} + +define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define i64 @dotp_cost_disagreement( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP11]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = mul nuw nsw [[TMP13]], [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP15]] = add [[VEC_PHI]], [[TMP14]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; +; CHECK-INTERLEAVED-LABEL: define i64 @dotp_cost_disagreement( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]] +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add nuw nsw i64 [[TMP6]], 1 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 2 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i64 [[TMP18]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP16]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP19]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = zext [[WIDE_LOAD3]] to +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = zext [[WIDE_LOAD4]] to +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul nuw nsw [[TMP20]], [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = mul nuw nsw [[TMP21]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add [[VEC_PHI]], [[TMP22]] +; CHECK-INTERLEAVED-NEXT: [[TMP25]] = add [[VEC_PHI1]], [[TMP23]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; +; CHECK-MAXBW-LABEL: define i64 @dotp_cost_disagreement( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]] +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 1 +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP11]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = mul nuw nsw [[TMP13]], [[TMP9]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call @llvm.experimental.vector.partial.reduce.add.nxv1i64.nxv8i64( [[VEC_PHI]], [[TMP14]]) +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64( [[PARTIAL_REDUCE]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] + %sum = phi i64 [ 0, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds nuw i8, ptr %a, i64 %i.iv + %0 = load i8, ptr %arrayidx, align 1 + %conv = zext i8 %0 to i64 + %i.iv.next = add nuw nsw i64 %i.iv, 1 + %arrayidx2 = getelementptr inbounds nuw i8, ptr %b, i64 %i.iv.next + %1 = load i8, ptr %arrayidx2, align 1 + %conv3 = zext i8 %1 to i64 + %mul = mul nuw nsw i64 %conv3, %conv + %add = add i64 %sum, %mul + %exitcond.not = icmp eq i64 %i.iv.next, 16 + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %for.body + ret i64 %add +} + +!7 = distinct !{!7, !8, !9, !10} +!8 = !{!"llvm.loop.mustprogress"} +!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} +!10 = !{!"llvm.loop.vectorize.enable", i1 true} +attributes #0 = { vscale_range(1,16) "target-features"="+sve" } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll new file mode 100644 index 0000000000000..f24b115ab9f99 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll @@ -0,0 +1,61 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon -S < %s | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +define i32 @not_dotp(ptr %a, ptr %b) { +; CHECK-LABEL: define i32 @not_dotp( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll new file mode 100644 index 0000000000000..06aaf29b382a2 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll @@ -0,0 +1,93 @@ +; REQUIRES: asserts +; RUN: opt -mattr=+neon,+dotprod -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-interleave=1 -disable-output %s 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +; Tests for printing VPlans that are enabled under AArch64 + +define i32 @print_partial_reduction(ptr %a, ptr %b) { +; CHECK: VPlan 'Initial VPlan for VF={8,16},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<0> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: Successor(s): vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi ir<0>, ir<[[REDUCE:%.+]]> (VF scaled by 1/4) +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> +; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a> +; CHECK-NEXT: WIDEN ir<%load.a> = load vp<[[PTR_A]]> +; CHECK-NEXT: WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32 +; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[PTR_B:%.+]]> = vector-pointer ir<%gep.b> +; CHECK-NEXT: WIDEN ir<%load.b> = load vp<[[PTR_B]]> +; CHECK-NEXT: WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32 +; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a> +; CHECK-NEXT: PARTIAL-REDUCE ir<[[REDUCE]]> = add ir<%mul>, ir<[[ACC]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, ir<[[REDUCE]]> +; CHECK-NEXT: EMIT vp<[[EXTRACT:%.+]]> = extract-from-end vp<[[RED_RESULT]]>, ir<1> +; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<0>, vp<%1> +; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: EMIT vp<%bc.merge.rdx> = resume-phi vp<[[RED_RESULT]]>, ir<0> +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +; CHECK-NEXT: IR %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) +; CHECK-NEXT: IR %gep.a = getelementptr i8, ptr %a, i64 %iv +; CHECK-NEXT: IR %load.a = load i8, ptr %gep.a, align 1 +; CHECK-NEXT: IR %ext.a = zext i8 %load.a to i32 +; CHECK-NEXT: IR %gep.b = getelementptr i8, ptr %b, i64 %iv +; CHECK-NEXT: IR %load.b = load i8, ptr %gep.b, align 1 +; CHECK-NEXT: IR %ext.b = zext i8 %load.b to i32 +; CHECK-NEXT: IR %mul = mul i32 %ext.b, %ext.a +; CHECK-NEXT: IR %add = add i32 %mul, %accum +; CHECK-NEXT: IR %iv.next = add i64 %iv, 1 +; CHECK-NEXT: IR %exitcond.not = icmp eq i64 %iv.next, 0 +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[EXTRACT]]> from middle.block) +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret i32 %add +} From 10c18ab7e6c46d9daeb558d47be1f06c53c5ed0e Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Tue, 24 Dec 2024 08:18:09 -0500 Subject: [PATCH 014/567] [gn] port dbae7176a6ec (LLVMTelemetry) --- llvm/utils/gn/secondary/llvm/lib/Telemetry/BUILD.gn | 5 +++++ llvm/utils/gn/secondary/llvm/unittests/BUILD.gn | 1 + .../gn/secondary/llvm/unittests/Telemetry/BUILD.gn | 10 ++++++++++ 3 files changed, 16 insertions(+) create mode 100644 llvm/utils/gn/secondary/llvm/lib/Telemetry/BUILD.gn create mode 100644 llvm/utils/gn/secondary/llvm/unittests/Telemetry/BUILD.gn diff --git a/llvm/utils/gn/secondary/llvm/lib/Telemetry/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Telemetry/BUILD.gn new file mode 100644 index 0000000000000..82c1152c1eea3 --- /dev/null +++ b/llvm/utils/gn/secondary/llvm/lib/Telemetry/BUILD.gn @@ -0,0 +1,5 @@ +static_library("Telemetry") { + output_name = "LLVMTelemetry" + deps = [ "//llvm/lib/Support" ] + sources = [ "Telemetry.cpp" ] +} diff --git a/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn index 95279099fa53b..78875ea981022 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn @@ -50,6 +50,7 @@ group("unittests") { "TableGen:TableGenTests", "Target:TargetMachineCTests", "TargetParser:TargetParserTests", + "Telemetry:TelemetryTests", "Testing/ADT:TestingADTTests", "Testing/Support:TestingSupportTests", "TextAPI:TextAPITests", diff --git a/llvm/utils/gn/secondary/llvm/unittests/Telemetry/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Telemetry/BUILD.gn new file mode 100644 index 0000000000000..47ecf352256e6 --- /dev/null +++ b/llvm/utils/gn/secondary/llvm/unittests/Telemetry/BUILD.gn @@ -0,0 +1,10 @@ +import("//third-party/unittest/unittest.gni") + +unittest("TelemetryTests") { + deps = [ + "//llvm/lib/IR", + "//llvm/lib/Support", + "//llvm/lib/Telemetry", + ] + sources = [ "TelemetryTest.cpp" ] +} From 8e1cb96db84a70b2c803c28a359c8bb71395f35e Mon Sep 17 00:00:00 2001 From: vabridgers <58314289+vabridgers@users.noreply.github.com> Date: Tue, 24 Dec 2024 07:21:14 -0600 Subject: [PATCH 015/567] [analyzer] Split alpha core Identical Expression tests (#119543) Split the remnant test from PR #114715, "Remove alpha.core.IdenticalExpr Checker" into seperate tests for misc-redundant-expression and bugprone-branch-clone per review comment requests. --------- Co-authored-by: einvbri --- .../checkers/bugprone/branch-clone-2.cpp | 768 ++++++++++++++++++ .../redundant-expression-2.cpp} | 475 +---------- 2 files changed, 804 insertions(+), 439 deletions(-) create mode 100644 clang-tools-extra/test/clang-tidy/checkers/bugprone/branch-clone-2.cpp rename clang-tools-extra/test/clang-tidy/checkers/{bugprone/alpha-core-identicalexpr.cpp => misc/redundant-expression-2.cpp} (67%) diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/branch-clone-2.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/branch-clone-2.cpp new file mode 100644 index 0000000000000..b91ac6a550c5a --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/branch-clone-2.cpp @@ -0,0 +1,768 @@ +// RUN: %check_clang_tidy %s bugprone-branch-clone %t -- + +/* Only one expected warning per function allowed at the very end. */ + +int func(void) +{ + return 0; +} + +int func2(void) +{ + return 0; +} + +int funcParam(int a) +{ + return 0; +} + +/* '!=' operator*/ + + +/* '!=' with int pointer */ + +int checkNotEqualIntPointerLiteralCompare1(void) { + int* p = 0; + return (p != 0); // no warning +} + +int checkNotEqualIntPointerLiteralCompare2(void) { + return (6 != 7); // no warning +} + +int checkNotEqualIntPointerDeclCompare1(void) { + int k = 3; + int* f = &k; + int* g = &k; + return (f != g); // no warning +} + +int checkNotEqualCastIntPointerDeclCompare11(void) { + int k = 7; + int* f = &k; + return ((int*)f != (int*)f); +} +int checkNotEqualCastIntPointerDeclCompare12(void) { + int k = 7; + int* f = &k; + return ((int*)((char*)f) != (int*)f); // no warning +} +int checkNotEqualBinaryOpIntPointerCompare1(void) { + int k = 7; + int res; + int* f= &k; + res = (f + 4 != f + 4); + return (0); +} +int checkNotEqualBinaryOpIntPointerCompare2(void) { + int k = 7; + int* f = &k; + int* g = &k; + return (f + 4 != g + 4); // no warning +} + + +int checkNotEqualBinaryOpIntPointerCompare3(void) { + int k = 7; + int res; + int* f= &k; + res = ((int*)f + 4 != (int*)f + 4); + return (0); +} +int checkNotEqualBinaryOpIntPointerCompare4(void) { + int k = 7; + int res; + int* f= &k; + res = ((int*)f + 4 != (int*)((char*)f) + 4); // no warning + return (0); +} + +int checkNotEqualNestedBinaryOpIntPointerCompare1(void) { + int res; + int k = 7; + int t= 1; + int* u= &k+2; + int* f= &k+3; + res = ((f + (3)*t) != (f + (3)*t)); + return (0); +} + +int checkNotEqualNestedBinaryOpIntPointerCompare2(void) { + int res; + int k = 7; + int t= 1; + int* u= &k+2; + int* f= &k+3; + res = (((3)*t + f) != (f + (3)*t)); // no warning + return (0); +} +/* end '!=' int* */ + +/* '!=' with function*/ + +int checkNotEqualSameFunction() { + unsigned a = 0; + unsigned b = 1; + int res = (a+func() != a+func()); // no warning + return (0); +} + +int checkNotEqualDifferentFunction() { + unsigned a = 0; + unsigned b = 1; + int res = (a+func() != a+func2()); // no warning + return (0); +} + +int checkNotEqualSameFunctionSameParam() { + unsigned a = 0; + unsigned b = 1; + int res = (a+funcParam(a) != a+funcParam(a)); // no warning + return (0); +} + +int checkNotEqualSameFunctionDifferentParam() { + unsigned a = 0; + unsigned b = 1; + int res = (a+funcParam(a) != a+funcParam(b)); // no warning + return (0); +} + +/* end '!=' with function*/ + +/* end '!=' */ + + +/* Checking use of identical expressions in conditional operator*/ + +unsigned test_unsigned(unsigned a) { + unsigned b = 1; + a = a > 5 ? b : b; +// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] + return a; +} + +void test_signed() { + int a = 0; + a = a > 5 ? a : a; +// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] +} + +void test_bool(bool a) { + a = a > 0 ? a : a; +// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] +} + +void test_float() { + float a = 0; + float b = 0; + a = a > 5 ? a : a; +// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] +} + +const char *test_string() { + float a = 0; + return a > 5 ? "abc" : "abc"; +// CHECK-MESSAGES: :[[@LINE-1]]:16: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] +} + +void test_unsigned_expr() { + unsigned a = 0; + unsigned b = 0; + a = a > 5 ? a+b : a+b; +// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] +} + +void test_signed_expr() { + int a = 0; + int b = 1; + a = a > 5 ? a+b : a+b; +// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] +} + +void test_bool_expr(bool a) { + bool b = 0; + a = a > 0 ? a&&b : a&&b; +// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] +} + +void test_unsigned_expr_negative() { + unsigned a = 0; + unsigned b = 0; + a = a > 5 ? a+b : b+a; // no warning +} + +void test_signed_expr_negative() { + int a = 0; + int b = 1; + a = a > 5 ? b+a : a+b; // no warning +} + +void test_bool_expr_negative(bool a) { + bool b = 0; + a = a > 0 ? a&&b : b&&a; // no warning +} + +void test_float_expr_positive() { + float a = 0; + float b = 0; + a = a > 5 ? a+b : a+b; +// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] +} + +void test_expr_positive_func() { + unsigned a = 0; + unsigned b = 1; + a = a > 5 ? a+func() : a+func(); +// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] +} + +void test_expr_negative_func() { + unsigned a = 0; + unsigned b = 1; + a = a > 5 ? a+func() : a+func2(); // no warning +} + +void test_expr_positive_funcParam() { + unsigned a = 0; + unsigned b = 1; + a = a > 5 ? a+funcParam(b) : a+funcParam(b); +// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] +} + +void test_expr_negative_funcParam() { + unsigned a = 0; + unsigned b = 1; + a = a > 5 ? a+funcParam(a) : a+funcParam(b); // no warning +} + +void test_expr_positive_inc() { + unsigned a = 0; + unsigned b = 1; + a = a > 5 ? a++ : a++; +// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] +} + +void test_expr_negative_inc() { + unsigned a = 0; + unsigned b = 1; + a = a > 5 ? a++ : b++; // no warning +} + +void test_expr_positive_assign() { + unsigned a = 0; + unsigned b = 1; + a = a > 5 ? a=1 : a=1; +// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] +} + +void test_expr_negative_assign() { + unsigned a = 0; + unsigned b = 1; + a = a > 5 ? a=1 : a=2; // no warning +} + +void test_signed_nested_expr() { + int a = 0; + int b = 1; + int c = 3; + a = a > 5 ? a+b+(c+a)*(a + b*(c+a)) : a+b+(c+a)*(a + b*(c+a)); +// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] +} + +void test_signed_nested_expr_negative() { + int a = 0; + int b = 1; + int c = 3; + a = a > 5 ? a+b+(c+a)*(a + b*(c+a)) : a+b+(c+a)*(a + b*(a+c)); // no warning +} + +void test_signed_nested_cond_expr_negative() { + int a = 0; + int b = 1; + int c = 3; + a = a > 5 ? (b > 5 ? 1 : 4) : (b > 5 ? 2 : 4); // no warning +} + +void test_signed_nested_cond_expr() { + int a = 0; + int b = 1; + int c = 3; + a = a > 5 ? (b > 5 ? 1 : 4) : (b > 5 ? 4 : 4); +// CHECK-MESSAGES: :[[@LINE-1]]:40: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] +} + +void test_identical_branches1(bool b) { + int i = 0; + if (b) { +// CHECK-MESSAGES: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone] + ++i; + } else { +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: else branch starts here + ++i; + } +} + +void test_identical_branches2(bool b) { + int i = 0; + if (b) { +// CHECK-MESSAGES: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone] + ++i; + } else +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: else branch starts here + ++i; +} + +void test_identical_branches3(bool b) { + int i = 0; + if (b) { // no warning + ++i; + } else { + i++; + } +} + +void test_identical_branches4(bool b) { + int i = 0; + if (b) { +// CHECK-MESSAGES: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone] + } else { +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: else branch starts here + } +} + +void test_identical_branches_break(bool b) { + while (true) { + if (b) +// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: if with identical then and else branches [bugprone-branch-clone] + break; + else +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: else branch starts here + break; + } +} + +void test_identical_branches_continue(bool b) { + while (true) { + if (b) +// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: if with identical then and else branches [bugprone-branch-clone] + continue; + else +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: else branch starts here + continue; + } +} + +void test_identical_branches_func(bool b) { + if (b) +// CHECK-MESSAGES: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone] + func(); + else +// CHECK-MESSAGES: :[[@LINE-1]]:3: note: else branch starts here + func(); +} + +void test_identical_branches_func_arguments(bool b) { + if (b) // no-warning + funcParam(1); + else + funcParam(2); +} + +void test_identical_branches_cast1(bool b) { + long v = -7; + if (b) // no-warning + v = (signed int) v; + else + v = (unsigned int) v; +} + +void test_identical_branches_cast2(bool b) { + long v = -7; + if (b) +// CHECK-MESSAGES: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone] + v = (signed int) v; + else +// CHECK-MESSAGES: :[[@LINE-1]]:3: note: else branch starts here + v = (signed int) v; +} + +int test_identical_branches_return_int(bool b) { + int i = 0; + if (b) { +// CHECK-MESSAGES: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone] + i++; + return i; + } else { +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: else branch starts here + i++; + return i; + } +} + +int test_identical_branches_return_func(bool b) { + if (b) { +// CHECK-MESSAGES: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone] + return func(); + } else { +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: else branch starts here + return func(); + } +} + +void test_identical_branches_for(bool b) { + int i; + int j; + if (b) { +// CHECK-MESSAGES: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone] + for (i = 0, j = 0; i < 10; i++) + j += 4; + } else { +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: else branch starts here + for (i = 0, j = 0; i < 10; i++) + j += 4; + } +} + +void test_identical_branches_while(bool b) { + int i = 10; + if (b) { +// CHECK-MESSAGES: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone] + while (func()) + i--; + } else { +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: else branch starts here + while (func()) + i--; + } +} + +void test_identical_branches_while_2(bool b) { + int i = 10; + if (b) { // no-warning + while (func()) + i--; + } else { + while (func()) + i++; + } +} + +void test_identical_branches_do_while(bool b) { + int i = 10; + if (b) { +// CHECK-MESSAGES: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone] + do { + i--; + } while (func()); + } else { +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: else branch starts here + do { + i--; + } while (func()); + } +} + +void test_identical_branches_if(bool b, int i) { + if (b) { +// CHECK-MESSAGES: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone] + if (i < 5) + i += 10; + } else { +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: else branch starts here + if (i < 5) + i += 10; + } +} + +void test_identical_bitwise1() { + int a = 5 | 5; // no-warning +} + +void test_identical_bitwise2() { + int a = 5; + int b = a | a; // no-warning +} + +void test_identical_bitwise3() { + int a = 5; + int b = (a | a); // no-warning +} + +void test_identical_bitwise4() { + int a = 4; + int b = a | 4; // no-warning +} + +void test_identical_bitwise5() { + int a = 4; + int b = 4; + int c = a | b; // no-warning +} + +void test_identical_bitwise6() { + int a = 5; + int b = a | 4 | a; +} + +void test_identical_bitwise7() { + int a = 5; + int b = func() | func(); +} + +void test_identical_logical1(int a) { + if (a == 4 && a == 4) + ; +} + +void test_identical_logical2(int a) { + if (a == 4 || a == 5 || a == 4) + ; +} + +void test_identical_logical3(int a) { + if (a == 4 || a == 5 || a == 6) // no-warning + ; +} + +void test_identical_logical4(int a) { + if (a == func() || a == func()) // no-warning + ; +} + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlogical-op-parentheses" +void test_identical_logical5(int x, int y) { + if (x == 4 && y == 5 || x == 4 && y == 6) // no-warning + ; +} + +void test_identical_logical6(int x, int y) { + if (x == 4 && y == 5 || x == 4 && y == 5) + ; +} + +void test_identical_logical7(int x, int y) { + // FIXME: We should warn here + if (x == 4 && y == 5 || x == 4) + ; +} + +void test_identical_logical8(int x, int y) { + // FIXME: We should warn here + if (x == 4 || y == 5 && x == 4) + ; +} + +void test_identical_logical9(int x, int y) { + // FIXME: We should warn here + if (x == 4 || x == 4 && y == 5) + ; +} +#pragma clang diagnostic pop + +void test_warn_chained_if_stmts_1(int x) { + if (x == 1) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone] +// CHECK-MESSAGES: :[[@LINE-2]]:6: note: end of the original + else if (x == 1) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 1 starts here +} + +void test_warn_chained_if_stmts_2(int x) { + if (x == 1) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone] +// CHECK-MESSAGES: :[[@LINE-2]]:6: note: end of the original + else if (x == 1) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 1 starts here + else if (x == 1) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 2 starts here +} + +void test_warn_chained_if_stmts_3(int x) { + if (x == 1) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone] +// CHECK-MESSAGES: :[[@LINE-2]]:6: note: end of the original + else if (x == 2) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 1 starts here + else if (x == 1) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 2 starts here +} + +void test_warn_chained_if_stmts_4(int x) { + if (x == 1) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone] +// CHECK-MESSAGES: :[[@LINE-2]]:6: note: end of the original + else if (func()) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 1 starts here + else if (x == 1) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 2 starts here +} + +void test_warn_chained_if_stmts_5(int x) { + if (x & 1) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone] +// CHECK-MESSAGES: :[[@LINE-2]]:6: note: end of the original + else if (x & 1) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 1 starts here +} + +void test_warn_chained_if_stmts_6(int x) { + if (x == 1) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone] +// CHECK-MESSAGES: :[[@LINE-2]]:6: note: end of the original + else if (x == 2) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 1 starts here + else if (x == 2) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 2 starts here + else if (x == 3) + ; +} + +void test_warn_chained_if_stmts_7(int x) { + if (x == 1) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone] +// CHECK-MESSAGES: :[[@LINE-2]]:6: note: end of the original + else if (x == 2) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 1 starts here + else if (x == 3) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 2 starts here + else if (x == 2) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 3 starts here + else if (x == 5) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 4 starts here +} + +void test_warn_chained_if_stmts_8(int x) { + if (x == 1) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone] +// CHECK-MESSAGES: :[[@LINE-2]]:6: note: end of the original + else if (x == 2) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 1 starts here + else if (x == 3) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 2 starts here + else if (x == 2) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 3 starts here + else if (x == 5) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 4 starts here + else if (x == 3) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 5 starts here + else if (x == 7) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 6 starts here +} + +void test_nowarn_chained_if_stmts_1(int x) { + if (func()) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone] +// CHECK-MESSAGES: :[[@LINE-2]]:6: note: end of the original + else if (func()) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 1 starts here +} + +void test_nowarn_chained_if_stmts_2(int x) { + if (func()) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone] +// CHECK-MESSAGES: :[[@LINE-2]]:6: note: end of the original + else if (x == 1) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 1 starts here + else if (func()) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 2 starts here +} + +void test_nowarn_chained_if_stmts_3(int x) { + if (x++) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone] +// CHECK-MESSAGES: :[[@LINE-2]]:6: note: end of the original + else if (x++) + ; +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 1 starts here +} + +void test_warn_wchar() { + const wchar_t * a = 0 ? L"Warning" : L"Warning"; +// CHECK-MESSAGES: :[[@LINE-1]]:25: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] +} +void test_nowarn_wchar() { + const wchar_t * a = 0 ? L"No" : L"Warning"; +} + +void test_nowarn_long() { + int a = 0, b = 0; + long c; + if (0) { + b -= a; + c = 0; + } else { + b -= a; + c = 0LL; + } +} + +// Identical inner conditions + +void test_warn_inner_if_1(int x) { + if (x == 1) { +// CHECK-MESSAGES: :[[@LINE-1]]:3: warning: if with identical inner if statement [bugprone-branch-clone] + if (x == 1) +// CHECK-MESSAGES: :[[@LINE-1]]:5: note: inner if starts here + ; + } + + // FIXME: Should warn here. The warning is currently not emitted because there + // is code between the conditions. + if (x == 1) { + int y = x; + if (x == 1) + ; + } +} + +void test_nowarn_inner_if_1(int x) { + // Don't warn when condition has side effects. + if (x++ == 1) { + if (x++ == 1) + ; + } + + // Don't warn when x is changed before inner condition. + if (x < 10) { + x++; + if (x < 10) + ; + } +} diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/alpha-core-identicalexpr.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/redundant-expression-2.cpp similarity index 67% rename from clang-tools-extra/test/clang-tidy/checkers/bugprone/alpha-core-identicalexpr.cpp rename to clang-tools-extra/test/clang-tidy/checkers/misc/redundant-expression-2.cpp index 8eff3ebc948de..8dcef30a4e754 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/alpha-core-identicalexpr.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/misc/redundant-expression-2.cpp @@ -1,5 +1,4 @@ -// RUN: clang-tidy %s -checks="-*,misc-redundant-expression" -- 2>&1 | FileCheck %s --check-prefix=CHECK-MESSAGES-IDENTEXPR -// RUN: clang-tidy %s -checks="-*,bugprone-branch-clone" -- 2>&1 | FileCheck %s --check-prefix=CHECK-MESSAGES-BUGPRONEBRANCH +// RUN: %check_clang_tidy %s misc-redundant-expression -check-suffix=IDENTEXPR %t /* Only one expected warning per function allowed at the very end. */ @@ -77,6 +76,7 @@ int checkNotEqualBinaryOpFloatCompare1(void) { int res; float f= 3.14F; res = (f + 3.14F != f + 3.14F); // no warning +// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:20: warning: both sides of operator are equivalent [misc-redundant-expression] return (0); } int checkNotEqualBinaryOpFloatCompare2(void) { @@ -88,6 +88,7 @@ int checkNotEqualBinaryOpFloatCompare3(void) { int res; float f= 3.14F; res = ((int)f + 3.14F != (int)f + 3.14F); // no warning +// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:25: warning: both sides of operator are equivalent [misc-redundant-expression] return (0); } int checkNotEqualBinaryOpFloatCompare4(void) { @@ -103,6 +104,7 @@ int checkNotEqualNestedBinaryOpFloatCompare1(void) { int u= 2; float f= 3.14F; res = (((int)f + (3.14F - u)*t) != ((int)f + (3.14F - u)*t)); // no warning +// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:35: warning: both sides of operator are equivalent [misc-redundant-expression] return (0); } @@ -121,12 +123,11 @@ int checkNotEqualNestedBinaryOpFloatCompare3(void) { int u= 2; float f= 3.14F; res = (((int)f + (u - 3.14F)*t) != ((int)f + (3.14F - u)*(f + t != f + t))); // no warning +// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:67: warning: both sides of operator are equivalent [misc-redundant-expression] return (0); } - - /* end '!=' with float*/ /* '!=' with int*/ @@ -238,8 +239,6 @@ int checkNotEqualNestedBinaryOpIntCompare3(void) { /* end '!=' int */ - - /* '!=' with int pointer */ int checkNotEqualIntPointerLiteralCompare1(void) { @@ -329,6 +328,7 @@ int checkNotEqualSameFunction() { unsigned a = 0; unsigned b = 1; int res = (a+func() != a+func()); // no warning +// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:23: warning: both sides of operator are equivalent [misc-redundant-expression] return (0); } @@ -343,6 +343,7 @@ int checkNotEqualSameFunctionSameParam() { unsigned a = 0; unsigned b = 1; int res = (a+funcParam(a) != a+funcParam(a)); // no warning +// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:29: warning: both sides of operator are equivalent [misc-redundant-expression] return (0); } @@ -434,7 +435,8 @@ int checkEqualCastFloatDeclCompare12(void) { int checkEqualBinaryOpFloatCompare1(void) { int res; float f= 3.14F; - res = (f + 3.14F == f + 3.14F); // no warning + res = (f + 3.14F == f + 3.14F); +// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:20: warning: both sides of operator are equivalent [misc-redundant-expression] return (0); } int checkEqualBinaryOpFloatCompare2(void) { @@ -445,7 +447,8 @@ int checkEqualBinaryOpFloatCompare2(void) { int checkEqualBinaryOpFloatCompare3(void) { int res; float f= 3.14F; - res = ((int)f + 3.14F == (int)f + 3.14F); // no warning + res = ((int)f + 3.14F == (int)f + 3.14F); +// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:25: warning: both sides of operator are equivalent [misc-redundant-expression] return (0); } int checkEqualBinaryOpFloatCompare4(void) { @@ -460,7 +463,8 @@ int checkEqualNestedBinaryOpFloatCompare1(void) { int t= 1; int u= 2; float f= 3.14F; - res = (((int)f + (3.14F - u)*t) == ((int)f + (3.14F - u)*t)); // no warning + res = (((int)f + (3.14F - u)*t) == ((int)f + (3.14F - u)*t)); +// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:35: warning: both sides of operator are equivalent [misc-redundant-expression] return (0); } @@ -478,14 +482,11 @@ int checkEqualNestedBinaryOpFloatCompare3(void) { int t= 1; int u= 2; float f= 3.14F; - res = (((int)f + (u - 3.14F)*t) == ((int)f + (3.14F - u)*(f + t == f + t))); // no warning + res = (((int)f + (u - 3.14F)*t) == ((int)f + (3.14F - u)*(f + t == f + t))); +// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:67: warning: both sides of operator are equivalent [misc-redundant-expression] return (0); } - - - - /* Equal with int*/ int checkEqualIntLiteralCompare1(void) { @@ -600,7 +601,8 @@ int checkEqualNestedBinaryOpIntCompare3(void) { int checkEqualSameFunction() { unsigned a = 0; unsigned b = 1; - int res = (a+func() == a+func()); // no warning + int res = (a+func() == a+func()); +// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:23: warning: both sides of operator are equivalent [misc-redundant-expression] return (0); } @@ -614,7 +616,8 @@ int checkEqualDifferentFunction() { int checkEqualSameFunctionSameParam() { unsigned a = 0; unsigned b = 1; - int res = (a+funcParam(a) == a+funcParam(a)); // no warning + int res = (a+funcParam(a) == a+funcParam(a)); +// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:29: warning: both sides of operator are equivalent [misc-redundant-expression] return (0); } @@ -692,7 +695,8 @@ int checkLessThanCastFloatDeclCompare12(void) { int checkLessThanBinaryOpFloatCompare1(void) { int res; float f= 3.14F; - res = (f + 3.14F < f + 3.14F); // no warning + res = (f + 3.14F < f + 3.14F); +// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:20: warning: both sides of operator are equivalent [misc-redundant-expression] return (0); } int checkLessThanBinaryOpFloatCompare2(void) { @@ -703,7 +707,8 @@ int checkLessThanBinaryOpFloatCompare2(void) { int checkLessThanBinaryOpFloatCompare3(void) { int res; float f= 3.14F; - res = ((int)f + 3.14F < (int)f + 3.14F); // no warning + res = ((int)f + 3.14F < (int)f + 3.14F); +// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:25: warning: both sides of operator are equivalent [misc-redundant-expression] return (0); } int checkLessThanBinaryOpFloatCompare4(void) { @@ -718,7 +723,8 @@ int checkLessThanNestedBinaryOpFloatCompare1(void) { int t= 1; int u= 2; float f= 3.14F; - res = (((int)f + (3.14F - u)*t) < ((int)f + (3.14F - u)*t)); // no warning + res = (((int)f + (3.14F - u)*t) < ((int)f + (3.14F - u)*t)); +// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:35: warning: both sides of operator are equivalent [misc-redundant-expression] return (0); } @@ -736,7 +742,8 @@ int checkLessThanNestedBinaryOpFloatCompare3(void) { int t= 1; int u= 2; float f= 3.14F; - res = (((int)f + (u - 3.14F)*t) < ((int)f + (3.14F - u)*(f + t < f + t))); // no warning + res = (((int)f + (u - 3.14F)*t) < ((int)f + (3.14F - u)*(f + t < f + t))); +// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:66: warning: both sides of operator are equivalent [misc-redundant-expression] return (0); } @@ -912,7 +919,8 @@ int checkGreaterThanCastFloatDeclCompare12(void) { int checkGreaterThanBinaryOpFloatCompare1(void) { int res; float f= 3.14F; - res = (f + 3.14F > f + 3.14F); // no warning + res = (f + 3.14F > f + 3.14F); +// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:20: warning: both sides of operator are equivalent [misc-redundant-expression] return (0); } int checkGreaterThanBinaryOpFloatCompare2(void) { @@ -923,7 +931,8 @@ int checkGreaterThanBinaryOpFloatCompare2(void) { int checkGreaterThanBinaryOpFloatCompare3(void) { int res; float f= 3.14F; - res = ((int)f + 3.14F > (int)f + 3.14F); // no warning + res = ((int)f + 3.14F > (int)f + 3.14F); +// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:25: warning: both sides of operator are equivalent [misc-redundant-expression] return (0); } int checkGreaterThanBinaryOpFloatCompare4(void) { @@ -938,7 +947,8 @@ int checkGreaterThanNestedBinaryOpFloatCompare1(void) { int t= 1; int u= 2; float f= 3.14F; - res = (((int)f + (3.14F - u)*t) > ((int)f + (3.14F - u)*t)); // no warning + res = (((int)f + (3.14F - u)*t) > ((int)f + (3.14F - u)*t)); +// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:35: warning: both sides of operator are equivalent [misc-redundant-expression] return (0); } @@ -957,6 +967,7 @@ int checkGreaterThanNestedBinaryOpFloatCompare3(void) { int u= 2; float f= 3.14F; res = (((int)f + (u - 3.14F)*t) > ((int)f + (3.14F - u)*(f + t > f + t))); // no warning +// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:66: warning: both sides of operator are equivalent [misc-redundant-expression] return (0); } @@ -1066,7 +1077,6 @@ unsigned test_unsigned(unsigned a) { unsigned b = 1; a = a > 5 ? b : b; // CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:17: warning: 'true' and 'false' expressions are equivalent [misc-redundant-expression] -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] return a; } @@ -1074,13 +1084,11 @@ void test_signed() { int a = 0; a = a > 5 ? a : a; // CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:17: warning: 'true' and 'false' expressions are equivalent [misc-redundant-expression] -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] } void test_bool(bool a) { a = a > 0 ? a : a; // CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:17: warning: 'true' and 'false' expressions are equivalent [misc-redundant-expression] -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] } void test_float() { @@ -1088,14 +1096,12 @@ void test_float() { float b = 0; a = a > 5 ? a : a; // CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:17: warning: 'true' and 'false' expressions are equivalent [misc-redundant-expression] -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] } const char *test_string() { float a = 0; return a > 5 ? "abc" : "abc"; // CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:24: warning: 'true' and 'false' expressions are equivalent [misc-redundant-expression] -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:16: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] } void test_unsigned_expr() { @@ -1103,7 +1109,6 @@ void test_unsigned_expr() { unsigned b = 0; a = a > 5 ? a+b : a+b; // CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:19: warning: 'true' and 'false' expressions are equivalent [misc-redundant-expression] -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] } void test_signed_expr() { @@ -1111,14 +1116,12 @@ void test_signed_expr() { int b = 1; a = a > 5 ? a+b : a+b; // CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:19: warning: 'true' and 'false' expressions are equivalent [misc-redundant-expression] -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] } void test_bool_expr(bool a) { bool b = 0; a = a > 0 ? a&&b : a&&b; // CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:20: warning: 'true' and 'false' expressions are equivalent [misc-redundant-expression] -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] } void test_unsigned_expr_negative() { @@ -1143,7 +1146,6 @@ void test_float_expr_positive() { float b = 0; a = a > 5 ? a+b : a+b; // CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:19: warning: 'true' and 'false' expressions are equivalent [misc-redundant-expression] -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] } void test_expr_positive_func() { @@ -1151,7 +1153,6 @@ void test_expr_positive_func() { unsigned b = 1; a = a > 5 ? a+func() : a+func(); // CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:24: warning: 'true' and 'false' expressions are equivalent [misc-redundant-expression] -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] } void test_expr_negative_func() { @@ -1165,7 +1166,6 @@ void test_expr_positive_funcParam() { unsigned b = 1; a = a > 5 ? a+funcParam(b) : a+funcParam(b); // CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:30: warning: 'true' and 'false' expressions are equivalent [misc-redundant-expression] -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] } void test_expr_negative_funcParam() { @@ -1174,26 +1174,12 @@ void test_expr_negative_funcParam() { a = a > 5 ? a+funcParam(a) : a+funcParam(b); // no warning } -void test_expr_positive_inc() { - unsigned a = 0; - unsigned b = 1; - a = a > 5 ? a++ : a++; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] -} - void test_expr_negative_inc() { unsigned a = 0; unsigned b = 1; a = a > 5 ? a++ : b++; // no warning } -void test_expr_positive_assign() { - unsigned a = 0; - unsigned b = 1; - a = a > 5 ? a=1 : a=1; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] -} - void test_expr_negative_assign() { unsigned a = 0; unsigned b = 1; @@ -1206,7 +1192,6 @@ void test_signed_nested_expr() { int c = 3; a = a > 5 ? a+b+(c+a)*(a + b*(c+a)) : a+b+(c+a)*(a + b*(c+a)); // CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:39: warning: 'true' and 'false' expressions are equivalent [misc-redundant-expression] -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] } void test_signed_nested_expr_negative() { @@ -1229,190 +1214,6 @@ void test_signed_nested_cond_expr() { int c = 3; a = a > 5 ? (b > 5 ? 1 : 4) : (b > 5 ? 4 : 4); // CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:44: warning: 'true' and 'false' expressions are equivalent [misc-redundant-expression] -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:40: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] -} - -void test_identical_branches1(bool b) { - int i = 0; - if (b) { -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone] - ++i; - } else { -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: else branch starts here - ++i; - } -} - -void test_identical_branches2(bool b) { - int i = 0; - if (b) { -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone] - ++i; - } else -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: else branch starts here - ++i; -} - -void test_identical_branches3(bool b) { - int i = 0; - if (b) { // no warning - ++i; - } else { - i++; - } -} - -void test_identical_branches4(bool b) { - int i = 0; - if (b) { -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone] - } else { -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: else branch starts here - } -} - -void test_identical_branches_break(bool b) { - while (true) { - if (b) -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: warning: if with identical then and else branches [bugprone-branch-clone] - break; - else -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: else branch starts here - break; - } -} - -void test_identical_branches_continue(bool b) { - while (true) { - if (b) -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: warning: if with identical then and else branches [bugprone-branch-clone] - continue; - else -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: else branch starts here - continue; - } -} - -void test_identical_branches_func(bool b) { - if (b) -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone] - func(); - else -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:3: note: else branch starts here - func(); -} - -void test_identical_branches_func_arguments(bool b) { - if (b) // no-warning - funcParam(1); - else - funcParam(2); -} - -void test_identical_branches_cast1(bool b) { - long v = -7; - if (b) // no-warning - v = (signed int) v; - else - v = (unsigned int) v; -} - -void test_identical_branches_cast2(bool b) { - long v = -7; - if (b) -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone] - v = (signed int) v; - else -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:3: note: else branch starts here - v = (signed int) v; -} - -int test_identical_branches_return_int(bool b) { - int i = 0; - if (b) { -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone] - i++; - return i; - } else { -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: else branch starts here - i++; - return i; - } -} - -int test_identical_branches_return_func(bool b) { - if (b) { -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone] - return func(); - } else { -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: else branch starts here - return func(); - } -} - -void test_identical_branches_for(bool b) { - int i; - int j; - if (b) { -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone] - for (i = 0, j = 0; i < 10; i++) - j += 4; - } else { -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: else branch starts here - for (i = 0, j = 0; i < 10; i++) - j += 4; - } -} - -void test_identical_branches_while(bool b) { - int i = 10; - if (b) { -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone] - while (func()) - i--; - } else { -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: else branch starts here - while (func()) - i--; - } -} - -void test_identical_branches_while_2(bool b) { - int i = 10; - if (b) { // no-warning - while (func()) - i--; - } else { - while (func()) - i++; - } -} - -void test_identical_branches_do_while(bool b) { - int i = 10; - if (b) { -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone] - do { - i--; - } while (func()); - } else { -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: else branch starts here - do { - i--; - } while (func()); - } -} - -void test_identical_branches_if(bool b, int i) { - if (b) { -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone] - if (i < 5) - i += 10; - } else { -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: else branch starts here - if (i < 5) - i += 10; - } } void test_identical_bitwise1() { @@ -1473,7 +1274,8 @@ void test_identical_logical3(int a) { } void test_identical_logical4(int a) { - if (a == func() || a == func()) // no-warning + if (a == func() || a == func()) +// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:19: warning: both sides of operator are equivalent [misc-redundant-expression] ; } @@ -1508,208 +1310,3 @@ void test_identical_logical9(int x, int y) { ; } #pragma clang diagnostic pop - -void test_warn_chained_if_stmts_1(int x) { - if (x == 1) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone] -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:6: note: end of the original - else if (x == 1) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 1 starts here -} - -void test_warn_chained_if_stmts_2(int x) { - if (x == 1) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone] -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:6: note: end of the original - else if (x == 1) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 1 starts here - else if (x == 1) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 2 starts here -} - -void test_warn_chained_if_stmts_3(int x) { - if (x == 1) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone] -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:6: note: end of the original - else if (x == 2) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 1 starts here - else if (x == 1) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 2 starts here -} - -void test_warn_chained_if_stmts_4(int x) { - if (x == 1) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone] -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:6: note: end of the original - else if (func()) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 1 starts here - else if (x == 1) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 2 starts here -} - -void test_warn_chained_if_stmts_5(int x) { - if (x & 1) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone] -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:6: note: end of the original - else if (x & 1) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 1 starts here -} - -void test_warn_chained_if_stmts_6(int x) { - if (x == 1) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone] -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:6: note: end of the original - else if (x == 2) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 1 starts here - else if (x == 2) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 2 starts here - else if (x == 3) - ; -} - -void test_warn_chained_if_stmts_7(int x) { - if (x == 1) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone] -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:6: note: end of the original - else if (x == 2) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 1 starts here - else if (x == 3) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 2 starts here - else if (x == 2) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 3 starts here - else if (x == 5) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 4 starts here -} - -void test_warn_chained_if_stmts_8(int x) { - if (x == 1) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone] -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:6: note: end of the original - else if (x == 2) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 1 starts here - else if (x == 3) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 2 starts here - else if (x == 2) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 3 starts here - else if (x == 5) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 4 starts here - else if (x == 3) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 5 starts here - else if (x == 7) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 6 starts here -} - -void test_nowarn_chained_if_stmts_1(int x) { - if (func()) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone] -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:6: note: end of the original - else if (func()) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 1 starts here -} - -void test_nowarn_chained_if_stmts_2(int x) { - if (func()) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone] -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:6: note: end of the original - else if (x == 1) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 1 starts here - else if (func()) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 2 starts here -} - -void test_nowarn_chained_if_stmts_3(int x) { - if (x++) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone] -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:6: note: end of the original - else if (x++) - ; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 1 starts here -} - -void test_warn_wchar() { - const wchar_t * a = 0 ? L"Warning" : L"Warning"; -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:25: warning: conditional operator with identical true and false expressions [bugprone-branch-clone] -} -void test_nowarn_wchar() { - const wchar_t * a = 0 ? L"No" : L"Warning"; -} - -void test_nowarn_long() { - int a = 0, b = 0; - long c; - if (0) { - b -= a; - c = 0; - } else { - b -= a; - c = 0LL; - } -} - -// Identical inner conditions - -void test_warn_inner_if_1(int x) { - if (x == 1) { -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:3: warning: if with identical inner if statement [bugprone-branch-clone] - if (x == 1) -// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: inner if starts here - ; - } - - // FIXME: Should warn here. The warning is currently not emitted because there - // is code between the conditions. - if (x == 1) { - int y = x; - if (x == 1) - ; - } -} - -void test_nowarn_inner_if_1(int x) { - // Don't warn when condition has side effects. - if (x++ == 1) { - if (x++ == 1) - ; - } - - // Don't warn when x is changed before inner condition. - if (x < 10) { - x++; - if (x < 10) - ; - } -} From 334a5766d7591bfaadf6990b3d8568c9688e22a5 Mon Sep 17 00:00:00 2001 From: Richard Dzenis Date: Tue, 24 Dec 2024 16:05:10 +0200 Subject: [PATCH 016/567] [llvm-objcopy] Add support of symbol modification flags for MachO (#120895) This patch adds support of the following llvm-objcopy flags for MachO: - `--globalize-symbol`, `--globalize-symbols`, - `--keep-global-symbol`, `-G`, `--keep-global-symbols`, - `--localize-symbol`, `-L`, `--localize-symbols`, - `--skip-symbol`, `--skip-symbols`. Code in `updateAndRemoveSymbols` for MachO is kept similar to its version for ELF. Fixes #120894 --- llvm/docs/CommandGuide/llvm-objcopy.rst | 100 ++++++------ llvm/docs/ReleaseNotes.md | 6 + llvm/lib/ObjCopy/ConfigManager.cpp | 6 +- llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp | 35 ++++- llvm/lib/ObjCopy/MachO/MachOObject.cpp | 13 ++ llvm/lib/ObjCopy/MachO/MachOObject.h | 1 + .../llvm-objcopy/MachO/globalize-symbol.test | 134 ++++++++++++++++ .../MachO/keep-global-symbol.test | 147 +++++++++++++++++ .../llvm-objcopy/MachO/localize-symbol.test | 131 ++++++++++++++++ .../tools/llvm-objcopy/MachO/skip-symbol.test | 148 ++++++++++++++++++ 10 files changed, 659 insertions(+), 62 deletions(-) create mode 100644 llvm/test/tools/llvm-objcopy/MachO/globalize-symbol.test create mode 100644 llvm/test/tools/llvm-objcopy/MachO/keep-global-symbol.test create mode 100644 llvm/test/tools/llvm-objcopy/MachO/localize-symbol.test create mode 100644 llvm/test/tools/llvm-objcopy/MachO/skip-symbol.test diff --git a/llvm/docs/CommandGuide/llvm-objcopy.rst b/llvm/docs/CommandGuide/llvm-objcopy.rst index e6af47ce9710a..be4876cad6760 100644 --- a/llvm/docs/CommandGuide/llvm-objcopy.rst +++ b/llvm/docs/CommandGuide/llvm-objcopy.rst @@ -78,10 +78,47 @@ multiple file formats. Enable deterministic mode when copying archives, i.e. use 0 for archive member header UIDs, GIDs and timestamp fields. On by default. +.. option:: --globalize-symbol + + Mark any defined symbols named ```` as global symbols in the output. + Can be specified multiple times to mark multiple symbols. + +.. option:: --globalize-symbols + + Read a list of names from the file ```` and mark defined symbols with + those names as global in the output. In the file, each line represents a single + symbol, with leading and trailing whitespace ignored, as is anything following + a '#'. Can be specified multiple times to read names from multiple files. + .. option:: --help, -h Print a summary of command line options. +.. option:: --keep-global-symbol , -G + + Mark all symbols local in the output, except for symbols with the name + ````. Can be specified multiple times to ignore multiple symbols. + +.. option:: --keep-global-symbols + + Mark all symbols local in the output, except for symbols named in the file + ````. In the file, each line represents a single symbol, with leading + and trailing whitespace ignored, as is anything following a '#'. Can be + specified multiple times to read names from multiple files. + +.. option:: --localize-symbol , -L + + Mark any defined non-common symbol named ```` as a local symbol in the + output. Can be specified multiple times to mark multiple symbols as local. + +.. option:: --localize-symbols + + Read a list of names from the file ```` and mark defined non-common + symbols with those names as local in the output. In the file, each line + represents a single symbol, with leading and trailing whitespace ignored, as is + anything following a '#'. Can be specified multiple times to read names from + multiple files. + .. option:: --only-keep-debug Produce a debug file as the output that only preserves contents of sections @@ -177,6 +214,19 @@ multiple file formats. flags. - `share` = add the `IMAGE_SCN_MEM_SHARED` and `IMAGE_SCN_MEM_READ` flags. +.. option:: --skip-symbol + + Do not change the parameters of symbol ```` when executing other + options that can change the symbol's name, binding or visibility. + +.. option:: --skip-symbols + + Do not change the parameters of symbols named in the file ```` when + executing other options that can change the symbol's name, binding or + visibility. In the file, each line represents a single symbol, with leading + and trailing whitespace ignored, as is anything following a '#'. + Can be specified multiple times to read names from multiple files. + .. option:: --strip-all-gnu Remove all symbols, debug sections and relocations from the output. This option @@ -355,18 +405,6 @@ them. For binary outputs, fill the gaps between sections with ```` instead of zero. The value must be an unsigned 8-bit integer. -.. option:: --globalize-symbol - - Mark any defined symbols named ```` as global symbols in the output. - Can be specified multiple times to mark multiple symbols. - -.. option:: --globalize-symbols - - Read a list of names from the file ```` and mark defined symbols with - those names as global in the output. In the file, each line represents a single - symbol, with leading and trailing whitespace ignored, as is anything following - a '#'. Can be specified multiple times to read names from multiple files. - .. option:: --input-target , -I Read the input as the specified format. See `SUPPORTED FORMATS`_ for a list of @@ -377,18 +415,6 @@ them. Keep symbols of type `STT_FILE`, even if they would otherwise be stripped. -.. option:: --keep-global-symbol , -G - - Mark all symbols local in the output, except for symbols with the name - ````. Can be specified multiple times to ignore multiple symbols. - -.. option:: --keep-global-symbols - - Mark all symbols local in the output, except for symbols named in the file - ````. In the file, each line represents a single symbol, with leading - and trailing whitespace ignored, as is anything following a '#'. Can be - specified multiple times to read names from multiple files. - .. option:: --keep-section
When removing sections from the output, do not remove sections named @@ -410,19 +436,6 @@ them. Mark all symbols with hidden or internal visibility local in the output. -.. option:: --localize-symbol , -L - - Mark any defined non-common symbol named ```` as a local symbol in the - output. Can be specified multiple times to mark multiple symbols as local. - -.. option:: --localize-symbols - - Read a list of names from the file ```` and mark defined non-common - symbols with those names as local in the output. In the file, each line - represents a single symbol, with leading and trailing whitespace ignored, as is - anything following a '#'. Can be specified multiple times to read names from - multiple files. - .. option:: --new-symbol-visibility Specify the visibility of the symbols automatically created when using binary @@ -489,19 +502,6 @@ them. Read a list of symbols from and change their visibility to the specified value. Visibility values: default, internal, hidden, protected. -.. option:: --skip-symbol - - Do not change the parameters of symbol ```` when executing other - options that can change the symbol's name, binding or visibility. - -.. option:: --skip-symbols - - Do not change the parameters of symbols named in the file ```` when - executing other options that can change the symbol's name, binding or - visibility. In the file, each line represents a single symbol, with leading - and trailing whitespace ignored, as is anything following a '#'. - Can be specified multiple times to read names from multiple files. - .. option:: --split-dwo Equivalent to running :program:`llvm-objcopy` with :option:`--extract-dwo` and diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index cc0e21d6c4036..4486218d4f883 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -350,6 +350,12 @@ Changes to the Debug Info Changes to the LLVM tools --------------------------------- +* llvm-objcopy now supports the following options for Mach-O: + `--globalize-symbol`, `--globalize-symbols`, + `--keep-global-symbol`, `--keep-global-symbols`, + `--localize-symbol`, `--localize-symbols`, + `--skip-symbol`, `--skip-symbols`. + Changes to LLDB --------------------------------- diff --git a/llvm/lib/ObjCopy/ConfigManager.cpp b/llvm/lib/ObjCopy/ConfigManager.cpp index 78fc0c451e1a3..79bbb289d1623 100644 --- a/llvm/lib/ObjCopy/ConfigManager.cpp +++ b/llvm/lib/ObjCopy/ConfigManager.cpp @@ -36,11 +36,9 @@ Expected ConfigManager::getCOFFConfig() const { Expected ConfigManager::getMachOConfig() const { if (!Common.SplitDWO.empty() || !Common.SymbolsPrefix.empty() || - !Common.SymbolsPrefixRemove.empty() || !Common.SymbolsToSkip.empty() || + !Common.SymbolsPrefixRemove.empty() || !Common.AllocSectionsPrefix.empty() || !Common.KeepSection.empty() || - !Common.SymbolsToGlobalize.empty() || !Common.SymbolsToKeep.empty() || - !Common.SymbolsToLocalize.empty() || - !Common.SymbolsToKeepGlobal.empty() || !Common.SectionsToRename.empty() || + !Common.SymbolsToKeep.empty() || !Common.SectionsToRename.empty() || !Common.UnneededSymbolsToRemove.empty() || !Common.SetSectionAlignment.empty() || !Common.SetSectionFlags.empty() || !Common.SetSectionType.empty() || Common.ExtractDWO || diff --git a/llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp b/llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp index 91500c2d9dd47..a188425b283fa 100644 --- a/llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp +++ b/llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp @@ -93,19 +93,38 @@ static void markSymbols(const CommonConfig &, Object &Obj) { static void updateAndRemoveSymbols(const CommonConfig &Config, const MachOConfig &MachOConfig, Object &Obj) { - for (SymbolEntry &Sym : Obj.SymTable) { - // Weaken symbols first to match ELFObjcopy behavior. - bool IsExportedAndDefined = - (Sym.n_type & llvm::MachO::N_EXT) && - (Sym.n_type & llvm::MachO::N_TYPE) != llvm::MachO::N_UNDF; - if (IsExportedAndDefined && + Obj.SymTable.updateSymbols([&](SymbolEntry &Sym) { + if (Config.SymbolsToSkip.matches(Sym.Name)) + return; + + if (!Sym.isUndefinedSymbol() && Config.SymbolsToLocalize.matches(Sym.Name)) + Sym.n_type &= ~MachO::N_EXT; + + // Note: these two globalize flags have very similar names but different + // meanings: + // + // --globalize-symbol: promote a symbol to global + // --keep-global-symbol: all symbols except for these should be made local + // + // If --globalize-symbol is specified for a given symbol, it will be + // global in the output file even if it is not included via + // --keep-global-symbol. Because of that, make sure to check + // --globalize-symbol second. + if (!Sym.isUndefinedSymbol() && !Config.SymbolsToKeepGlobal.empty() && + !Config.SymbolsToKeepGlobal.matches(Sym.Name)) + Sym.n_type &= ~MachO::N_EXT; + + if (!Sym.isUndefinedSymbol() && Config.SymbolsToGlobalize.matches(Sym.Name)) + Sym.n_type |= MachO::N_EXT; + + if (Sym.isExternalSymbol() && !Sym.isUndefinedSymbol() && (Config.Weaken || Config.SymbolsToWeaken.matches(Sym.Name))) - Sym.n_desc |= llvm::MachO::N_WEAK_DEF; + Sym.n_desc |= MachO::N_WEAK_DEF; auto I = Config.SymbolsToRename.find(Sym.Name); if (I != Config.SymbolsToRename.end()) Sym.Name = std::string(I->getValue()); - } + }); auto RemovePred = [&Config, &MachOConfig, &Obj](const std::unique_ptr &N) { diff --git a/llvm/lib/ObjCopy/MachO/MachOObject.cpp b/llvm/lib/ObjCopy/MachO/MachOObject.cpp index d593d6788e112..8d2c02dc37c99 100644 --- a/llvm/lib/ObjCopy/MachO/MachOObject.cpp +++ b/llvm/lib/ObjCopy/MachO/MachOObject.cpp @@ -33,6 +33,19 @@ SymbolEntry *SymbolTable::getSymbolByIndex(uint32_t Index) { static_cast(this)->getSymbolByIndex(Index)); } +void SymbolTable::updateSymbols(function_ref Callable) { + for (auto &Sym : Symbols) + Callable(*Sym); + + // Partition symbols: local < defined external < undefined external. + auto ExternalBegin = std::stable_partition( + std::begin(Symbols), std::end(Symbols), + [](const auto &Sym) { return Sym->isLocalSymbol(); }); + std::stable_partition(ExternalBegin, std::end(Symbols), [](const auto &Sym) { + return !Sym->isUndefinedSymbol(); + }); +} + void SymbolTable::removeSymbols( function_ref &)> ToRemove) { llvm::erase_if(Symbols, ToRemove); diff --git a/llvm/lib/ObjCopy/MachO/MachOObject.h b/llvm/lib/ObjCopy/MachO/MachOObject.h index b3303fd291c82..a454c4f502fd6 100644 --- a/llvm/lib/ObjCopy/MachO/MachOObject.h +++ b/llvm/lib/ObjCopy/MachO/MachOObject.h @@ -142,6 +142,7 @@ struct SymbolTable { const SymbolEntry *getSymbolByIndex(uint32_t Index) const; SymbolEntry *getSymbolByIndex(uint32_t Index); + void updateSymbols(function_ref Callable); void removeSymbols( function_ref &)> ToRemove); }; diff --git a/llvm/test/tools/llvm-objcopy/MachO/globalize-symbol.test b/llvm/test/tools/llvm-objcopy/MachO/globalize-symbol.test new file mode 100644 index 0000000000000..ea47d44813402 --- /dev/null +++ b/llvm/test/tools/llvm-objcopy/MachO/globalize-symbol.test @@ -0,0 +1,134 @@ +# RUN: yaml2obj %s -o %t +# RUN: llvm-objcopy --wildcard --globalize-symbol="*" %t %t.copy +# RUN: llvm-readobj --symbols %t.copy | FileCheck %s + +# RUN: echo "*" > %t-star.txt +# RUN: llvm-objcopy --wildcard --globalize-symbols="%t-star.txt" %t %t.copy +# RUN: llvm-readobj --symbols %t.copy | FileCheck %s + +# CHECK: Symbols [ +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: _PrivateSymbol +# CHECK-NEXT: Extern +# CHECK-NEXT: Type: Section (0xE) +# CHECK-NEXT: Section: __text (0x1) +# CHECK-NEXT: RefType: UndefinedNonLazy (0x0) +# CHECK-NEXT: Flags [ (0x0) +# CHECK-NEXT: ] +# CHECK-NEXT: Value: 0x1 +# CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: _PrivateExternalSymbol +# CHECK-NEXT: PrivateExtern +# CHECK-NEXT: Extern +# CHECK-NEXT: Type: Section (0xE) +# CHECK-NEXT: Section: __text (0x1) +# CHECK-NEXT: RefType: UndefinedNonLazy (0x0) +# CHECK-NEXT: Flags [ (0x0) +# CHECK-NEXT: ] +# CHECK-NEXT: Value: 0x2 +# CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: _CommonSymbol +# CHECK-NEXT: Extern +# CHECK-NEXT: Type: Section (0xE) +# CHECK-NEXT: Section: __text (0x1) +# CHECK-NEXT: RefType: UndefinedNonLazy (0x0) +# CHECK-NEXT: Flags [ (0x0) +# CHECK-NEXT: ] +# CHECK-NEXT: Value: 0x3 +# CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: _UndefinedExternalSymbol +# CHECK-NEXT: Extern +# CHECK-NEXT: Type: Undef (0x0) +# CHECK-NEXT: Section: (0x0) +# CHECK-NEXT: RefType: UndefinedNonLazy (0x0) +# CHECK-NEXT: Flags [ (0x0) +# CHECK-NEXT: ] +# CHECK-NEXT: Value: 0x0 +# CHECK-NEXT: } +# CHECK-NEXT: ] + +--- !mach-o +FileHeader: + magic: 0xFEEDFACF + cputype: 0x100000C + cpusubtype: 0x0 + filetype: 0x2 + ncmds: 3 + sizeofcmds: 328 + flags: 0x200085 + reserved: 0x0 +LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 152 + segname: __TEXT + vmaddr: 4294967296 + vmsize: 4096 + fileoff: 0 + filesize: 4096 + maxprot: 5 + initprot: 5 + nsects: 1 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0x100000FF8 + size: 8 + offset: 0xFF8 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x80000400 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 00008052C0035FD6 + - cmd: LC_SEGMENT_64 + cmdsize: 72 + segname: __LINKEDIT + vmaddr: 4294971392 + vmsize: 4096 + fileoff: 4096 + filesize: 67 + maxprot: 1 + initprot: 1 + nsects: 0 + flags: 0 + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 4096 + nsyms: 4 + stroff: 4164 + strsize: 79 +LinkEditData: + NameList: + - n_strx: 2 + n_type: 0x0E + n_sect: 1 + n_desc: 0 + n_value: 1 + - n_strx: 17 + n_type: 0x1E + n_sect: 1 + n_desc: 0 + n_value: 2 + - n_strx: 40 + n_type: 0x0F + n_sect: 1 + n_desc: 0 + n_value: 3 + - n_strx: 54 + n_type: 0x01 + n_sect: 0 + n_desc: 0 + n_value: 0 + StringTable: + - ' ' + - _PrivateSymbol + - _PrivateExternalSymbol + - _CommonSymbol + - _UndefinedExternalSymbol +... diff --git a/llvm/test/tools/llvm-objcopy/MachO/keep-global-symbol.test b/llvm/test/tools/llvm-objcopy/MachO/keep-global-symbol.test new file mode 100644 index 0000000000000..009a732667a16 --- /dev/null +++ b/llvm/test/tools/llvm-objcopy/MachO/keep-global-symbol.test @@ -0,0 +1,147 @@ +# RUN: yaml2obj %s -o %t +# RUN: llvm-objcopy --keep-global-symbol _CommonSymbol %t %t.copy +# RUN: llvm-readobj --symbols %t.copy | FileCheck %s + +# RUN: echo _CommonSymbol > %t-sym-list.txt +# RUN: llvm-objcopy --wildcard --keep-global-symbols="%t-sym-list.txt" %t %t.copy +# RUN: llvm-readobj --symbols %t.copy | FileCheck %s + +# CHECK: Symbols [ +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: _PrivateSymbol +# CHECK-NEXT: Type: Section (0xE) +# CHECK-NEXT: Section: __text (0x1) +# CHECK-NEXT: RefType: UndefinedNonLazy (0x0) +# CHECK-NEXT: Flags [ (0x0) +# CHECK-NEXT: ] +# CHECK-NEXT: Value: 0x1 +# CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: _PrivateExternalSymbol +# CHECK-NEXT: PrivateExtern +# CHECK-NEXT: Type: Section (0xE) +# CHECK-NEXT: Section: __text (0x1) +# CHECK-NEXT: RefType: UndefinedNonLazy (0x0) +# CHECK-NEXT: Flags [ (0x0) +# CHECK-NEXT: ] +# CHECK-NEXT: Value: 0x2 +# CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: _CommonSymbol2 +# CHECK-NEXT: Type: Section (0xE) +# CHECK-NEXT: Section: __text (0x1) +# CHECK-NEXT: RefType: UndefinedNonLazy (0x0) +# CHECK-NEXT: Flags [ (0x0) +# CHECK-NEXT: ] +# CHECK-NEXT: Value: 0x4 +# CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: _CommonSymbol +# CHECK-NEXT: Extern +# CHECK-NEXT: Type: Section (0xE) +# CHECK-NEXT: Section: __text (0x1) +# CHECK-NEXT: RefType: UndefinedNonLazy (0x0) +# CHECK-NEXT: Flags [ (0x0) +# CHECK-NEXT: ] +# CHECK-NEXT: Value: 0x3 +# CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: _UndefinedExternalSymbol +# CHECK-NEXT: Extern +# CHECK-NEXT: Type: Undef (0x0) +# CHECK-NEXT: Section: (0x0) +# CHECK-NEXT: RefType: UndefinedNonLazy (0x0) +# CHECK-NEXT: Flags [ (0x0) +# CHECK-NEXT: ] +# CHECK-NEXT: Value: 0x0 +# CHECK-NEXT: } +# CHECK-NEXT: ] + +--- !mach-o +FileHeader: + magic: 0xFEEDFACF + cputype: 0x100000C + cpusubtype: 0x0 + filetype: 0x2 + ncmds: 3 + sizeofcmds: 328 + flags: 0x200085 + reserved: 0x0 +LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 152 + segname: __TEXT + vmaddr: 4294967296 + vmsize: 4096 + fileoff: 0 + filesize: 4096 + maxprot: 5 + initprot: 5 + nsects: 1 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0x100000FF8 + size: 8 + offset: 0xFF8 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x80000400 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 00008052C0035FD6 + - cmd: LC_SEGMENT_64 + cmdsize: 72 + segname: __LINKEDIT + vmaddr: 4294971392 + vmsize: 4096 + fileoff: 4096 + filesize: 94 + maxprot: 1 + initprot: 1 + nsects: 0 + flags: 0 + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 4096 + nsyms: 5 + stroff: 4176 + strsize: 94 +LinkEditData: + NameList: + - n_strx: 2 + n_type: 0x0E + n_sect: 1 + n_desc: 0 + n_value: 1 + - n_strx: 17 + n_type: 0x1E + n_sect: 1 + n_desc: 0 + n_value: 2 + - n_strx: 40 + n_type: 0x0F + n_sect: 1 + n_desc: 0 + n_value: 3 + - n_strx: 54 + n_type: 0x0F + n_sect: 1 + n_desc: 0 + n_value: 4 + - n_strx: 69 + n_type: 0x01 + n_sect: 0 + n_desc: 0 + n_value: 0 + StringTable: + - ' ' + - _PrivateSymbol + - _PrivateExternalSymbol + - _CommonSymbol + - _CommonSymbol2 + - _UndefinedExternalSymbol +... diff --git a/llvm/test/tools/llvm-objcopy/MachO/localize-symbol.test b/llvm/test/tools/llvm-objcopy/MachO/localize-symbol.test new file mode 100644 index 0000000000000..131d3bf3b3b7d --- /dev/null +++ b/llvm/test/tools/llvm-objcopy/MachO/localize-symbol.test @@ -0,0 +1,131 @@ +# RUN: yaml2obj %s -o %t +# RUN: llvm-objcopy --wildcard --localize-symbol="*" %t %t.copy +# RUN: llvm-readobj --symbols %t.copy | FileCheck %s + +# RUN: echo "*" > %t-star.txt +# RUN: llvm-objcopy --wildcard --localize-symbols="%t-star.txt" %t %t.copy +# RUN: llvm-readobj --symbols %t.copy | FileCheck %s + +# CHECK: Symbols [ +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: _PrivateSymbol +# CHECK-NEXT: Type: Section (0xE) +# CHECK-NEXT: Section: __text (0x1) +# CHECK-NEXT: RefType: UndefinedNonLazy (0x0) +# CHECK-NEXT: Flags [ (0x0) +# CHECK-NEXT: ] +# CHECK-NEXT: Value: 0x1 +# CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: _PrivateExternalSymbol +# CHECK-NEXT: PrivateExtern +# CHECK-NEXT: Type: Section (0xE) +# CHECK-NEXT: Section: __text (0x1) +# CHECK-NEXT: RefType: UndefinedNonLazy (0x0) +# CHECK-NEXT: Flags [ (0x0) +# CHECK-NEXT: ] +# CHECK-NEXT: Value: 0x2 +# CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: _CommonSymbol +# CHECK-NEXT: Type: Section (0xE) +# CHECK-NEXT: Section: __text (0x1) +# CHECK-NEXT: RefType: UndefinedNonLazy (0x0) +# CHECK-NEXT: Flags [ (0x0) +# CHECK-NEXT: ] +# CHECK-NEXT: Value: 0x3 +# CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: _UndefinedExternalSymbol +# CHECK-NEXT: Extern +# CHECK-NEXT: Type: Undef (0x0) +# CHECK-NEXT: Section: (0x0) +# CHECK-NEXT: RefType: UndefinedNonLazy (0x0) +# CHECK-NEXT: Flags [ (0x0) +# CHECK-NEXT: ] +# CHECK-NEXT: Value: 0x0 +# CHECK-NEXT: } +# CHECK-NEXT: ] + +--- !mach-o +FileHeader: + magic: 0xFEEDFACF + cputype: 0x100000C + cpusubtype: 0x0 + filetype: 0x2 + ncmds: 3 + sizeofcmds: 328 + flags: 0x200085 + reserved: 0x0 +LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 152 + segname: __TEXT + vmaddr: 4294967296 + vmsize: 4096 + fileoff: 0 + filesize: 4096 + maxprot: 5 + initprot: 5 + nsects: 1 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0x100000FF8 + size: 8 + offset: 0xFF8 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x80000400 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 00008052C0035FD6 + - cmd: LC_SEGMENT_64 + cmdsize: 72 + segname: __LINKEDIT + vmaddr: 4294971392 + vmsize: 4096 + fileoff: 4096 + filesize: 67 + maxprot: 1 + initprot: 1 + nsects: 0 + flags: 0 + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 4096 + nsyms: 4 + stroff: 4164 + strsize: 79 +LinkEditData: + NameList: + - n_strx: 2 + n_type: 0x0E + n_sect: 1 + n_desc: 0 + n_value: 1 + - n_strx: 17 + n_type: 0x1E + n_sect: 1 + n_desc: 0 + n_value: 2 + - n_strx: 40 + n_type: 0x0F + n_sect: 1 + n_desc: 0 + n_value: 3 + - n_strx: 54 + n_type: 0x01 + n_sect: 0 + n_desc: 0 + n_value: 0 + StringTable: + - ' ' + - _PrivateSymbol + - _PrivateExternalSymbol + - _CommonSymbol + - _UndefinedExternalSymbol +... diff --git a/llvm/test/tools/llvm-objcopy/MachO/skip-symbol.test b/llvm/test/tools/llvm-objcopy/MachO/skip-symbol.test new file mode 100644 index 0000000000000..0991fb3c74cd5 --- /dev/null +++ b/llvm/test/tools/llvm-objcopy/MachO/skip-symbol.test @@ -0,0 +1,148 @@ +# RUN: yaml2obj %s -o %t +# RUN: llvm-objcopy --wildcard --localize-symbol="*" --skip-symbol _CommonSymbol %t %t.copy +# RUN: llvm-readobj --symbols %t.copy | FileCheck %s + +# RUN: echo "*" > %t-star.txt +# RUN: echo _CommonSymbol > %t-sym-list.txt +# RUN: llvm-objcopy --wildcard --localize-symbols="%t-star.txt" --skip-symbols="%t-sym-list.txt" %t %t.copy +# RUN: llvm-readobj --symbols %t.copy | FileCheck %s + +# CHECK: Symbols [ +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: _PrivateSymbol +# CHECK-NEXT: Type: Section (0xE) +# CHECK-NEXT: Section: __text (0x1) +# CHECK-NEXT: RefType: UndefinedNonLazy (0x0) +# CHECK-NEXT: Flags [ (0x0) +# CHECK-NEXT: ] +# CHECK-NEXT: Value: 0x1 +# CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: _PrivateExternalSymbol +# CHECK-NEXT: PrivateExtern +# CHECK-NEXT: Type: Section (0xE) +# CHECK-NEXT: Section: __text (0x1) +# CHECK-NEXT: RefType: UndefinedNonLazy (0x0) +# CHECK-NEXT: Flags [ (0x0) +# CHECK-NEXT: ] +# CHECK-NEXT: Value: 0x2 +# CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: _CommonSymbol2 +# CHECK-NEXT: Type: Section (0xE) +# CHECK-NEXT: Section: __text (0x1) +# CHECK-NEXT: RefType: UndefinedNonLazy (0x0) +# CHECK-NEXT: Flags [ (0x0) +# CHECK-NEXT: ] +# CHECK-NEXT: Value: 0x4 +# CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: _CommonSymbol +# CHECK-NEXT: Extern +# CHECK-NEXT: Type: Section (0xE) +# CHECK-NEXT: Section: __text (0x1) +# CHECK-NEXT: RefType: UndefinedNonLazy (0x0) +# CHECK-NEXT: Flags [ (0x0) +# CHECK-NEXT: ] +# CHECK-NEXT: Value: 0x3 +# CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: _UndefinedExternalSymbol +# CHECK-NEXT: Extern +# CHECK-NEXT: Type: Undef (0x0) +# CHECK-NEXT: Section: (0x0) +# CHECK-NEXT: RefType: UndefinedNonLazy (0x0) +# CHECK-NEXT: Flags [ (0x0) +# CHECK-NEXT: ] +# CHECK-NEXT: Value: 0x0 +# CHECK-NEXT: } +# CHECK-NEXT: ] + +--- !mach-o +FileHeader: + magic: 0xFEEDFACF + cputype: 0x100000C + cpusubtype: 0x0 + filetype: 0x2 + ncmds: 3 + sizeofcmds: 328 + flags: 0x200085 + reserved: 0x0 +LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 152 + segname: __TEXT + vmaddr: 4294967296 + vmsize: 4096 + fileoff: 0 + filesize: 4096 + maxprot: 5 + initprot: 5 + nsects: 1 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0x100000FF8 + size: 8 + offset: 0xFF8 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x80000400 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 00008052C0035FD6 + - cmd: LC_SEGMENT_64 + cmdsize: 72 + segname: __LINKEDIT + vmaddr: 4294971392 + vmsize: 4096 + fileoff: 4096 + filesize: 94 + maxprot: 1 + initprot: 1 + nsects: 0 + flags: 0 + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 4096 + nsyms: 5 + stroff: 4176 + strsize: 94 +LinkEditData: + NameList: + - n_strx: 2 + n_type: 0x0E + n_sect: 1 + n_desc: 0 + n_value: 1 + - n_strx: 17 + n_type: 0x1E + n_sect: 1 + n_desc: 0 + n_value: 2 + - n_strx: 40 + n_type: 0x0F + n_sect: 1 + n_desc: 0 + n_value: 3 + - n_strx: 54 + n_type: 0x0F + n_sect: 1 + n_desc: 0 + n_value: 4 + - n_strx: 69 + n_type: 0x01 + n_sect: 0 + n_desc: 0 + n_value: 0 + StringTable: + - ' ' + - _PrivateSymbol + - _PrivateExternalSymbol + - _CommonSymbol + - _CommonSymbol2 + - _UndefinedExternalSymbol +... From ffc7380ff4808fcc21350a39caf7f34073b41697 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Tue, 24 Dec 2024 16:18:25 +0100 Subject: [PATCH 017/567] [libc++] Avoid including shared_ptr.h in basic_ostream.h (#121049) --- libcxx/include/__fwd/memory.h | 3 +++ libcxx/include/__ostream/basic_ostream.h | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/libcxx/include/__fwd/memory.h b/libcxx/include/__fwd/memory.h index b9e151855ad7d..564000997dec6 100644 --- a/libcxx/include/__fwd/memory.h +++ b/libcxx/include/__fwd/memory.h @@ -20,6 +20,9 @@ _LIBCPP_BEGIN_NAMESPACE_STD template class _LIBCPP_TEMPLATE_VIS allocator; +template +class _LIBCPP_TEMPLATE_VIS shared_ptr; + _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP___FWD_MEMORY_H diff --git a/libcxx/include/__ostream/basic_ostream.h b/libcxx/include/__ostream/basic_ostream.h index 6d24171bc0d6c..cf4d26167aebd 100644 --- a/libcxx/include/__ostream/basic_ostream.h +++ b/libcxx/include/__ostream/basic_ostream.h @@ -14,7 +14,7 @@ #if _LIBCPP_HAS_LOCALIZATION # include <__exception/operations.h> -# include <__memory/shared_ptr.h> +# include <__fwd/memory.h> # include <__memory/unique_ptr.h> # include <__new/exceptions.h> # include <__ostream/put_character_sequence.h> From 2ff614aaa6eb94bc5d02c8f0fb70a1132acb4423 Mon Sep 17 00:00:00 2001 From: Congcong Cai Date: Tue, 24 Dec 2024 23:24:14 +0800 Subject: [PATCH 018/567] [clang-tidy] support parameters file in command line (#120547) Fixes: #103499 --- .../clang-tidy/tool/ClangTidyMain.cpp | 22 +++++++++++++++++++ clang-tools-extra/docs/ReleaseNotes.rst | 2 ++ clang-tools-extra/docs/clang-tidy/index.rst | 11 ++++++++++ .../Inputs/param/parameters.txt | 2 ++ .../read-parameters-from-file-error.cpp | 3 +++ .../read-parameters-from-file.cpp | 5 +++++ 6 files changed, 45 insertions(+) create mode 100644 clang-tools-extra/test/clang-tidy/infrastructure/Inputs/param/parameters.txt create mode 100644 clang-tools-extra/test/clang-tidy/infrastructure/read-parameters-from-file-error.cpp create mode 100644 clang-tools-extra/test/clang-tidy/infrastructure/read-parameters-from-file.cpp diff --git a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp index b8d843cba7133..3451e1f624257 100644 --- a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp +++ b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp @@ -20,12 +20,14 @@ #include "../GlobList.h" #include "clang/Tooling/CommonOptionsParser.h" #include "llvm/ADT/StringSet.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/InitLLVM.h" #include "llvm/Support/PluginLoader.h" #include "llvm/Support/Process.h" #include "llvm/Support/Signals.h" #include "llvm/Support/TargetSelect.h" #include "llvm/Support/WithColor.h" +#include "llvm/TargetParser/Host.h" #include using namespace clang::tooling; @@ -36,6 +38,11 @@ static cl::desc desc(StringRef description) { return {description.ltrim()}; } static cl::OptionCategory ClangTidyCategory("clang-tidy options"); static cl::extrahelp CommonHelp(CommonOptionsParser::HelpMessage); +static cl::extrahelp ClangTidyParameterFileHelp(R"( +Parameters files: + A large number of options or source files can be passed as parameter files + by use '@parameter-file' in the command line. +)"); static cl::extrahelp ClangTidyHelp(R"( Configuration files: clang-tidy attempts to read configuration for each source file from a @@ -571,6 +578,21 @@ static llvm::IntrusiveRefCntPtr createBaseFS() { int clangTidyMain(int argc, const char **argv) { llvm::InitLLVM X(argc, argv); + SmallVector Args{argv, argv + argc}; + + // expand parameters file to argc and argv. + llvm::BumpPtrAllocator Alloc; + llvm::cl::TokenizerCallback Tokenizer = + llvm::Triple(llvm::sys::getProcessTriple()).isOSWindows() + ? llvm::cl::TokenizeWindowsCommandLine + : llvm::cl::TokenizeGNUCommandLine; + llvm::cl::ExpansionContext ECtx(Alloc, Tokenizer); + if (llvm::Error Err = ECtx.expandResponseFiles(Args)) { + llvm::WithColor::error() << llvm::toString(std::move(Err)) << "\n"; + return 1; + } + argc = static_cast(Args.size()); + argv = Args.data(); // Enable help for -load option, if plugins are enabled. if (cl::Option *LoadOpt = cl::getRegisteredOptions().lookup("load")) diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index fa3a8e577a33a..fabd0cc78ac64 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -115,6 +115,8 @@ Improvements to clang-tidy - Improved :program:`run-clang-tidy.py` script. Fixed minor shutdown noise happening on certain platforms when interrupting the script. +- Improved :program:`clang-tidy` by accepting parameters file in command line. + - Removed :program:`clang-tidy`'s global options for most of checks. All options are changed to local options except `IncludeStyle`, `StrictMode` and `IgnoreMacros`. diff --git a/clang-tools-extra/docs/clang-tidy/index.rst b/clang-tools-extra/docs/clang-tidy/index.rst index f053e57e8d4c8..8c79b4dc19393 100644 --- a/clang-tools-extra/docs/clang-tidy/index.rst +++ b/clang-tools-extra/docs/clang-tidy/index.rst @@ -33,6 +33,14 @@ compilation options on the command line after ``--``: $ clang-tidy test.cpp -- -Imy_project/include -DMY_DEFINES ... +If there are too many options or source files to specify on the command line, +you can store them in a parameter file, and use :program:`clang-tidy` with that +parameters file: + +.. code-block:: console + + $ clang-tidy @parameters_file + :program:`clang-tidy` has its own checks and can also run Clang Static Analyzer checks. Each check has a name and the checks to run can be chosen using the ``-checks=`` option, which specifies a comma-separated list of positive and @@ -264,6 +272,9 @@ An overview of all the command-line options: automatically removed, but the rest of a relative path must be a suffix of a path in the compile command database. + Parameters files: + A large number of options or source files can be passed as parameter files + by use '@parameter-file' in the command line. Configuration files: clang-tidy attempts to read configuration for each source file from a diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/param/parameters.txt b/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/param/parameters.txt new file mode 100644 index 0000000000000..a6d8fa7ee299f --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/param/parameters.txt @@ -0,0 +1,2 @@ +-checks='-*,llvm-namespace-comment' +--warnings-as-errors=llvm-namespace-comment diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/read-parameters-from-file-error.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/read-parameters-from-file-error.cpp new file mode 100644 index 0000000000000..183f44365137c --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/infrastructure/read-parameters-from-file-error.cpp @@ -0,0 +1,3 @@ +// RUN: echo @%t.param > %t.param && not clang-tidy %s @%t.param -- 2>&1 | FileCheck %s + +// CHECK: recursive expansion of diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/read-parameters-from-file.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/read-parameters-from-file.cpp new file mode 100644 index 0000000000000..9d8c40a2e7d41 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/infrastructure/read-parameters-from-file.cpp @@ -0,0 +1,5 @@ +// RUN: not clang-tidy %s @%S/Inputs/param/parameters.txt -- | FileCheck %s + +namespace i { +} +// CHECK: error: namespace 'i' not terminated with a closing comment [llvm-namespace-comment,-warnings-as-errors] From 6bafbc99b0df7d5554af63115d78d0d97065862a Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 24 Dec 2024 07:32:12 -0800 Subject: [PATCH 019/567] [SLP][NFC]Add a test with incorrect (more poisnous) reduction chain --- .../logical-ops-poisonous-repeated.ll | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 llvm/test/Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll diff --git a/llvm/test/Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll b/llvm/test/Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll new file mode 100644 index 0000000000000..6ef6490d9f830 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll @@ -0,0 +1,32 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer < %s | FileCheck %s + +define i1 @test(<4 x i32> %x) { +; CHECK-LABEL: define i1 @test( +; CHECK-SAME: <4 x i32> [[X:%.*]]) { +; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X]], i32 0 +; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 -1 +; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 +; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[X0]], 0 +; CHECK-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 0 +; CHECK-NEXT: [[C2:%.*]] = icmp sgt i32 [[X2]], 0 +; CHECK-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 0 +; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[C3]], i1 [[C1]], i1 false +; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i1 [[TMP1]], i1 false +; CHECK-NEXT: ret i1 [[OP_RDX1]] +; + %x0 = extractelement <4 x i32> %x, i32 0 + %x1 = extractelement <4 x i32> %x, i32 -1 + %x2 = extractelement <4 x i32> %x, i32 2 + %x3 = extractelement <4 x i32> %x, i32 3 + %2 = icmp ugt i32 %x0, 0 + %c1 = icmp slt i32 %x1, 0 + %c2 = icmp sgt i32 %x2, 0 + %c3 = icmp slt i32 %x3, 0 + %s1 = select i1 %2, i1 %c1, i1 false + %s2 = select i1 %s1, i1 %c3, i1 false + %s3 = select i1 %s2, i1 %c3, i1 false + ret i1 %s3 +} + From f0f8dab712967b8fca5fcca4d7338b1d25017634 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 24 Dec 2024 07:40:35 -0800 Subject: [PATCH 020/567] [SLP]Check if the first reduced value requires freeze/swap, if it may be too poisonous If several reduced values are combined and the first reduced value is just the original reduced value of the bool logical op, need to freeze it to prevent the propagation of the poison value. Fixes #114905 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 32 +++++++++++++------ .../logical-ops-poisonous-repeated.ll | 3 +- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index b5d68c075b986..2785d7fb36ffd 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -19679,20 +19679,35 @@ class HorizontalReduction { return cast(ScalarCond); }; + bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) { + return isBoolLogicOp(cast(V)); + }); // Return new VectorizedTree, based on previous value. auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) { if (VectorizedTree) { // Update the final value in the reduction. Builder.SetCurrentDebugLocation( cast(ReductionOps.front().front())->getDebugLoc()); - if ((isa(VectorizedTree) && !isa(Res)) || - (isGuaranteedNotToBePoison(Res) && - !isGuaranteedNotToBePoison(VectorizedTree))) { - auto It = ReducedValsToOps.find(Res); - if (It != ReducedValsToOps.end() && - any_of(It->getSecond(), - [](Instruction *I) { return isBoolLogicOp(I); })) + if (AnyBoolLogicOp) { + + if (auto It = ReducedValsToOps.find(VectorizedTree); + It == ReducedValsToOps.end() || + isGuaranteedNotToBePoison(VectorizedTree) || + any_of(It->getSecond(), [&](Instruction *I) { + return isBoolLogicOp(I) && + getRdxOperand(I, 0) == VectorizedTree; + })) { + ; + } else if (auto It = ReducedValsToOps.find(Res); + It == ReducedValsToOps.end() || + isGuaranteedNotToBePoison(Res) || + any_of(It->getSecond(), [&](Instruction *I) { + return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res; + })) { std::swap(VectorizedTree, Res); + } else { + VectorizedTree = Builder.CreateFreeze(VectorizedTree); + } } return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx", @@ -19701,9 +19716,6 @@ class HorizontalReduction { // Initialize the final value in the reduction. return Res; }; - bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) { - return isBoolLogicOp(cast(V)); - }); SmallDenseSet IgnoreList(ReductionOps.size() * ReductionOps.front().size()); for (ReductionOpsType &RdxOps : ReductionOps) diff --git a/llvm/test/Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll b/llvm/test/Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll index 6ef6490d9f830..101f66f331304 100644 --- a/llvm/test/Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll +++ b/llvm/test/Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll @@ -12,7 +12,8 @@ define i1 @test(<4 x i32> %x) { ; CHECK-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 0 ; CHECK-NEXT: [[C2:%.*]] = icmp sgt i32 [[X2]], 0 ; CHECK-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 0 -; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[C3]], i1 [[C1]], i1 false +; CHECK-NEXT: [[TMP2:%.*]] = freeze i1 [[C3]] +; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP2]], i1 [[C1]], i1 false ; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i1 [[TMP1]], i1 false ; CHECK-NEXT: ret i1 [[OP_RDX1]] ; From 8dbb33762cfb8d8606d28a71293f437ddffee4af Mon Sep 17 00:00:00 2001 From: Balazs Benics Date: Tue, 24 Dec 2024 17:05:38 +0100 Subject: [PATCH 021/567] [analyzer] Simplify CallEvent castArgToParamTypeIfNeeded (#120981) I noticed recently that this code (that I wrote xD) uses the `getRuntimeDefinition()` which isn't quite necessary for the simple task this function was designed for. Why would it be better not using this API here? I'm experimenting with improving how virtual functions are inlined, where depending on our ability of deducing the dynamic type of the object we may end up with inaccurate type information. Such inaccuracy would mean that we may have multiple runtime definitions. After that, this code would become ambiguous. To resolve this, I decided to refactor this and use a simpler - but equivalent approach. --- clang/lib/StaticAnalyzer/Core/CallEvent.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/clang/lib/StaticAnalyzer/Core/CallEvent.cpp b/clang/lib/StaticAnalyzer/Core/CallEvent.cpp index 0fdef7487b981..bb4a39f68280c 100644 --- a/clang/lib/StaticAnalyzer/Core/CallEvent.cpp +++ b/clang/lib/StaticAnalyzer/Core/CallEvent.cpp @@ -435,27 +435,27 @@ static SVal processArgument(SVal Value, const Expr *ArgumentExpr, /// runtime definition don't match in terms of argument and parameter count. static SVal castArgToParamTypeIfNeeded(const CallEvent &Call, unsigned ArgIdx, SVal ArgVal, SValBuilder &SVB) { - const FunctionDecl *RTDecl = - Call.getRuntimeDefinition().getDecl()->getAsFunction(); const auto *CallExprDecl = dyn_cast_or_null(Call.getDecl()); - - if (!RTDecl || !CallExprDecl) + if (!CallExprDecl) return ArgVal; + const FunctionDecl *Definition = CallExprDecl; + Definition->hasBody(Definition); + // The function decl of the Call (in the AST) will not have any parameter // declarations, if it was 'only' declared without a prototype. However, the // engine will find the appropriate runtime definition - basically a // redeclaration, which has a function body (and a function prototype). - if (CallExprDecl->hasPrototype() || !RTDecl->hasPrototype()) + if (CallExprDecl->hasPrototype() || !Definition->hasPrototype()) return ArgVal; // Only do this cast if the number arguments at the callsite matches with // the parameters at the runtime definition. - if (Call.getNumArgs() != RTDecl->getNumParams()) + if (Call.getNumArgs() != Definition->getNumParams()) return UnknownVal(); const Expr *ArgExpr = Call.getArgExpr(ArgIdx); - const ParmVarDecl *Param = RTDecl->getParamDecl(ArgIdx); + const ParmVarDecl *Param = Definition->getParamDecl(ArgIdx); return SVB.evalCast(ArgVal, Param->getType(), ArgExpr->getType()); } From 0d6cb0ae9d4ff610f729d0fd1bbd27227e6628cf Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 24 Dec 2024 08:11:36 -0800 Subject: [PATCH 022/567] [SLP]Fix strict weak ordering criterion in comparators Fixes #121019 --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 2785d7fb36ffd..d3b52da380a9c 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5635,8 +5635,11 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { auto PHICompare = [&](unsigned I1, unsigned I2) { Value *V1 = TE.Scalars[I1]; Value *V2 = TE.Scalars[I2]; - if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0) || - isa(V1) || isa(V2)) + if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0)) + return false; + if (isa(V1)) + return true; + if (isa(V2)) return false; if (V1->getNumUses() < V2->getNumUses()) return true; @@ -21733,9 +21736,6 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { V2->getValueOperand()->getType()->getScalarSizeInBits()) return false; // UndefValues are compatible with all other values. - if (isa(V->getValueOperand()) || - isa(V2->getValueOperand())) - return false; if (auto *I1 = dyn_cast(V->getValueOperand())) if (auto *I2 = dyn_cast(V2->getValueOperand())) { DomTreeNodeBase *NodeI1 = @@ -21749,14 +21749,8 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { "Different nodes should have different DFS numbers"); if (NodeI1 != NodeI2) return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); - InstructionsState S = getSameOpcode({I1, I2}, *TLI); - if (S.getOpcode()) - return false; return I1->getOpcode() < I2->getOpcode(); } - if (isa(V->getValueOperand()) && - isa(V2->getValueOperand())) - return false; return V->getValueOperand()->getValueID() < V2->getValueOperand()->getValueID(); }; From 852feea820f3f8b2fc44c851cc3ce5fe9576fa64 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 24 Dec 2024 09:20:26 -0800 Subject: [PATCH 023/567] [SLP]Propagate AssumptionCache where possible --- .../Transforms/Vectorize/SLPVectorizer.cpp | 52 ++++++++++--------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index d3b52da380a9c..fd167b0036e9c 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -702,7 +702,8 @@ static SmallBitVector isUndefVector(const Value *V, /// TODO: Can we split off and reuse the shuffle mask detection from /// ShuffleVectorInst/getShuffleCost? static std::optional -isFixedVectorShuffle(ArrayRef VL, SmallVectorImpl &Mask) { +isFixedVectorShuffle(ArrayRef VL, SmallVectorImpl &Mask, + AssumptionCache *AC) { const auto *It = find_if(VL, IsaPred); if (It == VL.end()) return std::nullopt; @@ -719,14 +720,14 @@ isFixedVectorShuffle(ArrayRef VL, SmallVectorImpl &Mask) { Value *Vec1 = nullptr; Value *Vec2 = nullptr; - bool HasNonUndefVec = any_of(VL, [](Value *V) { + bool HasNonUndefVec = any_of(VL, [&](Value *V) { auto *EE = dyn_cast(V); if (!EE) return false; Value *Vec = EE->getVectorOperand(); if (isa(Vec)) return false; - return isGuaranteedNotToBePoison(Vec); + return isGuaranteedNotToBePoison(Vec, AC); }); enum ShuffleMode { Unknown, Select, Permute }; ShuffleMode CommonShuffleMode = Unknown; @@ -11875,7 +11876,7 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const { TE->Scalars.size() < Limit || ((TE->getOpcode() == Instruction::ExtractElement || all_of(TE->Scalars, IsaPred)) && - isFixedVectorShuffle(TE->Scalars, Mask)) || + isFixedVectorShuffle(TE->Scalars, Mask, AC)) || (TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()) || any_of(TE->Scalars, IsaPred)); }; @@ -12940,7 +12941,7 @@ BoUpSLP::tryToGatherSingleRegisterExtractElements( // Check that gather of extractelements can be represented as just a // shuffle of a single/two vectors the scalars are extracted from. std::optional Res = - isFixedVectorShuffle(GatheredExtracts, Mask); + isFixedVectorShuffle(GatheredExtracts, Mask, AC); if (!Res || all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) { // TODO: try to check other subsets if possible. // Restore the original VL if attempt was not successful. @@ -14828,7 +14829,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, // non-poisonous, or by freezing the incoming scalar value first. auto *It = find_if(Scalars, [this, E](Value *V) { return !isa(V) && - (getTreeEntry(V) || isGuaranteedNotToBePoison(V) || + (getTreeEntry(V) || isGuaranteedNotToBePoison(V, AC) || (E->UserTreeIndices.size() == 1 && any_of(V->uses(), [E](const Use &U) { // Check if the value already used in the same operation in @@ -14900,11 +14901,11 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, } if (Vec2) { IsUsedInExpr = false; - IsNonPoisoned &= - isGuaranteedNotToBePoison(Vec1) && isGuaranteedNotToBePoison(Vec2); + IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) && + isGuaranteedNotToBePoison(Vec2, AC); ShuffleBuilder.add(Vec1, Vec2, ExtractMask); } else if (Vec1) { - bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1); + bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC); IsUsedInExpr &= FindReusedSplat( ExtractMask, cast(Vec1->getType())->getNumElements(), 0, @@ -14935,7 +14936,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, if (TEs.size() == 1) { bool IsNotPoisonedVec = TEs.front()->VectorizedValue - ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) + ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) : true; IsUsedInExpr &= FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I, @@ -14947,8 +14948,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask); if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue) IsNonPoisoned &= - isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) && - isGuaranteedNotToBePoison(TEs.back()->VectorizedValue); + isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) && + isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC); } } } @@ -15283,7 +15284,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { } if (!IsIdentity || NumElts != NumScalars) { Value *V2 = nullptr; - bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V); + bool IsVNonPoisonous = + !isConstant(V) && isGuaranteedNotToBePoison(V, AC); SmallVector InsertMask(Mask); if (NumElts != NumScalars && Offset == 0) { // Follow all insert element instructions from the current buildvector @@ -19638,7 +19640,7 @@ class HorizontalReduction { /// Attempt to vectorize the tree found by matchAssociativeReduction. Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI, - const TargetLibraryInfo &TLI) { + const TargetLibraryInfo &TLI, AssumptionCache *AC) { const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4; constexpr unsigned RegMaxNumber = 4; constexpr unsigned RedValsMaxNumber = 128; @@ -19695,7 +19697,7 @@ class HorizontalReduction { if (auto It = ReducedValsToOps.find(VectorizedTree); It == ReducedValsToOps.end() || - isGuaranteedNotToBePoison(VectorizedTree) || + isGuaranteedNotToBePoison(VectorizedTree, AC) || any_of(It->getSecond(), [&](Instruction *I) { return isBoolLogicOp(I) && getRdxOperand(I, 0) == VectorizedTree; @@ -19703,7 +19705,7 @@ class HorizontalReduction { ; } else if (auto It = ReducedValsToOps.find(Res); It == ReducedValsToOps.end() || - isGuaranteedNotToBePoison(Res) || + isGuaranteedNotToBePoison(Res, AC) || any_of(It->getSecond(), [&](Instruction *I) { return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res; })) { @@ -19795,7 +19797,7 @@ class HorizontalReduction { TrackedToOrig.try_emplace(RdxVal, RV); } SmallVector Mask; - if (isFixedVectorShuffle(CommonCandidates, Mask)) { + if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) { ++I; Candidates.swap(CommonCandidates); ShuffledExtracts = true; @@ -20110,7 +20112,7 @@ class HorizontalReduction { // To prevent poison from leaking across what used to be sequential, // safe, scalar boolean logic operations, the reduction operand must be // frozen. - if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot)) + if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC)) VectorizedRoot = Builder.CreateFreeze(VectorizedRoot); // Emit code to correctly handle reused reduced values, if required. @@ -20217,13 +20219,13 @@ class HorizontalReduction { bool InitStep) { if (!AnyBoolLogicOp) return; - if (isBoolLogicOp(RedOp1) && - ((!InitStep && LHS == VectorizedTree) || - getRdxOperand(RedOp1, 0) == LHS || isGuaranteedNotToBePoison(LHS))) + if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) || + getRdxOperand(RedOp1, 0) == LHS || + isGuaranteedNotToBePoison(LHS, AC))) return; if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) || getRdxOperand(RedOp2, 0) == RHS || - isGuaranteedNotToBePoison(RHS))) { + isGuaranteedNotToBePoison(RHS, AC))) { std::swap(LHS, RHS); return; } @@ -20871,7 +20873,7 @@ bool SLPVectorizerPass::vectorizeHorReduction( HorizontalReduction HorRdx; if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI)) return nullptr; - return HorRdx.tryToReduce(R, *DL, TTI, *TLI); + return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC); }; auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) { if (TryOperandsAsNewSeeds && FutureSeed == Root) { @@ -20977,8 +20979,8 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, SmallVector BuildVectorOpds; SmallVector Mask; if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) || - (llvm::all_of(BuildVectorOpds, IsaPred) && - isFixedVectorShuffle(BuildVectorOpds, Mask))) + (all_of(BuildVectorOpds, IsaPred) && + isFixedVectorShuffle(BuildVectorOpds, Mask, AC))) return false; if (MaxVFOnly && BuildVectorInsts.size() == 2) { From 6e3631d0e3316394ff4eae2913013d323e685790 Mon Sep 17 00:00:00 2001 From: Kunwar Grover Date: Tue, 24 Dec 2024 18:01:41 +0000 Subject: [PATCH 024/567] [mlir][scf] Track replacements using a listener in TileAndFuse (#120999) This PR makes TileAndFuse explicitly track replacements using a listener instead of assuming that the results always come from the outer most tiling loop. scf::tileUsingInterface can introduce merge operations whose results are the actual replacements to use, instead of the outer most loop results. --- .../SCF/Transforms/TileUsingInterface.cpp | 80 ++++++++++++++----- 1 file changed, 59 insertions(+), 21 deletions(-) diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp index 90db42d479a19..2277989bf8411 100644 --- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp @@ -28,6 +28,7 @@ #include "mlir/Interfaces/TilingInterface.h" #include "mlir/Rewrite/FrozenRewritePatternSet.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/Debug.h" #include @@ -1467,6 +1468,47 @@ void SliceTrackingListener::notifyOperationReplaced(Operation *op, ValueRange replacement) { removeOp(op); } + +//===----------------------------------------------------------------------===// +// ReplacementListener +//===----------------------------------------------------------------------===// + +/// Listener that tracks updates replacements for values which can be mutated. +/// This listener runs on top of the existing listener for the rewriter, +/// to make sure external users can still run listeners. +class ReplacementListener : public RewriterBase::ForwardingListener { +public: + ReplacementListener(DenseMap &replacements, + OpBuilder::Listener *listener) + : ForwardingListener(listener), replacements(replacements) {} + + void updateReplacementValues(ValueRange origValues, + ValueRange replaceValues) { + // This can probably be written better, but just iterates over the map + // and the new replacements for now. + for (auto &[key, val] : replacements) { + for (auto [orig, replace] : llvm::zip_equal(origValues, replaceValues)) { + if (val == orig) { + val = replace; + } + } + } + } + + void notifyOperationReplaced(Operation *op, Operation *newOp) override { + ForwardingListener::notifyOperationReplaced(op, newOp); + updateReplacementValues(op->getResults(), newOp->getResults()); + } + + void notifyOperationReplaced(Operation *op, ValueRange values) override { + ForwardingListener::notifyOperationReplaced(op, values); + updateReplacementValues(op->getResults(), values); + } + +private: + DenseMap &replacements; +}; + } // namespace /// Implementation of tile consumer and fuse producer greedily. @@ -1493,26 +1535,27 @@ mlir::scf::tileConsumerAndFuseProducersUsingSCF( for (auto *tiledOp : tilingResult->tiledOps) tiledAndFusedOps.insert(tiledOp); + DenseMap replacements; + for (auto [origVal, replacement] : llvm::zip_equal( + consumer->getResults(), tilingResult->mergeResult.replacements)) { + replacements[origVal] = replacement; + } + // If there are no loops generated, fusion is immaterial. auto &loops = tilingResult->loops; if (loops.empty()) { - DenseMap replacements; - for (auto [origVal, replacement] : llvm::zip_equal( - consumer->getResults(), tilingResult->mergeResult.replacements)) { - replacements[origVal] = replacement; - } return scf::SCFTileAndFuseResult{fusedProducers, tiledAndFusedOps, loops, replacements}; } - // To keep track of replacements for now just record the map from the - // original untiled value to the result number of the for loop. Since the - // loop gets potentially replaced during fusion, keeping the value directly - // wont work. - DenseMap origValToResultNumber; - for (auto [index, result] : llvm::enumerate(consumer->getResults())) { - origValToResultNumber[result] = index; - } + // Since the loop gets potentially replaced during fusion, we need to track + // the mutation of replacement values. To do this, we attach a listener to + // update the replacements as they happen. + OpBuilder::Listener *previousListener = rewriter.getListener(); + auto resetListener = + llvm::make_scope_exit([&]() { rewriter.setListener(previousListener); }); + ReplacementListener replaceListener(replacements, previousListener); + rewriter.setListener(&replaceListener); // 2. Typically, the operands of the tiled operation are slices of the // operands of the untiled operation. These are expressed in IR using @@ -1581,9 +1624,9 @@ mlir::scf::tileConsumerAndFuseProducersUsingSCF( worklistCandidates.append(newSlices.value()); for (auto [index, result] : llvm::enumerate(fusableProducerOp->getResults())) { - origValToResultNumber[result] = loops.front()->getNumResults() - - fusableProducerOp->getNumResults() + - index; + replacements[result] = loops.front()->getResult( + loops.front()->getNumResults() - + fusableProducerOp->getNumResults() + index); } } if (Operation *tiledAndFusedOp = @@ -1597,11 +1640,6 @@ mlir::scf::tileConsumerAndFuseProducersUsingSCF( } } - DenseMap replacements; - for (auto [origVal, resultNumber] : origValToResultNumber) { - replacements[origVal] = loops.front()->getResult(resultNumber); - } - return scf::SCFTileAndFuseResult{fusedProducers, tiledAndFusedOps, loops, replacements}; } From 2d038caeebc8c5e49915c0db7c7eb21116c71de2 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 24 Dec 2024 20:19:34 +0000 Subject: [PATCH 025/567] [VPlan] Remove stray space when printing VPWidenCastRecipe. printFlags() already takes care of printing a single space if there are no flags. Remove the extra space when printing a recipe without flags. --- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 2 +- .../AArch64/sve-inductions-unusual-types.ll | 2 +- .../AArch64/sve2-histcnt-vplan.ll | 2 +- .../AArch64/type-shrinkage-zext-costs.ll | 32 +++++++++---------- .../LoopVectorize/ARM/mve-icmpcost.ll | 30 ++++++++--------- .../LoopVectorize/X86/reduction-small-size.ll | 8 ++--- .../X86/uint64_to_fp64-cost-model.ll | 4 +-- .../first-order-recurrence-chains-vplan.ll | 8 ++--- 8 files changed, 44 insertions(+), 44 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index bbee9b0125206..36a5d3be113ba 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1642,7 +1642,7 @@ void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN-CAST "; printAsOperand(O, SlotTracker); - O << " = " << Instruction::getOpcodeName(Opcode) << " "; + O << " = " << Instruction::getOpcodeName(Opcode); printFlags(O); printOperands(O, SlotTracker); O << " to " << *getResultType(); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll index 5f09431b66d47..25d3b3fe3b837 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll @@ -72,7 +72,7 @@ for.end: ; preds = %for.body ; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction %indvars.iv.next1295 = add i3 %indvars.iv1294, 1 ; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction %indvars.iv1294 = phi i3 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ] -; DEBUG: Cost of Invalid for VF vscale x 1: WIDEN-CAST ir<%zexti3> = zext ir<%indvars.iv1294> to i64 +; DEBUG: Cost of Invalid for VF vscale x 1: WIDEN-CAST ir<%zexti3> = zext ir<%indvars.iv1294> to i64 define void @induction_i3_zext(ptr %dst) #0 { ; CHECK-LABEL: define void @induction_i3_zext( diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll index 3ef99ff496a68..8037a3a0c0f84 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll @@ -75,7 +75,7 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: CLONE [[GEP_IDX:.*]] = getelementptr inbounds ir<%indices>, [[STEPS]] ; CHECK-NEXT: [[VECP_IDX:vp.*]] = vector-pointer [[GEP_IDX]] ; CHECK-NEXT: WIDEN [[IDX:.*]] = load [[VECP_IDX]] -; CHECK-NEXT: WIDEN-CAST [[EXT_IDX:.*]] = zext [[IDX]] to i64 +; CHECK-NEXT: WIDEN-CAST [[EXT_IDX:.*]] = zext [[IDX]] to i64 ; CHECK-NEXT: WIDEN-GEP Inv[Var] [[GEP_BUCKET:.*]] = getelementptr inbounds ir<%buckets>, [[EXT_IDX]] ; CHECK-NEXT: WIDEN-HISTOGRAM buckets: [[GEP_BUCKET]], inc: ir<1> ; CHECK-NEXT: EMIT [[IV_NEXT]] = add nuw [[IV]], [[VFxUF]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll index bd2e5dcb3dba4..0bc3ea94a37e3 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll @@ -8,14 +8,14 @@ target triple = "aarch64-unknown-linux-gnu" define void @zext_i8_i16(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i32 %len) #0 { ; CHECK-COST-LABEL: LV: Checking a loop in 'zext_i8_i16' -; CHECK-COST: Cost of 1 for VF 2: WIDEN-CAST ir<%conv> = zext ir<%0> to i16 -; CHECK-COST: Cost of 1 for VF 4: WIDEN-CAST ir<%conv> = zext ir<%0> to i16 -; CHECK-COST: Cost of 1 for VF 8: WIDEN-CAST ir<%conv> = zext ir<%0> to i16 -; CHECK-COST: Cost of 2 for VF 16: WIDEN-CAST ir<%conv> = zext ir<%0> to i16 -; CHECK-COST: Cost of 1 for VF vscale x 1: WIDEN-CAST ir<%conv> = zext ir<%0> to i16 -; CHECK-COST: Cost of 1 for VF vscale x 2: WIDEN-CAST ir<%conv> = zext ir<%0> to i16 -; CHECK-COST: Cost of 1 for VF vscale x 4: WIDEN-CAST ir<%conv> = zext ir<%0> to i16 -; CHECK-COST: Cost of 0 for VF vscale x 8: WIDEN-CAST ir<%conv> = zext ir<%0> to i16 +; CHECK-COST: Cost of 1 for VF 2: WIDEN-CAST ir<%conv> = zext ir<%0> to i16 +; CHECK-COST: Cost of 1 for VF 4: WIDEN-CAST ir<%conv> = zext ir<%0> to i16 +; CHECK-COST: Cost of 1 for VF 8: WIDEN-CAST ir<%conv> = zext ir<%0> to i16 +; CHECK-COST: Cost of 2 for VF 16: WIDEN-CAST ir<%conv> = zext ir<%0> to i16 +; CHECK-COST: Cost of 1 for VF vscale x 1: WIDEN-CAST ir<%conv> = zext ir<%0> to i16 +; CHECK-COST: Cost of 1 for VF vscale x 2: WIDEN-CAST ir<%conv> = zext ir<%0> to i16 +; CHECK-COST: Cost of 1 for VF vscale x 4: WIDEN-CAST ir<%conv> = zext ir<%0> to i16 +; CHECK-COST: Cost of 0 for VF vscale x 8: WIDEN-CAST ir<%conv> = zext ir<%0> to i16 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction: %conv = zext i8 %0 to i32 ; CHECK-LABEL: define void @zext_i8_i16 ; CHECK-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i32 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] { @@ -85,14 +85,14 @@ exit: ; preds = %for.body define void @sext_i8_i16(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i32 %len) #0 { ; CHECK-COST-LABEL: LV: Checking a loop in 'sext_i8_i16' -; CHECK-COST: Cost of 1 for VF 2: WIDEN-CAST ir<%conv> = sext ir<%0> to i16 -; CHECK-COST: Cost of 1 for VF 4: WIDEN-CAST ir<%conv> = sext ir<%0> to i16 -; CHECK-COST: Cost of 1 for VF 8: WIDEN-CAST ir<%conv> = sext ir<%0> to i16 -; CHECK-COST: Cost of 2 for VF 16: WIDEN-CAST ir<%conv> = sext ir<%0> to i16 -; CHECK-COST: Cost of 1 for VF vscale x 1: WIDEN-CAST ir<%conv> = sext ir<%0> to i16 -; CHECK-COST: Cost of 1 for VF vscale x 2: WIDEN-CAST ir<%conv> = sext ir<%0> to i16 -; CHECK-COST: Cost of 1 for VF vscale x 4: WIDEN-CAST ir<%conv> = sext ir<%0> to i16 -; CHECK-COST: Cost of 0 for VF vscale x 8: WIDEN-CAST ir<%conv> = sext ir<%0> to i16 +; CHECK-COST: Cost of 1 for VF 2: WIDEN-CAST ir<%conv> = sext ir<%0> to i16 +; CHECK-COST: Cost of 1 for VF 4: WIDEN-CAST ir<%conv> = sext ir<%0> to i16 +; CHECK-COST: Cost of 1 for VF 8: WIDEN-CAST ir<%conv> = sext ir<%0> to i16 +; CHECK-COST: Cost of 2 for VF 16: WIDEN-CAST ir<%conv> = sext ir<%0> to i16 +; CHECK-COST: Cost of 1 for VF vscale x 1: WIDEN-CAST ir<%conv> = sext ir<%0> to i16 +; CHECK-COST: Cost of 1 for VF vscale x 2: WIDEN-CAST ir<%conv> = sext ir<%0> to i16 +; CHECK-COST: Cost of 1 for VF vscale x 4: WIDEN-CAST ir<%conv> = sext ir<%0> to i16 +; CHECK-COST: Cost of 0 for VF vscale x 8: WIDEN-CAST ir<%conv> = sext ir<%0> to i16 ; CHECK-LABEL: define void @sext_i8_i16 ; CHECK-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i32 [[LEN:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll index 7b18e5cc1da7f..b22910316d7cb 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll @@ -27,7 +27,7 @@ target triple = "thumbv8.1m.main-arm-none-eabi" ; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<%3> ; CHECK: Cost of 0 for VF 2: vp<%4> = vector-pointer ir<%arrayidx> ; CHECK: Cost of 18 for VF 2: WIDEN ir<%1> = load vp<%4> -; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv> = sext ir<%1> to i32 +; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv> = sext ir<%1> to i32 ; CHECK: Cost of 20 for VF 2: WIDEN ir<%cmp2> = icmp sgt ir<%conv>, ir<%conv1> ; CHECK: Cost of 26 for VF 2: WIDEN ir<%conv6> = add ir<%1>, ir<%0> ; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx7> = getelementptr ir<%d>, vp<%3> @@ -44,7 +44,7 @@ target triple = "thumbv8.1m.main-arm-none-eabi" ; CHECK: Cost of 0 for VF 4: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<%3> ; CHECK: Cost of 0 for VF 4: vp<%4> = vector-pointer ir<%arrayidx> ; CHECK: Cost of 2 for VF 4: WIDEN ir<%1> = load vp<%4> -; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv> = sext ir<%1> to i32 +; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv> = sext ir<%1> to i32 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%cmp2> = icmp sgt ir<%conv>, ir<%conv1> ; CHECK: Cost of 2 for VF 4: WIDEN ir<%conv6> = add ir<%1>, ir<%0> ; CHECK: Cost of 0 for VF 4: CLONE ir<%arrayidx7> = getelementptr ir<%d>, vp<%3> @@ -61,7 +61,7 @@ target triple = "thumbv8.1m.main-arm-none-eabi" ; CHECK: Cost of 0 for VF 8: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<%3> ; CHECK: Cost of 0 for VF 8: vp<%4> = vector-pointer ir<%arrayidx> ; CHECK: Cost of 2 for VF 8: WIDEN ir<%1> = load vp<%4> -; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv> = sext ir<%1> to i32 +; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv> = sext ir<%1> to i32 ; CHECK: Cost of 36 for VF 8: WIDEN ir<%cmp2> = icmp sgt ir<%conv>, ir<%conv1> ; CHECK: Cost of 2 for VF 8: WIDEN ir<%conv6> = add ir<%1>, ir<%0> ; CHECK: Cost of 0 for VF 8: CLONE ir<%arrayidx7> = getelementptr ir<%d>, vp<%3> @@ -144,15 +144,15 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 2: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<[[STEPS3]]> ; CHECK: Cost of 0 for VF 2: vp<[[VEC_PTR:%.+]]> = vector-pointer vp<%next.gep> ; CHECK: Cost of 18 for VF 2: WIDEN ir<%0> = load vp<[[VEC_PTR]]> -; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv1> = sext ir<%0> to i32 +; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv1> = sext ir<%0> to i32 ; CHECK: Cost of 0 for VF 2: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.2 ; CHECK: Cost of 18 for VF 2: WIDEN ir<%1> = load vp<[[VEC_PTR2]]> -; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv3> = sext ir<%1> to i32 +; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv3> = sext ir<%1> to i32 ; CHECK: Cost of 26 for VF 2: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1> ; CHECK: Cost of 18 for VF 2: WIDEN ir<%shr> = ashr ir<%mul>, ir<7> ; CHECK: Cost of 0 for VF 2: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127> ; CHECK: Cost of 22 for VF 2: WIDEN-SELECT ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127> -; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv4> = trunc ir<%spec.select.i> to i8 +; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv4> = trunc ir<%spec.select.i> to i8 ; CHECK: Cost of 0 for VF 2: vp<[[VEC_PTR3:%.+]]> = vector-pointer vp<%next.gep>.1 ; CHECK: Cost of 18 for VF 2: WIDEN store vp<[[VEC_PTR3]]>, ir<%conv4> ; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<%0> @@ -176,15 +176,15 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 4: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<[[STEPS3]]> ; CHECK: Cost of 0 for VF 4: vp<[[VEC_PTR1:%.+]]> = vector-pointer vp<%next.gep> ; CHECK: Cost of 2 for VF 4: WIDEN ir<%0> = load vp<[[VEC_PTR1]]> -; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv1> = sext ir<%0> to i32 +; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv1> = sext ir<%0> to i32 ; CHECK: Cost of 0 for VF 4: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.2 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%1> = load vp<[[VEC_PTR2]]> -; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv3> = sext ir<%1> to i32 +; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv3> = sext ir<%1> to i32 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1> ; CHECK: Cost of 2 for VF 4: WIDEN ir<%shr> = ashr ir<%mul>, ir<7> ; CHECK: Cost of 0 for VF 4: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127> ; CHECK: Cost of 2 for VF 4: WIDEN-SELECT ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127> -; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv4> = trunc ir<%spec.select.i> to i8 +; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv4> = trunc ir<%spec.select.i> to i8 ; CHECK: Cost of 0 for VF 4: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.1 ; CHECK: Cost of 2 for VF 4: WIDEN store vp<[[VEC_PTR2]]>, ir<%conv4> ; CHECK: Cost of 0 for VF 4: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<%0> @@ -208,15 +208,15 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 8: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<[[STEPS3]]> ; CHECK: Cost of 0 for VF 8: vp<[[VEC_PTR1:%.+]]> = vector-pointer vp<%next.gep> ; CHECK: Cost of 2 for VF 8: WIDEN ir<%0> = load vp<[[VEC_PTR1]]> -; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv1> = sext ir<%0> to i32 +; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv1> = sext ir<%0> to i32 ; CHECK: Cost of 0 for VF 8: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.2 ; CHECK: Cost of 2 for VF 8: WIDEN ir<%1> = load vp<[[VEC_PTR2]]> -; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv3> = sext ir<%1> to i32 +; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv3> = sext ir<%1> to i32 ; CHECK: Cost of 4 for VF 8: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1> ; CHECK: Cost of 4 for VF 8: WIDEN ir<%shr> = ashr ir<%mul>, ir<7> ; CHECK: Cost of 0 for VF 8: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127> ; CHECK: Cost of 4 for VF 8: WIDEN-SELECT ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127> -; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv4> = trunc ir<%spec.select.i> to i8 +; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv4> = trunc ir<%spec.select.i> to i8 ; CHECK: Cost of 0 for VF 8: vp<[[VEC_PTR3:%.+]]> = vector-pointer vp<%next.gep>.1 ; CHECK: Cost of 2 for VF 8: WIDEN store vp<[[VEC_PTR3]]>, ir<%conv4> ; CHECK: Cost of 0 for VF 8: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}} @@ -240,15 +240,15 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 16: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<[[STEPS3]]> ; CHECK: Cost of 0 for VF 16: vp<[[VEC_PTR:%.+]]> = vector-pointer vp<%next.gep> ; CHECK: Cost of 2 for VF 16: WIDEN ir<%0> = load vp<[[VEC_PTR]]> -; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv1> = sext ir<%0> to i32 +; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv1> = sext ir<%0> to i32 ; CHECK: Cost of 0 for VF 16: vp<[[VEC_PTR1:%.+]]> = vector-pointer vp<%next.gep>.2 ; CHECK: Cost of 2 for VF 16: WIDEN ir<%1> = load vp<[[VEC_PTR1]]> -; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv3> = sext ir<%1> to i32 +; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv3> = sext ir<%1> to i32 ; CHECK: Cost of 8 for VF 16: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1> ; CHECK: Cost of 8 for VF 16: WIDEN ir<%shr> = ashr ir<%mul>, ir<7> ; CHECK: Cost of 0 for VF 16: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127> ; CHECK: Cost of 8 for VF 16: WIDEN-SELECT ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127> -; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv4> = trunc ir<%spec.select.i> to i8 +; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv4> = trunc ir<%spec.select.i> to i8 ; CHECK: Cost of 0 for VF 16: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.1 ; CHECK: Cost of 2 for VF 16: WIDEN store vp<[[VEC_PTR2]]>, ir<%conv4> ; CHECK: Cost of 0 for VF 16: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}}> diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll index 052a963f5458b..28f8988bd853a 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll @@ -38,16 +38,16 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<%4> ; CHECK: Cost of 0 for VF 2: vp<%5> = vector-pointer ir<%arrayidx> ; CHECK: Cost of 1 for VF 2: WIDEN ir<%0> = load vp<%5> -; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv> = zext ir<%0> to i32 +; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv> = zext ir<%0> to i32 ; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx2> = getelementptr inbounds ir<%b>, vp<%4> ; CHECK: Cost of 0 for VF 2: vp<%6> = vector-pointer ir<%arrayidx2> ; CHECK: Cost of 1 for VF 2: WIDEN ir<%1> = load vp<%6> -; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv3> = zext ir<%1> to i32 +; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv3> = zext ir<%1> to i32 ; CHECK: Cost of 0 for VF 2: WIDEN ir<%conv4> = and ir<%sum.013>, ir<255> ; CHECK: Cost of 1 for VF 2: WIDEN ir<%add> = add ir<%conv>, ir<%conv4> ; CHECK: Cost of 1 for VF 2: WIDEN ir<%add5> = add ir<%add>, ir<%conv3> -; CHECK: Cost of 0 for VF 2: WIDEN-CAST vp<%7> = trunc ir<%add5> to i8 -; CHECK: Cost of 0 for VF 2: WIDEN-CAST vp<%8> = zext vp<%7> to i32 +; CHECK: Cost of 0 for VF 2: WIDEN-CAST vp<%7> = trunc ir<%add5> to i8 +; CHECK: Cost of 0 for VF 2: WIDEN-CAST vp<%8> = zext vp<%7> to i32 ; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<%3>, vp<%0> ; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<%1> ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll index b8dcfd31bbc4c..8661d86f554b8 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll @@ -6,8 +6,8 @@ target triple = "x86_64-apple-macosx10.8.0" ; CHECK: cost of 4 for VF 1 For instruction: %conv = uitofp i64 %tmp to double -; CHECK: Cost of 5 for VF 2: WIDEN-CAST ir<%conv> = uitofp ir<%tmp> to double -; CHECK: Cost of 10 for VF 4: WIDEN-CAST ir<%conv> = uitofp ir<%tmp> to double +; CHECK: Cost of 5 for VF 2: WIDEN-CAST ir<%conv> = uitofp ir<%tmp> to double +; CHECK: Cost of 10 for VF 4: WIDEN-CAST ir<%conv> = uitofp ir<%tmp> to double define void @uint64_to_double_cost(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) nounwind { entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll index 0eab97b0cc735..fe6121ca3d004 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll @@ -186,9 +186,9 @@ define i32 @test_chained_first_order_recurrences_4(ptr %base, i64 %x) { ; CHECK-NEXT: vp<[[SCALAR_STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%base>, vp<[[SCALAR_STEPS]]> ; CHECK-NEXT: EMIT vp<[[SPLICE_X:%.]]> = first-order splice ir<%for.x>, ir<%for.x.next> -; CHECK-NEXT: WIDEN-CAST ir<%for.x.prev> = trunc vp<[[SPLICE_X]]> to i32 +; CHECK-NEXT: WIDEN-CAST ir<%for.x.prev> = trunc vp<[[SPLICE_X]]> to i32 ; CHECK-NEXT: EMIT vp<[[SPLICE_Y:%.+]]> = first-order splice ir<%for.y>, ir<%for.x.prev> -; CHECK-NEXT: WIDEN-CAST ir<%for.y.i64> = sext vp<[[SPLICE_Y]]> to i64 +; CHECK-NEXT: WIDEN-CAST ir<%for.y.i64> = sext vp<[[SPLICE_Y]]> to i64 ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%for.y.i64> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> @@ -263,9 +263,9 @@ define i32 @test_chained_first_order_recurrences_5_hoist_to_load(ptr %base) { ; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: WIDEN ir<%for.x.next> = mul ir<%l>, ir<2> ; CHECK-NEXT: EMIT vp<[[SPLICE_X:%.]]> = first-order splice ir<%for.x>, ir<%for.x.next> -; CHECK-NEXT: WIDEN-CAST ir<%for.x.prev> = trunc vp<[[SPLICE_X]]> to i32 +; CHECK-NEXT: WIDEN-CAST ir<%for.x.prev> = trunc vp<[[SPLICE_X]]> to i32 ; CHECK-NEXT: EMIT vp<[[SPLICE_Y:%.+]]> = first-order splice ir<%for.y>, ir<%for.x.prev> -; CHECK-NEXT: WIDEN-CAST ir<%for.y.i64> = sext vp<[[SPLICE_Y]]> to i64 +; CHECK-NEXT: WIDEN-CAST ir<%for.y.i64> = sext vp<[[SPLICE_Y]]> to i64 ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%for.y.i64> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> From 07d284d4ebffd58d4b2934769b4e11fedd0b106e Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 24 Dec 2024 15:35:29 -0500 Subject: [PATCH 026/567] [SLP]Add cost estimation for gather node reshuffling Adds cost estimation for the variants of the permutations of the scalar values, used in gather nodes. Currently, SLP just unconditionally emits shuffles for the reused buildvectors, but in some cases better to leave them as buildvectors rather than shuffles, if the cost of such buildvectors is better. X86, AVX512, -O3+LTO Metric: size..text Program size..text results results0 diff test-suite :: External/SPEC/CINT2006/445.gobmk/445.gobmk.test 912998.00 913238.00 0.0% test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 203070.00 203102.00 0.0% test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1396320.00 1396448.00 0.0% test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1396320.00 1396448.00 0.0% test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 309790.00 309678.00 -0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12477607.00 12470807.00 -0.1% CINT2006/445.gobmk - extra code vectorized MiBench/consumer-lame - small variations CFP2017speed/638.imagick_s CFP2017rate/538.imagick_r - extra vectorized code Benchmarks/Bullet - extra code vectorized CFP2017rate/526.blender_r - extra vector code RISC-V, sifive-p670, -O3+LTO CFP2006/433.milc - regressions, should be fixed by https://github.com/llvm/llvm-project/pull/115173 CFP2006/453.povray - extra vectorized code CFP2017rate/508.namd_r - better vector code CFP2017rate/510.parest_r - extra vectorized code SPEC/CFP2017rate - extra/better vector code CFP2017rate/526.blender_r - extra vectorized code CFP2017rate/538.imagick_r - extra vectorized code CINT2006/403.gcc - extra vectorized code CINT2006/445.gobmk - extra vectorized code CINT2006/464.h264ref - extra vectorized code CINT2006/483.xalancbmk - small variations CINT2017rate/525.x264_r - better vectorization Reviewers: RKSimon Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/115201 --- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 32 +- .../Transforms/Vectorize/SLPVectorizer.cpp | 149 +++++- .../CostModel/AMDGPU/shufflevector.ll | 16 +- .../CostModel/RISCV/shuffle-exact-vlen.ll | 18 +- .../X86/shuffle-single-src-latency.ll | 14 +- .../CostModel/X86/shuffle-splat-codesize.ll | 248 ++++------ .../CostModel/X86/shuffle-splat-latency.ll | 284 +++++------- .../X86/shuffle-splat-sizelatency.ll | 248 ++++------ .../Analysis/CostModel/X86/shuffle-splat.ll | 248 ++++------ .../AArch64/reused-scalar-repeated-in-node.ll | 8 +- .../AArch64/scalarization-overhead.ll | 56 +-- .../SLPVectorizer/RISCV/complex-loads.ll | 426 +++++++++--------- .../X86/scatter-vectorize-reorder.ll | 2 +- .../alternate-cmp-swapped-pred-parent.ll | 6 +- .../extract-many-users-buildvector.ll | 75 ++- .../full-overlap-non-schedulable.ll | 4 +- ...hered-consecutive-loads-different-types.ll | 10 +- .../SLPVectorizer/reorder-clustered-node.ll | 72 ++- .../resized-alt-shuffle-after-minbw.ll | 6 +- 19 files changed, 955 insertions(+), 967 deletions(-) diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index ed4541f66740e..c9f142d64ae9e 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -259,6 +259,33 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { return AddrExtractCost + MemoryOpCost + PackingCost + ConditionalCost; } + /// Checks if the provided mask \p is a splat mask, i.e. it contains only -1 + /// or same non -1 index value and this index value contained at least twice. + /// So, mask <0, -1,-1, -1> is not considered splat (it is just identity), + /// same for <-1, 0, -1, -1> (just a slide), while <2, -1, 2, -1> is a splat + /// with \p Index=2. + static bool isSplatMask(ArrayRef Mask, unsigned NumSrcElts, int &Index) { + // Check that the broadcast index meets at least twice. + bool IsCompared = false; + if (int SplatIdx = PoisonMaskElem; + all_of(enumerate(Mask), [&](const auto &P) { + if (P.value() == PoisonMaskElem) + return P.index() != Mask.size() - 1 || IsCompared; + if (static_cast(P.value()) >= NumSrcElts * 2) + return false; + if (SplatIdx == PoisonMaskElem) { + SplatIdx = P.value(); + return P.index() != Mask.size() - 1; + } + IsCompared = true; + return SplatIdx == P.value(); + })) { + Index = SplatIdx; + return true; + } + return false; + } + protected: explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL) : BaseT(DL) {} @@ -1014,17 +1041,20 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { return Kind; int NumSrcElts = Ty->getElementCount().getKnownMinValue(); switch (Kind) { - case TTI::SK_PermuteSingleSrc: + case TTI::SK_PermuteSingleSrc: { if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts)) return TTI::SK_Reverse; if (ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts)) return TTI::SK_Broadcast; + if (isSplatMask(Mask, NumSrcElts, Index)) + return TTI::SK_Broadcast; if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts, Index) && (Index + Mask.size()) <= (size_t)NumSrcElts) { SubTy = FixedVectorType::get(Ty->getElementType(), Mask.size()); return TTI::SK_ExtractSubvector; } break; + } case TTI::SK_PermuteTwoSrc: { int NumSubElts; if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask( diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index fd167b0036e9c..57f3016fbe1e0 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -13199,6 +13199,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( // No perfect match, just shuffle, so choose the first tree node from the // tree. Entries.push_back(FirstEntries.front()); + VF = FirstEntries.front()->getVectorFactor(); } else { // Try to find nodes with the same vector factor. assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries."); @@ -13239,6 +13240,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( Entries.push_back(SecondEntries.front()); VF = std::max(Entries.front()->getVectorFactor(), Entries.back()->getVectorFactor()); + } else { + VF = Entries.front()->getVectorFactor(); } } @@ -13350,17 +13353,141 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( : Entries[Pair.first]->findLaneForValue(VL[Pair.second])); IsIdentity &= Mask[Idx] == Pair.second; } - switch (Entries.size()) { - case 1: - if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2) - return TargetTransformInfo::SK_PermuteSingleSrc; - break; - case 2: - if (EntryLanes.size() > 2 || VL.size() <= 2) - return TargetTransformInfo::SK_PermuteTwoSrc; - break; - default: - break; + if (ForOrder || IsIdentity || Entries.empty()) { + switch (Entries.size()) { + case 1: + if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2) + return TargetTransformInfo::SK_PermuteSingleSrc; + break; + case 2: + if (EntryLanes.size() > 2 || VL.size() <= 2) + return TargetTransformInfo::SK_PermuteTwoSrc; + break; + default: + break; + } + } else if (!isa(VL.front()->getType()) && + (EntryLanes.size() > Entries.size() || VL.size() <= 2)) { + // Do the cost estimation if shuffle beneficial than buildvector. + SmallVector SubMask(std::next(Mask.begin(), Part * VL.size()), + std::next(Mask.begin(), (Part + 1) * VL.size())); + int MinElement = SubMask.front(), MaxElement = SubMask.front(); + for (int Idx : SubMask) { + if (Idx == PoisonMaskElem) + continue; + if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF) + MinElement = Idx; + if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF) + MaxElement = Idx; + } + assert(MaxElement >= 0 && MinElement >= 0 && + MaxElement % VF >= MinElement % VF && + "Expected at least single element."); + unsigned NewVF = std::max( + VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(), + (MaxElement % VF) - + (MinElement % VF) + 1)); + if (NewVF < VF) { + for_each(SubMask, [&](int &Idx) { + if (Idx == PoisonMaskElem) + return; + Idx = (Idx % VF) - (MinElement % VF) + + (Idx >= static_cast(VF) ? NewVF : 0); + }); + VF = NewVF; + } + + constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + auto *VecTy = getWidenedType(VL.front()->getType(), VF); + auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size()); + auto GetShuffleCost = [&, + &TTI = *TTI](ArrayRef Mask, + ArrayRef Entries, + VectorType *VecTy) -> InstructionCost { + if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 && + ShuffleVectorInst::isDeInterleaveMaskOfFactor( + Mask, Entries.front()->getInterleaveFactor())) + return TTI::TCC_Free; + return ::getShuffleCost(TTI, + Entries.size() > 1 ? TTI::SK_PermuteTwoSrc + : TTI::SK_PermuteSingleSrc, + VecTy, Mask, CostKind); + }; + InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy); + InstructionCost FirstShuffleCost = 0; + SmallVector FirstMask(SubMask.begin(), SubMask.end()); + if (Entries.size() == 1 || !Entries[0]->isGather()) { + FirstShuffleCost = ShuffleCost; + } else { + // Transform mask to include only first entry. + APInt DemandedElts = APInt::getAllOnes(SubMask.size()); + bool IsIdentity = true; + for (auto [I, Idx] : enumerate(FirstMask)) { + if (Idx >= static_cast(VF)) { + Idx = PoisonMaskElem; + } else { + DemandedElts.clearBit(I); + if (Idx != PoisonMaskElem) + IsIdentity &= static_cast(I) == Idx; + } + } + if (!IsIdentity) + FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy); + FirstShuffleCost += TTI->getScalarizationOverhead( + MaskVecTy, DemandedElts, /*Insert=*/true, + /*Extract=*/false, CostKind); + } + InstructionCost SecondShuffleCost = 0; + SmallVector SecondMask(SubMask.begin(), SubMask.end()); + if (Entries.size() == 1 || !Entries[1]->isGather()) { + SecondShuffleCost = ShuffleCost; + } else { + // Transform mask to include only first entry. + APInt DemandedElts = APInt::getAllOnes(SubMask.size()); + bool IsIdentity = true; + for (auto [I, Idx] : enumerate(SecondMask)) { + if (Idx < static_cast(VF) && Idx >= 0) { + Idx = PoisonMaskElem; + } else { + DemandedElts.clearBit(I); + if (Idx != PoisonMaskElem) { + Idx -= VF; + IsIdentity &= static_cast(I) == Idx; + } + } + } + if (!IsIdentity) + SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy); + SecondShuffleCost += TTI->getScalarizationOverhead( + MaskVecTy, DemandedElts, /*Insert=*/true, + /*Extract=*/false, CostKind); + } + APInt DemandedElts = APInt::getAllOnes(SubMask.size()); + for (auto [I, Idx] : enumerate(SubMask)) + if (Idx == PoisonMaskElem) + DemandedElts.clearBit(I); + InstructionCost BuildVectorCost = + TTI->getScalarizationOverhead(MaskVecTy, DemandedElts, /*Insert=*/true, + /*Extract=*/false, CostKind); + const TreeEntry *BestEntry = nullptr; + if (FirstShuffleCost < ShuffleCost) { + copy(FirstMask, std::next(Mask.begin(), Part * VL.size())); + BestEntry = Entries.front(); + ShuffleCost = FirstShuffleCost; + } + if (SecondShuffleCost < ShuffleCost) { + copy(SecondMask, std::next(Mask.begin(), Part * VL.size())); + BestEntry = Entries[1]; + ShuffleCost = SecondShuffleCost; + } + if (BuildVectorCost >= ShuffleCost) { + if (BestEntry) { + Entries.clear(); + Entries.push_back(BestEntry); + } + return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc + : TargetTransformInfo::SK_PermuteSingleSrc; + } } Entries.clear(); // Clear the corresponding mask elements. diff --git a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll index a18156744a36b..7107d2be579c6 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll @@ -399,13 +399,13 @@ define amdgpu_kernel void @shufflevector_i8(<2 x i8> %vec1, <2 x i8> %vec2) { ; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer ; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf02 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf03 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf21 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf13 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> @@ -436,13 +436,13 @@ define amdgpu_kernel void @shufflevector_i8(<2 x i8> %vec1, <2 x i8> %vec2) { ; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer ; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf02_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf03_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf21_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf13_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> @@ -476,13 +476,13 @@ define amdgpu_kernel void @shufflevector_i8(<2 x i8> %vec1, <2 x i8> %vec2) { ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf02 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf03 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf21 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf13 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> @@ -513,13 +513,13 @@ define amdgpu_kernel void @shufflevector_i8(<2 x i8> %vec1, <2 x i8> %vec2) { ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf02_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf03_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf21_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf13_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-exact-vlen.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-exact-vlen.ll index 30bae7e926289..cada8ab240cc7 100644 --- a/llvm/test/Analysis/CostModel/RISCV/shuffle-exact-vlen.ll +++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-exact-vlen.ll @@ -734,7 +734,7 @@ define void @shuffle2() vscale_range(2,2) { define void @multipart() vscale_range(2,2) { ; RV32-LABEL: 'multipart' ; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v16a = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> -; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16b = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16b = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16c = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v16d = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32a = shufflevector <4 x i32> poison, <4 x i32> poison, <4 x i32> @@ -743,18 +743,18 @@ define void @multipart() vscale_range(2,2) { ; RV32-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v32many = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v32many2 = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v323 = shufflevector <3 x i32> poison, <3 x i32> poison, <3 x i32> -; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v64a = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64a = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64b = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> zeroinitializer ; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v64ab = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v64d = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> -; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64a = shufflevector <2 x double> poison, <2 x double> poison, <2 x i32> +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64a = shufflevector <2 x double> poison, <2 x double> poison, <2 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64b = shufflevector <2 x double> poison, <2 x double> poison, <2 x i32> zeroinitializer ; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f64ab = shufflevector <4 x double> poison, <4 x double> poison, <4 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV64-LABEL: 'multipart' ; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v16a = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> -; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16b = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16b = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16c = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v16d = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32a = shufflevector <4 x i32> poison, <4 x i32> poison, <4 x i32> @@ -763,18 +763,18 @@ define void @multipart() vscale_range(2,2) { ; RV64-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v32many = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v32many2 = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v323 = shufflevector <3 x i32> poison, <3 x i32> poison, <3 x i32> -; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v64a = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64a = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64b = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> zeroinitializer ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v64ab = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v64d = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> -; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64a = shufflevector <2 x double> poison, <2 x double> poison, <2 x i32> +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64a = shufflevector <2 x double> poison, <2 x double> poison, <2 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64b = shufflevector <2 x double> poison, <2 x double> poison, <2 x i32> zeroinitializer ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %f64ab = shufflevector <4 x double> poison, <4 x double> poison, <4 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SIZE-LABEL: 'multipart' ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v16a = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> -; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16b = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> +; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16b = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16c = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v16d = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32a = shufflevector <4 x i32> poison, <4 x i32> poison, <4 x i32> @@ -783,11 +783,11 @@ define void @multipart() vscale_range(2,2) { ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v32many = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v32many2 = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v323 = shufflevector <3 x i32> poison, <3 x i32> poison, <3 x i32> -; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v64a = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> +; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64a = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64b = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v64ab = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64d = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> -; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64a = shufflevector <2 x double> poison, <2 x double> poison, <2 x i32> +; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64a = shufflevector <2 x double> poison, <2 x double> poison, <2 x i32> ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64b = shufflevector <2 x double> poison, <2 x double> poison, <2 x i32> zeroinitializer ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64ab = shufflevector <4 x double> poison, <4 x double> poison, <4 x i32> ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-single-src-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-single-src-latency.ll index 3fa54588625f1..330cbc07bf33a 100644 --- a/llvm/test/Analysis/CostModel/X86/shuffle-single-src-latency.ll +++ b/llvm/test/Analysis/CostModel/X86/shuffle-single-src-latency.ll @@ -47,7 +47,7 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512-LABEL: 'test_vXf64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> @@ -86,7 +86,7 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512-LABEL: 'test_vXi64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void @@ -127,7 +127,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512-LABEL: 'test_vXf32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> @@ -174,7 +174,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512-LABEL: 'test_vXi32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> @@ -336,7 +336,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512F-LABEL: 'test_vXi8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> @@ -345,7 +345,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512BW-LABEL: 'test_vXi8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> @@ -354,7 +354,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512VBMI-LABEL: 'test_vXi8' -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> ; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> ; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splat-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splat-codesize.ll index 39c935fff6b76..0215f658cbe40 100644 --- a/llvm/test/Analysis/CostModel/X86/shuffle-splat-codesize.ll +++ b/llvm/test/Analysis/CostModel/X86/shuffle-splat-codesize.ll @@ -4,9 +4,9 @@ ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx | FileCheck %s -check-prefixes=AVX1 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx2 | FileCheck %s -check-prefixes=AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512 ; ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=code-size -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=code-size -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42 @@ -19,20 +19,20 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512) { ; SSE-LABEL: 'test_vXf64' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'test_vXf64' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'test_vXf64' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512-LABEL: 'test_vXf64' @@ -50,20 +50,20 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) { ; SSE-LABEL: 'test_vXi64' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'test_vXi64' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'test_vXi64' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512-LABEL: 'test_vXi64' @@ -82,22 +82,22 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr ; SSE-LABEL: 'test_vXf32' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'test_vXf32' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'test_vXf32' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512-LABEL: 'test_vXf32' @@ -118,22 +118,22 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, ; SSE-LABEL: 'test_vXi32' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'test_vXi32' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'test_vXi32' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512-LABEL: 'test_vXi32' @@ -154,58 +154,42 @@ define void @test_vXf16(<2 x half> %src32, <4 x half> %src64, <8 x half> %src128 ; SSE2-LABEL: 'test_vXf16' ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSSE3-LABEL: 'test_vXf16' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSE42-LABEL: 'test_vXf16' ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'test_vXf16' ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; -; AVX512F-LABEL: 'test_vXf16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; AVX512BW-LABEL: 'test_vXf16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; AVX512VBMI-LABEL: 'test_vXf16' -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; AVX512-LABEL: 'test_vXf16' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> @@ -260,66 +244,50 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, < ; SSE2-LABEL: 'test_vXi16' ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSSE3-LABEL: 'test_vXi16' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSE42-LABEL: 'test_vXi16' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'test_vXi16' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'test_vXi16' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; -; AVX512F-LABEL: 'test_vXi16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; AVX512BW-LABEL: 'test_vXi16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; AVX512VBMI-LABEL: 'test_vXi16' -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; AVX512-LABEL: 'test_vXi16' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> @@ -332,11 +300,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, < define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) { ; SSE2-LABEL: 'test_vXi8' ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSSE3-LABEL: 'test_vXi8' @@ -344,8 +312,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSE42-LABEL: 'test_vXi8' @@ -353,8 +321,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'test_vXi8' @@ -362,8 +330,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'test_vXi8' @@ -371,36 +339,18 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; -; AVX512F-LABEL: 'test_vXi8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; AVX512BW-LABEL: 'test_vXi8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; AVX512VBMI-LABEL: 'test_vXi8' -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; AVX512-LABEL: 'test_vXi8' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> @@ -415,10 +365,10 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1> ; SSE2-LABEL: 'test_vXi1' ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 960 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSSE3-LABEL: 'test_vXi1' @@ -426,8 +376,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSE42-LABEL: 'test_vXi1' @@ -435,8 +385,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'test_vXi1' @@ -444,8 +394,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'test_vXi1' @@ -453,17 +403,17 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512-LABEL: 'test_vXi1' -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> @@ -483,22 +433,22 @@ define void @test_upper_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a ; SSE-LABEL: 'test_upper_vXf32' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'test_upper_vXf32' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'test_upper_vXf32' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512-LABEL: 'test_upper_vXf32' diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splat-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splat-latency.ll index 2a89924dc7780..b20986ed6657d 100644 --- a/llvm/test/Analysis/CostModel/X86/shuffle-splat-latency.ll +++ b/llvm/test/Analysis/CostModel/X86/shuffle-splat-latency.ll @@ -4,9 +4,9 @@ ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=latency -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=latency -mattr=+avx | FileCheck %s -check-prefixes=AVX1 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512 ; ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=latency -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42 @@ -19,26 +19,26 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512) { ; SSE-LABEL: 'test_vXf64' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'test_vXf64' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'test_vXf64' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512-LABEL: 'test_vXf64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> @@ -50,26 +50,26 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) { ; SSE-LABEL: 'test_vXi64' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'test_vXi64' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'test_vXi64' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512-LABEL: 'test_vXi64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> @@ -82,29 +82,29 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr ; SSE-LABEL: 'test_vXf32' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'test_vXf32' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'test_vXf32' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512-LABEL: 'test_vXf32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> @@ -118,29 +118,29 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, ; SSE-LABEL: 'test_vXi32' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'test_vXi32' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'test_vXi32' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512-LABEL: 'test_vXi32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> @@ -154,58 +154,42 @@ define void @test_vXf16(<2 x half> %src32, <4 x half> %src64, <8 x half> %src128 ; SSE2-LABEL: 'test_vXf16' ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSSE3-LABEL: 'test_vXf16' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSE42-LABEL: 'test_vXf16' ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'test_vXf16' ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; -; AVX512F-LABEL: 'test_vXf16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; AVX512BW-LABEL: 'test_vXf16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; AVX512VBMI-LABEL: 'test_vXf16' -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; AVX512-LABEL: 'test_vXf16' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> @@ -260,66 +244,50 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, < ; SSE2-LABEL: 'test_vXi16' ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSSE3-LABEL: 'test_vXi16' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSE42-LABEL: 'test_vXi16' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'test_vXi16' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'test_vXi16' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; -; AVX512F-LABEL: 'test_vXi16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; AVX512BW-LABEL: 'test_vXi16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; AVX512VBMI-LABEL: 'test_vXi16' -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; AVX512-LABEL: 'test_vXi16' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> @@ -332,11 +300,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, < define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) { ; SSE2-LABEL: 'test_vXi8' ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSSE3-LABEL: 'test_vXi8' @@ -344,8 +312,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSE42-LABEL: 'test_vXi8' @@ -353,8 +321,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'test_vXi8' @@ -362,8 +330,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'test_vXi8' @@ -371,36 +339,18 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; -; AVX512F-LABEL: 'test_vXi8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; AVX512BW-LABEL: 'test_vXi8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; AVX512VBMI-LABEL: 'test_vXi8' -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; AVX512-LABEL: 'test_vXi8' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> @@ -415,10 +365,10 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1> ; SSE2-LABEL: 'test_vXi1' ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 960 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSSE3-LABEL: 'test_vXi1' @@ -426,8 +376,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSE42-LABEL: 'test_vXi1' @@ -435,8 +385,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'test_vXi1' @@ -444,8 +394,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'test_vXi1' @@ -453,17 +403,17 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512-LABEL: 'test_vXi1' -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> @@ -483,29 +433,29 @@ define void @test_upper_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a ; SSE-LABEL: 'test_upper_vXf32' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'test_upper_vXf32' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'test_upper_vXf32' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512-LABEL: 'test_upper_vXf32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splat-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splat-sizelatency.ll index 848e7b4e611a7..56d8cadb18e48 100644 --- a/llvm/test/Analysis/CostModel/X86/shuffle-splat-sizelatency.ll +++ b/llvm/test/Analysis/CostModel/X86/shuffle-splat-sizelatency.ll @@ -4,9 +4,9 @@ ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx | FileCheck %s -check-prefixes=AVX1 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512 ; ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42 @@ -19,20 +19,20 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512) { ; SSE-LABEL: 'test_vXf64' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'test_vXf64' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'test_vXf64' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512-LABEL: 'test_vXf64' @@ -50,20 +50,20 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) { ; SSE-LABEL: 'test_vXi64' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'test_vXi64' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'test_vXi64' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512-LABEL: 'test_vXi64' @@ -82,22 +82,22 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr ; SSE-LABEL: 'test_vXf32' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'test_vXf32' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'test_vXf32' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512-LABEL: 'test_vXf32' @@ -118,22 +118,22 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, ; SSE-LABEL: 'test_vXi32' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'test_vXi32' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'test_vXi32' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512-LABEL: 'test_vXi32' @@ -154,58 +154,42 @@ define void @test_vXf16(<2 x half> %src32, <4 x half> %src64, <8 x half> %src128 ; SSE2-LABEL: 'test_vXf16' ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSSE3-LABEL: 'test_vXf16' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSE42-LABEL: 'test_vXf16' ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'test_vXf16' ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; -; AVX512F-LABEL: 'test_vXf16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; AVX512BW-LABEL: 'test_vXf16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; AVX512VBMI-LABEL: 'test_vXf16' -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; AVX512-LABEL: 'test_vXf16' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> @@ -260,66 +244,50 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, < ; SSE2-LABEL: 'test_vXi16' ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSSE3-LABEL: 'test_vXi16' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSE42-LABEL: 'test_vXi16' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'test_vXi16' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'test_vXi16' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; -; AVX512F-LABEL: 'test_vXi16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; AVX512BW-LABEL: 'test_vXi16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; AVX512VBMI-LABEL: 'test_vXi16' -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; AVX512-LABEL: 'test_vXi16' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> @@ -332,11 +300,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, < define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) { ; SSE2-LABEL: 'test_vXi8' ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSSE3-LABEL: 'test_vXi8' @@ -344,8 +312,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSE42-LABEL: 'test_vXi8' @@ -353,8 +321,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'test_vXi8' @@ -362,8 +330,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'test_vXi8' @@ -371,36 +339,18 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; -; AVX512F-LABEL: 'test_vXi8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; AVX512BW-LABEL: 'test_vXi8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; AVX512VBMI-LABEL: 'test_vXi8' -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; AVX512-LABEL: 'test_vXi8' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> @@ -415,10 +365,10 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1> ; SSE2-LABEL: 'test_vXi1' ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 960 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSSE3-LABEL: 'test_vXi1' @@ -426,8 +376,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSE42-LABEL: 'test_vXi1' @@ -435,8 +385,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'test_vXi1' @@ -444,8 +394,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'test_vXi1' @@ -453,17 +403,17 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512-LABEL: 'test_vXi1' -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> @@ -483,22 +433,22 @@ define void @test_upper_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a ; SSE-LABEL: 'test_upper_vXf32' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'test_upper_vXf32' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'test_upper_vXf32' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512-LABEL: 'test_upper_vXf32' diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splat.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splat.ll index 4c6d1ccd5ca34..56f56c3c2942a 100644 --- a/llvm/test/Analysis/CostModel/X86/shuffle-splat.ll +++ b/llvm/test/Analysis/CostModel/X86/shuffle-splat.ll @@ -4,9 +4,9 @@ ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mattr=+avx | FileCheck %s -check-prefixes=AVX1 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mattr=+avx2 | FileCheck %s -check-prefixes=AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512 ; ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42 @@ -19,20 +19,20 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512) { ; SSE-LABEL: 'test_vXf64' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'test_vXf64' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'test_vXf64' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'test_vXf64' @@ -50,20 +50,20 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) { ; SSE-LABEL: 'test_vXi64' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'test_vXi64' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'test_vXi64' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'test_vXi64' @@ -82,22 +82,22 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr ; SSE-LABEL: 'test_vXf32' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'test_vXf32' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'test_vXf32' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'test_vXf32' @@ -118,22 +118,22 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, ; SSE-LABEL: 'test_vXi32' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'test_vXi32' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'test_vXi32' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'test_vXi32' @@ -154,58 +154,42 @@ define void @test_vXf16(<2 x half> %src32, <4 x half> %src64, <8 x half> %src128 ; SSE2-LABEL: 'test_vXf16' ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSSE3-LABEL: 'test_vXf16' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE42-LABEL: 'test_vXf16' ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'test_vXf16' ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; -; AVX512F-LABEL: 'test_vXf16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; AVX512BW-LABEL: 'test_vXf16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; AVX512VBMI-LABEL: 'test_vXf16' -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX512-LABEL: 'test_vXf16' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> @@ -260,66 +244,50 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, < ; SSE2-LABEL: 'test_vXi16' ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSSE3-LABEL: 'test_vXi16' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE42-LABEL: 'test_vXi16' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'test_vXi16' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'test_vXi16' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; -; AVX512F-LABEL: 'test_vXi16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; AVX512BW-LABEL: 'test_vXi16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; AVX512VBMI-LABEL: 'test_vXi16' -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX512-LABEL: 'test_vXi16' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> @@ -332,11 +300,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, < define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) { ; SSE2-LABEL: 'test_vXi8' ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSSE3-LABEL: 'test_vXi8' @@ -344,8 +312,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE42-LABEL: 'test_vXi8' @@ -353,8 +321,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'test_vXi8' @@ -362,8 +330,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'test_vXi8' @@ -371,36 +339,18 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; -; AVX512F-LABEL: 'test_vXi8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; AVX512BW-LABEL: 'test_vXi8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; AVX512VBMI-LABEL: 'test_vXi8' -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> -; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX512-LABEL: 'test_vXi8' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> @@ -415,10 +365,10 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1> ; SSE2-LABEL: 'test_vXi1' ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 960 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSSE3-LABEL: 'test_vXi1' @@ -426,8 +376,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE42-LABEL: 'test_vXi1' @@ -435,8 +385,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'test_vXi1' @@ -444,8 +394,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'test_vXi1' @@ -453,17 +403,17 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'test_vXi1' -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> @@ -483,22 +433,22 @@ define void @test_upper_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a ; SSE-LABEL: 'test_upper_vXf32' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'test_upper_vXf32' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'test_upper_vXf32' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'test_upper_vXf32' diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll index 289807a808d5d..3cab4a4da3f8e 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll @@ -48,12 +48,12 @@ define void @test() { ; CHECK-NEXT: [[TMP15:%.*]] = phi <8 x float> [ [[TMP17]], %[[BB77]] ], [ [[TMP36:%.*]], %[[BB78]] ] ; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x float> [ [[TMP31]], %[[BB77]] ], [ [[TMP37:%.*]], %[[BB78]] ] ; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> ; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP20]], <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <16 x float> [[TMP21]], <16 x float> [[TMP20]], <16 x i32> ; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <16 x float> [[TMP21]], <16 x float> [[TMP22]], <16 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x float> [[TMP23]], <16 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP40:%.*]] = shufflevector <16 x float> [[TMP23]], <16 x float> [[TMP22]], <16 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x float> [[TMP40]], <16 x float> poison, <16 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = fmul fast <16 x float> [[TMP24]], [[TMP13]] ; CHECK-NEXT: [[TMP26:%.*]] = fmul fast <16 x float> [[TMP38]], [[TMP25]] ; CHECK-NEXT: [[TMP27:%.*]] = fadd fast <16 x float> [[TMP26]], [[TMP18]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll index 8093285ad8717..a504f3ed02014 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll @@ -8,56 +8,34 @@ define fastcc i64 @zot(float %arg, float %arg1, float %arg2, float %arg3, float %arg4, ptr %arg5, i1 %arg6, i1 %arg7, i1 %arg8) { ; CHECK-LABEL: @zot( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[VAL:%.*]] = fmul fast float 0.000000e+00, 0.000000e+00 ; CHECK-NEXT: [[VAL9:%.*]] = fmul fast float 0.000000e+00, [[ARG:%.*]] -; CHECK-NEXT: [[VAL10:%.*]] = fmul fast float [[ARG3:%.*]], 1.000000e+00 -; CHECK-NEXT: [[VAL11:%.*]] = fmul fast float [[ARG3]], 1.000000e+00 -; CHECK-NEXT: [[VAL12:%.*]] = fadd fast float [[ARG3]], 1.000000e+00 -; CHECK-NEXT: [[VAL13:%.*]] = fadd fast float [[VAL12]], 2.000000e+00 -; CHECK-NEXT: [[VAL14:%.*]] = fadd fast float 0.000000e+00, 0.000000e+00 -; CHECK-NEXT: [[VAL15:%.*]] = fadd fast float [[VAL14]], 1.000000e+00 -; CHECK-NEXT: [[VAL16:%.*]] = fadd fast float [[ARG3]], 1.000000e+00 -; CHECK-NEXT: [[VAL17:%.*]] = fadd fast float [[ARG3]], 1.000000e+00 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> , float [[ARG]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[ARG3:%.*]], i32 2 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> , [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> , float [[ARG3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <2 x float> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP2]], <2 x float> [[TMP5]], i64 0) +; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <4 x float> [[TMP6]], ; CHECK-NEXT: br i1 [[ARG6:%.*]], label [[BB18:%.*]], label [[BB57:%.*]] ; CHECK: bb18: -; CHECK-NEXT: [[VAL19:%.*]] = phi float [ [[VAL13]], [[BB:%.*]] ] -; CHECK-NEXT: [[VAL20:%.*]] = phi float [ [[VAL15]], [[BB]] ] -; CHECK-NEXT: [[VAL21:%.*]] = phi float [ [[VAL16]], [[BB]] ] -; CHECK-NEXT: [[VAL22:%.*]] = phi float [ [[VAL17]], [[BB]] ] +; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x float> [ [[TMP7]], [[BB:%.*]] ] +; CHECK-NEXT: [[VAL16:%.*]] = extractelement <4 x float> [[TMP7]], i32 2 ; CHECK-NEXT: [[VAL23:%.*]] = fmul fast float [[VAL16]], 2.000000e+00 +; CHECK-NEXT: [[VAL17:%.*]] = extractelement <4 x float> [[TMP7]], i32 3 ; CHECK-NEXT: [[VAL24:%.*]] = fmul fast float [[VAL17]], 3.000000e+00 ; CHECK-NEXT: br i1 [[ARG7:%.*]], label [[BB25:%.*]], label [[BB57]] ; CHECK: bb25: -; CHECK-NEXT: [[VAL26:%.*]] = phi float [ [[VAL19]], [[BB18]] ] -; CHECK-NEXT: [[VAL27:%.*]] = phi float [ [[VAL20]], [[BB18]] ] -; CHECK-NEXT: [[VAL28:%.*]] = phi float [ [[VAL21]], [[BB18]] ] -; CHECK-NEXT: [[VAL29:%.*]] = phi float [ [[VAL22]], [[BB18]] ] +; CHECK-NEXT: [[TMP11:%.*]] = phi <4 x float> [ [[TMP8]], [[BB18]] ] ; CHECK-NEXT: br label [[BB30:%.*]] ; CHECK: bb30: ; CHECK-NEXT: [[VAL31:%.*]] = phi float [ [[VAL55:%.*]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ] ; CHECK-NEXT: [[VAL32:%.*]] = phi float [ [[VAL9]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ] -; CHECK-NEXT: [[VAL33:%.*]] = load i8, ptr [[ARG5:%.*]], align 1 -; CHECK-NEXT: [[VAL34:%.*]] = uitofp i8 [[VAL33]] to float -; CHECK-NEXT: [[VAL35:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 1 -; CHECK-NEXT: [[VAL36:%.*]] = load i8, ptr [[VAL35]], align 1 -; CHECK-NEXT: [[VAL37:%.*]] = uitofp i8 [[VAL36]] to float -; CHECK-NEXT: [[VAL38:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 2 -; CHECK-NEXT: [[VAL39:%.*]] = load i8, ptr [[VAL38]], align 1 -; CHECK-NEXT: [[VAL40:%.*]] = uitofp i8 [[VAL39]] to float -; CHECK-NEXT: [[VAL41:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 3 -; CHECK-NEXT: [[VAL42:%.*]] = load i8, ptr [[VAL41]], align 1 -; CHECK-NEXT: [[VAL43:%.*]] = uitofp i8 [[VAL42]] to float -; CHECK-NEXT: [[VAL44:%.*]] = fsub fast float [[VAL34]], [[VAL]] -; CHECK-NEXT: [[VAL45:%.*]] = fsub fast float [[VAL37]], [[VAL9]] -; CHECK-NEXT: [[VAL46:%.*]] = fsub fast float [[VAL40]], [[VAL10]] -; CHECK-NEXT: [[VAL47:%.*]] = fsub fast float [[VAL43]], [[VAL11]] -; CHECK-NEXT: [[VAL48:%.*]] = fmul fast float [[VAL44]], [[VAL26]] -; CHECK-NEXT: [[VAL49:%.*]] = fmul fast float [[VAL45]], [[VAL27]] -; CHECK-NEXT: [[VAL50:%.*]] = fadd fast float [[VAL49]], [[VAL48]] -; CHECK-NEXT: [[VAL51:%.*]] = fmul fast float [[VAL46]], [[VAL28]] -; CHECK-NEXT: [[VAL52:%.*]] = fadd fast float [[VAL50]], [[VAL51]] -; CHECK-NEXT: [[VAL53:%.*]] = fmul fast float [[VAL47]], [[VAL29]] -; CHECK-NEXT: [[VAL54:%.*]] = fadd fast float [[VAL52]], [[VAL53]] +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ARG5:%.*]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = uitofp <4 x i8> [[TMP12]] to <4 x float> +; CHECK-NEXT: [[TMP14:%.*]] = fsub fast <4 x float> [[TMP13]], [[TMP3]] +; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <4 x float> [[TMP14]], [[TMP11]] +; CHECK-NEXT: [[VAL54:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP15]]) ; CHECK-NEXT: [[VAL55]] = tail call fast float @llvm.minnum.f32(float [[VAL31]], float [[ARG1:%.*]]) ; CHECK-NEXT: [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[VAL54]]) ; CHECK-NEXT: call void @ham(float [[VAL55]], float [[VAL56]]) diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll index 912d60d0cc386..257e4660c80aa 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll @@ -30,11 +30,11 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4 ; CHECK-NEXT: [[ARRAYIDX8_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 1 ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 -; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1 -; CHECK-NEXT: [[TMP29:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1 ; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP21:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32> -; CHECK-NEXT: [[TMP84:%.*]] = zext i8 [[TMP29]] to i32 +; CHECK-NEXT: [[CONV_2:%.*]] = zext i8 [[TMP6]] to i32 ; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 ; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP31:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32> @@ -50,7 +50,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP30:%.*]] = add <2 x i32> [[TMP25]], [[TMP23]] ; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP51:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32> -; CHECK-NEXT: [[TMP83:%.*]] = zext i8 [[TMP33]] to i32 +; CHECK-NEXT: [[CONV9_2:%.*]] = zext i8 [[TMP7]] to i32 ; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32> ; CHECK-NEXT: [[TMP35:%.*]] = sub <2 x i32> [[TMP51]], [[TMP57]] @@ -61,14 +61,14 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP36:%.*]] = sub <2 x i32> [[TMP39]], [[TMP61]] ; CHECK-NEXT: [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], splat (i32 16) ; CHECK-NEXT: [[TMP42:%.*]] = add <2 x i32> [[TMP37]], [[TMP35]] -; CHECK-NEXT: [[TMP43:%.*]] = add <2 x i32> [[TMP42]], [[TMP30]] +; CHECK-NEXT: [[TMP34:%.*]] = add <2 x i32> [[TMP42]], [[TMP30]] ; CHECK-NEXT: [[TMP44:%.*]] = sub <2 x i32> [[TMP30]], [[TMP42]] -; CHECK-NEXT: [[TMP73:%.*]] = extractelement <2 x i32> [[TMP43]], i32 0 -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[TMP43]], i32 1 -; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[TMP34]], [[TMP73]] -; CHECK-NEXT: [[TMP47:%.*]] = extractelement <2 x i32> [[TMP44]], i32 0 -; CHECK-NEXT: [[TMP48:%.*]] = extractelement <2 x i32> [[TMP44]], i32 1 -; CHECK-NEXT: [[ADD55_2:%.*]] = add i32 [[TMP48]], [[TMP47]] +; CHECK-NEXT: [[TMP43:%.*]] = extractelement <2 x i32> [[TMP34]], i32 0 +; CHECK-NEXT: [[TMP45:%.*]] = extractelement <2 x i32> [[TMP34]], i32 1 +; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[TMP45]], [[TMP43]] +; CHECK-NEXT: [[TMP46:%.*]] = extractelement <2 x i32> [[TMP44]], i32 0 +; CHECK-NEXT: [[TMP47:%.*]] = extractelement <2 x i32> [[TMP44]], i32 1 +; CHECK-NEXT: [[ADD55_2:%.*]] = add i32 [[TMP47]], [[TMP46]] ; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4 ; CHECK-NEXT: [[TMP53:%.*]] = load <2 x i8>, ptr null, align 1 ; CHECK-NEXT: [[TMP52:%.*]] = load i8, ptr null, align 1 @@ -79,12 +79,12 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP59:%.*]] = sub <2 x i32> [[TMP62]], [[TMP55]] ; CHECK-NEXT: [[TMP41:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2) ; CHECK-NEXT: [[TMP58:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32> -; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <2 x i32> [[TMP58]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <2 x i32> [[TMP58]], <2 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP63:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_3]], align 1 ; CHECK-NEXT: [[TMP76:%.*]] = zext <2 x i8> [[TMP63]] to <2 x i32> -; CHECK-NEXT: [[TMP45:%.*]] = sub <2 x i32> [[TMP60]], [[TMP76]] -; CHECK-NEXT: [[TMP46:%.*]] = shl <2 x i32> [[TMP45]], splat (i32 16) -; CHECK-NEXT: [[TMP90:%.*]] = add <2 x i32> [[TMP46]], [[TMP59]] +; CHECK-NEXT: [[TMP81:%.*]] = sub <2 x i32> [[TMP48]], [[TMP76]] +; CHECK-NEXT: [[TMP167:%.*]] = shl <2 x i32> [[TMP81]], splat (i32 16) +; CHECK-NEXT: [[TMP75:%.*]] = add <2 x i32> [[TMP167]], [[TMP59]] ; CHECK-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2 ; CHECK-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2 ; CHECK-NEXT: [[ARRAYIDX27_3:%.*]] = getelementptr i8, ptr null, i64 6 @@ -93,236 +93,236 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP82:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_3]], align 1 ; CHECK-NEXT: [[TMP91:%.*]] = zext <2 x i8> [[TMP82]] to <2 x i32> ; CHECK-NEXT: [[TMP65:%.*]] = sub <2 x i32> [[TMP79]], [[TMP91]] -; CHECK-NEXT: [[TMP75:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> zeroinitializer, i32 1, <2 x i1> splat (i1 true), <2 x i8> poison) -; CHECK-NEXT: [[TMP98:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32> -; CHECK-NEXT: [[TMP100:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1 -; CHECK-NEXT: [[TMP103:%.*]] = zext <2 x i8> [[TMP100]] to <2 x i32> -; CHECK-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP98]], [[TMP103]] -; CHECK-NEXT: [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], splat (i32 16) -; CHECK-NEXT: [[TMP74:%.*]] = add <2 x i32> [[TMP70]], [[TMP65]] -; CHECK-NEXT: [[TMP78:%.*]] = extractelement <2 x i32> [[TMP90]], i32 0 -; CHECK-NEXT: [[TMP71:%.*]] = extractelement <2 x i32> [[TMP90]], i32 1 -; CHECK-NEXT: [[ADD48_3:%.*]] = add i32 [[TMP71]], [[TMP78]] -; CHECK-NEXT: [[SUB51_3:%.*]] = sub i32 [[TMP78]], [[TMP71]] -; CHECK-NEXT: [[TMP80:%.*]] = extractelement <2 x i32> [[TMP74]], i32 0 -; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x i32> [[TMP74]], i32 1 -; CHECK-NEXT: [[ADD55_3:%.*]] = add i32 [[TMP81]], [[TMP80]] -; CHECK-NEXT: [[TMP107:%.*]] = sub i32 [[TMP80]], [[TMP81]] -; CHECK-NEXT: [[ADD48_4:%.*]] = add i32 [[ADD55_3]], [[ADD48_3]] -; CHECK-NEXT: [[TMP113:%.*]] = shufflevector <2 x i32> [[TMP43]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP122:%.*]] = insertelement <2 x i32> [[TMP113]], i32 [[ADD48_3]], i32 0 -; CHECK-NEXT: [[TMP72:%.*]] = insertelement <2 x i32> [[TMP43]], i32 [[ADD55_3]], i32 0 -; CHECK-NEXT: [[TMP123:%.*]] = sub <2 x i32> [[TMP122]], [[TMP72]] -; CHECK-NEXT: [[ADD55_4:%.*]] = add i32 [[TMP107]], [[SUB51_3]] -; CHECK-NEXT: [[TMP126:%.*]] = shufflevector <2 x i32> [[TMP44]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP129:%.*]] = insertelement <2 x i32> [[TMP126]], i32 [[SUB51_3]], i32 0 -; CHECK-NEXT: [[TMP130:%.*]] = insertelement <2 x i32> [[TMP44]], i32 [[TMP107]], i32 0 -; CHECK-NEXT: [[TMP143:%.*]] = sub <2 x i32> [[TMP129]], [[TMP130]] -; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[ADD48_4]], [[ADD48_2]] -; CHECK-NEXT: [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_4]] -; CHECK-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[TMP77]], 15 -; CHECK-NEXT: [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537 -; CHECK-NEXT: [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535 -; CHECK-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[TMP34]], 15 +; CHECK-NEXT: [[TMP170:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> zeroinitializer, i32 1, <2 x i1> splat (i1 true), <2 x i8> poison) +; CHECK-NEXT: [[TMP171:%.*]] = zext <2 x i8> [[TMP170]] to <2 x i32> +; CHECK-NEXT: [[TMP172:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1 +; CHECK-NEXT: [[TMP173:%.*]] = zext <2 x i8> [[TMP172]] to <2 x i32> +; CHECK-NEXT: [[TMP66:%.*]] = sub <2 x i32> [[TMP171]], [[TMP173]] +; CHECK-NEXT: [[TMP67:%.*]] = shl <2 x i32> [[TMP66]], splat (i32 16) +; CHECK-NEXT: [[TMP69:%.*]] = add <2 x i32> [[TMP67]], [[TMP65]] +; CHECK-NEXT: [[TMP176:%.*]] = extractelement <2 x i32> [[TMP75]], i32 0 +; CHECK-NEXT: [[TMP197:%.*]] = extractelement <2 x i32> [[TMP75]], i32 1 +; CHECK-NEXT: [[SUB59:%.*]] = add i32 [[TMP197]], [[TMP176]] +; CHECK-NEXT: [[SUB45_3:%.*]] = sub i32 [[TMP176]], [[TMP197]] +; CHECK-NEXT: [[ADD112_2:%.*]] = extractelement <2 x i32> [[TMP69]], i32 0 +; CHECK-NEXT: [[XOR_I63_2:%.*]] = extractelement <2 x i32> [[TMP69]], i32 1 +; CHECK-NEXT: [[SUB59_1:%.*]] = add i32 [[XOR_I63_2]], [[ADD112_2]] +; CHECK-NEXT: [[SUB47_3:%.*]] = sub i32 [[ADD112_2]], [[XOR_I63_2]] +; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[SUB59_1]], [[SUB59]] +; CHECK-NEXT: [[TMP70:%.*]] = shufflevector <2 x i32> [[TMP34]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP71:%.*]] = insertelement <2 x i32> [[TMP70]], i32 [[SUB59]], i32 0 +; CHECK-NEXT: [[TMP72:%.*]] = insertelement <2 x i32> [[TMP34]], i32 [[SUB59_1]], i32 0 +; CHECK-NEXT: [[TMP222:%.*]] = sub <2 x i32> [[TMP71]], [[TMP72]] +; CHECK-NEXT: [[ADD55_3:%.*]] = add i32 [[SUB47_3]], [[SUB45_3]] +; CHECK-NEXT: [[TMP74:%.*]] = shufflevector <2 x i32> [[TMP44]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP78:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[SUB45_3]], i32 0 +; CHECK-NEXT: [[TMP80:%.*]] = insertelement <2 x i32> [[TMP44]], i32 [[SUB47_3]], i32 0 +; CHECK-NEXT: [[TMP85:%.*]] = sub <2 x i32> [[TMP78]], [[TMP80]] +; CHECK-NEXT: [[ADD95:%.*]] = add i32 [[ADD94]], [[ADD48_2]] +; CHECK-NEXT: [[SUB86_3:%.*]] = sub i32 [[ADD48_2]], [[ADD94]] +; CHECK-NEXT: [[SHR_I:%.*]] = lshr i32 [[TMP77]], 15 +; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[SHR_I]], 65537 +; CHECK-NEXT: [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535 +; CHECK-NEXT: [[SHR_I49:%.*]] = lshr i32 [[TMP45]], 15 +; CHECK-NEXT: [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537 +; CHECK-NEXT: [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535 +; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[ADD55_3]], [[ADD55_2]] +; CHECK-NEXT: [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_3]] +; CHECK-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[CONV9_2]], 15 ; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537 ; CHECK-NEXT: [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535 -; CHECK-NEXT: [[ADD94_5:%.*]] = add i32 [[ADD55_4]], [[ADD55_2]] -; CHECK-NEXT: [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_4]] -; CHECK-NEXT: [[SHR_I_2:%.*]] = lshr i32 [[TMP83]], 15 -; CHECK-NEXT: [[AND_I_2:%.*]] = and i32 [[SHR_I_2]], 65537 -; CHECK-NEXT: [[MUL_I_2:%.*]] = mul i32 [[AND_I_2]], 65535 -; CHECK-NEXT: [[SHR_I49_1:%.*]] = lshr i32 [[TMP84]], 15 +; CHECK-NEXT: [[SHR_I49_1:%.*]] = lshr i32 [[CONV_2]], 15 ; CHECK-NEXT: [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537 -; CHECK-NEXT: [[ADD94_2:%.*]] = mul i32 [[AND_I50_1]], 65535 -; CHECK-NEXT: [[TMP144:%.*]] = extractelement <2 x i32> [[TMP123]], i32 0 -; CHECK-NEXT: [[TMP145:%.*]] = extractelement <2 x i32> [[TMP123]], i32 1 -; CHECK-NEXT: [[ADD94_4:%.*]] = add i32 [[TMP144]], [[TMP145]] -; CHECK-NEXT: [[TMP169:%.*]] = sub i32 [[TMP145]], [[TMP144]] +; CHECK-NEXT: [[MUL_I51_1:%.*]] = mul i32 [[AND_I50_1]], 65535 +; CHECK-NEXT: [[TMP86:%.*]] = extractelement <2 x i32> [[TMP222]], i32 0 +; CHECK-NEXT: [[TMP87:%.*]] = extractelement <2 x i32> [[TMP222]], i32 1 +; CHECK-NEXT: [[ADD94_3:%.*]] = add i32 [[TMP86]], [[TMP87]] +; CHECK-NEXT: [[ADD112_1:%.*]] = sub i32 [[TMP87]], [[TMP86]] ; CHECK-NEXT: [[SHR_I49_2:%.*]] = lshr i32 [[CONV_1]], 15 ; CHECK-NEXT: [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537 ; CHECK-NEXT: [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535 -; CHECK-NEXT: [[TMP146:%.*]] = extractelement <2 x i32> [[TMP143]], i32 0 -; CHECK-NEXT: [[TMP147:%.*]] = extractelement <2 x i32> [[TMP143]], i32 1 -; CHECK-NEXT: [[ADD94_3:%.*]] = add i32 [[TMP146]], [[TMP147]] -; CHECK-NEXT: [[SUB102_3:%.*]] = sub i32 [[TMP147]], [[TMP146]] -; CHECK-NEXT: [[SHR_I49_4:%.*]] = lshr i32 [[CONV1]], 15 -; CHECK-NEXT: [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537 -; CHECK-NEXT: [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535 -; CHECK-NEXT: [[TMP66:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1 -; CHECK-NEXT: [[TMP102:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32> -; CHECK-NEXT: [[TMP148:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1 -; CHECK-NEXT: [[TMP67:%.*]] = shufflevector <4 x i8> [[TMP148]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP85:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32> -; CHECK-NEXT: [[TMP149:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1 -; CHECK-NEXT: [[TMP106:%.*]] = shufflevector <4 x i8> [[TMP149]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP108:%.*]] = zext <2 x i8> [[TMP106]] to <2 x i32> -; CHECK-NEXT: [[TMP150:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 -; CHECK-NEXT: [[TMP109:%.*]] = shufflevector <4 x i8> [[TMP150]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP89:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32> -; CHECK-NEXT: [[TMP87:%.*]] = sub <2 x i32> [[TMP108]], [[TMP89]] -; CHECK-NEXT: [[TMP88:%.*]] = shl <2 x i32> [[TMP87]], splat (i32 16) -; CHECK-NEXT: [[TMP112:%.*]] = shufflevector <4 x i8> [[TMP148]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP120:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32> -; CHECK-NEXT: [[TMP94:%.*]] = shufflevector <4 x i8> [[TMP149]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP128:%.*]] = zext <2 x i8> [[TMP94]] to <2 x i32> -; CHECK-NEXT: [[TMP131:%.*]] = shufflevector <4 x i8> [[TMP150]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP132:%.*]] = zext <2 x i8> [[TMP131]] to <2 x i32> -; CHECK-NEXT: [[TMP95:%.*]] = sub <2 x i32> [[TMP128]], [[TMP132]] -; CHECK-NEXT: [[TMP96:%.*]] = shl <2 x i32> [[TMP95]], splat (i32 16) -; CHECK-NEXT: [[TMP97:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV33]], i32 1 -; CHECK-NEXT: [[TMP117:%.*]] = sub <2 x i32> [[TMP97]], [[TMP120]] -; CHECK-NEXT: [[TMP105:%.*]] = add <2 x i32> [[TMP96]], [[TMP117]] -; CHECK-NEXT: [[TMP86:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0 -; CHECK-NEXT: [[TMP119:%.*]] = sub <2 x i32> [[TMP86]], [[TMP85]] -; CHECK-NEXT: [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP119]] -; CHECK-NEXT: [[TMP93:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> [[TMP92]], <2 x i32> -; CHECK-NEXT: [[TMP101:%.*]] = add <2 x i32> [[TMP105]], [[TMP92]] -; CHECK-NEXT: [[TMP151:%.*]] = sub <2 x i32> [[TMP92]], [[TMP105]] -; CHECK-NEXT: [[TMP111:%.*]] = extractelement <2 x i32> [[TMP101]], i32 0 -; CHECK-NEXT: [[TMP99:%.*]] = extractelement <2 x i32> [[TMP101]], i32 1 -; CHECK-NEXT: [[ADD55:%.*]] = add i32 [[TMP99]], [[TMP111]] -; CHECK-NEXT: [[SUB51:%.*]] = sub i32 [[TMP111]], [[TMP99]] -; CHECK-NEXT: [[TMP153:%.*]] = extractelement <2 x i32> [[TMP151]], i32 0 -; CHECK-NEXT: [[TMP157:%.*]] = extractelement <2 x i32> [[TMP151]], i32 1 -; CHECK-NEXT: [[ADD78_1:%.*]] = add i32 [[TMP157]], [[TMP153]] -; CHECK-NEXT: [[SUB59:%.*]] = sub i32 [[TMP153]], [[TMP157]] -; CHECK-NEXT: [[SHR_I59_1:%.*]] = lshr i32 [[TMP99]], 15 +; CHECK-NEXT: [[TMP88:%.*]] = extractelement <2 x i32> [[TMP85]], i32 0 +; CHECK-NEXT: [[TMP89:%.*]] = extractelement <2 x i32> [[TMP85]], i32 1 +; CHECK-NEXT: [[ADD94_4:%.*]] = add i32 [[TMP88]], [[TMP89]] +; CHECK-NEXT: [[SUB102_3:%.*]] = sub i32 [[TMP89]], [[TMP88]] +; CHECK-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[CONV1]], 15 +; CHECK-NEXT: [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537 +; CHECK-NEXT: [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535 +; CHECK-NEXT: [[TMP90:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1 +; CHECK-NEXT: [[TMP102:%.*]] = zext <2 x i8> [[TMP90]] to <2 x i32> +; CHECK-NEXT: [[TMP92:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1 +; CHECK-NEXT: [[TMP93:%.*]] = shufflevector <4 x i8> [[TMP92]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP94:%.*]] = zext <2 x i8> [[TMP93]] to <2 x i32> +; CHECK-NEXT: [[TMP95:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1 +; CHECK-NEXT: [[TMP96:%.*]] = shufflevector <4 x i8> [[TMP95]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP97:%.*]] = zext <2 x i8> [[TMP96]] to <2 x i32> +; CHECK-NEXT: [[TMP98:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[TMP99:%.*]] = shufflevector <4 x i8> [[TMP98]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP100:%.*]] = zext <2 x i8> [[TMP99]] to <2 x i32> +; CHECK-NEXT: [[TMP101:%.*]] = sub <2 x i32> [[TMP97]], [[TMP100]] +; CHECK-NEXT: [[TMP224:%.*]] = shl <2 x i32> [[TMP101]], splat (i32 16) +; CHECK-NEXT: [[TMP103:%.*]] = shufflevector <4 x i8> [[TMP92]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP104:%.*]] = zext <2 x i8> [[TMP103]] to <2 x i32> +; CHECK-NEXT: [[TMP105:%.*]] = shufflevector <4 x i8> [[TMP95]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP106:%.*]] = zext <2 x i8> [[TMP105]] to <2 x i32> +; CHECK-NEXT: [[TMP107:%.*]] = shufflevector <4 x i8> [[TMP98]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP108:%.*]] = zext <2 x i8> [[TMP107]] to <2 x i32> +; CHECK-NEXT: [[TMP109:%.*]] = sub <2 x i32> [[TMP106]], [[TMP108]] +; CHECK-NEXT: [[TMP110:%.*]] = shl <2 x i32> [[TMP109]], splat (i32 16) +; CHECK-NEXT: [[TMP111:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV33]], i32 1 +; CHECK-NEXT: [[TMP112:%.*]] = sub <2 x i32> [[TMP111]], [[TMP104]] +; CHECK-NEXT: [[TMP113:%.*]] = add <2 x i32> [[TMP110]], [[TMP112]] +; CHECK-NEXT: [[TMP114:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0 +; CHECK-NEXT: [[TMP115:%.*]] = sub <2 x i32> [[TMP114]], [[TMP94]] +; CHECK-NEXT: [[TMP116:%.*]] = add <2 x i32> [[TMP224]], [[TMP115]] +; CHECK-NEXT: [[TMP117:%.*]] = shufflevector <2 x i32> [[TMP113]], <2 x i32> [[TMP116]], <2 x i32> +; CHECK-NEXT: [[TMP126:%.*]] = add <2 x i32> [[TMP113]], [[TMP116]] +; CHECK-NEXT: [[TMP119:%.*]] = sub <2 x i32> [[TMP116]], [[TMP113]] +; CHECK-NEXT: [[TMP120:%.*]] = extractelement <2 x i32> [[TMP126]], i32 0 +; CHECK-NEXT: [[TMP127:%.*]] = extractelement <2 x i32> [[TMP126]], i32 1 +; CHECK-NEXT: [[ADD48:%.*]] = add i32 [[TMP127]], [[TMP120]] +; CHECK-NEXT: [[TMP166:%.*]] = sub i32 [[TMP120]], [[TMP127]] +; CHECK-NEXT: [[TMP128:%.*]] = extractelement <2 x i32> [[TMP119]], i32 0 +; CHECK-NEXT: [[TMP129:%.*]] = extractelement <2 x i32> [[TMP119]], i32 1 +; CHECK-NEXT: [[ADD55:%.*]] = add i32 [[TMP129]], [[TMP128]] +; CHECK-NEXT: [[SUB60:%.*]] = sub i32 [[TMP128]], [[TMP129]] +; CHECK-NEXT: [[SHR_I59:%.*]] = lshr i32 [[TMP127]], 15 +; CHECK-NEXT: [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537 +; CHECK-NEXT: [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535 +; CHECK-NEXT: [[SHR_I59_1:%.*]] = lshr i32 [[TMP129]], 15 ; CHECK-NEXT: [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537 ; CHECK-NEXT: [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535 -; CHECK-NEXT: [[SHR_I59_4:%.*]] = lshr i32 [[TMP157]], 15 -; CHECK-NEXT: [[AND_I60_4:%.*]] = and i32 [[SHR_I59_4]], 65537 -; CHECK-NEXT: [[MUL_I61_4:%.*]] = mul i32 [[AND_I60_4]], 65535 -; CHECK-NEXT: [[TMP104:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1 -; CHECK-NEXT: [[TMP110:%.*]] = zext <2 x i8> [[TMP104]] to <2 x i32> -; CHECK-NEXT: [[TMP158:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1 -; CHECK-NEXT: [[TMP114:%.*]] = shufflevector <4 x i8> [[TMP158]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP133:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32> -; CHECK-NEXT: [[TMP121:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 -; CHECK-NEXT: [[TMP116:%.*]] = shufflevector <4 x i8> [[TMP121]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP115:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32> -; CHECK-NEXT: [[TMP159:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 -; CHECK-NEXT: [[TMP118:%.*]] = shufflevector <4 x i8> [[TMP159]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP134:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32> -; CHECK-NEXT: [[TMP124:%.*]] = sub <2 x i32> [[TMP115]], [[TMP134]] -; CHECK-NEXT: [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], splat (i32 16) -; CHECK-NEXT: [[TMP127:%.*]] = shufflevector <4 x i8> [[TMP158]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP191:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32> -; CHECK-NEXT: [[TMP160:%.*]] = shufflevector <4 x i8> [[TMP121]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP161:%.*]] = zext <2 x i8> [[TMP160]] to <2 x i32> -; CHECK-NEXT: [[TMP171:%.*]] = shufflevector <4 x i8> [[TMP159]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP172:%.*]] = zext <2 x i8> [[TMP171]] to <2 x i32> -; CHECK-NEXT: [[TMP135:%.*]] = sub <2 x i32> [[TMP161]], [[TMP172]] -; CHECK-NEXT: [[TMP136:%.*]] = shl <2 x i32> [[TMP135]], splat (i32 16) -; CHECK-NEXT: [[TMP137:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV33_1]], i32 1 -; CHECK-NEXT: [[TMP173:%.*]] = sub <2 x i32> [[TMP137]], [[TMP191]] -; CHECK-NEXT: [[TMP174:%.*]] = add <2 x i32> [[TMP136]], [[TMP173]] -; CHECK-NEXT: [[TMP140:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV_1]], i32 0 -; CHECK-NEXT: [[TMP141:%.*]] = sub <2 x i32> [[TMP140]], [[TMP133]] -; CHECK-NEXT: [[TMP192:%.*]] = add <2 x i32> [[TMP125]], [[TMP141]] -; CHECK-NEXT: [[TMP156:%.*]] = add <2 x i32> [[TMP174]], [[TMP192]] -; CHECK-NEXT: [[TMP155:%.*]] = sub <2 x i32> [[TMP192]], [[TMP174]] -; CHECK-NEXT: [[TMP139:%.*]] = extractelement <2 x i32> [[TMP156]], i32 0 -; CHECK-NEXT: [[TMP142:%.*]] = extractelement <2 x i32> [[TMP156]], i32 1 -; CHECK-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP142]], [[TMP139]] -; CHECK-NEXT: [[SUB45_1:%.*]] = sub i32 [[TMP139]], [[TMP142]] -; CHECK-NEXT: [[TMP138:%.*]] = extractelement <2 x i32> [[TMP155]], i32 0 -; CHECK-NEXT: [[SUB47_1:%.*]] = extractelement <2 x i32> [[TMP155]], i32 1 -; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[SUB47_1]], [[TMP138]] -; CHECK-NEXT: [[SUB59_1:%.*]] = sub i32 [[TMP138]], [[SUB47_1]] -; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP142]], 15 +; CHECK-NEXT: [[TMP130:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1 +; CHECK-NEXT: [[TMP131:%.*]] = zext <2 x i8> [[TMP130]] to <2 x i32> +; CHECK-NEXT: [[TMP132:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1 +; CHECK-NEXT: [[TMP133:%.*]] = shufflevector <4 x i8> [[TMP132]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP134:%.*]] = zext <2 x i8> [[TMP133]] to <2 x i32> +; CHECK-NEXT: [[TMP135:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 +; CHECK-NEXT: [[TMP136:%.*]] = shufflevector <4 x i8> [[TMP135]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP137:%.*]] = zext <2 x i8> [[TMP136]] to <2 x i32> +; CHECK-NEXT: [[TMP138:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 +; CHECK-NEXT: [[TMP139:%.*]] = shufflevector <4 x i8> [[TMP138]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP140:%.*]] = zext <2 x i8> [[TMP139]] to <2 x i32> +; CHECK-NEXT: [[TMP141:%.*]] = sub <2 x i32> [[TMP137]], [[TMP140]] +; CHECK-NEXT: [[TMP142:%.*]] = shl <2 x i32> [[TMP141]], splat (i32 16) +; CHECK-NEXT: [[TMP143:%.*]] = shufflevector <4 x i8> [[TMP132]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP144:%.*]] = zext <2 x i8> [[TMP143]] to <2 x i32> +; CHECK-NEXT: [[TMP145:%.*]] = shufflevector <4 x i8> [[TMP135]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP146:%.*]] = zext <2 x i8> [[TMP145]] to <2 x i32> +; CHECK-NEXT: [[TMP147:%.*]] = shufflevector <4 x i8> [[TMP138]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP148:%.*]] = zext <2 x i8> [[TMP147]] to <2 x i32> +; CHECK-NEXT: [[TMP149:%.*]] = sub <2 x i32> [[TMP146]], [[TMP148]] +; CHECK-NEXT: [[TMP150:%.*]] = shl <2 x i32> [[TMP149]], splat (i32 16) +; CHECK-NEXT: [[TMP151:%.*]] = insertelement <2 x i32> [[TMP131]], i32 [[CONV33_1]], i32 1 +; CHECK-NEXT: [[TMP225:%.*]] = sub <2 x i32> [[TMP151]], [[TMP144]] +; CHECK-NEXT: [[TMP153:%.*]] = add <2 x i32> [[TMP150]], [[TMP225]] +; CHECK-NEXT: [[TMP154:%.*]] = insertelement <2 x i32> [[TMP131]], i32 [[CONV_1]], i32 0 +; CHECK-NEXT: [[TMP155:%.*]] = sub <2 x i32> [[TMP154]], [[TMP134]] +; CHECK-NEXT: [[TMP156:%.*]] = add <2 x i32> [[TMP142]], [[TMP155]] +; CHECK-NEXT: [[TMP157:%.*]] = add <2 x i32> [[TMP153]], [[TMP156]] +; CHECK-NEXT: [[TMP158:%.*]] = sub <2 x i32> [[TMP156]], [[TMP153]] +; CHECK-NEXT: [[TMP159:%.*]] = extractelement <2 x i32> [[TMP157]], i32 0 +; CHECK-NEXT: [[TMP160:%.*]] = extractelement <2 x i32> [[TMP157]], i32 1 +; CHECK-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP160]], [[TMP159]] +; CHECK-NEXT: [[SUB51_1:%.*]] = sub i32 [[TMP159]], [[TMP160]] +; CHECK-NEXT: [[TMP161:%.*]] = extractelement <2 x i32> [[TMP158]], i32 0 +; CHECK-NEXT: [[TMP162:%.*]] = extractelement <2 x i32> [[TMP158]], i32 1 +; CHECK-NEXT: [[ADD55_1:%.*]] = add i32 [[TMP162]], [[TMP161]] +; CHECK-NEXT: [[SUB59_2:%.*]] = sub i32 [[TMP161]], [[TMP162]] +; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP160]], 15 ; CHECK-NEXT: [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537 ; CHECK-NEXT: [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535 -; CHECK-NEXT: [[SHR_I54_1:%.*]] = lshr i32 [[SUB47_1]], 15 +; CHECK-NEXT: [[SHR_I54_1:%.*]] = lshr i32 [[TMP162]], 15 ; CHECK-NEXT: [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537 ; CHECK-NEXT: [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535 -; CHECK-NEXT: [[TMP154:%.*]] = lshr <2 x i32> [[TMP110]], splat (i32 15) -; CHECK-NEXT: [[TMP184:%.*]] = and <2 x i32> [[TMP154]], splat (i32 65537) -; CHECK-NEXT: [[TMP195:%.*]] = mul <2 x i32> [[TMP184]], splat (i32 65535) -; CHECK-NEXT: [[ADD78:%.*]] = add i32 [[ADD48_1]], [[ADD55]] -; CHECK-NEXT: [[SUB86:%.*]] = sub i32 [[ADD55]], [[ADD48_1]] -; CHECK-NEXT: [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]] -; CHECK-NEXT: [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD94]] -; CHECK-NEXT: [[ADD105:%.*]] = add i32 [[SUB102]], [[SUB86]] -; CHECK-NEXT: [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB102]] -; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I51_3]], [[ADD103]] +; CHECK-NEXT: [[TMP163:%.*]] = lshr <2 x i32> [[TMP131]], splat (i32 15) +; CHECK-NEXT: [[TMP164:%.*]] = and <2 x i32> [[TMP163]], splat (i32 65537) +; CHECK-NEXT: [[TMP165:%.*]] = mul <2 x i32> [[TMP164]], splat (i32 65535) +; CHECK-NEXT: [[ADD78:%.*]] = add i32 [[ADD48_1]], [[ADD48]] +; CHECK-NEXT: [[SUB86:%.*]] = sub i32 [[ADD48]], [[ADD48_1]] +; CHECK-NEXT: [[ADD103:%.*]] = add i32 [[ADD95]], [[ADD78]] +; CHECK-NEXT: [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD95]] +; CHECK-NEXT: [[ADD105:%.*]] = add i32 [[SUB86_3]], [[SUB86]] +; CHECK-NEXT: [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB86_3]] +; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]] ; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP77]] -; CHECK-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I_1]], [[ADD105]] -; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP34]] +; CHECK-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]] +; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP45]] ; CHECK-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]] -; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP142]] -; CHECK-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61_1]], [[SUB106]] -; CHECK-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP99]] +; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP160]] +; CHECK-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]] +; CHECK-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP127]] ; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]] ; CHECK-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]] -; CHECK-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]] +; CHECK-NEXT: [[ADD105_3:%.*]] = add i32 [[ADD112]], [[XOR_I63]] +; CHECK-NEXT: [[ADD78_1:%.*]] = add i32 [[ADD55_1]], [[ADD55]] +; CHECK-NEXT: [[SUB86_1:%.*]] = sub i32 [[ADD55]], [[ADD55_1]] ; CHECK-NEXT: [[ADD103_1:%.*]] = add i32 [[ADD94_1]], [[ADD78_1]] ; CHECK-NEXT: [[SUB104_1:%.*]] = sub i32 [[ADD78_1]], [[ADD94_1]] -; CHECK-NEXT: [[ADD103_2:%.*]] = add i32 [[ADD94_5]], [[ADD103_1]] -; CHECK-NEXT: [[SUB104_2:%.*]] = sub i32 [[ADD103_1]], [[ADD94_5]] -; CHECK-NEXT: [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB104_1]] -; CHECK-NEXT: [[SUB106_1:%.*]] = sub i32 [[SUB104_1]], [[SUB102_1]] -; CHECK-NEXT: [[ADD_I_1:%.*]] = add i32 [[MUL_I_2]], [[ADD103_2]] -; CHECK-NEXT: [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[TMP83]] -; CHECK-NEXT: [[ADD_I52_1:%.*]] = add i32 [[ADD94_2]], [[ADD105_1]] -; CHECK-NEXT: [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP84]] -; CHECK-NEXT: [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_2]] -; CHECK-NEXT: [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[SUB47_1]] -; CHECK-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_4]], [[SUB106_1]] -; CHECK-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP157]] -; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD113]] +; CHECK-NEXT: [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB86_1]] +; CHECK-NEXT: [[SUB106_1:%.*]] = sub i32 [[SUB86_1]], [[SUB102_1]] +; CHECK-NEXT: [[ADD_I_1:%.*]] = add i32 [[MUL_I_1]], [[ADD103_1]] +; CHECK-NEXT: [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[CONV9_2]] +; CHECK-NEXT: [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_1]], [[ADD105_1]] +; CHECK-NEXT: [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[CONV_2]] +; CHECK-NEXT: [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_1]] +; CHECK-NEXT: [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[TMP162]] +; CHECK-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]] +; CHECK-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP129]] +; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD105_3]] ; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]] -; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]] -; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]] -; CHECK-NEXT: [[ADD78_2:%.*]] = add i32 [[SUB45_1]], [[SUB51]] -; CHECK-NEXT: [[TMP170:%.*]] = sub i32 [[SUB51]], [[SUB45_1]] -; CHECK-NEXT: [[TMP162:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0 -; CHECK-NEXT: [[TMP163:%.*]] = shufflevector <2 x i32> [[TMP162]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP164:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_4]], i32 0 -; CHECK-NEXT: [[TMP165:%.*]] = shufflevector <2 x i32> [[TMP164]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP166:%.*]] = add <2 x i32> [[TMP163]], [[TMP165]] -; CHECK-NEXT: [[TMP167:%.*]] = sub <2 x i32> [[TMP163]], [[TMP165]] -; CHECK-NEXT: [[TMP168:%.*]] = shufflevector <2 x i32> [[TMP166]], <2 x i32> [[TMP167]], <2 x i32> -; CHECK-NEXT: [[ADD105_2:%.*]] = add i32 [[TMP169]], [[TMP170]] -; CHECK-NEXT: [[SUB106_2:%.*]] = sub i32 [[TMP170]], [[TMP169]] -; CHECK-NEXT: [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD105_2]] -; CHECK-NEXT: [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]] -; CHECK-NEXT: [[TMP197:%.*]] = add <2 x i32> [[TMP195]], [[TMP168]] -; CHECK-NEXT: [[TMP152:%.*]] = xor <2 x i32> [[TMP197]], [[TMP110]] -; CHECK-NEXT: [[SHR_I59_2:%.*]] = lshr i32 [[TMP111]], 15 -; CHECK-NEXT: [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537 -; CHECK-NEXT: [[MUL_I61_2:%.*]] = mul i32 [[AND_I60_2]], 65535 -; CHECK-NEXT: [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]] -; CHECK-NEXT: [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP111]] -; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]] -; CHECK-NEXT: [[TMP175:%.*]] = extractelement <2 x i32> [[TMP152]], i32 0 -; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP175]] -; CHECK-NEXT: [[TMP176:%.*]] = extractelement <2 x i32> [[TMP152]], i32 1 -; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP176]] -; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]] -; CHECK-NEXT: [[ADD78_3:%.*]] = add i32 [[SUB59_1]], [[SUB59]] -; CHECK-NEXT: [[SUB86_3:%.*]] = sub i32 [[SUB59]], [[SUB59_1]] +; CHECK-NEXT: [[ADD112_5:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]] +; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_5]], [[XOR_I63_1]] +; CHECK-NEXT: [[ADD78_3:%.*]] = add i32 [[SUB51_1]], [[TMP166]] +; CHECK-NEXT: [[TMP204:%.*]] = sub i32 [[TMP166]], [[SUB51_1]] ; CHECK-NEXT: [[TMP177:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0 ; CHECK-NEXT: [[TMP178:%.*]] = shufflevector <2 x i32> [[TMP177]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP179:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0 ; CHECK-NEXT: [[TMP180:%.*]] = shufflevector <2 x i32> [[TMP179]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP181:%.*]] = add <2 x i32> [[TMP178]], [[TMP180]] -; CHECK-NEXT: [[TMP182:%.*]] = sub <2 x i32> [[TMP178]], [[TMP180]] -; CHECK-NEXT: [[TMP183:%.*]] = shufflevector <2 x i32> [[TMP181]], <2 x i32> [[TMP182]], <2 x i32> -; CHECK-NEXT: [[ADD105_3:%.*]] = add i32 [[SUB102_3]], [[SUB86_3]] -; CHECK-NEXT: [[SUB106_3:%.*]] = sub i32 [[SUB86_3]], [[SUB102_3]] -; CHECK-NEXT: [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_4]], [[ADD105_3]] -; CHECK-NEXT: [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_3]], [[CONV1]] +; CHECK-NEXT: [[TMP199:%.*]] = add <2 x i32> [[TMP178]], [[TMP180]] +; CHECK-NEXT: [[TMP200:%.*]] = sub <2 x i32> [[TMP178]], [[TMP180]] +; CHECK-NEXT: [[TMP201:%.*]] = shufflevector <2 x i32> [[TMP199]], <2 x i32> [[TMP200]], <2 x i32> +; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[TMP204]] +; CHECK-NEXT: [[SUB106_2:%.*]] = sub i32 [[TMP204]], [[ADD112_1]] +; CHECK-NEXT: [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD113_1]] +; CHECK-NEXT: [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]] +; CHECK-NEXT: [[TMP208:%.*]] = add <2 x i32> [[TMP165]], [[TMP201]] +; CHECK-NEXT: [[TMP209:%.*]] = xor <2 x i32> [[TMP208]], [[TMP131]] +; CHECK-NEXT: [[SHR_I59_2:%.*]] = lshr i32 [[TMP120]], 15 +; CHECK-NEXT: [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537 +; CHECK-NEXT: [[MUL_I61_2:%.*]] = mul i32 [[AND_I60_2]], 65535 +; CHECK-NEXT: [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]] +; CHECK-NEXT: [[XOR_I63_4:%.*]] = xor i32 [[ADD_I62_2]], [[TMP120]] +; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_2]] +; CHECK-NEXT: [[TMP211:%.*]] = extractelement <2 x i32> [[TMP209]], i32 0 +; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP211]] +; CHECK-NEXT: [[TMP212:%.*]] = extractelement <2 x i32> [[TMP209]], i32 1 +; CHECK-NEXT: [[ADD112_4:%.*]] = add i32 [[ADD110_2]], [[TMP212]] +; CHECK-NEXT: [[ADD113_4:%.*]] = add i32 [[ADD112_4]], [[XOR_I63_4]] +; CHECK-NEXT: [[ADD78_4:%.*]] = add i32 [[SUB59_2]], [[SUB60]] +; CHECK-NEXT: [[SUB86_4:%.*]] = sub i32 [[SUB60]], [[SUB59_2]] +; CHECK-NEXT: [[TMP213:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_4]], i32 0 +; CHECK-NEXT: [[TMP214:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP215:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_4]], i32 0 +; CHECK-NEXT: [[TMP216:%.*]] = shufflevector <2 x i32> [[TMP215]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP217:%.*]] = add <2 x i32> [[TMP214]], [[TMP216]] +; CHECK-NEXT: [[TMP218:%.*]] = sub <2 x i32> [[TMP214]], [[TMP216]] +; CHECK-NEXT: [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP217]], <2 x i32> [[TMP218]], <2 x i32> +; CHECK-NEXT: [[ADD105_4:%.*]] = add i32 [[SUB102_3]], [[SUB86_4]] +; CHECK-NEXT: [[SUB106_3:%.*]] = sub i32 [[SUB86_4]], [[SUB102_3]] +; CHECK-NEXT: [[ADD_I52_4:%.*]] = add i32 [[MUL_I51_3]], [[ADD105_4]] +; CHECK-NEXT: [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_4]], [[CONV1]] ; CHECK-NEXT: [[TMP185:%.*]] = lshr <2 x i32> [[TMP102]], splat (i32 15) ; CHECK-NEXT: [[TMP193:%.*]] = and <2 x i32> [[TMP185]], splat (i32 65537) ; CHECK-NEXT: [[TMP186:%.*]] = mul <2 x i32> [[TMP193]], splat (i32 65535) -; CHECK-NEXT: [[TMP187:%.*]] = add <2 x i32> [[TMP186]], [[TMP183]] +; CHECK-NEXT: [[TMP187:%.*]] = add <2 x i32> [[TMP186]], [[TMP219]] ; CHECK-NEXT: [[TMP188:%.*]] = xor <2 x i32> [[TMP187]], [[TMP102]] ; CHECK-NEXT: [[SHR_I59_3:%.*]] = lshr i32 [[CONV33]], 15 ; CHECK-NEXT: [[AND_I60_3:%.*]] = and i32 [[SHR_I59_3]], 65537 ; CHECK-NEXT: [[MUL_I61_3:%.*]] = mul i32 [[AND_I60_3]], 65535 ; CHECK-NEXT: [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]] ; CHECK-NEXT: [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]] -; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]] +; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_4]] ; CHECK-NEXT: [[TMP189:%.*]] = extractelement <2 x i32> [[TMP188]], i32 0 ; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP189]] ; CHECK-NEXT: [[TMP190:%.*]] = extractelement <2 x i32> [[TMP188]], i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll index 360b258f216c5..f875d45db61dd 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll @@ -14,7 +14,7 @@ define void @test() { ; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr undef, align 4 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> , float [[TMP2]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP0]], float [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> , <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> , float [[TMP2]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP6]]) ; CHECK-NEXT: br i1 false, label [[BB2:%.*]], label [[BB3:%.*]] ; CHECK: bb2: diff --git a/llvm/test/Transforms/SLPVectorizer/alternate-cmp-swapped-pred-parent.ll b/llvm/test/Transforms/SLPVectorizer/alternate-cmp-swapped-pred-parent.ll index 371b23019498d..afca39ad8938a 100644 --- a/llvm/test/Transforms/SLPVectorizer/alternate-cmp-swapped-pred-parent.ll +++ b/llvm/test/Transforms/SLPVectorizer/alternate-cmp-swapped-pred-parent.ll @@ -12,7 +12,8 @@ define void @test() { ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> , i16 [[CALL37]], i32 3 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[CALL]], i32 5 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> , <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i16> , i16 [[CALL37]], i32 6 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i16> [[TMP5]], i16 [[CALL]], i32 7 ; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <8 x i16> [[TMP2]], [[TMP3]] ; CHECK-NEXT: ret void ; @@ -43,7 +44,8 @@ define void @test1() { ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> , i16 [[CALL]], i32 3 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[CALL37]], i32 4 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> , <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i16> , i16 [[CALL]], i32 6 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i16> [[TMP5]], i16 [[CALL37]], i32 7 ; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <8 x i16> [[TMP2]], [[TMP3]] ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll index 261ec2b3935d7..40568f9c8a509 100644 --- a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll +++ b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll @@ -1,31 +1,56 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 -; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %} -; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %} +; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefix X86 %} +; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix AARCH64 %} define i1 @test(float %0, double %1) { -; CHECK-LABEL: define i1 @test -; CHECK-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) { -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> , float [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> , double [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> , <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> , <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]] -; CHECK-NEXT: [[TMP12:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> , <4 x double> [[TMP10]], i64 0) -; CHECK-NEXT: [[TMP13:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> , <4 x double> [[TMP11]], i64 0) -; CHECK-NEXT: [[TMP14:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP13]], <2 x double> [[TMP6]], i64 4) -; CHECK-NEXT: [[TMP15:%.*]] = fsub <8 x double> [[TMP12]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = fmul <8 x double> [[TMP12]], [[TMP14]] -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <8 x double> [[TMP15]], <8 x double> [[TMP16]], <8 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = fptrunc <8 x double> [[TMP17]] to <8 x float> -; CHECK-NEXT: [[TMP19:%.*]] = fmul <8 x float> [[TMP18]], zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = fcmp oeq <8 x float> [[TMP19]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = freeze <8 x i1> [[TMP20]] -; CHECK-NEXT: [[TMP22:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP21]]) -; CHECK-NEXT: ret i1 [[TMP22]] +; X86-LABEL: define i1 @test +; X86-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) { +; X86-NEXT: [[TMP3:%.*]] = insertelement <4 x float> , float [[TMP0]], i32 3 +; X86-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double> +; X86-NEXT: [[TMP5:%.*]] = insertelement <2 x double> , double [[TMP1]], i32 0 +; X86-NEXT: [[TMP6:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]] +; X86-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; X86-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> , <4 x i32> +; X86-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> , <4 x i32> +; X86-NEXT: [[TMP10:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]] +; X86-NEXT: [[TMP11:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]] +; X86-NEXT: [[TMP12:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> , <4 x double> [[TMP10]], i64 0) +; X86-NEXT: [[TMP13:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> , <4 x double> [[TMP11]], i64 0) +; X86-NEXT: [[TMP14:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP13]], <2 x double> [[TMP6]], i64 4) +; X86-NEXT: [[TMP15:%.*]] = fsub <8 x double> [[TMP12]], [[TMP14]] +; X86-NEXT: [[TMP16:%.*]] = fmul <8 x double> [[TMP12]], [[TMP14]] +; X86-NEXT: [[TMP17:%.*]] = shufflevector <8 x double> [[TMP15]], <8 x double> [[TMP16]], <8 x i32> +; X86-NEXT: [[TMP18:%.*]] = fptrunc <8 x double> [[TMP17]] to <8 x float> +; X86-NEXT: [[TMP19:%.*]] = fmul <8 x float> [[TMP18]], zeroinitializer +; X86-NEXT: [[TMP20:%.*]] = fcmp oeq <8 x float> [[TMP19]], zeroinitializer +; X86-NEXT: [[TMP21:%.*]] = freeze <8 x i1> [[TMP20]] +; X86-NEXT: [[TMP22:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP21]]) +; X86-NEXT: ret i1 [[TMP22]] +; +; AARCH64-LABEL: define i1 @test +; AARCH64-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) { +; AARCH64-NEXT: [[TMP3:%.*]] = insertelement <4 x float> , float [[TMP0]], i32 3 +; AARCH64-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double> +; AARCH64-NEXT: [[TMP5:%.*]] = insertelement <2 x double> , double [[TMP1]], i32 0 +; AARCH64-NEXT: [[TMP6:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]] +; AARCH64-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; AARCH64-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> , <4 x i32> +; AARCH64-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> , <4 x i32> +; AARCH64-NEXT: [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> [[TMP4]], <4 x i32> +; AARCH64-NEXT: [[TMP11:%.*]] = fmul <4 x double> [[TMP8]], [[TMP10]] +; AARCH64-NEXT: [[TMP12:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]] +; AARCH64-NEXT: [[TMP13:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> , <4 x double> [[TMP11]], i64 0) +; AARCH64-NEXT: [[TMP14:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> , <4 x double> [[TMP12]], i64 0) +; AARCH64-NEXT: [[TMP15:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP14]], <2 x double> [[TMP6]], i64 4) +; AARCH64-NEXT: [[TMP16:%.*]] = fsub <8 x double> [[TMP13]], [[TMP15]] +; AARCH64-NEXT: [[TMP17:%.*]] = fmul <8 x double> [[TMP13]], [[TMP15]] +; AARCH64-NEXT: [[TMP18:%.*]] = shufflevector <8 x double> [[TMP16]], <8 x double> [[TMP17]], <8 x i32> +; AARCH64-NEXT: [[TMP19:%.*]] = fptrunc <8 x double> [[TMP18]] to <8 x float> +; AARCH64-NEXT: [[TMP20:%.*]] = fmul <8 x float> [[TMP19]], zeroinitializer +; AARCH64-NEXT: [[TMP21:%.*]] = fcmp oeq <8 x float> [[TMP20]], zeroinitializer +; AARCH64-NEXT: [[TMP22:%.*]] = freeze <8 x i1> [[TMP21]] +; AARCH64-NEXT: [[TMP23:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP22]]) +; AARCH64-NEXT: ret i1 [[TMP23]] ; %3 = fpext float %0 to double %4 = fpext float 0.000000e+00 to double diff --git a/llvm/test/Transforms/SLPVectorizer/full-overlap-non-schedulable.ll b/llvm/test/Transforms/SLPVectorizer/full-overlap-non-schedulable.ll index dbd91199c24ec..c704baaad6f71 100644 --- a/llvm/test/Transforms/SLPVectorizer/full-overlap-non-schedulable.ll +++ b/llvm/test/Transforms/SLPVectorizer/full-overlap-non-schedulable.ll @@ -24,7 +24,9 @@ define void @test(ptr %p1, ptr %0, i32 %1, i1 %c1, ptr %p2) { ; CHECK: [[L47]]: ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x ptr> [[TMP5]], i32 1 ; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x ptr> [[TMP5]], <4 x ptr> poison, <2 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <4 x ptr> [[TMP5]], <4 x ptr> poison, <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x ptr> [[TMP25]], <2 x ptr> [[TMP26]], <2 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq <2 x ptr> [[TMP14]], zeroinitializer ; CHECK-NEXT: [[TMP16:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4 ; CHECK-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> zeroinitializer, <2 x i32> [[TMP16]] diff --git a/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll b/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll index a854c61db6d28..a42c8f2c650ae 100644 --- a/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll +++ b/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll @@ -11,8 +11,8 @@ define i32 @test(i8 %0) { ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <2 x i8> zeroinitializer, [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = load volatile i8, ptr null, align 8 ; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i8>, ptr addrspace(21) getelementptr inbounds (i8, ptr addrspace(21) null, i64 8), align 8 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i8> [[TMP5]], <2 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i8> [[TMP6]], <8 x i8> , <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i8> [[TMP5]], <2 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i8> , <8 x i8> [[TMP6]], <8 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <8 x i8> zeroinitializer, [[TMP7]] ; CHECK-NEXT: [[TEST_STRUCTCOPY_14_S14_CM_COERCE_SROA_2_0_COPYLOAD:%.*]] = load i48, ptr addrspace(21) getelementptr inbounds (i8, ptr addrspace(21) null, i64 8), align 8 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i48> , i48 [[TEST_STRUCTCOPY_14_S14_CM_COERCE_SROA_2_0_COPYLOAD]], i32 0 @@ -21,9 +21,9 @@ define i32 @test(i8 %0) { ; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(21) null, align 2 ; CHECK-NEXT: [[TMP13:%.*]] = load volatile i8, ptr null, align 2 ; CHECK-NEXT: [[TMP14:%.*]] = load <2 x i8>, ptr addrspace(21) getelementptr inbounds (i8, ptr addrspace(21) null, i64 8), align 8 -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <2 x i8> [[TMP14]], <2 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <8 x i8> [[TMP15]], <8 x i8> , <8 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x i8> [[TMP16]], i8 [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x i8> , i8 [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x i8> [[TMP14]], <2 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <8 x i8> [[TMP15]], <8 x i8> [[TMP16]], <8 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = insertelement <8 x i8> , i8 [[TMP0]], i32 3 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x i8> [[TMP18]], i8 [[TMP13]], i32 1 ; CHECK-NEXT: [[TMP20:%.*]] = icmp eq <8 x i8> [[TMP17]], [[TMP19]] diff --git a/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll b/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll index 561182d5e4f49..940ee5b95871d 100644 --- a/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll +++ b/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll @@ -1,30 +1,54 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=x86_64 -slp-threshold=-150 | FileCheck %s %} -; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=aarch64-unknown-linux-gnu -slp-threshold=-150 | FileCheck %s %} +; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=x86_64 -slp-threshold=-150 | FileCheck %s --check-prefix X86 %} +; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=aarch64-unknown-linux-gnu -slp-threshold=-150 | FileCheck %s --check-prefix AARCH64 %} define i1 @test(ptr %arg, ptr %i233, i64 %i241, ptr %i235, ptr %i237, ptr %i227) { -; CHECK-LABEL: @test( -; CHECK-NEXT: bb: -; CHECK-NEXT: [[I226:%.*]] = getelementptr ptr, ptr [[ARG:%.*]], i32 7 -; CHECK-NEXT: [[I242:%.*]] = getelementptr double, ptr [[I233:%.*]], i64 [[I241:%.*]] -; CHECK-NEXT: [[I245:%.*]] = getelementptr double, ptr [[I235:%.*]], i64 [[I241]] -; CHECK-NEXT: [[I248:%.*]] = getelementptr double, ptr [[I237:%.*]], i64 [[I241]] -; CHECK-NEXT: [[I250:%.*]] = getelementptr double, ptr [[I227:%.*]], i64 [[I241]] -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x ptr>, ptr [[I226]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x ptr> , ptr [[I242]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[I250]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = icmp ult <8 x ptr> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x ptr> [[TMP5]], ptr [[I245]], i32 2 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[I248]], i32 3 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <8 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> , <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = icmp ult <8 x ptr> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i1> [[TMP4]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP11]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = and i1 [[TMP12]], false -; CHECK-NEXT: ret i1 [[OP_RDX]] +; X86-LABEL: @test( +; X86-NEXT: bb: +; X86-NEXT: [[I226:%.*]] = getelementptr ptr, ptr [[ARG:%.*]], i32 7 +; X86-NEXT: [[I242:%.*]] = getelementptr double, ptr [[I233:%.*]], i64 [[I241:%.*]] +; X86-NEXT: [[I245:%.*]] = getelementptr double, ptr [[I235:%.*]], i64 [[I241]] +; X86-NEXT: [[I248:%.*]] = getelementptr double, ptr [[I237:%.*]], i64 [[I241]] +; X86-NEXT: [[I250:%.*]] = getelementptr double, ptr [[I227:%.*]], i64 [[I241]] +; X86-NEXT: [[TMP0:%.*]] = load <4 x ptr>, ptr [[I226]], align 8 +; X86-NEXT: [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <8 x i32> +; X86-NEXT: [[TMP2:%.*]] = insertelement <8 x ptr> , ptr [[I242]], i32 0 +; X86-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[I250]], i32 2 +; X86-NEXT: [[TMP4:%.*]] = icmp ult <8 x ptr> [[TMP3]], [[TMP1]] +; X86-NEXT: [[TMP5:%.*]] = insertelement <8 x ptr> poison, ptr [[I250]], i32 0 +; X86-NEXT: [[TMP6:%.*]] = insertelement <8 x ptr> [[TMP5]], ptr [[I242]], i32 1 +; X86-NEXT: [[TMP7:%.*]] = insertelement <8 x ptr> [[TMP6]], ptr [[I245]], i32 2 +; X86-NEXT: [[TMP8:%.*]] = insertelement <8 x ptr> [[TMP7]], ptr [[I248]], i32 3 +; X86-NEXT: [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP8]], <8 x ptr> poison, <8 x i32> +; X86-NEXT: [[TMP10:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> , <8 x i32> +; X86-NEXT: [[TMP11:%.*]] = icmp ult <8 x ptr> [[TMP9]], [[TMP10]] +; X86-NEXT: [[TMP12:%.*]] = or <8 x i1> [[TMP4]], [[TMP11]] +; X86-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP12]]) +; X86-NEXT: [[OP_RDX:%.*]] = and i1 [[TMP13]], false +; X86-NEXT: ret i1 [[OP_RDX]] +; +; AARCH64-LABEL: @test( +; AARCH64-NEXT: bb: +; AARCH64-NEXT: [[I226:%.*]] = getelementptr ptr, ptr [[ARG:%.*]], i32 7 +; AARCH64-NEXT: [[I242:%.*]] = getelementptr double, ptr [[I233:%.*]], i64 [[I241:%.*]] +; AARCH64-NEXT: [[I245:%.*]] = getelementptr double, ptr [[I235:%.*]], i64 [[I241]] +; AARCH64-NEXT: [[I248:%.*]] = getelementptr double, ptr [[I237:%.*]], i64 [[I241]] +; AARCH64-NEXT: [[I250:%.*]] = getelementptr double, ptr [[I227:%.*]], i64 [[I241]] +; AARCH64-NEXT: [[TMP0:%.*]] = load <4 x ptr>, ptr [[I226]], align 8 +; AARCH64-NEXT: [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <8 x i32> +; AARCH64-NEXT: [[TMP2:%.*]] = insertelement <8 x ptr> , ptr [[I242]], i32 0 +; AARCH64-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[I250]], i32 2 +; AARCH64-NEXT: [[TMP4:%.*]] = icmp ult <8 x ptr> [[TMP3]], [[TMP1]] +; AARCH64-NEXT: [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <4 x i32> +; AARCH64-NEXT: [[TMP6:%.*]] = insertelement <4 x ptr> [[TMP5]], ptr [[I245]], i32 2 +; AARCH64-NEXT: [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[I248]], i32 3 +; AARCH64-NEXT: [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <8 x i32> +; AARCH64-NEXT: [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> , <8 x i32> +; AARCH64-NEXT: [[TMP10:%.*]] = icmp ult <8 x ptr> [[TMP8]], [[TMP9]] +; AARCH64-NEXT: [[TMP11:%.*]] = or <8 x i1> [[TMP4]], [[TMP10]] +; AARCH64-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP11]]) +; AARCH64-NEXT: [[OP_RDX:%.*]] = and i1 [[TMP12]], false +; AARCH64-NEXT: ret i1 [[OP_RDX]] ; bb: %i226 = getelementptr ptr, ptr %arg, i32 7 diff --git a/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll b/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll index 61a84a67c9ff1..056b6222cae72 100644 --- a/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll +++ b/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll @@ -13,9 +13,9 @@ define void @func(i32 %0) { ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <32 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP6]] to i64 ; CHECK-NEXT: [[TMP10:%.*]] = or i64 [[TMP9]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <32 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <32 x i32> [[TMP11]], <32 x i32> , <32 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <32 x i32> [[TMP12]], i32 0, i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP9]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <32 x i32> , i32 [[TMP11]], i32 30 +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <32 x i32> [[TMP12]], <32 x i32> poison, <32 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v8i32(<32 x i32> [[TMP13]], <8 x i32> zeroinitializer, i64 16) ; CHECK-NEXT: [[TMP15:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v4i32(<32 x i32> [[TMP14]], <4 x i32> zeroinitializer, i64 24) ; CHECK-NEXT: [[TMP16:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v2i32(<32 x i32> [[TMP15]], <2 x i32> zeroinitializer, i64 14) From f2334c5919ec077e6a8deeaf43a5b5188baf0251 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Tue, 24 Dec 2024 21:43:35 +0000 Subject: [PATCH 027/567] [llvm-exegesis] Make benchmark pinning actually work When originally writing this feature up, I apparently completely forgot to actually make the test exercise it and left an extra exit in the function implementing the functionality without the appropriate preprocessor macros around it, causing things to never work. This patch should fix that. --- llvm/test/tools/llvm-exegesis/X86/latency/cpu-pinning.s | 2 +- llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/test/tools/llvm-exegesis/X86/latency/cpu-pinning.s b/llvm/test/tools/llvm-exegesis/X86/latency/cpu-pinning.s index 0ea3752fc3bb9..e7430e462d5cc 100644 --- a/llvm/test/tools/llvm-exegesis/X86/latency/cpu-pinning.s +++ b/llvm/test/tools/llvm-exegesis/X86/latency/cpu-pinning.s @@ -1,5 +1,5 @@ # REQUIRES: exegesis-can-measure-latency, x86_64-linux -# RUN: llvm-exegesis -mtriple=x86_64-unknown-unknown -mode=latency -opcode-name=ADD64rr -execution-mode=subprocess | FileCheck %s +# RUN: llvm-exegesis -mtriple=x86_64-unknown-unknown -mode=latency -opcode-name=ADD64rr -execution-mode=subprocess --benchmark-process-cpu=0 | FileCheck %s # CHECK: - { key: latency, value: {{[0-9.]*}}, per_snippet_value: {{[0-9.]*}} diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp index 9116b5ced0274..a8226b810c242 100644 --- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp +++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp @@ -422,8 +422,9 @@ class SubProcessFunctionExecutorImpl "Expected getcpu call to succeed."); assert(static_cast(CurrentCPU) == CPUToUse && "Expected current CPU to equal the CPU requested by the user"); +#else + exit(ChildProcessExChildProcessExitCodeE::SetCPUAffinityFailed)); #endif // defined(__x86_64__) && defined(SYS_getcpu) - exit(ChildProcessExitCodeE::SetCPUAffinityFailed); } Error createSubProcessAndRunBenchmark( From 4b3d439e7e7b4e794e523caea9863d67ff8cf85f Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 24 Dec 2024 13:53:26 -0800 Subject: [PATCH 028/567] [test] Replace -riscv-no-alises with -M no-aliases The new option from https://reviews.llvm.org/D103004 is preferred. --- llvm/test/MC/RISCV/XVentanaCondOps-valid.s | 2 +- llvm/test/MC/RISCV/Zawrs-valid.s | 4 ++-- llvm/test/MC/RISCV/Ztso.s | 4 ++-- llvm/test/MC/RISCV/compress-cjal.s | 2 +- llvm/test/MC/RISCV/compress-debug-info.s | 4 ++-- llvm/test/MC/RISCV/compress-rv32d.s | 8 ++++---- llvm/test/MC/RISCV/compress-rv32f.s | 4 ++-- llvm/test/MC/RISCV/compress-rv32i.s | 4 ++-- llvm/test/MC/RISCV/compress-rv64i.s | 2 +- llvm/test/MC/RISCV/compressed-relocations.s | 2 +- llvm/test/MC/RISCV/compressed-zicfiss.s | 6 +++--- llvm/test/MC/RISCV/corev/XCValu-valid.s | 2 +- llvm/test/MC/RISCV/corev/XCVmac-valid.s | 2 +- llvm/test/MC/RISCV/debug-valid.s | 4 ++-- llvm/test/MC/RISCV/deprecated-csr-names.s | 4 ++-- llvm/test/MC/RISCV/fixups-binary-expression.s | 2 +- llvm/test/MC/RISCV/fixups.s | 2 +- llvm/test/MC/RISCV/fp-default-rounding-mode.s | 2 +- llvm/test/MC/RISCV/fp-inx-default-rounding-mode.s | 2 +- llvm/test/MC/RISCV/hypervisor-csr-names.s | 4 ++-- llvm/test/MC/RISCV/insn.s | 4 ++-- llvm/test/MC/RISCV/insn_c.s | 4 ++-- llvm/test/MC/RISCV/machine-csr-names.s | 4 ++-- llvm/test/MC/RISCV/option-nopic.s | 8 ++++---- llvm/test/MC/RISCV/option-pic.s | 8 ++++---- llvm/test/MC/RISCV/option-pushpop.s | 4 ++-- llvm/test/MC/RISCV/option-rvc.s | 4 ++-- llvm/test/MC/RISCV/print-imm-hex.s | 4 ++-- llvm/test/MC/RISCV/priv-valid.s | 4 ++-- llvm/test/MC/RISCV/relocations.s | 2 +- llvm/test/MC/RISCV/rv32-hypervisor-csr-names.s | 2 +- llvm/test/MC/RISCV/rv32-machine-csr-names.s | 2 +- llvm/test/MC/RISCV/rv32-supervisor-csr-names.s | 2 +- llvm/test/MC/RISCV/rv32-user-csr-names.s | 2 +- llvm/test/MC/RISCV/rv32c-aliases-valid.s | 4 ++-- llvm/test/MC/RISCV/rv32c-only-valid.s | 8 ++++---- llvm/test/MC/RISCV/rv32dc-valid.s | 8 ++++---- llvm/test/MC/RISCV/rv32fc-aliases-valid.s | 2 +- llvm/test/MC/RISCV/rv32fc-valid.s | 10 +++++----- llvm/test/MC/RISCV/rv32i-aliases-invalid.s | 2 +- llvm/test/MC/RISCV/rv32i-aliases-valid.s | 2 +- llvm/test/MC/RISCV/rv32i-only-valid.s | 2 +- llvm/test/MC/RISCV/rv32zbb-aliases-valid.s | 2 +- llvm/test/MC/RISCV/rv32zbb-only-valid.s | 2 +- llvm/test/MC/RISCV/rv32zbs-aliases-valid.s | 2 +- llvm/test/MC/RISCV/rv32zcmp-invalid.s | 2 +- llvm/test/MC/RISCV/rv32zcmp-valid.s | 2 +- llvm/test/MC/RISCV/rv32zfa-only-valid.s | 4 ++-- llvm/test/MC/RISCV/rv32zicfiss-invalid.s | 2 +- llvm/test/MC/RISCV/rv32zmmul-invaild.s | 2 +- llvm/test/MC/RISCV/rv32zmmul-valid.s | 2 +- llvm/test/MC/RISCV/rv64-machine-csr-names.s | 2 +- llvm/test/MC/RISCV/rv64-user-csr-names.s | 2 +- llvm/test/MC/RISCV/rv64a-aliases-valid.s | 2 +- llvm/test/MC/RISCV/rv64c-aliases-valid.s | 4 ++-- llvm/test/MC/RISCV/rv64c-hints-valid.s | 4 ++-- llvm/test/MC/RISCV/rv64c-valid.s | 8 ++++---- llvm/test/MC/RISCV/rv64d-aliases-valid.s | 2 +- llvm/test/MC/RISCV/rv64d-valid.s | 2 +- llvm/test/MC/RISCV/rv64dc-valid.s | 8 ++++---- llvm/test/MC/RISCV/rv64e-valid.s | 2 +- llvm/test/MC/RISCV/rv64e-zcmp-valid.s | 2 +- llvm/test/MC/RISCV/rv64f-aliases-valid.s | 2 +- llvm/test/MC/RISCV/rv64f-valid.s | 2 +- llvm/test/MC/RISCV/rv64i-aliases-invalid.s | 2 +- llvm/test/MC/RISCV/rv64i-aliases-valid.s | 2 +- llvm/test/MC/RISCV/rv64i-valid.s | 2 +- llvm/test/MC/RISCV/rv64ih-valid.s | 2 +- llvm/test/MC/RISCV/rv64m-valid.s | 2 +- llvm/test/MC/RISCV/rv64zaamo-valid.s | 4 ++-- llvm/test/MC/RISCV/rv64zacas-valid.s | 2 +- llvm/test/MC/RISCV/rv64zalasr-valid.s | 4 ++-- llvm/test/MC/RISCV/rv64zalrsc-valid.s | 4 ++-- llvm/test/MC/RISCV/rv64zba-aliases-valid.s | 2 +- llvm/test/MC/RISCV/rv64zbb-aliases-valid.s | 2 +- llvm/test/MC/RISCV/rv64zbb-valid.s | 2 +- llvm/test/MC/RISCV/rv64zbs-aliases-valid.s | 2 +- llvm/test/MC/RISCV/rv64zcb-valid.s | 6 +++--- llvm/test/MC/RISCV/rv64zcmp-invalid.s | 2 +- llvm/test/MC/RISCV/rv64zcmp-valid.s | 2 +- llvm/test/MC/RISCV/rv64zdinx-valid.s | 2 +- llvm/test/MC/RISCV/rv64zfh-valid.s | 2 +- llvm/test/MC/RISCV/rv64zfinx-valid.s | 2 +- llvm/test/MC/RISCV/rv64zhinx-valid.s | 2 +- llvm/test/MC/RISCV/rv64zhinxmin-valid.s | 2 +- llvm/test/MC/RISCV/rv64zicfiss-invalid.s | 2 +- llvm/test/MC/RISCV/rv64zmmul-invalid.s | 2 +- llvm/test/MC/RISCV/rv64zmmul-valid.s | 2 +- llvm/test/MC/RISCV/rva-aliases-valid.s | 4 ++-- llvm/test/MC/RISCV/rvc-aliases-valid.s | 4 ++-- llvm/test/MC/RISCV/rvc-hints-valid.s | 4 ++-- llvm/test/MC/RISCV/rvc-valid.s | 10 +++++----- llvm/test/MC/RISCV/rvd-aliases-valid.s | 4 ++-- llvm/test/MC/RISCV/rvd-valid.s | 4 ++-- llvm/test/MC/RISCV/rvdc-aliases-valid.s | 4 ++-- llvm/test/MC/RISCV/rve-valid.s | 4 ++-- llvm/test/MC/RISCV/rvf-aliases-valid.s | 4 ++-- llvm/test/MC/RISCV/rvf-user-csr-names.s | 4 ++-- llvm/test/MC/RISCV/rvf-valid.s | 4 ++-- llvm/test/MC/RISCV/rvi-valid.s | 4 ++-- llvm/test/MC/RISCV/rvih-valid.s | 4 ++-- llvm/test/MC/RISCV/rvk-user-csr-name.s | 4 ++-- llvm/test/MC/RISCV/rvm-valid.s | 4 ++-- llvm/test/MC/RISCV/rvv-user-csr-names.s | 4 ++-- llvm/test/MC/RISCV/rvv/aliases.s | 2 +- llvm/test/MC/RISCV/rvv/fothers.s | 2 +- llvm/test/MC/RISCV/rvv/freduction.s | 2 +- llvm/test/MC/RISCV/rvv/load.s | 2 +- llvm/test/MC/RISCV/rvv/others.s | 2 +- llvm/test/MC/RISCV/rvv/store.s | 2 +- llvm/test/MC/RISCV/rvv/zvlsseg.s | 2 +- llvm/test/MC/RISCV/rvzaamo-valid.s | 8 ++++---- llvm/test/MC/RISCV/rvzabha-valid.s | 4 ++-- llvm/test/MC/RISCV/rvzabha-zacas-valid.s | 4 ++-- llvm/test/MC/RISCV/rvzacas-valid.s | 4 ++-- llvm/test/MC/RISCV/rvzalasr-valid.s | 8 ++++---- llvm/test/MC/RISCV/rvzalrsc-valid.s | 8 ++++---- llvm/test/MC/RISCV/rvzbb-valid.s | 4 ++-- llvm/test/MC/RISCV/rvzcb-invalid.s | 4 ++-- llvm/test/MC/RISCV/rvzcb-valid.s | 8 ++++---- llvm/test/MC/RISCV/rvzcmt-invalid.s | 4 ++-- llvm/test/MC/RISCV/rvzcmt-user-csr-name.s | 4 ++-- llvm/test/MC/RISCV/rvzcmt-valid.s | 8 ++++---- llvm/test/MC/RISCV/rvzdinx-aliases-valid.s | 4 ++-- llvm/test/MC/RISCV/rvzdinx-valid.s | 4 ++-- llvm/test/MC/RISCV/rvzfbfmin-valid.s | 4 ++-- llvm/test/MC/RISCV/rvzfh-aliases-valid.s | 4 ++-- llvm/test/MC/RISCV/rvzfh-valid.s | 4 ++-- llvm/test/MC/RISCV/rvzfhmin-valid.s | 4 ++-- llvm/test/MC/RISCV/rvzfinx-aliases-valid.s | 4 ++-- llvm/test/MC/RISCV/rvzfinx-valid.s | 4 ++-- llvm/test/MC/RISCV/rvzhinx-aliases-valid.s | 4 ++-- llvm/test/MC/RISCV/rvzhinx-valid.s | 4 ++-- llvm/test/MC/RISCV/rvzhinxmin-valid.s | 4 ++-- llvm/test/MC/RISCV/rvzihintntl-valid.s | 4 ++-- llvm/test/MC/RISCV/rvzihintpause-valid.s | 4 ++-- llvm/test/MC/RISCV/smctr-ssctr-valid.s | 12 ++++++------ llvm/test/MC/RISCV/smrnmi-valid.s | 4 ++-- llvm/test/MC/RISCV/supervisor-csr-names.s | 4 ++-- llvm/test/MC/RISCV/user-csr-names.s | 4 ++-- llvm/test/MC/RISCV/xqcia-valid.s | 2 +- llvm/test/MC/RISCV/xqcics-valid.s | 2 +- llvm/test/MC/RISCV/xqcicsr-valid.s | 2 +- llvm/test/MC/RISCV/xqcilsm-aliases-valid.s | 2 +- llvm/test/MC/RISCV/xqcilsm-valid.s | 2 +- llvm/test/MC/RISCV/xqcisls-valid.s | 2 +- llvm/test/MC/RISCV/xsifive-valid.s | 4 ++-- llvm/test/MC/RISCV/xwchc-compress.s | 2 +- llvm/test/MC/RISCV/xwchc-valid.s | 2 +- llvm/test/MC/RISCV/zfa-double-invalid.s | 4 ++-- llvm/test/MC/RISCV/zfa-half-invalid.s | 4 ++-- llvm/test/MC/RISCV/zfa-valid.s | 8 ++++---- llvm/test/MC/RISCV/zfa-zfhmin-zvfh-valid.s | 8 ++++---- llvm/test/MC/RISCV/zicfilp-invalid.s | 4 ++-- llvm/test/MC/RISCV/zicfilp-valid.s | 8 ++++---- llvm/test/MC/RISCV/zicfiss-valid.s | 8 ++++---- 156 files changed, 285 insertions(+), 285 deletions(-) diff --git a/llvm/test/MC/RISCV/XVentanaCondOps-valid.s b/llvm/test/MC/RISCV/XVentanaCondOps-valid.s index 8f4eba5445ac7..9825210f8495f 100644 --- a/llvm/test/MC/RISCV/XVentanaCondOps-valid.s +++ b/llvm/test/MC/RISCV/XVentanaCondOps-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv64 -mattr=+xventanacondops -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+xventanacondops -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+xventanacondops < %s \ # RUN: | llvm-objdump --mattr=+xventanacondops -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/Zawrs-valid.s b/llvm/test/MC/RISCV/Zawrs-valid.s index 0bdc5708896be..29e95d36f4006 100644 --- a/llvm/test/MC/RISCV/Zawrs-valid.s +++ b/llvm/test/MC/RISCV/Zawrs-valid.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+zawrs -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+zawrs -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zawrs -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zawrs -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zawrs < %s \ # RUN: | llvm-objdump --mattr=+zawrs -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/Ztso.s b/llvm/test/MC/RISCV/Ztso.s index 06b1030fca7ef..af613551d0146 100644 --- a/llvm/test/MC/RISCV/Ztso.s +++ b/llvm/test/MC/RISCV/Ztso.s @@ -1,5 +1,5 @@ -# RUN: llvm-mc %s -triple=riscv64 -mattr=+ztso -riscv-no-aliases 2>&1 | FileCheck %s -# RUN: llvm-mc %s -triple=riscv32 -mattr=+ztso -riscv-no-aliases 2>&1 | FileCheck %s +# RUN: llvm-mc %s -triple=riscv64 -mattr=+ztso -M no-aliases 2>&1 | FileCheck %s +# RUN: llvm-mc %s -triple=riscv32 -mattr=+ztso -M no-aliases 2>&1 | FileCheck %s # Note: Ztso doesn't add or remove any instructions, so this is basically # just checking that a) we accepted the attribute name, and b) codegen did diff --git a/llvm/test/MC/RISCV/compress-cjal.s b/llvm/test/MC/RISCV/compress-cjal.s index d55586b005c7b..cdb6e85facc01 100644 --- a/llvm/test/MC/RISCV/compress-cjal.s +++ b/llvm/test/MC/RISCV/compress-cjal.s @@ -1,7 +1,7 @@ # RUN: llvm-mc -triple riscv32 -mattr=+c -show-encoding < %s \ # RUN: | FileCheck -check-prefixes=CHECK,CHECK-ALIAS %s # RUN: llvm-mc -triple riscv32 -mattr=+c -show-encoding \ -# RUN: -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s +# RUN: -M no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s # RUN: llvm-mc -triple riscv32 -mattr=+c -filetype=obj < %s \ # RUN: | llvm-objdump --triple=riscv32 --mattr=+c -d - \ # RUN: | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIASOBJ %s diff --git a/llvm/test/MC/RISCV/compress-debug-info.s b/llvm/test/MC/RISCV/compress-debug-info.s index 70aaefba3f820..134c3d0a876d1 100644 --- a/llvm/test/MC/RISCV/compress-debug-info.s +++ b/llvm/test/MC/RISCV/compress-debug-info.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc -triple riscv32 -mattr=+c %s -g -o - -riscv-no-aliases \ +# RUN: llvm-mc -triple riscv32 -mattr=+c %s -g -o - -M no-aliases \ # RUN: | FileCheck %s -check-prefixes=COMPRESS,BOTH -# RUN: llvm-mc -triple riscv32 %s -g -o - -riscv-no-aliases \ +# RUN: llvm-mc -triple riscv32 %s -g -o - -M no-aliases \ # RUN: | FileCheck %s -check-prefixes=UNCOMPRESS,BOTH diff --git a/llvm/test/MC/RISCV/compress-rv32d.s b/llvm/test/MC/RISCV/compress-rv32d.s index c41a088928624..2bfae212e1fd3 100644 --- a/llvm/test/MC/RISCV/compress-rv32d.s +++ b/llvm/test/MC/RISCV/compress-rv32d.s @@ -1,7 +1,7 @@ # RUN: llvm-mc -triple riscv32 -mattr=+c,+d -show-encoding < %s \ # RUN: | FileCheck -check-prefixes=CHECK,CHECK-ALIAS %s # RUN: llvm-mc -triple riscv32 -mattr=+c,+d -show-encoding \ -# RUN: -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s +# RUN: -M no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s # RUN: llvm-mc -triple riscv32 -mattr=+c,+d -filetype=obj < %s \ # RUN: | llvm-objdump --no-print-imm-hex --triple=riscv32 --mattr=+c,+d -d - \ # RUN: | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s @@ -11,7 +11,7 @@ # RUN: llvm-mc -triple riscv32 -mattr=+zcd,+d -show-encoding < %s \ # RUN: | FileCheck -check-prefixes=CHECK,CHECK-ALIAS %s # RUN: llvm-mc -triple riscv32 -mattr=+zcd,+d -show-encoding \ -# RUN: -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s +# RUN: -M no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s # RUN: llvm-mc -triple riscv32 -mattr=+zcd,+d -filetype=obj < %s \ # RUN: | llvm-objdump --no-print-imm-hex --triple=riscv32 --mattr=+zcd,+d -d - \ # RUN: | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s @@ -22,7 +22,7 @@ # RUN: llvm-mc -triple riscv64 -mattr=+c,+d -show-encoding < %s \ # RUN: | FileCheck -check-prefixes=CHECK-ALIAS %s # RUN: llvm-mc -triple riscv64 -mattr=+c,+d -show-encoding \ -# RUN: -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK-INST %s +# RUN: -M no-aliases < %s | FileCheck -check-prefixes=CHECK-INST %s # RUN: llvm-mc -triple riscv64 -mattr=+c,+d -filetype=obj < %s \ # RUN: | llvm-objdump --no-print-imm-hex --triple=riscv64 --mattr=+c,+d -d - \ # RUN: | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s @@ -32,7 +32,7 @@ # RUN: llvm-mc -triple riscv64 -mattr=+zcd,+d -show-encoding < %s \ # RUN: | FileCheck -check-prefixes=CHECK-ALIAS %s # RUN: llvm-mc -triple riscv64 -mattr=+zcd,+d -show-encoding \ -# RUN: -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK-INST %s +# RUN: -M no-aliases < %s | FileCheck -check-prefixes=CHECK-INST %s # RUN: llvm-mc -triple riscv64 -mattr=+zcd,+d -filetype=obj < %s \ # RUN: | llvm-objdump --no-print-imm-hex --triple=riscv64 --mattr=+zcd,+d -d - \ # RUN: | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s diff --git a/llvm/test/MC/RISCV/compress-rv32f.s b/llvm/test/MC/RISCV/compress-rv32f.s index afe15c598bb65..5fc3f41fdf0a5 100644 --- a/llvm/test/MC/RISCV/compress-rv32f.s +++ b/llvm/test/MC/RISCV/compress-rv32f.s @@ -1,7 +1,7 @@ # RUN: llvm-mc -triple riscv32 -mattr=+c,+f -show-encoding < %s \ # RUN: | FileCheck -check-prefixes=CHECK,CHECK-ALIAS %s # RUN: llvm-mc -triple riscv32 -mattr=+c,+f -show-encoding \ -# RUN: -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s +# RUN: -M no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s # RUN: llvm-mc -triple riscv32 -mattr=+c,+f -filetype=obj < %s \ # RUN: | llvm-objdump --triple=riscv32 --mattr=+c,+f --no-print-imm-hex -d - \ # RUN: | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s @@ -11,7 +11,7 @@ # RUN: llvm-mc -triple riscv32 -mattr=+zcf,+f -show-encoding < %s \ # RUN: | FileCheck -check-prefixes=CHECK,CHECK-ALIAS %s # RUN: llvm-mc -triple riscv32 -mattr=+zcf,+f -show-encoding \ -# RUN: -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s +# RUN: -M no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s # RUN: llvm-mc -triple riscv32 -mattr=+zcf,+f -filetype=obj < %s \ # RUN: | llvm-objdump --triple=riscv32 --mattr=+zcf,+f --no-print-imm-hex -d - \ # RUN: | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s diff --git a/llvm/test/MC/RISCV/compress-rv32i.s b/llvm/test/MC/RISCV/compress-rv32i.s index a75bea32ac0cf..5a2812839cbe7 100644 --- a/llvm/test/MC/RISCV/compress-rv32i.s +++ b/llvm/test/MC/RISCV/compress-rv32i.s @@ -1,7 +1,7 @@ # RUN: llvm-mc -triple riscv32 -mattr=+c -show-encoding < %s \ # RUN: | FileCheck -check-prefixes=CHECK,CHECK-ALIAS,CHECK-ALIASASM %s # RUN: llvm-mc -triple riscv32 -mattr=+c -show-encoding \ -# RUN: -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST,CHECK-INSTASM %s +# RUN: -M no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST,CHECK-INSTASM %s # RUN: llvm-mc -triple riscv32 -mattr=+c -filetype=obj < %s \ # RUN: | llvm-objdump --triple=riscv32 --mattr=+c --no-print-imm-hex -d - \ # RUN: | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS,CHECK-ALIASOBJ32 %s @@ -12,7 +12,7 @@ # RUN: llvm-mc -triple riscv64 -mattr=+c -show-encoding < %s \ # RUN: | FileCheck -check-prefixes=CHECK-ALIAS,CHECK-ALIASASM %s # RUN: llvm-mc -triple riscv64 -mattr=+c -show-encoding \ -# RUN: -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK-INST,CHECK-INSTASM %s +# RUN: -M no-aliases < %s | FileCheck -check-prefixes=CHECK-INST,CHECK-INSTASM %s # RUN: llvm-mc -triple riscv64 -mattr=+c -filetype=obj < %s \ # RUN: | llvm-objdump --triple=riscv64 --mattr=+c --no-print-imm-hex -d - \ # RUN: | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS,CHECK-ALIASOBJ64 %s diff --git a/llvm/test/MC/RISCV/compress-rv64i.s b/llvm/test/MC/RISCV/compress-rv64i.s index ab5b24307cd1a..31eb4d984036e 100644 --- a/llvm/test/MC/RISCV/compress-rv64i.s +++ b/llvm/test/MC/RISCV/compress-rv64i.s @@ -1,7 +1,7 @@ # RUN: llvm-mc -triple riscv64 -mattr=+c -show-encoding < %s \ # RUN: | FileCheck -check-prefixes=CHECK-ALIAS %s # RUN: llvm-mc -triple riscv64 -mattr=+c -show-encoding \ -# RUN: -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK-INST %s +# RUN: -M no-aliases < %s | FileCheck -check-prefixes=CHECK-INST %s # RUN: llvm-mc -triple riscv64 -mattr=+c -filetype=obj < %s \ # RUN: | llvm-objdump --triple=riscv64 --mattr=+c --no-print-imm-hex -d - \ # RUN: | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s diff --git a/llvm/test/MC/RISCV/compressed-relocations.s b/llvm/test/MC/RISCV/compressed-relocations.s index c7117ab702434..196d9873f40e6 100644 --- a/llvm/test/MC/RISCV/compressed-relocations.s +++ b/llvm/test/MC/RISCV/compressed-relocations.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc -triple riscv32 -mattr=+c -riscv-no-aliases < %s -show-encoding \ +# RUN: llvm-mc -triple riscv32 -mattr=+c -M no-aliases < %s -show-encoding \ # RUN: | FileCheck -check-prefix=INSTR -check-prefix=FIXUP %s # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+c < %s \ # RUN: | llvm-readobj -r - | FileCheck -check-prefix=RELOC %s diff --git a/llvm/test/MC/RISCV/compressed-zicfiss.s b/llvm/test/MC/RISCV/compressed-zicfiss.s index 2ebf9d3af3be8..7d387b257b7b4 100644 --- a/llvm/test/MC/RISCV/compressed-zicfiss.s +++ b/llvm/test/MC/RISCV/compressed-zicfiss.s @@ -1,15 +1,15 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zicfiss,+zcmop -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zicfiss,+zcmop -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+experimental-zicfiss,+zcmop < %s \ # RUN: | llvm-objdump --mattr=+experimental-zicfiss,+zcmop -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zicfiss,+zcmop -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zicfiss,+zcmop -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+experimental-zicfiss,+zcmop < %s \ # RUN: | llvm-objdump --mattr=+experimental-zicfiss,+zcmop -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s # -# RUN: not llvm-mc -triple riscv32 -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: not llvm-mc -triple riscv32 -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXT %s # CHECK-ASM-AND-OBJ: c.sspopchk t0 diff --git a/llvm/test/MC/RISCV/corev/XCValu-valid.s b/llvm/test/MC/RISCV/corev/XCValu-valid.s index 1c74e364a1254..2636b34d07735 100644 --- a/llvm/test/MC/RISCV/corev/XCValu-valid.s +++ b/llvm/test/MC/RISCV/corev/XCValu-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc -triple=riscv32 --mattr=+xcvalu -riscv-no-aliases -show-encoding %s \ +# RUN: llvm-mc -triple=riscv32 --mattr=+xcvalu -M no-aliases -show-encoding %s \ # RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INSTR # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+xcvalu < %s \ # RUN: | llvm-objdump --mattr=+xcvalu --no-print-imm-hex -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/corev/XCVmac-valid.s b/llvm/test/MC/RISCV/corev/XCVmac-valid.s index 93ed9d7b1291e..a795bc37e53ce 100644 --- a/llvm/test/MC/RISCV/corev/XCVmac-valid.s +++ b/llvm/test/MC/RISCV/corev/XCVmac-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc -triple=riscv32 --mattr=+xcvmac -riscv-no-aliases -show-encoding %s \ +# RUN: llvm-mc -triple=riscv32 --mattr=+xcvmac -M no-aliases -show-encoding %s \ # RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INSTR # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+xcvmac < %s \ # RUN: | llvm-objdump --mattr=+xcvmac --no-print-imm-hex -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/debug-valid.s b/llvm/test/MC/RISCV/debug-valid.s index 89b8f001b2cef..44dd47d25b4cc 100644 --- a/llvm/test/MC/RISCV/debug-valid.s +++ b/llvm/test/MC/RISCV/debug-valid.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK,CHECK-INST %s -# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK,CHECK-INST %s # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \ # RUN: | llvm-objdump -M no-aliases -d - \ diff --git a/llvm/test/MC/RISCV/deprecated-csr-names.s b/llvm/test/MC/RISCV/deprecated-csr-names.s index e895732ae23f6..3bb104a1b0bcc 100644 --- a/llvm/test/MC/RISCV/deprecated-csr-names.s +++ b/llvm/test/MC/RISCV/deprecated-csr-names.s @@ -1,10 +1,10 @@ -# RUN: llvm-mc -triple riscv32 -riscv-no-aliases -show-encoding %s \ +# RUN: llvm-mc -triple riscv32 -M no-aliases -show-encoding %s \ # RUN: | FileCheck -check-prefixes CHECK-INST,CHECK-ENC %s # RUN: llvm-mc -filetype obj -triple riscv32 %s \ # RUN: | llvm-objdump -d - \ # RUN: | FileCheck -check-prefix=CHECK-INST-ALIAS %s -# RUN: llvm-mc -triple riscv64 -riscv-no-aliases -show-encoding %s \ +# RUN: llvm-mc -triple riscv64 -M no-aliases -show-encoding %s \ # RUN: | FileCheck -check-prefixes CHECK-INST,CHECK-ENC %s # RUN: llvm-mc -filetype obj -triple riscv64 %s \ # RUN: | llvm-objdump -d - \ diff --git a/llvm/test/MC/RISCV/fixups-binary-expression.s b/llvm/test/MC/RISCV/fixups-binary-expression.s index dc1de5d6b324a..325a54ceeda9e 100644 --- a/llvm/test/MC/RISCV/fixups-binary-expression.s +++ b/llvm/test/MC/RISCV/fixups-binary-expression.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc -triple riscv32 -mattr=+c -riscv-no-aliases < %s -show-encoding \ +# RUN: llvm-mc -triple riscv32 -mattr=+c -M no-aliases < %s -show-encoding \ # RUN: | FileCheck -check-prefix=CHECK-FIXUP %s # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+c < %s \ # RUN: | llvm-objdump -M no-aliases -d - \ diff --git a/llvm/test/MC/RISCV/fixups.s b/llvm/test/MC/RISCV/fixups.s index d0682ed5bbda5..5145dbe8db888 100644 --- a/llvm/test/MC/RISCV/fixups.s +++ b/llvm/test/MC/RISCV/fixups.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc -triple riscv32 -riscv-no-aliases < %s -show-encoding \ +# RUN: llvm-mc -triple riscv32 -M no-aliases < %s -show-encoding \ # RUN: | FileCheck -check-prefix=CHECK-FIXUP %s # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \ # RUN: | llvm-objdump --no-print-imm-hex -M no-aliases -d - \ diff --git a/llvm/test/MC/RISCV/fp-default-rounding-mode.s b/llvm/test/MC/RISCV/fp-default-rounding-mode.s index c91892079161a..88b681ae9cea8 100644 --- a/llvm/test/MC/RISCV/fp-default-rounding-mode.s +++ b/llvm/test/MC/RISCV/fp-default-rounding-mode.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv64 -mattr=+d,+zfh,+zfbfmin -riscv-no-aliases \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+d,+zfh,+zfbfmin -M no-aliases \ # RUN: | FileCheck -check-prefixes=CHECK-INST %s # RUN: llvm-mc %s -triple=riscv64 -mattr=+d,+zfh,+zfbfmin \ # RUN: | FileCheck -check-prefixes=CHECK-ALIAS %s diff --git a/llvm/test/MC/RISCV/fp-inx-default-rounding-mode.s b/llvm/test/MC/RISCV/fp-inx-default-rounding-mode.s index d2764ce2ad217..2b628e1ac2a06 100644 --- a/llvm/test/MC/RISCV/fp-inx-default-rounding-mode.s +++ b/llvm/test/MC/RISCV/fp-inx-default-rounding-mode.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx,+zhinx -riscv-no-aliases \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx,+zhinx -M no-aliases \ # RUN: | FileCheck -check-prefixes=CHECK-INST %s # RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx,+zhinx \ # RUN: | FileCheck -check-prefixes=CHECK-ALIAS %s diff --git a/llvm/test/MC/RISCV/hypervisor-csr-names.s b/llvm/test/MC/RISCV/hypervisor-csr-names.s index 2f29e5dacbeb9..a7e3a57f6584c 100644 --- a/llvm/test/MC/RISCV/hypervisor-csr-names.s +++ b/llvm/test/MC/RISCV/hypervisor-csr-names.s @@ -1,10 +1,10 @@ -# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \ # RUN: | llvm-objdump -d - \ # RUN: | FileCheck -check-prefix=CHECK-INST-ALIAS %s # -# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s # RUN: llvm-mc -filetype=obj -triple riscv64 < %s \ # RUN: | llvm-objdump -d - \ diff --git a/llvm/test/MC/RISCV/insn.s b/llvm/test/MC/RISCV/insn.s index 829364c632884..b1ef2511b96c6 100644 --- a/llvm/test/MC/RISCV/insn.s +++ b/llvm/test/MC/RISCV/insn.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+f -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+f -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM %s -# RUN: llvm-mc %s -triple riscv64 -mattr=+f -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple riscv64 -mattr=+f -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+f < %s \ # RUN: | llvm-objdump --mattr=+f -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/insn_c.s b/llvm/test/MC/RISCV/insn_c.s index c63e8ab33aef9..c52f9b66a96ec 100644 --- a/llvm/test/MC/RISCV/insn_c.s +++ b/llvm/test/MC/RISCV/insn_c.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+f,+c -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+f,+c -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefix=CHECK-ASM %s -# RUN: llvm-mc %s -triple riscv64 -mattr=+f,+c -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple riscv64 -mattr=+f,+c -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefix=CHECK-ASM %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+f,+c < %s \ # RUN: | llvm-objdump --mattr=+f,+c -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/machine-csr-names.s b/llvm/test/MC/RISCV/machine-csr-names.s index 8cfdf7ee116ce..07b948a78e6c2 100644 --- a/llvm/test/MC/RISCV/machine-csr-names.s +++ b/llvm/test/MC/RISCV/machine-csr-names.s @@ -1,10 +1,10 @@ -# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \ # RUN: | llvm-objdump -d - \ # RUN: | FileCheck -check-prefix=CHECK-INST-ALIAS %s # -# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s # RUN: llvm-mc -filetype=obj -triple riscv64 < %s \ # RUN: | llvm-objdump -d - \ diff --git a/llvm/test/MC/RISCV/option-nopic.s b/llvm/test/MC/RISCV/option-nopic.s index db0cf1db8e115..20203f65e7d5d 100644 --- a/llvm/test/MC/RISCV/option-nopic.s +++ b/llvm/test/MC/RISCV/option-nopic.s @@ -1,19 +1,19 @@ -# RUN: llvm-mc -triple riscv32 -mattr=-relax -riscv-no-aliases < %s \ +# RUN: llvm-mc -triple riscv32 -mattr=-relax -M no-aliases < %s \ # RUN: | FileCheck -check-prefix=CHECK-INST %s # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \ # RUN: | llvm-readobj -r - | FileCheck -check-prefix=CHECK-RELOC %s -# RUN: llvm-mc -triple riscv32 -mattr=-relax -riscv-no-aliases \ +# RUN: llvm-mc -triple riscv32 -mattr=-relax -M no-aliases \ # RUN: -position-independent < %s | FileCheck -check-prefix=CHECK-INST %s # RUN: llvm-mc -filetype=obj -triple riscv32 -position-independent < %s \ # RUN: | llvm-readobj -r - | FileCheck -check-prefix=CHECK-RELOC %s -# RUN: llvm-mc -triple riscv64 -mattr=-relax -riscv-no-aliases < %s \ +# RUN: llvm-mc -triple riscv64 -mattr=-relax -M no-aliases < %s \ # RUN: | FileCheck -check-prefix=CHECK-INST %s # RUN: llvm-mc -filetype=obj -triple riscv64 < %s \ # RUN: | llvm-readobj -r - | FileCheck -check-prefix=CHECK-RELOC %s -# RUN: llvm-mc -triple riscv64 -mattr=-relax -riscv-no-aliases \ +# RUN: llvm-mc -triple riscv64 -mattr=-relax -M no-aliases \ # RUN: -position-independent < %s | FileCheck -check-prefix=CHECK-INST %s # RUN: llvm-mc -filetype=obj -triple riscv64 -position-independent < %s \ # RUN: | llvm-readobj -r - | FileCheck -check-prefix=CHECK-RELOC %s diff --git a/llvm/test/MC/RISCV/option-pic.s b/llvm/test/MC/RISCV/option-pic.s index ef456e01c4ba8..9c9381cb948d0 100644 --- a/llvm/test/MC/RISCV/option-pic.s +++ b/llvm/test/MC/RISCV/option-pic.s @@ -1,19 +1,19 @@ -# RUN: llvm-mc -triple riscv32 -mattr=-relax -riscv-no-aliases < %s \ +# RUN: llvm-mc -triple riscv32 -mattr=-relax -M no-aliases < %s \ # RUN: | FileCheck -check-prefix=CHECK-INST %s # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \ # RUN: | llvm-readobj -r - | FileCheck -check-prefix=CHECK-RELOC %s -# RUN: llvm-mc -triple riscv32 -mattr=-relax -riscv-no-aliases \ +# RUN: llvm-mc -triple riscv32 -mattr=-relax -M no-aliases \ # RUN: -position-independent < %s | FileCheck -check-prefix=CHECK-INST %s # RUN: llvm-mc -filetype=obj -triple riscv32 -position-independent < %s \ # RUN: | llvm-readobj -r - | FileCheck -check-prefix=CHECK-RELOC %s -# RUN: llvm-mc -triple riscv64 -mattr=-relax -riscv-no-aliases < %s \ +# RUN: llvm-mc -triple riscv64 -mattr=-relax -M no-aliases < %s \ # RUN: | FileCheck -check-prefix=CHECK-INST %s # RUN: llvm-mc -filetype=obj -triple riscv64 < %s \ # RUN: | llvm-readobj -r - | FileCheck -check-prefix=CHECK-RELOC %s -# RUN: llvm-mc -triple riscv64 -mattr=-relax -riscv-no-aliases \ +# RUN: llvm-mc -triple riscv64 -mattr=-relax -M no-aliases \ # RUN: -position-independent < %s | FileCheck -check-prefix=CHECK-INST %s # RUN: llvm-mc -filetype=obj -triple riscv64 -position-independent < %s \ # RUN: | llvm-readobj -r - | FileCheck -check-prefix=CHECK-RELOC %s diff --git a/llvm/test/MC/RISCV/option-pushpop.s b/llvm/test/MC/RISCV/option-pushpop.s index 9c61b5dab5f3b..68d60be9f888d 100644 --- a/llvm/test/MC/RISCV/option-pushpop.s +++ b/llvm/test/MC/RISCV/option-pushpop.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc -triple riscv32 -mattr=-relax -riscv-no-aliases < %s \ +# RUN: llvm-mc -triple riscv32 -mattr=-relax -M no-aliases < %s \ # RUN: | FileCheck -check-prefix=CHECK-INST %s # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \ # RUN: | llvm-readobj -r - | FileCheck -check-prefix=CHECK-RELOC %s @@ -6,7 +6,7 @@ # RUN: | llvm-objdump --no-print-imm-hex --triple=riscv32 --mattr=+c -d - \ # RUN: | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s -# RUN: llvm-mc -triple riscv64 -mattr=-relax -riscv-no-aliases < %s \ +# RUN: llvm-mc -triple riscv64 -mattr=-relax -M no-aliases < %s \ # RUN: | FileCheck -check-prefix=CHECK-INST %s # RUN: llvm-mc -filetype=obj -triple riscv64 < %s \ # RUN: | llvm-readobj -r - | FileCheck -check-prefix=CHECK-RELOC %s diff --git a/llvm/test/MC/RISCV/option-rvc.s b/llvm/test/MC/RISCV/option-rvc.s index 894fbab562d70..32568b9fc65b8 100644 --- a/llvm/test/MC/RISCV/option-rvc.s +++ b/llvm/test/MC/RISCV/option-rvc.s @@ -1,7 +1,7 @@ # RUN: llvm-mc -triple riscv32 -show-encoding < %s \ # RUN: | FileCheck -check-prefixes=CHECK,CHECK-ALIAS %s # RUN: llvm-mc -triple riscv32 -show-encoding \ -# RUN: -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s +# RUN: -M no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s # RUN: llvm-mc -triple riscv32 -filetype=obj < %s \ # RUN: | llvm-objdump --triple=riscv32 --mattr=+c --no-print-imm-hex -d - \ # RUN: | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s @@ -12,7 +12,7 @@ # RUN: llvm-mc -triple riscv64 -show-encoding < %s \ # RUN: | FileCheck -check-prefixes=CHECK-ALIAS %s # RUN: llvm-mc -triple riscv64 -show-encoding \ -# RUN: -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK-INST %s +# RUN: -M no-aliases < %s | FileCheck -check-prefixes=CHECK-INST %s # RUN: llvm-mc -triple riscv64 -filetype=obj < %s \ # RUN: | llvm-objdump --triple=riscv64 --mattr=+c --no-print-imm-hex -d - \ # RUN: | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s diff --git a/llvm/test/MC/RISCV/print-imm-hex.s b/llvm/test/MC/RISCV/print-imm-hex.s index 04d405a0f3cfa..10270de8b8f24 100644 --- a/llvm/test/MC/RISCV/print-imm-hex.s +++ b/llvm/test/MC/RISCV/print-imm-hex.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding -mattr=+v \ +# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding -mattr=+v \ # RUN: | FileCheck -check-prefixes=CHECK-ASM %s -# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding -mattr=+v --print-imm-hex \ +# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding -mattr=+v --print-imm-hex \ # RUN: | FileCheck -check-prefixes=CHECK-ASM-HEX %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+v < %s \ # RUN: | llvm-objdump -M no-aliases --mattr=+v --no-print-imm-hex -d -r - \ diff --git a/llvm/test/MC/RISCV/priv-valid.s b/llvm/test/MC/RISCV/priv-valid.s index 561c76bf4fa29..67dc50111aae9 100644 --- a/llvm/test/MC/RISCV/priv-valid.s +++ b/llvm/test/MC/RISCV/priv-valid.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+svinval -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+svinval -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK,CHECK-INST %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+svinval -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+svinval -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK,CHECK-INST %s # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+svinval < %s \ # RUN: | llvm-objdump --mattr=+svinval -M no-aliases -d - \ diff --git a/llvm/test/MC/RISCV/relocations.s b/llvm/test/MC/RISCV/relocations.s index f5f6417487f22..85a25fee118ed 100644 --- a/llvm/test/MC/RISCV/relocations.s +++ b/llvm/test/MC/RISCV/relocations.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc -triple riscv32 -riscv-no-aliases < %s -show-encoding \ +# RUN: llvm-mc -triple riscv32 -M no-aliases < %s -show-encoding \ # RUN: | FileCheck -check-prefix=INSTR -check-prefix=FIXUP %s # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+c < %s \ # RUN: | llvm-readobj -r - | FileCheck -check-prefix=RELOC %s diff --git a/llvm/test/MC/RISCV/rv32-hypervisor-csr-names.s b/llvm/test/MC/RISCV/rv32-hypervisor-csr-names.s index 83e6d21d3067c..aadee4fb4f3ad 100644 --- a/llvm/test/MC/RISCV/rv32-hypervisor-csr-names.s +++ b/llvm/test/MC/RISCV/rv32-hypervisor-csr-names.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \ # RUN: | llvm-objdump -d - \ diff --git a/llvm/test/MC/RISCV/rv32-machine-csr-names.s b/llvm/test/MC/RISCV/rv32-machine-csr-names.s index e7a6d9ce718f2..3d527e382376e 100644 --- a/llvm/test/MC/RISCV/rv32-machine-csr-names.s +++ b/llvm/test/MC/RISCV/rv32-machine-csr-names.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \ # RUN: | llvm-objdump -d - \ diff --git a/llvm/test/MC/RISCV/rv32-supervisor-csr-names.s b/llvm/test/MC/RISCV/rv32-supervisor-csr-names.s index 4c1fef446a3d8..ca7887ac1c10a 100644 --- a/llvm/test/MC/RISCV/rv32-supervisor-csr-names.s +++ b/llvm/test/MC/RISCV/rv32-supervisor-csr-names.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \ # RUN: | llvm-objdump -d - \ diff --git a/llvm/test/MC/RISCV/rv32-user-csr-names.s b/llvm/test/MC/RISCV/rv32-user-csr-names.s index acd66467bfea7..6fb9861034738 100644 --- a/llvm/test/MC/RISCV/rv32-user-csr-names.s +++ b/llvm/test/MC/RISCV/rv32-user-csr-names.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \ # RUN: | llvm-objdump -d - \ diff --git a/llvm/test/MC/RISCV/rv32c-aliases-valid.s b/llvm/test/MC/RISCV/rv32c-aliases-valid.s index f159adbb9bd78..de9d0c60a5720 100644 --- a/llvm/test/MC/RISCV/rv32c-aliases-valid.s +++ b/llvm/test/MC/RISCV/rv32c-aliases-valid.s @@ -1,9 +1,9 @@ -# RUN: llvm-mc -triple=riscv32 -mattr=+c -riscv-no-aliases < %s \ +# RUN: llvm-mc -triple=riscv32 -mattr=+c -M no-aliases < %s \ # RUN: | FileCheck -check-prefixes=CHECK-EXPAND,CHECK-INST %s # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+c < %s \ # RUN: | llvm-objdump --no-print-imm-hex -d -M no-aliases - \ # RUN: | FileCheck -check-prefixes=CHECK-EXPAND,CHECK-INST %s -# RUN: llvm-mc -triple=riscv32 -mattr=+zca -riscv-no-aliases < %s \ +# RUN: llvm-mc -triple=riscv32 -mattr=+zca -M no-aliases < %s \ # RUN: | FileCheck -check-prefixes=CHECK-EXPAND,CHECK-INST %s # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+zca < %s \ # RUN: | llvm-objdump --no-print-imm-hex --mattr=+zca -d -M no-aliases - \ diff --git a/llvm/test/MC/RISCV/rv32c-only-valid.s b/llvm/test/MC/RISCV/rv32c-only-valid.s index 3321aff115c4d..c4fec69bd7012 100644 --- a/llvm/test/MC/RISCV/rv32c-only-valid.s +++ b/llvm/test/MC/RISCV/rv32c-only-valid.s @@ -1,17 +1,17 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+c -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+c -M no-aliases -show-encoding \ # RUN: | FileCheck --check-prefix=CHECK-ASM %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+c < %s \ # RUN: | llvm-objdump --mattr=+c --no-print-imm-hex -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefix=CHECK-OBJ %s # # RUN: not llvm-mc -triple riscv32 \ -# RUN: -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck --check-prefix=CHECK-NO-EXT %s # RUN: not llvm-mc -triple riscv64 -mattr=+c \ -# RUN: -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck --check-prefix=CHECK-NO-RV32 %s # RUN: not llvm-mc -triple riscv64 \ -# RUN: -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck --check-prefix=CHECK-NO-RV32-AND-EXT %s # CHECK-OBJ: c.jal 0x7fe diff --git a/llvm/test/MC/RISCV/rv32dc-valid.s b/llvm/test/MC/RISCV/rv32dc-valid.s index 201aee545d4a4..495c88448a183 100644 --- a/llvm/test/MC/RISCV/rv32dc-valid.s +++ b/llvm/test/MC/RISCV/rv32dc-valid.s @@ -1,18 +1,18 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+c,+d -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+c,+d -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+c,+d < %s \ # RUN: | llvm-objdump --no-print-imm-hex --mattr=+c,+d -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv32 -mattr=+zcd,+d -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+zcd,+d -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zcd,+d < %s \ # RUN: | llvm-objdump --no-print-imm-hex --mattr=+zcd,+d -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s # # RUN: not llvm-mc -triple riscv32 -mattr=+c \ -# RUN: -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXT-D %s -# RUN: not llvm-mc -triple riscv32 -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: not llvm-mc -triple riscv32 -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXT-DC %s # CHECK-ASM-AND-OBJ: c.fldsp fs0, 504(sp) diff --git a/llvm/test/MC/RISCV/rv32fc-aliases-valid.s b/llvm/test/MC/RISCV/rv32fc-aliases-valid.s index d992d07ec677e..f1a1c739677fc 100644 --- a/llvm/test/MC/RISCV/rv32fc-aliases-valid.s +++ b/llvm/test/MC/RISCV/rv32fc-aliases-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+c,+f -riscv-no-aliases \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+c,+f -M no-aliases \ # RUN: | FileCheck -check-prefixes=CHECK-EXPAND %s # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+c,+f < %s \ # RUN: | llvm-objdump --mattr=+c,+f --no-print-imm-hex -M no-aliases -d - \ diff --git a/llvm/test/MC/RISCV/rv32fc-valid.s b/llvm/test/MC/RISCV/rv32fc-valid.s index 936032594457f..af38a636cf804 100644 --- a/llvm/test/MC/RISCV/rv32fc-valid.s +++ b/llvm/test/MC/RISCV/rv32fc-valid.s @@ -1,22 +1,22 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+c,+f -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+c,+f -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+c,+f < %s \ # RUN: | llvm-objdump --mattr=+c,+f --no-print-imm-hex -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv32 -mattr=+zcf,+f -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+zcf,+f -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zcf,+f < %s \ # RUN: | llvm-objdump --mattr=+zcf,+f --no-print-imm-hex -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s # # RUN: not llvm-mc -triple riscv32 -mattr=+c \ -# RUN: -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXT-F %s # RUN: not llvm-mc -triple riscv32 \ -# RUN: -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXT-FC %s # RUN: not llvm-mc -triple riscv64 -mattr=+c,+f \ -# RUN: -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-RV32 %s # FIXME: error messages for rv64fc are misleading diff --git a/llvm/test/MC/RISCV/rv32i-aliases-invalid.s b/llvm/test/MC/RISCV/rv32i-aliases-invalid.s index 9254452e3df42..7f54fe720ea48 100644 --- a/llvm/test/MC/RISCV/rv32i-aliases-invalid.s +++ b/llvm/test/MC/RISCV/rv32i-aliases-invalid.s @@ -1,5 +1,5 @@ # UNSUPPORTED: target={{.*-windows.*}} -# RUN: not llvm-mc -triple=riscv32 -riscv-no-aliases < %s -o /dev/null 2>&1 | FileCheck %s +# RUN: not llvm-mc -triple=riscv32 -M no-aliases < %s -o /dev/null 2>&1 | FileCheck %s # RUN: not llvm-mc -triple=riscv32 < %s -o /dev/null 2>&1 | FileCheck %s # TODO ld diff --git a/llvm/test/MC/RISCV/rv32i-aliases-valid.s b/llvm/test/MC/RISCV/rv32i-aliases-valid.s index 93d8cb2eb0e4f..20deda4ec5eaf 100644 --- a/llvm/test/MC/RISCV/rv32i-aliases-valid.s +++ b/llvm/test/MC/RISCV/rv32i-aliases-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases \ +# RUN: llvm-mc %s -triple=riscv32 -M no-aliases \ # RUN: | FileCheck -check-prefixes=CHECK-EXPAND,CHECK-INST,CHECK-ASM-NOALIAS %s # RUN: llvm-mc %s -triple=riscv32 \ # RUN: | FileCheck -check-prefixes=CHECK-EXPAND,CHECK-ALIAS,CHECK-ASM %s diff --git a/llvm/test/MC/RISCV/rv32i-only-valid.s b/llvm/test/MC/RISCV/rv32i-only-valid.s index 74232e3c419f1..afe62ce9d2241 100644 --- a/llvm/test/MC/RISCV/rv32i-only-valid.s +++ b/llvm/test/MC/RISCV/rv32i-only-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 < %s \ # RUN: | llvm-objdump -M no-aliases --no-print-imm-hex -d -r - \ diff --git a/llvm/test/MC/RISCV/rv32zbb-aliases-valid.s b/llvm/test/MC/RISCV/rv32zbb-aliases-valid.s index 26a725a991b27..994b46e2e1680 100644 --- a/llvm/test/MC/RISCV/rv32zbb-aliases-valid.s +++ b/llvm/test/MC/RISCV/rv32zbb-aliases-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+zbb -riscv-no-aliases \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+zbb -M no-aliases \ # RUN: | FileCheck -check-prefixes=CHECK-S-OBJ-NOALIAS %s # RUN: llvm-mc %s -triple=riscv32 -mattr=+zbb \ # RUN: | FileCheck -check-prefixes=CHECK-S-OBJ %s diff --git a/llvm/test/MC/RISCV/rv32zbb-only-valid.s b/llvm/test/MC/RISCV/rv32zbb-only-valid.s index 8cee959ed4275..0e18cf20e1371 100644 --- a/llvm/test/MC/RISCV/rv32zbb-only-valid.s +++ b/llvm/test/MC/RISCV/rv32zbb-only-valid.s @@ -1,5 +1,5 @@ # With Bitmanip base extension: -# RUN: llvm-mc %s -triple=riscv32 -mattr=+zbb -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+zbb -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zbb < %s \ # RUN: | llvm-objdump --mattr=+zbb --no-print-imm-hex -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rv32zbs-aliases-valid.s b/llvm/test/MC/RISCV/rv32zbs-aliases-valid.s index 3ebb297f0cde2..60599c5d7dd26 100644 --- a/llvm/test/MC/RISCV/rv32zbs-aliases-valid.s +++ b/llvm/test/MC/RISCV/rv32zbs-aliases-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+zbs -riscv-no-aliases \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+zbs -M no-aliases \ # RUN: | FileCheck -check-prefixes=CHECK-S-OBJ-NOALIAS %s # RUN: llvm-mc %s -triple=riscv32 -mattr=+zbs \ # RUN: | FileCheck -check-prefixes=CHECK-S-OBJ %s diff --git a/llvm/test/MC/RISCV/rv32zcmp-invalid.s b/llvm/test/MC/RISCV/rv32zcmp-invalid.s index 2ed82bc55be3f..0720a74a9b5c2 100644 --- a/llvm/test/MC/RISCV/rv32zcmp-invalid.s +++ b/llvm/test/MC/RISCV/rv32zcmp-invalid.s @@ -1,4 +1,4 @@ -# RUN: not llvm-mc -triple=riscv32 -mattr=zcmp -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: not llvm-mc -triple=riscv32 -mattr=zcmp -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-ERROR %s # CHECK-ERROR: error: invalid operand for instruction diff --git a/llvm/test/MC/RISCV/rv32zcmp-valid.s b/llvm/test/MC/RISCV/rv32zcmp-valid.s index 31e287b07e705..d144c6f4a3e6d 100644 --- a/llvm/test/MC/RISCV/rv32zcmp-valid.s +++ b/llvm/test/MC/RISCV/rv32zcmp-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=zcmp -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=zcmp -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=zcmp < %s \ # RUN: | llvm-objdump --mattr=-c,zcmp -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rv32zfa-only-valid.s b/llvm/test/MC/RISCV/rv32zfa-only-valid.s index d212659d5208d..a780a9f9ce910 100644 --- a/llvm/test/MC/RISCV/rv32zfa-only-valid.s +++ b/llvm/test/MC/RISCV/rv32zfa-only-valid.s @@ -1,11 +1,11 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfa,+d,+zfh -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfa,+d,+zfh -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zfa,+d,+zfh < %s \ # RUN: | llvm-objdump --mattr=+zfa,+d,+zfh -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s # # RUN: not llvm-mc -triple riscv32 -mattr=+d,+zfh \ -# RUN: -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXT %s # CHECK-ASM-AND-OBJ: fmvh.x.d a1, fs1 diff --git a/llvm/test/MC/RISCV/rv32zicfiss-invalid.s b/llvm/test/MC/RISCV/rv32zicfiss-invalid.s index 1cedcb97e2e7f..048df67e8a646 100644 --- a/llvm/test/MC/RISCV/rv32zicfiss-invalid.s +++ b/llvm/test/MC/RISCV/rv32zicfiss-invalid.s @@ -1,4 +1,4 @@ -# RUN: not llvm-mc %s -triple=riscv32 -mattr=+experimental-zicfiss,+c -riscv-no-aliases -show-encoding \ +# RUN: not llvm-mc %s -triple=riscv32 -mattr=+experimental-zicfiss,+c -M no-aliases -show-encoding \ # RUN: 2>&1 | FileCheck -check-prefixes=CHECK-ERR %s # CHECK-ERR: error: invalid operand for instruction diff --git a/llvm/test/MC/RISCV/rv32zmmul-invaild.s b/llvm/test/MC/RISCV/rv32zmmul-invaild.s index cf4ced8bfe4ee..b5f1ee64016b8 100644 --- a/llvm/test/MC/RISCV/rv32zmmul-invaild.s +++ b/llvm/test/MC/RISCV/rv32zmmul-invaild.s @@ -1,4 +1,4 @@ -# RUN: not llvm-mc %s -triple=riscv32 -mattr=+zmmul -riscv-no-aliases 2>&1 \ +# RUN: not llvm-mc %s -triple=riscv32 -mattr=+zmmul -M no-aliases 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-ERROR %s # CHECK-ERROR: 5:1: error: instruction requires the following: 'M' (Integer Multiplication and Division){{$}} diff --git a/llvm/test/MC/RISCV/rv32zmmul-valid.s b/llvm/test/MC/RISCV/rv32zmmul-valid.s index 929dc52e1f5ea..b226685cd702f 100644 --- a/llvm/test/MC/RISCV/rv32zmmul-valid.s +++ b/llvm/test/MC/RISCV/rv32zmmul-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+zmmul -riscv-no-aliases 2>&1 \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+zmmul -M no-aliases 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-INST %s # CHECK-INST: mul a4, ra, s0 diff --git a/llvm/test/MC/RISCV/rv64-machine-csr-names.s b/llvm/test/MC/RISCV/rv64-machine-csr-names.s index 3efebf951bcd2..b49eb17f45db5 100644 --- a/llvm/test/MC/RISCV/rv64-machine-csr-names.s +++ b/llvm/test/MC/RISCV/rv64-machine-csr-names.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s # RUN: llvm-mc -filetype=obj -triple riscv64 < %s \ # RUN: | llvm-objdump --no-print-imm-hex -d - \ diff --git a/llvm/test/MC/RISCV/rv64-user-csr-names.s b/llvm/test/MC/RISCV/rv64-user-csr-names.s index fc23b697f9f78..afb7235659b48 100644 --- a/llvm/test/MC/RISCV/rv64-user-csr-names.s +++ b/llvm/test/MC/RISCV/rv64-user-csr-names.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s # RUN: llvm-mc -filetype=obj -triple riscv64 < %s \ # RUN: | llvm-objdump -d - \ diff --git a/llvm/test/MC/RISCV/rv64a-aliases-valid.s b/llvm/test/MC/RISCV/rv64a-aliases-valid.s index 09999536af5a3..577010f97c2e2 100644 --- a/llvm/test/MC/RISCV/rv64a-aliases-valid.s +++ b/llvm/test/MC/RISCV/rv64a-aliases-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -riscv-no-aliases \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -M no-aliases \ # RUN: | FileCheck -check-prefix=CHECK-INST %s # RUN: llvm-mc %s -triple=riscv64 -mattr=+a \ # RUN: | FileCheck -check-prefix=CHECK-ALIAS %s diff --git a/llvm/test/MC/RISCV/rv64c-aliases-valid.s b/llvm/test/MC/RISCV/rv64c-aliases-valid.s index ccf9e6a4fc2dd..685669737d9bb 100644 --- a/llvm/test/MC/RISCV/rv64c-aliases-valid.s +++ b/llvm/test/MC/RISCV/rv64c-aliases-valid.s @@ -1,9 +1,9 @@ -# RUN: llvm-mc -triple=riscv64 -mattr=+c -riscv-no-aliases < %s \ +# RUN: llvm-mc -triple=riscv64 -mattr=+c -M no-aliases < %s \ # RUN: | FileCheck -check-prefixes=CHECK-EXPAND,CHECK-INST %s # RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+c < %s \ # RUN: | llvm-objdump --no-print-imm-hex -d -M no-aliases - \ # RUN: | FileCheck -check-prefixes=CHECK-EXPAND,CHECK-INST %s -# RUN: llvm-mc -triple=riscv64 -mattr=+zca -riscv-no-aliases < %s \ +# RUN: llvm-mc -triple=riscv64 -mattr=+zca -M no-aliases < %s \ # RUN: | FileCheck -check-prefixes=CHECK-EXPAND,CHECK-INST %s # RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+zca < %s \ # RUN: | llvm-objdump --no-print-imm-hex --mattr=+zca -d -M no-aliases - \ diff --git a/llvm/test/MC/RISCV/rv64c-hints-valid.s b/llvm/test/MC/RISCV/rv64c-hints-valid.s index 92cbe542e5545..95d093200f9a3 100644 --- a/llvm/test/MC/RISCV/rv64c-hints-valid.s +++ b/llvm/test/MC/RISCV/rv64c-hints-valid.s @@ -1,9 +1,9 @@ -# RUN: llvm-mc %s -triple riscv64 -mattr=+c -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple riscv64 -mattr=+c -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+c < %s \ # RUN: | llvm-objdump --no-print-imm-hex -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple riscv64 -mattr=+zca -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple riscv64 -mattr=+zca -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zca < %s \ # RUN: | llvm-objdump --no-print-imm-hex --mattr=+zca -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rv64c-valid.s b/llvm/test/MC/RISCV/rv64c-valid.s index a9f0cf57c1863..f8736e5d5453b 100644 --- a/llvm/test/MC/RISCV/rv64c-valid.s +++ b/llvm/test/MC/RISCV/rv64c-valid.s @@ -1,9 +1,9 @@ -# RUN: llvm-mc %s -triple=riscv64 -mattr=+c -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+c -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+c < %s \ # RUN: | llvm-objdump --mattr=+c --no-print-imm-hex -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zca -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zca -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+c < %s \ # RUN: | llvm-objdump --mattr=+c --no-print-imm-hex -M no-aliases -d -r - \ @@ -11,10 +11,10 @@ # # # RUN: not llvm-mc -triple riscv64 \ -# RUN: -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXT %s # RUN: not llvm-mc -triple riscv32 -mattr=+c \ -# RUN: -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-RV64 %s # TODO: more exhaustive testing of immediate encoding. diff --git a/llvm/test/MC/RISCV/rv64d-aliases-valid.s b/llvm/test/MC/RISCV/rv64d-aliases-valid.s index 17a44b4f537eb..c23aa84ddea4f 100644 --- a/llvm/test/MC/RISCV/rv64d-aliases-valid.s +++ b/llvm/test/MC/RISCV/rv64d-aliases-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv64 -mattr=+d -riscv-no-aliases \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+d -M no-aliases \ # RUN: | FileCheck -check-prefix=CHECK-INST %s # RUN: llvm-mc %s -triple=riscv64 -mattr=+d \ # RUN: | FileCheck -check-prefix=CHECK-ALIAS %s diff --git a/llvm/test/MC/RISCV/rv64d-valid.s b/llvm/test/MC/RISCV/rv64d-valid.s index e6cc8ec50b27a..8a9f608b664d2 100644 --- a/llvm/test/MC/RISCV/rv64d-valid.s +++ b/llvm/test/MC/RISCV/rv64d-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv64 -mattr=+d -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+d -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+d < %s \ # RUN: | llvm-objdump --mattr=+d -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rv64dc-valid.s b/llvm/test/MC/RISCV/rv64dc-valid.s index 83225b2c68562..7f2b8c46b7bb6 100644 --- a/llvm/test/MC/RISCV/rv64dc-valid.s +++ b/llvm/test/MC/RISCV/rv64dc-valid.s @@ -1,18 +1,18 @@ -# RUN: llvm-mc %s -triple=riscv64 -mattr=+c,+d -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+c,+d -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+c,+d < %s \ # RUN: | llvm-objdump --no-print-imm-hex --mattr=+c,+d -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zcd,+d -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zcd,+d -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zcd,+d < %s \ # RUN: | llvm-objdump --no-print-imm-hex --mattr=+zcd,+d -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s # # RUN: not llvm-mc -triple riscv64 -mattr=+c \ -# RUN: -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXT-D %s -# RUN: not llvm-mc -triple riscv64 -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: not llvm-mc -triple riscv64 -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXT-DC %s # CHECK-ASM-AND-OBJ: c.fldsp fs0, 504(sp) diff --git a/llvm/test/MC/RISCV/rv64e-valid.s b/llvm/test/MC/RISCV/rv64e-valid.s index f7b66fdcf9227..42f9b2c55261d 100644 --- a/llvm/test/MC/RISCV/rv64e-valid.s +++ b/llvm/test/MC/RISCV/rv64e-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -mattr=+e -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -mattr=+e -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+e < %s \ # RUN: | llvm-objdump --no-print-imm-hex -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rv64e-zcmp-valid.s b/llvm/test/MC/RISCV/rv64e-zcmp-valid.s index 376edf05d40cb..607a023d8d774 100644 --- a/llvm/test/MC/RISCV/rv64e-zcmp-valid.s +++ b/llvm/test/MC/RISCV/rv64e-zcmp-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv64 -mattr=zcmp,+e -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=zcmp,+e -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=zcmp < %s \ # RUN: | llvm-objdump --mattr=-c,zcmp -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rv64f-aliases-valid.s b/llvm/test/MC/RISCV/rv64f-aliases-valid.s index 488d52fbb7f90..d04b0b53af046 100644 --- a/llvm/test/MC/RISCV/rv64f-aliases-valid.s +++ b/llvm/test/MC/RISCV/rv64f-aliases-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv64 -mattr=+f -riscv-no-aliases \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+f -M no-aliases \ # RUN: | FileCheck -check-prefix=CHECK-INST %s # RUN: llvm-mc %s -triple=riscv64 -mattr=+f \ # RUN: | FileCheck -check-prefix=CHECK-ALIAS %s diff --git a/llvm/test/MC/RISCV/rv64f-valid.s b/llvm/test/MC/RISCV/rv64f-valid.s index 108e1ebfad6c0..1fea7874e30e9 100644 --- a/llvm/test/MC/RISCV/rv64f-valid.s +++ b/llvm/test/MC/RISCV/rv64f-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv64 -mattr=+f -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+f -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+f < %s \ # RUN: | llvm-objdump --mattr=+f -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rv64i-aliases-invalid.s b/llvm/test/MC/RISCV/rv64i-aliases-invalid.s index 34552b552e54c..1bd4e78007c83 100644 --- a/llvm/test/MC/RISCV/rv64i-aliases-invalid.s +++ b/llvm/test/MC/RISCV/rv64i-aliases-invalid.s @@ -1,5 +1,5 @@ # UNSUPPORTED: target={{.*-windows.*}} -# RUN: not llvm-mc -triple=riscv64 -riscv-no-aliases < %s -o /dev/null 2>&1 | FileCheck %s +# RUN: not llvm-mc -triple=riscv64 -M no-aliases < %s -o /dev/null 2>&1 | FileCheck %s # RUN: not llvm-mc -triple=riscv64 < %s 2>&1 -o /dev/null | FileCheck %s li t5, 0x10000000000000000 # CHECK: :[[@LINE]]:8: error: unknown operand diff --git a/llvm/test/MC/RISCV/rv64i-aliases-valid.s b/llvm/test/MC/RISCV/rv64i-aliases-valid.s index f36446ddfa7e2..dde8dbe43a6ce 100644 --- a/llvm/test/MC/RISCV/rv64i-aliases-valid.s +++ b/llvm/test/MC/RISCV/rv64i-aliases-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases \ +# RUN: llvm-mc %s -triple=riscv64 -M no-aliases \ # RUN: | FileCheck -check-prefixes=CHECK-EXPAND,CHECK-INST,CHECK-ASM-NOALIAS %s # RUN: llvm-mc %s -triple=riscv64 \ # RUN: | FileCheck -check-prefixes=CHECK-EXPAND,CHECK-ALIAS,CHECK-ASM %s diff --git a/llvm/test/MC/RISCV/rv64i-valid.s b/llvm/test/MC/RISCV/rv64i-valid.s index ec101f9e8428b..7f94fbf0b5f0b 100644 --- a/llvm/test/MC/RISCV/rv64i-valid.s +++ b/llvm/test/MC/RISCV/rv64i-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 < %s \ # RUN: | llvm-objdump --no-print-imm-hex -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rv64ih-valid.s b/llvm/test/MC/RISCV/rv64ih-valid.s index be8ccf9b4d207..13ca4f0635a70 100644 --- a/llvm/test/MC/RISCV/rv64ih-valid.s +++ b/llvm/test/MC/RISCV/rv64ih-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv64 -mattr=+h -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+h -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK,CHECK-INST %s # RUN: llvm-mc -filetype=obj -mattr=+h -triple riscv64 < %s \ # RUN: | llvm-objdump --mattr=+h -M no-aliases -d - \ diff --git a/llvm/test/MC/RISCV/rv64m-valid.s b/llvm/test/MC/RISCV/rv64m-valid.s index 246f74fe9eb0c..21db06485f20e 100644 --- a/llvm/test/MC/RISCV/rv64m-valid.s +++ b/llvm/test/MC/RISCV/rv64m-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv64 -mattr=+m -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+m -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+m < %s \ # RUN: | llvm-objdump --mattr=+m -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rv64zaamo-valid.s b/llvm/test/MC/RISCV/rv64zaamo-valid.s index 96d3e619b4c1a..c401ce10cc8f8 100644 --- a/llvm/test/MC/RISCV/rv64zaamo-valid.s +++ b/llvm/test/MC/RISCV/rv64zaamo-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+a < %s \ # RUN: | llvm-objdump --mattr=+a -M no-aliases -d -r - \ @@ -7,7 +7,7 @@ # RUN: not llvm-mc -triple riscv32 -mattr=+a < %s 2>&1 \ # RUN: | FileCheck -check-prefix=CHECK-RV32 %s # -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zaamo -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zaamo -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zaamo < %s \ # RUN: | llvm-objdump --mattr=+zaamo -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rv64zacas-valid.s b/llvm/test/MC/RISCV/rv64zacas-valid.s index 595c70b6e3f5b..d90e77859b6c1 100644 --- a/llvm/test/MC/RISCV/rv64zacas-valid.s +++ b/llvm/test/MC/RISCV/rv64zacas-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+zacas -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+zacas -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+a,+zacas < %s \ # RUN: | llvm-objdump --mattr=+a,+zacas -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rv64zalasr-valid.s b/llvm/test/MC/RISCV/rv64zalasr-valid.s index 2f1e381832175..13d2b21fe6f3d 100644 --- a/llvm/test/MC/RISCV/rv64zalasr-valid.s +++ b/llvm/test/MC/RISCV/rv64zalasr-valid.s @@ -1,11 +1,11 @@ -# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zalasr -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zalasr -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+experimental-zalasr < %s \ # RUN: | llvm-objdump --mattr=+experimental-zalasr -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s # # RUN: not llvm-mc -triple riscv64 \ -# RUN: -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck --check-prefixes=CHECK-NO-EXT %s diff --git a/llvm/test/MC/RISCV/rv64zalrsc-valid.s b/llvm/test/MC/RISCV/rv64zalrsc-valid.s index 2bbde96b6e074..98ac38dcd5396 100644 --- a/llvm/test/MC/RISCV/rv64zalrsc-valid.s +++ b/llvm/test/MC/RISCV/rv64zalrsc-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+a < %s \ # RUN: | llvm-objdump --mattr=+a -M no-aliases -d -r - \ @@ -7,7 +7,7 @@ # RUN: not llvm-mc -triple riscv32 -mattr=+a < %s 2>&1 \ # RUN: | FileCheck -check-prefix=CHECK-RV32 %s # -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zalrsc -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zalrsc -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zalrsc < %s \ # RUN: | llvm-objdump --mattr=+zalrsc -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rv64zba-aliases-valid.s b/llvm/test/MC/RISCV/rv64zba-aliases-valid.s index bb8eeb41dde75..78ae18b0eaa00 100644 --- a/llvm/test/MC/RISCV/rv64zba-aliases-valid.s +++ b/llvm/test/MC/RISCV/rv64zba-aliases-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zba -riscv-no-aliases \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zba -M no-aliases \ # RUN: | FileCheck -check-prefixes=CHECK-S-OBJ-NOALIAS %s # RUN: llvm-mc %s -triple=riscv64 -mattr=+zba \ # RUN: | FileCheck -check-prefixes=CHECK-S-OBJ %s diff --git a/llvm/test/MC/RISCV/rv64zbb-aliases-valid.s b/llvm/test/MC/RISCV/rv64zbb-aliases-valid.s index 662f36010a4eb..50d67160536b0 100644 --- a/llvm/test/MC/RISCV/rv64zbb-aliases-valid.s +++ b/llvm/test/MC/RISCV/rv64zbb-aliases-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zbb -riscv-no-aliases \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zbb -M no-aliases \ # RUN: | FileCheck -check-prefixes=CHECK-S-OBJ-NOALIAS %s # RUN: llvm-mc %s -triple=riscv64 -mattr=+zbb \ # RUN: | FileCheck -check-prefixes=CHECK-S-OBJ %s diff --git a/llvm/test/MC/RISCV/rv64zbb-valid.s b/llvm/test/MC/RISCV/rv64zbb-valid.s index 6c7327f65dd0a..5617e1108b113 100644 --- a/llvm/test/MC/RISCV/rv64zbb-valid.s +++ b/llvm/test/MC/RISCV/rv64zbb-valid.s @@ -1,5 +1,5 @@ # With Bitmanip base extension: -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zbb -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zbb -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zbb < %s \ # RUN: | llvm-objdump --mattr=+zbb -M no-aliases --no-print-imm-hex -d -r - \ diff --git a/llvm/test/MC/RISCV/rv64zbs-aliases-valid.s b/llvm/test/MC/RISCV/rv64zbs-aliases-valid.s index 0bfd3ac39d68d..0379a06ad4c8b 100644 --- a/llvm/test/MC/RISCV/rv64zbs-aliases-valid.s +++ b/llvm/test/MC/RISCV/rv64zbs-aliases-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zbs -riscv-no-aliases \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zbs -M no-aliases \ # RUN: | FileCheck -check-prefixes=CHECK-S-OBJ-NOALIAS %s # RUN: llvm-mc %s -triple=riscv64 -mattr=+zbs \ # RUN: | FileCheck -check-prefixes=CHECK-S-OBJ %s diff --git a/llvm/test/MC/RISCV/rv64zcb-valid.s b/llvm/test/MC/RISCV/rv64zcb-valid.s index ab0550e3416ed..83e7fd6c094f7 100644 --- a/llvm/test/MC/RISCV/rv64zcb-valid.s +++ b/llvm/test/MC/RISCV/rv64zcb-valid.s @@ -1,14 +1,14 @@ -# RUN: llvm-mc %s -triple=riscv64 -mattr=+m,+zbb,+zba,+zcb -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+m,+zbb,+zba,+zcb -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+m,+zbb,+zba,+zcb < %s \ # RUN: | llvm-objdump --mattr=+m,+zbb,+zba,zcb -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefixes=CHECK-ASM-AND-OBJ %s # # RUN: not llvm-mc -triple riscv64 \ -# RUN: -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXT %s # RUN: not llvm-mc -triple riscv32 -mattr=+m,+zbb,+zba,+zcb \ -# RUN: -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-RV64 %s # CHECK-ASM-AND-OBJ: c.zext.w s0 diff --git a/llvm/test/MC/RISCV/rv64zcmp-invalid.s b/llvm/test/MC/RISCV/rv64zcmp-invalid.s index 8f353e8a7bb47..7e10ab5c2f902 100644 --- a/llvm/test/MC/RISCV/rv64zcmp-invalid.s +++ b/llvm/test/MC/RISCV/rv64zcmp-invalid.s @@ -1,4 +1,4 @@ -# RUN: not llvm-mc -triple=riscv64 -mattr=zcmp -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: not llvm-mc -triple=riscv64 -mattr=zcmp -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-ERROR %s # CHECK-ERROR: error: invalid operand for instruction diff --git a/llvm/test/MC/RISCV/rv64zcmp-valid.s b/llvm/test/MC/RISCV/rv64zcmp-valid.s index 5973f6d380959..c70a9047002b7 100644 --- a/llvm/test/MC/RISCV/rv64zcmp-valid.s +++ b/llvm/test/MC/RISCV/rv64zcmp-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv64 -mattr=zcmp -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=zcmp -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=zcmp < %s \ # RUN: | llvm-objdump --mattr=-c,zcmp -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rv64zdinx-valid.s b/llvm/test/MC/RISCV/rv64zdinx-valid.s index fa603f36d9850..411f42430ac23 100644 --- a/llvm/test/MC/RISCV/rv64zdinx-valid.s +++ b/llvm/test/MC/RISCV/rv64zdinx-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zdinx %s \ # RUN: | llvm-objdump --mattr=+zdinx -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rv64zfh-valid.s b/llvm/test/MC/RISCV/rv64zfh-valid.s index 5a15040293fc3..5fb8ba5e66379 100644 --- a/llvm/test/MC/RISCV/rv64zfh-valid.s +++ b/llvm/test/MC/RISCV/rv64zfh-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfh -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfh -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zfh < %s \ # RUN: | llvm-objdump --mattr=+zfh -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rv64zfinx-valid.s b/llvm/test/MC/RISCV/rv64zfinx-valid.s index d2de9a3307e33..63006e99a0f96 100644 --- a/llvm/test/MC/RISCV/rv64zfinx-valid.s +++ b/llvm/test/MC/RISCV/rv64zfinx-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfinx -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfinx -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zfinx %s \ # RUN: | llvm-objdump --mattr=+zfinx -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rv64zhinx-valid.s b/llvm/test/MC/RISCV/rv64zhinx-valid.s index cba252ff64a50..c6aa55946fc6c 100644 --- a/llvm/test/MC/RISCV/rv64zhinx-valid.s +++ b/llvm/test/MC/RISCV/rv64zhinx-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zhinx %s \ # RUN: | llvm-objdump --mattr=+zhinx -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rv64zhinxmin-valid.s b/llvm/test/MC/RISCV/rv64zhinxmin-valid.s index 062844f555ec6..3489549f6eda1 100644 --- a/llvm/test/MC/RISCV/rv64zhinxmin-valid.s +++ b/llvm/test/MC/RISCV/rv64zhinxmin-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx,+zdinx -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx,+zdinx -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zhinx,+zdinx %s \ # RUN: | llvm-objdump --mattr=+zhinx,+zdinx -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rv64zicfiss-invalid.s b/llvm/test/MC/RISCV/rv64zicfiss-invalid.s index 1296940455e85..fc69c68a477d6 100644 --- a/llvm/test/MC/RISCV/rv64zicfiss-invalid.s +++ b/llvm/test/MC/RISCV/rv64zicfiss-invalid.s @@ -1,4 +1,4 @@ -# RUN: not llvm-mc %s -triple=riscv64 -mattr=+experimental-zicfiss,+c -riscv-no-aliases -show-encoding \ +# RUN: not llvm-mc %s -triple=riscv64 -mattr=+experimental-zicfiss,+c -M no-aliases -show-encoding \ # RUN: 2>&1 | FileCheck -check-prefixes=CHECK-ERR %s # CHECK-ERR: error: invalid operand for instruction diff --git a/llvm/test/MC/RISCV/rv64zmmul-invalid.s b/llvm/test/MC/RISCV/rv64zmmul-invalid.s index 026b0a476ea7d..3d27ed90ae08b 100644 --- a/llvm/test/MC/RISCV/rv64zmmul-invalid.s +++ b/llvm/test/MC/RISCV/rv64zmmul-invalid.s @@ -1,4 +1,4 @@ -# RUN: not llvm-mc %s -triple=riscv64 -mattr=+zmmul -riscv-no-aliases 2>&1 \ +# RUN: not llvm-mc %s -triple=riscv64 -mattr=+zmmul -M no-aliases 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-ERROR %s # CHECK-ERROR: 5:1: error: instruction requires the following: 'M' (Integer Multiplication and Division){{$}} diff --git a/llvm/test/MC/RISCV/rv64zmmul-valid.s b/llvm/test/MC/RISCV/rv64zmmul-valid.s index 80d05ac2bff08..b287d8932bef7 100644 --- a/llvm/test/MC/RISCV/rv64zmmul-valid.s +++ b/llvm/test/MC/RISCV/rv64zmmul-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zmmul -riscv-no-aliases 2>&1 \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zmmul -M no-aliases 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-INST %s # CHECK-INST: mulw ra, sp, gp diff --git a/llvm/test/MC/RISCV/rva-aliases-valid.s b/llvm/test/MC/RISCV/rva-aliases-valid.s index 57d96b537de4b..7b33407a3b3af 100644 --- a/llvm/test/MC/RISCV/rva-aliases-valid.s +++ b/llvm/test/MC/RISCV/rva-aliases-valid.s @@ -1,8 +1,8 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+a -riscv-no-aliases \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+a -M no-aliases \ # RUN: | FileCheck -check-prefixes=CHECK-S-NOALIAS,CHECK-S-OBJ-NOALIAS %s # RUN: llvm-mc %s -triple=riscv32 -mattr=+a \ # RUN: | FileCheck -check-prefixes=CHECK-S,CHECK-S-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -riscv-no-aliases\ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -M no-aliases\ # RUN: | FileCheck -check-prefixes=CHECK-S-NOALIAS,CHECK-S-OBJ-NOALIAS %s # RUN: llvm-mc %s -triple=riscv64 -mattr=+a \ # RUN: | FileCheck -check-prefixes=CHECK-S,CHECK-S-OBJ %s diff --git a/llvm/test/MC/RISCV/rvc-aliases-valid.s b/llvm/test/MC/RISCV/rvc-aliases-valid.s index fa73922e80ecf..23398f9721446 100644 --- a/llvm/test/MC/RISCV/rvc-aliases-valid.s +++ b/llvm/test/MC/RISCV/rvc-aliases-valid.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+c -riscv-no-aliases \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+c -M no-aliases \ # RUN: | FileCheck -check-prefixes=CHECK-EXPAND %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+c -riscv-no-aliases \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+c -M no-aliases \ # RUN: | FileCheck -check-prefixes=CHECK-EXPAND %s # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+c < %s \ # RUN: | llvm-objdump --no-print-imm-hex -M no-aliases -d - \ diff --git a/llvm/test/MC/RISCV/rvc-hints-valid.s b/llvm/test/MC/RISCV/rvc-hints-valid.s index 562fe6593f1b5..5dc86d17dcc63 100644 --- a/llvm/test/MC/RISCV/rvc-hints-valid.s +++ b/llvm/test/MC/RISCV/rvc-hints-valid.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+c -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+c -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple riscv64 -mattr=+c -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple riscv64 -mattr=+c -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+c < %s \ # RUN: | llvm-objdump -M no-aliases --no-print-imm-hex -d -r - \ diff --git a/llvm/test/MC/RISCV/rvc-valid.s b/llvm/test/MC/RISCV/rvc-valid.s index 9b0ca80a7adc2..798bff8630cc4 100644 --- a/llvm/test/MC/RISCV/rvc-valid.s +++ b/llvm/test/MC/RISCV/rvc-valid.s @@ -1,26 +1,26 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+c -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+c -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+c < %s \ # RUN: | llvm-objdump --mattr=+c --no-print-imm-hex -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefixes=CHECK-OBJ,CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv32 -mattr=+zca -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+zca -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zca < %s \ # RUN: | llvm-objdump --mattr=+zca --no-print-imm-hex -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefixes=CHECK-OBJ,CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+c -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+c -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+c < %s \ # RUN: | llvm-objdump --mattr=+c --no-print-imm-hex -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zca -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zca -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zca < %s \ # RUN: | llvm-objdump --mattr=+zca --no-print-imm-hex -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s # RUN: not llvm-mc -triple riscv32 \ -# RUN: -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXT %s # TODO: more exhaustive testing of immediate encoding. diff --git a/llvm/test/MC/RISCV/rvd-aliases-valid.s b/llvm/test/MC/RISCV/rvd-aliases-valid.s index 584781953c304..9832b734c3477 100644 --- a/llvm/test/MC/RISCV/rvd-aliases-valid.s +++ b/llvm/test/MC/RISCV/rvd-aliases-valid.s @@ -1,8 +1,8 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+d -riscv-no-aliases \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+d -M no-aliases \ # RUN: | FileCheck -check-prefix=CHECK-INST %s # RUN: llvm-mc %s -triple=riscv32 -mattr=+d \ # RUN: | FileCheck -check-prefix=CHECK-ALIAS %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+d -riscv-no-aliases \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+d -M no-aliases \ # RUN: | FileCheck -check-prefix=CHECK-INST %s # RUN: llvm-mc %s -triple=riscv64 -mattr=+d \ # RUN: | FileCheck -check-prefix=CHECK-ALIAS %s diff --git a/llvm/test/MC/RISCV/rvd-valid.s b/llvm/test/MC/RISCV/rvd-valid.s index f782900c26c33..5135562352a0f 100644 --- a/llvm/test/MC/RISCV/rvd-valid.s +++ b/llvm/test/MC/RISCV/rvd-valid.s @@ -1,9 +1,9 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+d -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+d -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+d < %s \ # RUN: | llvm-objdump --no-print-imm-hex --mattr=+d -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+d -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+d -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+d < %s \ # RUN: | llvm-objdump --no-print-imm-hex --mattr=+d -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rvdc-aliases-valid.s b/llvm/test/MC/RISCV/rvdc-aliases-valid.s index f74ee05575d53..083c4d2babedd 100644 --- a/llvm/test/MC/RISCV/rvdc-aliases-valid.s +++ b/llvm/test/MC/RISCV/rvdc-aliases-valid.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+c,+d -riscv-no-aliases \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+c,+d -M no-aliases \ # RUN: | FileCheck -check-prefixes=CHECK-EXPAND %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+c,+d -riscv-no-aliases \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+c,+d -M no-aliases \ # RUN: | FileCheck -check-prefixes=CHECK-EXPAND %s # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+c,+d < %s \ # RUN: | llvm-objdump --mattr=+c,+d --no-print-imm-hex -M no-aliases -d - \ diff --git a/llvm/test/MC/RISCV/rve-valid.s b/llvm/test/MC/RISCV/rve-valid.s index ccb47f1557c69..d151c07288fec 100644 --- a/llvm/test/MC/RISCV/rve-valid.s +++ b/llvm/test/MC/RISCV/rve-valid.s @@ -1,9 +1,9 @@ -# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -mattr=+e -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -mattr=+e -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+e < %s \ # RUN: | llvm-objdump --no-print-imm-hex -M no-aliases -d -r - \ # RUN: | FileCheck -check-prefixes=CHECK-OBJ,CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -mattr=+e -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -mattr=+e -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+e < %s \ # RUN: | llvm-objdump --no-print-imm-hex -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rvf-aliases-valid.s b/llvm/test/MC/RISCV/rvf-aliases-valid.s index 0430e2af7c531..e0b63ec31e4ba 100644 --- a/llvm/test/MC/RISCV/rvf-aliases-valid.s +++ b/llvm/test/MC/RISCV/rvf-aliases-valid.s @@ -1,8 +1,8 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+f -riscv-no-aliases \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+f -M no-aliases \ # RUN: | FileCheck -check-prefix=CHECK-INST %s # RUN: llvm-mc %s -triple=riscv32 -mattr=+f \ # RUN: | FileCheck -check-prefix=CHECK-ALIAS %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+f -riscv-no-aliases \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+f -M no-aliases \ # RUN: | FileCheck -check-prefix=CHECK-INST %s # RUN: llvm-mc %s -triple=riscv64 -mattr=+f \ # RUN: | FileCheck -check-prefix=CHECK-ALIAS %s diff --git a/llvm/test/MC/RISCV/rvf-user-csr-names.s b/llvm/test/MC/RISCV/rvf-user-csr-names.s index 7b7569d4c6cd5..697c3334d62cf 100644 --- a/llvm/test/MC/RISCV/rvf-user-csr-names.s +++ b/llvm/test/MC/RISCV/rvf-user-csr-names.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -mattr=+f -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -mattr=+f -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+f < %s \ # RUN: | llvm-objdump -d --mattr=+f - \ @@ -7,7 +7,7 @@ # RUN: | llvm-objdump -d - \ # RUN: | FileCheck -check-prefix=CHECK-INST-ALIAS-NO-F %s # -# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -mattr=+f -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -mattr=+f -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s # RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+f < %s \ # RUN: | llvm-objdump -d --mattr=+f - \ diff --git a/llvm/test/MC/RISCV/rvf-valid.s b/llvm/test/MC/RISCV/rvf-valid.s index 77b5df001a218..49b67a0e9db94 100644 --- a/llvm/test/MC/RISCV/rvf-valid.s +++ b/llvm/test/MC/RISCV/rvf-valid.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+f -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+f -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+f -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+f -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+f < %s \ # RUN: | llvm-objdump --mattr=+f --no-print-imm-hex -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rvi-valid.s b/llvm/test/MC/RISCV/rvi-valid.s index 86b508ac8f5a9..25b72d479bc6c 100644 --- a/llvm/test/MC/RISCV/rvi-valid.s +++ b/llvm/test/MC/RISCV/rvi-valid.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple riscv64 -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple riscv64 -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 < %s \ # RUN: | llvm-objdump -M no-aliases --no-print-imm-hex -d -r - \ diff --git a/llvm/test/MC/RISCV/rvih-valid.s b/llvm/test/MC/RISCV/rvih-valid.s index 6f80a24a1f8a2..a3a9d29709db5 100644 --- a/llvm/test/MC/RISCV/rvih-valid.s +++ b/llvm/test/MC/RISCV/rvih-valid.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+h -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+h -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK,CHECK-INST %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+h -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+h -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK,CHECK-INST %s # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+h < %s \ # RUN: | llvm-objdump --mattr=+h -M no-aliases -d - \ diff --git a/llvm/test/MC/RISCV/rvk-user-csr-name.s b/llvm/test/MC/RISCV/rvk-user-csr-name.s index 0615da0ad118e..0a3cb2dbf1f28 100644 --- a/llvm/test/MC/RISCV/rvk-user-csr-name.s +++ b/llvm/test/MC/RISCV/rvk-user-csr-name.s @@ -1,10 +1,10 @@ -# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+zkr < %s \ # RUN: | llvm-objdump -d --mattr=+zkr - \ # RUN: | FileCheck -check-prefix=CHECK-INST-ALIAS %s # -# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s # RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+zkr < %s \ # RUN: | llvm-objdump -d --mattr=+zkr - \ diff --git a/llvm/test/MC/RISCV/rvm-valid.s b/llvm/test/MC/RISCV/rvm-valid.s index 8fb54bba12087..d0612c4ed3023 100644 --- a/llvm/test/MC/RISCV/rvm-valid.s +++ b/llvm/test/MC/RISCV/rvm-valid.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+m -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+m -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+m -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+m -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+m < %s \ # RUN: | llvm-objdump --mattr=+m -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rvv-user-csr-names.s b/llvm/test/MC/RISCV/rvv-user-csr-names.s index 71d40334f6e0e..1f691bbfc7aaa 100644 --- a/llvm/test/MC/RISCV/rvv-user-csr-names.s +++ b/llvm/test/MC/RISCV/rvv-user-csr-names.s @@ -1,10 +1,10 @@ -# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -mattr=+f -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -mattr=+f -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+v < %s \ # RUN: | llvm-objdump -d --mattr=+v - \ # RUN: | FileCheck -check-prefix=CHECK-INST-ALIAS %s # -# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -mattr=+f -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -mattr=+f -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s # RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+v < %s \ # RUN: | llvm-objdump -d --mattr=+v - \ diff --git a/llvm/test/MC/RISCV/rvv/aliases.s b/llvm/test/MC/RISCV/rvv/aliases.s index 0dadeb1b1db19..c36bdb535b865 100644 --- a/llvm/test/MC/RISCV/rvv/aliases.s +++ b/llvm/test/MC/RISCV/rvv/aliases.s @@ -1,6 +1,6 @@ # RUN: llvm-mc --triple=riscv64 -mattr +v < %s --show-encoding 2>&1 \ # RUN: -mattr +d | FileCheck --check-prefix=ALIAS %s -# RUN: llvm-mc --triple=riscv64 -mattr=+v --riscv-no-aliases < %s \ +# RUN: llvm-mc --triple=riscv64 -mattr=+v --M no-aliases < %s \ # RUN: -mattr +d --show-encoding 2>&1 | FileCheck --check-prefix=NO-ALIAS %s # ALIAS: vwcvt.x.x.v v2, v1, v0.t # encoding: [0x57,0x61,0x10,0xc4] diff --git a/llvm/test/MC/RISCV/rvv/fothers.s b/llvm/test/MC/RISCV/rvv/fothers.s index 997115f96bd9d..0236d3119d4bc 100644 --- a/llvm/test/MC/RISCV/rvv/fothers.s +++ b/llvm/test/MC/RISCV/rvv/fothers.s @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+v %s \ -# RUN: --mattr=+f --riscv-no-aliases \ +# RUN: --mattr=+f --M no-aliases \ # RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST # RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \ # RUN: | FileCheck %s --check-prefix=CHECK-ERROR diff --git a/llvm/test/MC/RISCV/rvv/freduction.s b/llvm/test/MC/RISCV/rvv/freduction.s index 12326942e6e88..190d60fab6e0b 100644 --- a/llvm/test/MC/RISCV/rvv/freduction.s +++ b/llvm/test/MC/RISCV/rvv/freduction.s @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+v %s \ -# RUN: --mattr=+f --riscv-no-aliases \ +# RUN: --mattr=+f --M no-aliases \ # RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST # RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \ # RUN: | FileCheck %s --check-prefix=CHECK-ERROR diff --git a/llvm/test/MC/RISCV/rvv/load.s b/llvm/test/MC/RISCV/rvv/load.s index 3c251a3a8d75b..9cd0ab45115d4 100644 --- a/llvm/test/MC/RISCV/rvv/load.s +++ b/llvm/test/MC/RISCV/rvv/load.s @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+v %s \ -# RUN: --riscv-no-aliases | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +# RUN: --M no-aliases | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST # RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \ # RUN: | FileCheck %s --check-prefix=CHECK-ERROR # RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+v %s \ diff --git a/llvm/test/MC/RISCV/rvv/others.s b/llvm/test/MC/RISCV/rvv/others.s index cc16a8774b82d..041458502f62e 100644 --- a/llvm/test/MC/RISCV/rvv/others.s +++ b/llvm/test/MC/RISCV/rvv/others.s @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+v %s \ -# RUN: --riscv-no-aliases | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +# RUN: --M no-aliases | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST # RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \ # RUN: | FileCheck %s --check-prefix=CHECK-ERROR # RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+v %s \ diff --git a/llvm/test/MC/RISCV/rvv/store.s b/llvm/test/MC/RISCV/rvv/store.s index c6a34705fa4a6..ca9bb130fb87f 100644 --- a/llvm/test/MC/RISCV/rvv/store.s +++ b/llvm/test/MC/RISCV/rvv/store.s @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+v %s \ -# RUN: --riscv-no-aliases | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +# RUN: --M no-aliases | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST # RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \ # RUN: | FileCheck %s --check-prefix=CHECK-ERROR # RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+v %s \ diff --git a/llvm/test/MC/RISCV/rvv/zvlsseg.s b/llvm/test/MC/RISCV/rvv/zvlsseg.s index 65089e2261be2..479d2f9a9e800 100644 --- a/llvm/test/MC/RISCV/rvv/zvlsseg.s +++ b/llvm/test/MC/RISCV/rvv/zvlsseg.s @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+v %s \ -# RUN: --riscv-no-aliases \ +# RUN: --M no-aliases \ # RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST # RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \ # RUN: | FileCheck %s --check-prefix=CHECK-ERROR diff --git a/llvm/test/MC/RISCV/rvzaamo-valid.s b/llvm/test/MC/RISCV/rvzaamo-valid.s index d9ba6ef0240b4..e4805aa039dc3 100644 --- a/llvm/test/MC/RISCV/rvzaamo-valid.s +++ b/llvm/test/MC/RISCV/rvzaamo-valid.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+a -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+a -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+a < %s \ # RUN: | llvm-objdump --mattr=+a -M no-aliases -d -r - \ @@ -8,9 +8,9 @@ # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+a < %s \ # RUN: | llvm-objdump --mattr=+a -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv32 -mattr=+zaamo -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+zaamo -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zaamo -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zaamo -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zaamo < %s \ # RUN: | llvm-objdump --mattr=+zaamo -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rvzabha-valid.s b/llvm/test/MC/RISCV/rvzabha-valid.s index a3c61dbb570cb..2b1b1e075589a 100644 --- a/llvm/test/MC/RISCV/rvzabha-valid.s +++ b/llvm/test/MC/RISCV/rvzabha-valid.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+zabha -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+zabha -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+zabha -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+zabha -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+a,+zabha < %s \ # RUN: | llvm-objdump --mattr=+a,+zabha -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rvzabha-zacas-valid.s b/llvm/test/MC/RISCV/rvzabha-zacas-valid.s index 97afb9d6563e5..4e271e4a233d0 100644 --- a/llvm/test/MC/RISCV/rvzabha-zacas-valid.s +++ b/llvm/test/MC/RISCV/rvzabha-zacas-valid.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+zabha,+zacas -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+zabha,+zacas -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+zabha,+zacas -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+zabha,+zacas -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+a,+zabha,+zacas < %s \ # RUN: | llvm-objdump --mattr=+a,+zabha,+zacas -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rvzacas-valid.s b/llvm/test/MC/RISCV/rvzacas-valid.s index 0e76f02399483..2524001740b7d 100644 --- a/llvm/test/MC/RISCV/rvzacas-valid.s +++ b/llvm/test/MC/RISCV/rvzacas-valid.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+zacas -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+zacas -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+zacas -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+zacas -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+a,+zacas < %s \ # RUN: | llvm-objdump --mattr=+a,+zacas -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rvzalasr-valid.s b/llvm/test/MC/RISCV/rvzalasr-valid.s index 7b2668bb641f5..11487ee7597a8 100644 --- a/llvm/test/MC/RISCV/rvzalasr-valid.s +++ b/llvm/test/MC/RISCV/rvzalasr-valid.s @@ -1,19 +1,19 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zalasr -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zalasr -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+experimental-zalasr < %s \ # RUN: | llvm-objdump --mattr=+experimental-zalasr -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zalasr -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zalasr -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+experimental-zalasr < %s \ # RUN: | llvm-objdump --mattr=+experimental-zalasr -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s # # RUN: not llvm-mc -triple riscv32 \ -# RUN: -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck --check-prefixes=CHECK-NO-EXT %s # RUN: not llvm-mc -triple riscv64 \ -# RUN: -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck --check-prefixes=CHECK-NO-EXT %s # CHECK-ASM-AND-OBJ: lb.aq t1, (a0) diff --git a/llvm/test/MC/RISCV/rvzalrsc-valid.s b/llvm/test/MC/RISCV/rvzalrsc-valid.s index f84c0fd62f690..01883dadbabaa 100644 --- a/llvm/test/MC/RISCV/rvzalrsc-valid.s +++ b/llvm/test/MC/RISCV/rvzalrsc-valid.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+a -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+a -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+a < %s \ # RUN: | llvm-objdump --mattr=+a -M no-aliases -d -r - \ @@ -8,9 +8,9 @@ # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+a < %s \ # RUN: | llvm-objdump --mattr=+a -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv32 -mattr=+zalrsc -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+zalrsc -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zalrsc -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zalrsc -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zalrsc < %s \ # RUN: | llvm-objdump --mattr=+zalrsc -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rvzbb-valid.s b/llvm/test/MC/RISCV/rvzbb-valid.s index 1ed069ed0bdc8..1b060beff2121 100644 --- a/llvm/test/MC/RISCV/rvzbb-valid.s +++ b/llvm/test/MC/RISCV/rvzbb-valid.s @@ -1,7 +1,7 @@ # With Bitmanip base extension: -# RUN: llvm-mc %s -triple=riscv32 -mattr=+zbb -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+zbb -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zbb -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zbb -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zbb < %s \ # RUN: | llvm-objdump --mattr=+zbb --no-print-imm-hex -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rvzcb-invalid.s b/llvm/test/MC/RISCV/rvzcb-invalid.s index 2f543b2dad67f..f53ab25735303 100644 --- a/llvm/test/MC/RISCV/rvzcb-invalid.s +++ b/llvm/test/MC/RISCV/rvzcb-invalid.s @@ -1,6 +1,6 @@ -# RUN: not llvm-mc -triple=riscv32 -mattr=zcb -riscv-no-aliases -show-encoding %s 2>&1 \ +# RUN: not llvm-mc -triple=riscv32 -mattr=zcb -M no-aliases -show-encoding %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-ERROR %s -# RUN: not llvm-mc -triple=riscv64 -mattr=zcb -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: not llvm-mc -triple=riscv64 -mattr=zcb -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-ERROR %s # CHECK-ERROR: error: immediate must be an integer in the range [0, 3] diff --git a/llvm/test/MC/RISCV/rvzcb-valid.s b/llvm/test/MC/RISCV/rvzcb-valid.s index de25a389b995c..b78ecef3ea5a9 100644 --- a/llvm/test/MC/RISCV/rvzcb-valid.s +++ b/llvm/test/MC/RISCV/rvzcb-valid.s @@ -1,19 +1,19 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+m,+zbb,+zba,+zcb -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+m,+zbb,+zba,+zcb -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+m,+zbb,+zba,+zcb < %s \ # RUN: | llvm-objdump --mattr=+m,+zbb,+zba,+zcb --no-print-imm-hex -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefixes=CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+m,+zbb,+zba,+zcb -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+m,+zbb,+zba,+zcb -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+m,+zbb,+zba,+zcb < %s \ # RUN: | llvm-objdump --mattr=+m,+zbb,+zba,zcb --no-print-imm-hex -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefixes=CHECK-ASM-AND-OBJ %s # # RUN: not llvm-mc -triple riscv32 \ -# RUN: -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXT %s # RUN: not llvm-mc -triple riscv64 \ -# RUN: -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXT %s # CHECK-ASM-AND-OBJ: c.zext.b s0 diff --git a/llvm/test/MC/RISCV/rvzcmt-invalid.s b/llvm/test/MC/RISCV/rvzcmt-invalid.s index 5f964edd7d3f3..0cd9f0b8a3d1d 100644 --- a/llvm/test/MC/RISCV/rvzcmt-invalid.s +++ b/llvm/test/MC/RISCV/rvzcmt-invalid.s @@ -1,6 +1,6 @@ -# RUN: not llvm-mc -triple=riscv32 -mattr=+zcmt -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: not llvm-mc -triple=riscv32 -mattr=+zcmt -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-ERROR %s -# RUN: not llvm-mc -triple=riscv64 -mattr=+zcmt -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: not llvm-mc -triple=riscv64 -mattr=+zcmt -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-ERROR %s # CHECK-ERROR: error: immediate must be an integer in the range [0, 31] diff --git a/llvm/test/MC/RISCV/rvzcmt-user-csr-name.s b/llvm/test/MC/RISCV/rvzcmt-user-csr-name.s index 58fe43e2aa88d..c1ab6bc143363 100644 --- a/llvm/test/MC/RISCV/rvzcmt-user-csr-name.s +++ b/llvm/test/MC/RISCV/rvzcmt-user-csr-name.s @@ -1,10 +1,10 @@ -# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -mattr=+zcmt -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -mattr=+zcmt -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+zcmt < %s \ # RUN: | llvm-objdump -d --mattr=+zcmt - \ # RUN: | FileCheck -check-prefix=CHECK-INST-ALIAS %s # -# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -mattr=+zcmt -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -mattr=+zcmt -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s # RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+zcmt < %s \ # RUN: | llvm-objdump -d --mattr=+zcmt - \ diff --git a/llvm/test/MC/RISCV/rvzcmt-valid.s b/llvm/test/MC/RISCV/rvzcmt-valid.s index a3829fed829f0..a87e7902f6eed 100644 --- a/llvm/test/MC/RISCV/rvzcmt-valid.s +++ b/llvm/test/MC/RISCV/rvzcmt-valid.s @@ -1,5 +1,5 @@ # RUN: llvm-mc %s -triple=riscv32 -mattr=+zcmt\ -# RUN: -riscv-no-aliases -show-encoding \ +# RUN: -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zcmt\ # RUN: -mattr=m < %s \ @@ -7,7 +7,7 @@ # RUN: -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefixes=CHECK-ASM-AND-OBJ %s # RUN: llvm-mc %s -triple=riscv64 -mattr=+zcmt\ -# RUN: -riscv-no-aliases -show-encoding \ +# RUN: -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zcmt\ # RUN: -mattr=m < %s \ @@ -16,10 +16,10 @@ # RUN: | FileCheck --check-prefixes=CHECK-ASM-AND-OBJ %s # # RUN: not llvm-mc -triple riscv32 \ -# RUN: -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXT %s # RUN: not llvm-mc -triple riscv64 \ -# RUN: -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXT %s # CHECK-ASM-AND-OBJ: cm.jt 1 diff --git a/llvm/test/MC/RISCV/rvzdinx-aliases-valid.s b/llvm/test/MC/RISCV/rvzdinx-aliases-valid.s index 96ec4a4e1041b..a24e36b82edbc 100644 --- a/llvm/test/MC/RISCV/rvzdinx-aliases-valid.s +++ b/llvm/test/MC/RISCV/rvzdinx-aliases-valid.s @@ -1,8 +1,8 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+zdinx -riscv-no-aliases \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+zdinx -M no-aliases \ # RUN: | FileCheck -check-prefix=CHECK-INST %s # RUN: llvm-mc %s -triple=riscv32 -mattr=+zdinx \ # RUN: | FileCheck -check-prefix=CHECK-ALIAS %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx -riscv-no-aliases \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx -M no-aliases \ # RUN: | FileCheck -check-prefix=CHECK-INST %s # RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx \ # RUN: | FileCheck -check-prefix=CHECK-ALIAS %s diff --git a/llvm/test/MC/RISCV/rvzdinx-valid.s b/llvm/test/MC/RISCV/rvzdinx-valid.s index bd1e23104c5ba..623e281b2cae8 100644 --- a/llvm/test/MC/RISCV/rvzdinx-valid.s +++ b/llvm/test/MC/RISCV/rvzdinx-valid.s @@ -1,9 +1,9 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+zdinx -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+zdinx -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zdinx %s \ # RUN: | llvm-objdump --mattr=+zdinx -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zdinx %s \ # RUN: | llvm-objdump --mattr=+zdinx -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rvzfbfmin-valid.s b/llvm/test/MC/RISCV/rvzfbfmin-valid.s index aa8f8ccc79f78..6bca69175b83c 100644 --- a/llvm/test/MC/RISCV/rvzfbfmin-valid.s +++ b/llvm/test/MC/RISCV/rvzfbfmin-valid.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfbfmin -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfbfmin -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfbfmin -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfbfmin -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zfbfmin,+f < %s \ # RUN: | llvm-objdump --mattr=+zfbfmin --no-print-imm-hex -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rvzfh-aliases-valid.s b/llvm/test/MC/RISCV/rvzfh-aliases-valid.s index 4e33375f45a77..44b3cc12cfe96 100644 --- a/llvm/test/MC/RISCV/rvzfh-aliases-valid.s +++ b/llvm/test/MC/RISCV/rvzfh-aliases-valid.s @@ -1,8 +1,8 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfh -riscv-no-aliases \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfh -M no-aliases \ # RUN: | FileCheck -check-prefix=CHECK-INST %s # RUN: llvm-mc %s -triple=riscv32 -mattr=+zfh \ # RUN: | FileCheck -check-prefix=CHECK-ALIAS %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfh -riscv-no-aliases \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfh -M no-aliases \ # RUN: | FileCheck -check-prefix=CHECK-INST %s # RUN: llvm-mc %s -triple=riscv64 -mattr=+zfh \ # RUN: | FileCheck -check-prefix=CHECK-ALIAS %s diff --git a/llvm/test/MC/RISCV/rvzfh-valid.s b/llvm/test/MC/RISCV/rvzfh-valid.s index ec21f402d8738..ccc6d10188de2 100644 --- a/llvm/test/MC/RISCV/rvzfh-valid.s +++ b/llvm/test/MC/RISCV/rvzfh-valid.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfh -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfh -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfh -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfh -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zfh < %s \ # RUN: | llvm-objdump --mattr=+zfh --no-print-imm-hex -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rvzfhmin-valid.s b/llvm/test/MC/RISCV/rvzfhmin-valid.s index 63e5e98cc15e9..fd7b36a58734f 100644 --- a/llvm/test/MC/RISCV/rvzfhmin-valid.s +++ b/llvm/test/MC/RISCV/rvzfhmin-valid.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfhmin,+d -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfhmin,+d -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfhmin,+d -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfhmin,+d -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zfhmin,+d < %s \ # RUN: | llvm-objdump --mattr=+zfhmin,+d --no-print-imm-hex -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rvzfinx-aliases-valid.s b/llvm/test/MC/RISCV/rvzfinx-aliases-valid.s index f9225cf5dc2ef..83e166012f861 100644 --- a/llvm/test/MC/RISCV/rvzfinx-aliases-valid.s +++ b/llvm/test/MC/RISCV/rvzfinx-aliases-valid.s @@ -1,8 +1,8 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfinx -riscv-no-aliases \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfinx -M no-aliases \ # RUN: | FileCheck -check-prefix=CHECK-INST %s # RUN: llvm-mc %s -triple=riscv32 -mattr=+zfinx \ # RUN: | FileCheck -check-prefix=CHECK-ALIAS %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfinx -riscv-no-aliases \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfinx -M no-aliases \ # RUN: | FileCheck -check-prefix=CHECK-INST %s # RUN: llvm-mc %s -triple=riscv64 -mattr=+zfinx \ # RUN: | FileCheck -check-prefix=CHECK-ALIAS %s diff --git a/llvm/test/MC/RISCV/rvzfinx-valid.s b/llvm/test/MC/RISCV/rvzfinx-valid.s index 58f805c9fce71..09a5f9eb920a5 100644 --- a/llvm/test/MC/RISCV/rvzfinx-valid.s +++ b/llvm/test/MC/RISCV/rvzfinx-valid.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfinx -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfinx -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfinx -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfinx -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zfinx %s \ # RUN: | llvm-objdump --mattr=+zfinx -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rvzhinx-aliases-valid.s b/llvm/test/MC/RISCV/rvzhinx-aliases-valid.s index dbefc5a91b275..8f69558d35aff 100644 --- a/llvm/test/MC/RISCV/rvzhinx-aliases-valid.s +++ b/llvm/test/MC/RISCV/rvzhinx-aliases-valid.s @@ -1,8 +1,8 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+zhinx -riscv-no-aliases \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+zhinx -M no-aliases \ # RUN: | FileCheck -check-prefix=CHECK-INST %s # RUN: llvm-mc %s -triple=riscv32 -mattr=+zhinx \ # RUN: | FileCheck -check-prefix=CHECK-ALIAS %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx -riscv-no-aliases \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx -M no-aliases \ # RUN: | FileCheck -check-prefix=CHECK-INST %s # RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx \ # RUN: | FileCheck -check-prefix=CHECK-ALIAS %s diff --git a/llvm/test/MC/RISCV/rvzhinx-valid.s b/llvm/test/MC/RISCV/rvzhinx-valid.s index 97ec9dd1a34f8..dc244b2cc9ef7 100644 --- a/llvm/test/MC/RISCV/rvzhinx-valid.s +++ b/llvm/test/MC/RISCV/rvzhinx-valid.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+zhinx -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+zhinx -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zhinx %s \ # RUN: | llvm-objdump --mattr=+zhinx -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rvzhinxmin-valid.s b/llvm/test/MC/RISCV/rvzhinxmin-valid.s index fbdbce06bc0ff..1773b29250c8d 100644 --- a/llvm/test/MC/RISCV/rvzhinxmin-valid.s +++ b/llvm/test/MC/RISCV/rvzhinxmin-valid.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+zhinxmin -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+zhinxmin -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinxmin -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinxmin -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zhinxmin %s \ # RUN: | llvm-objdump --mattr=+zhinxmin -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rvzihintntl-valid.s b/llvm/test/MC/RISCV/rvzihintntl-valid.s index f7601c3f9003c..415070a3eee29 100644 --- a/llvm/test/MC/RISCV/rvzihintntl-valid.s +++ b/llvm/test/MC/RISCV/rvzihintntl-valid.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+zihintntl -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+zihintntl -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zihintntl -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zihintntl -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zihintntl < %s \ # RUN: | llvm-objdump --mattr=+zihintntl -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/rvzihintpause-valid.s b/llvm/test/MC/RISCV/rvzihintpause-valid.s index 3ffc387dea3fb..44cebae29109c 100644 --- a/llvm/test/MC/RISCV/rvzihintpause-valid.s +++ b/llvm/test/MC/RISCV/rvzihintpause-valid.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+zihintpause -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+zihintpause -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zihintpause -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zihintpause -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zihintpause < %s \ # RUN: | llvm-objdump --mattr=+zihintpause -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/smctr-ssctr-valid.s b/llvm/test/MC/RISCV/smctr-ssctr-valid.s index 0b4fe47ae33f4..8bbd5a426b8ee 100644 --- a/llvm/test/MC/RISCV/smctr-ssctr-valid.s +++ b/llvm/test/MC/RISCV/smctr-ssctr-valid.s @@ -1,10 +1,10 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-smctr -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-smctr -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK,CHECK-INST %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-smctr -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-smctr -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK,CHECK-INST %s -# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-ssctr -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-ssctr -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK,CHECK-INST %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-ssctr -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-ssctr -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK,CHECK-INST %s # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-smctr < %s \ # RUN: | llvm-objdump --mattr=+experimental-smctr -M no-aliases -d - \ @@ -19,9 +19,9 @@ # RUN: | llvm-objdump --mattr=+experimental-ssctr -M no-aliases -d - \ # RUN: | FileCheck -check-prefix=CHECK-INST %s -# RUN: not llvm-mc -triple riscv32 -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: not llvm-mc -triple riscv32 -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXT %s -# RUN: not llvm-mc -triple riscv64 -defsym=RV64=1 -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: not llvm-mc -triple riscv64 -defsym=RV64=1 -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXT %s # CHECK-INST: sctrclr diff --git a/llvm/test/MC/RISCV/smrnmi-valid.s b/llvm/test/MC/RISCV/smrnmi-valid.s index d330ecefa6053..8c57a4d4c90d4 100644 --- a/llvm/test/MC/RISCV/smrnmi-valid.s +++ b/llvm/test/MC/RISCV/smrnmi-valid.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+smrnmi -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+smrnmi -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK,CHECK-INST %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+smrnmi -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+smrnmi -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK,CHECK-INST %s # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+smrnmi < %s \ # RUN: | llvm-objdump --mattr=+smrnmi -M no-aliases -d - \ diff --git a/llvm/test/MC/RISCV/supervisor-csr-names.s b/llvm/test/MC/RISCV/supervisor-csr-names.s index db0fcb381ef2a..712ec56bb1127 100644 --- a/llvm/test/MC/RISCV/supervisor-csr-names.s +++ b/llvm/test/MC/RISCV/supervisor-csr-names.s @@ -1,10 +1,10 @@ -# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \ # RUN: | llvm-objdump -d - \ # RUN: | FileCheck -check-prefix=CHECK-INST-ALIAS %s # -# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s # RUN: llvm-mc -filetype=obj -triple riscv64 < %s \ # RUN: | llvm-objdump -d - \ diff --git a/llvm/test/MC/RISCV/user-csr-names.s b/llvm/test/MC/RISCV/user-csr-names.s index f49eace659ac9..bc7363f0e67c8 100644 --- a/llvm/test/MC/RISCV/user-csr-names.s +++ b/llvm/test/MC/RISCV/user-csr-names.s @@ -1,10 +1,10 @@ -# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \ # RUN: | llvm-objdump -d - \ # RUN: | FileCheck -check-prefix=CHECK-INST-ALIAS %s # -# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s # RUN: llvm-mc -filetype=obj -triple riscv64 < %s \ # RUN: | llvm-objdump -d - \ diff --git a/llvm/test/MC/RISCV/xqcia-valid.s b/llvm/test/MC/RISCV/xqcia-valid.s index 6bd10492d4d6a..938285641ee79 100644 --- a/llvm/test/MC/RISCV/xqcia-valid.s +++ b/llvm/test/MC/RISCV/xqcia-valid.s @@ -1,5 +1,5 @@ # Xqcia - Qualcomm uC Arithmetic Extesnsion -# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcia -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcia -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcia < %s \ # RUN: | llvm-objdump --mattr=+experimental-xqcia -M no-aliases --no-print-imm-hex -d - \ diff --git a/llvm/test/MC/RISCV/xqcics-valid.s b/llvm/test/MC/RISCV/xqcics-valid.s index eb888a6222693..1438f67fd4b85 100644 --- a/llvm/test/MC/RISCV/xqcics-valid.s +++ b/llvm/test/MC/RISCV/xqcics-valid.s @@ -1,5 +1,5 @@ # Xqcics - Qualcomm uC Conditional Select Extension -# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcics -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcics -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcics < %s \ # RUN: | llvm-objdump --mattr=+experimental-xqcics -M no-aliases --no-print-imm-hex -d - \ diff --git a/llvm/test/MC/RISCV/xqcicsr-valid.s b/llvm/test/MC/RISCV/xqcicsr-valid.s index 1236dd622703d..ab26098fc7ee7 100644 --- a/llvm/test/MC/RISCV/xqcicsr-valid.s +++ b/llvm/test/MC/RISCV/xqcicsr-valid.s @@ -1,5 +1,5 @@ # Xqcicsr - Qualcomm uC CSR Extension -# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcicsr -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcicsr -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcicsr < %s \ # RUN: | llvm-objdump --mattr=+experimental-xqcicsr -M no-aliases --no-print-imm-hex -d - \ diff --git a/llvm/test/MC/RISCV/xqcilsm-aliases-valid.s b/llvm/test/MC/RISCV/xqcilsm-aliases-valid.s index e9aec14c3c3aa..b65a831a5f4d3 100644 --- a/llvm/test/MC/RISCV/xqcilsm-aliases-valid.s +++ b/llvm/test/MC/RISCV/xqcilsm-aliases-valid.s @@ -1,5 +1,5 @@ # Xqcilsm - Qualcomm uC Load Store Multiple Extension -# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcilsm -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcilsm -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcilsm < %s \ # RUN: | llvm-objdump --mattr=+experimental-xqcilsm -M no-aliases --no-print-imm-hex -d - \ diff --git a/llvm/test/MC/RISCV/xqcilsm-valid.s b/llvm/test/MC/RISCV/xqcilsm-valid.s index 4893e074df327..cbe25a269d19b 100644 --- a/llvm/test/MC/RISCV/xqcilsm-valid.s +++ b/llvm/test/MC/RISCV/xqcilsm-valid.s @@ -1,5 +1,5 @@ # Xqcilsm - Qualcomm uC Load Store Multiple Extension -# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcilsm -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcilsm -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcilsm < %s \ # RUN: | llvm-objdump --mattr=+experimental-xqcilsm -M no-aliases --no-print-imm-hex -d - \ diff --git a/llvm/test/MC/RISCV/xqcisls-valid.s b/llvm/test/MC/RISCV/xqcisls-valid.s index 32f64a82985ce..d7e80b313c78f 100644 --- a/llvm/test/MC/RISCV/xqcisls-valid.s +++ b/llvm/test/MC/RISCV/xqcisls-valid.s @@ -1,5 +1,5 @@ # Xqcisls - Qualcomm uC Scaled Load Store Extension -# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcisls -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcisls -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcisls < %s \ # RUN: | llvm-objdump --mattr=+experimental-xqcisls -M no-aliases --no-print-imm-hex -d - \ diff --git a/llvm/test/MC/RISCV/xsifive-valid.s b/llvm/test/MC/RISCV/xsifive-valid.s index 8aa0ab1bd8ba3..bf5998164f304 100644 --- a/llvm/test/MC/RISCV/xsifive-valid.s +++ b/llvm/test/MC/RISCV/xsifive-valid.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+xsifivecdiscarddlone,+xsifivecflushdlone,+xsfcease -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+xsifivecdiscarddlone,+xsifivecflushdlone,+xsfcease -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+xsifivecdiscarddlone,+xsifivecflushdlone,+xsfcease -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+xsifivecdiscarddlone,+xsifivecflushdlone,+xsfcease -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+xsifivecdiscarddlone,+xsifivecflushdlone,+xsfcease < %s \ # RUN: | llvm-objdump --mattr=+xsifivecdiscarddlone,+xsifivecflushdlone,+xsfcease -M no-aliases -d - \ diff --git a/llvm/test/MC/RISCV/xwchc-compress.s b/llvm/test/MC/RISCV/xwchc-compress.s index 4bdce1c02cfff..7964497488dd3 100644 --- a/llvm/test/MC/RISCV/xwchc-compress.s +++ b/llvm/test/MC/RISCV/xwchc-compress.s @@ -1,7 +1,7 @@ # RUN: llvm-mc -triple riscv32 -mattr=+xwchc -show-encoding < %s \ # RUN: | FileCheck -check-prefixes=CHECK,CHECK-ALIAS %s # RUN: llvm-mc -triple riscv32 -mattr=+xwchc -show-encoding \ -# RUN: -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s +# RUN: -M no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s # RUN: llvm-mc -triple riscv32 -mattr=+xwchc -filetype=obj < %s \ # RUN: | llvm-objdump --triple=riscv32 --mattr=+xwchc --no-print-imm-hex -d - \ # RUN: | FileCheck -check-prefixes=CHECK-ALIAS %s diff --git a/llvm/test/MC/RISCV/xwchc-valid.s b/llvm/test/MC/RISCV/xwchc-valid.s index 292a042805232..51767941a24b2 100644 --- a/llvm/test/MC/RISCV/xwchc-valid.s +++ b/llvm/test/MC/RISCV/xwchc-valid.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+xwchc -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+xwchc -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+xwchc < %s \ # RUN: | llvm-objdump --mattr=+xwchc --no-print-imm-hex -M no-aliases -d -r - \ diff --git a/llvm/test/MC/RISCV/zfa-double-invalid.s b/llvm/test/MC/RISCV/zfa-double-invalid.s index ec21b0c613375..f28bd5cd0b976 100644 --- a/llvm/test/MC/RISCV/zfa-double-invalid.s +++ b/llvm/test/MC/RISCV/zfa-double-invalid.s @@ -1,8 +1,8 @@ # RUN: not llvm-mc -triple riscv32 -mattr=+zfa,+zfh \ -# RUN: -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXTD %s # RUN: not llvm-mc -triple riscv64 -mattr=+zfa,+zfh \ -# RUN: -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXTD %s # CHECK-NO-EXTD: error: instruction requires the following: 'D' (Double-Precision Floating-Point){{$}} diff --git a/llvm/test/MC/RISCV/zfa-half-invalid.s b/llvm/test/MC/RISCV/zfa-half-invalid.s index a2c6f09043084..debaf717d0905 100644 --- a/llvm/test/MC/RISCV/zfa-half-invalid.s +++ b/llvm/test/MC/RISCV/zfa-half-invalid.s @@ -1,8 +1,8 @@ # RUN: not llvm-mc -triple riscv32 -mattr=+zfa,+d \ -# RUN: -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXTZFH %s # RUN: not llvm-mc -triple riscv64 -mattr=+zfa,+d \ -# RUN: -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXTZFH %s # CHECK-NO-EXTZFH: error: instruction requires the following: 'Zfh' (Half-Precision Floating-Point){{$}} diff --git a/llvm/test/MC/RISCV/zfa-valid.s b/llvm/test/MC/RISCV/zfa-valid.s index e951c9da2ba78..6e78a4c0f2584 100644 --- a/llvm/test/MC/RISCV/zfa-valid.s +++ b/llvm/test/MC/RISCV/zfa-valid.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfa,+d,+zfh -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfa,+d,+zfh -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfa,+d,+zfh -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfa,+d,+zfh -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zfa,+d,+zfh < %s \ # RUN: | llvm-objdump --mattr=+zfa,+d,+zfh -M no-aliases -d -r - \ @@ -10,10 +10,10 @@ # RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s # # RUN: not llvm-mc -triple riscv32 -mattr=+d,+zfh \ -# RUN: -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXT %s # RUN: not llvm-mc -triple riscv64 -mattr=+d,+zfh \ -# RUN: -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXT %s # CHECK-ASM-AND-OBJ: fli.s ft1, -1.0 diff --git a/llvm/test/MC/RISCV/zfa-zfhmin-zvfh-valid.s b/llvm/test/MC/RISCV/zfa-zfhmin-zvfh-valid.s index 6b5dc9200f34c..a7a16d5f9f682 100644 --- a/llvm/test/MC/RISCV/zfa-zfhmin-zvfh-valid.s +++ b/llvm/test/MC/RISCV/zfa-zfhmin-zvfh-valid.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfa,+zfhmin,+zvfh -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfa,+zfhmin,+zvfh -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfa,+zfhmin,+zvfh -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfa,+zfhmin,+zvfh -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zfa,+zfhmin,+zvfh < %s \ # RUN: | llvm-objdump --mattr=+zfa,+zfhmin,+zvfh -M no-aliases -d -r - \ @@ -9,9 +9,9 @@ # RUN: | llvm-objdump --mattr=+zfa,+zfhmin,+zvfh -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s # -# RUN: not llvm-mc -triple riscv32 -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: not llvm-mc -triple riscv32 -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXT %s -# RUN: not llvm-mc -triple riscv64 -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: not llvm-mc -triple riscv64 -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXT %s # This test makes sure fli.h is supported with Zvfh. diff --git a/llvm/test/MC/RISCV/zicfilp-invalid.s b/llvm/test/MC/RISCV/zicfilp-invalid.s index 5b22c0a7e21b9..bff989fa204a3 100644 --- a/llvm/test/MC/RISCV/zicfilp-invalid.s +++ b/llvm/test/MC/RISCV/zicfilp-invalid.s @@ -1,6 +1,6 @@ -# RUN: not llvm-mc -triple riscv32 -mattr=+experimental-zicfilp -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: not llvm-mc -triple riscv32 -mattr=+experimental-zicfilp -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXT %s -# RUN: not llvm-mc -triple riscv64 -mattr=+experimental-zicfilp -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: not llvm-mc -triple riscv64 -mattr=+experimental-zicfilp -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXT %s # CHECK-NO-EXT: immediate must be an integer in the range [0, 1048575] diff --git a/llvm/test/MC/RISCV/zicfilp-valid.s b/llvm/test/MC/RISCV/zicfilp-valid.s index 308e9b60bd7c3..f61cad8d85d53 100644 --- a/llvm/test/MC/RISCV/zicfilp-valid.s +++ b/llvm/test/MC/RISCV/zicfilp-valid.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zicfilp -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zicfilp -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zicfilp -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zicfilp -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+experimental-zicfilp < %s \ # RUN: | llvm-objdump --mattr=+experimental-zicfilp --no-print-imm-hex -d -r - \ @@ -9,9 +9,9 @@ # RUN: | llvm-objdump --mattr=+experimental-zicfilp --no-print-imm-hex -d -r - \ # RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s # -# RUN: not llvm-mc -triple riscv32 -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: not llvm-mc -triple riscv32 -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXT %s -# RUN: not llvm-mc -triple riscv64 -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: not llvm-mc -triple riscv64 -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXT %s # CHECK-ASM-AND-OBJ: lpad 22 diff --git a/llvm/test/MC/RISCV/zicfiss-valid.s b/llvm/test/MC/RISCV/zicfiss-valid.s index fd69d37d7cfa0..5b2ab8d326651 100644 --- a/llvm/test/MC/RISCV/zicfiss-valid.s +++ b/llvm/test/MC/RISCV/zicfiss-valid.s @@ -1,17 +1,17 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+experimental-zicfiss -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+experimental-zicfiss -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+a,+experimental-zicfiss < %s \ # RUN: | llvm-objdump --mattr=+a,+experimental-zicfiss -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -defsym=RV64=1 -mattr=+a,+experimental-zicfiss -riscv-no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -defsym=RV64=1 -mattr=+a,+experimental-zicfiss -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM-RV64,CHECK-ASM,CHECK-ASM-AND-OBJ-RV64,CHECK-ASM-AND-OBJ %s # RUN: llvm-mc -filetype=obj -triple=riscv64 -defsym=RV64=1 -mattr=+a,+experimental-zicfiss < %s \ # RUN: | llvm-objdump --mattr=+a,+experimental-zicfiss -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefixes=CHECK-ASM-AND-OBJ-RV64,CHECK-ASM-AND-OBJ %s # -# RUN: not llvm-mc -triple riscv32 -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: not llvm-mc -triple riscv32 -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXT %s -# RUN: not llvm-mc -triple riscv64 -defsym=RV64=1 -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: not llvm-mc -triple riscv64 -defsym=RV64=1 -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXT-RV64 %s # CHECK-ASM-AND-OBJ: sspopchk ra From 24ff23fb3af95bb3a2e5af1e95f94a3e308a5a6a Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Tue, 24 Dec 2024 21:52:08 +0000 Subject: [PATCH 029/567] [llvm-exegesis][Docs] Add documentation on benchmark-process-cpu option This patch adds documentation on the benchmark-process-cpu option. I apparently did not add any documentation when originally implementing the feature. --- llvm/docs/CommandGuide/llvm-exegesis.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/llvm/docs/CommandGuide/llvm-exegesis.rst b/llvm/docs/CommandGuide/llvm-exegesis.rst index c2681535b566e..8266d891a5e6b 100644 --- a/llvm/docs/CommandGuide/llvm-exegesis.rst +++ b/llvm/docs/CommandGuide/llvm-exegesis.rst @@ -473,6 +473,14 @@ OPTIONS flag can be specified multiple times to measure multiple events. The maximum number of validation counters is platform dependent. +.. option:: --benchmark-process-cpu= + + This option specifies the number of the CPU that should be used to run the + benchmarking subprocess. When starting the subprocess, + :program:`llvm-exegesis` will set the affinity of the subprocess to only + include the specified CPU. This option only works in the subprocess execution + mode. + EXIT STATUS ----------- From 48a6e51445d61101fa42ffbf9997e5d54a02fc18 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Tue, 24 Dec 2024 21:53:35 +0000 Subject: [PATCH 030/567] [llvm-exegesis] Fix typo in f2334c5919ec077e6a8deeaf43a5b5188baf0251 This should have been in the original commit, but I somehow forgot to run git add && git commit --amend --no-edit between making the change in my editor, saving the file, and pushing the commit. --- llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp index a8226b810c242..a7771b99e97b1 100644 --- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp +++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp @@ -423,7 +423,7 @@ class SubProcessFunctionExecutorImpl assert(static_cast(CurrentCPU) == CPUToUse && "Expected current CPU to equal the CPU requested by the user"); #else - exit(ChildProcessExChildProcessExitCodeE::SetCPUAffinityFailed)); + exit(ChildProcessExitCodeE::SetCPUAffinityFailed); #endif // defined(__x86_64__) && defined(SYS_getcpu) } From cd66c9b6a04689659348c0a3ff4c1205b1133fe9 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Tue, 24 Dec 2024 13:57:03 -0800 Subject: [PATCH 031/567] [Ubsan][Driver] Remove UBSAN C++ runtime from other sanitizers (#121006) Linking this runtime requires C++ ABI, which breaks -nostdlib++ builds. However, UBSAN C++ runtime is only needed for CFI and VPTR checks. Unblocks #120370. --- clang/include/clang/Driver/SanitizerArgs.h | 1 + clang/lib/Driver/SanitizerArgs.cpp | 8 ++++++++ clang/lib/Driver/ToolChains/CommonArgs.cpp | 3 ++- clang/test/Driver/sanitizer-ld.c | 19 +++++++++++++++++++ compiler-rt/lib/asan/CMakeLists.txt | 2 -- compiler-rt/lib/asan/tests/CMakeLists.txt | 6 ++---- compiler-rt/lib/hwasan/CMakeLists.txt | 2 -- compiler-rt/lib/msan/CMakeLists.txt | 1 - compiler-rt/lib/tsan/rtl/CMakeLists.txt | 1 - .../Linux/interface_symbols_linux.cpp | 2 ++ 10 files changed, 34 insertions(+), 11 deletions(-) diff --git a/clang/include/clang/Driver/SanitizerArgs.h b/clang/include/clang/Driver/SanitizerArgs.h index 7410ad4303011..3b275092bbbe8 100644 --- a/clang/include/clang/Driver/SanitizerArgs.h +++ b/clang/include/clang/Driver/SanitizerArgs.h @@ -99,6 +99,7 @@ class SanitizerArgs { } bool needsFuzzerInterceptors() const; bool needsUbsanRt() const; + bool needsUbsanCXXRt() const; bool requiresMinimalRuntime() const { return MinimalRuntime; } bool needsDfsanRt() const { return Sanitizers.has(SanitizerKind::DataFlow); } bool needsSafeStackRt() const { return SafeStackRuntime; } diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp index 7726e464f2b45..98116e2c8336b 100644 --- a/clang/lib/Driver/SanitizerArgs.cpp +++ b/clang/lib/Driver/SanitizerArgs.cpp @@ -338,6 +338,14 @@ bool SanitizerArgs::needsUbsanRt() const { CoverageFeatures; } +bool SanitizerArgs::needsUbsanCXXRt() const { + // Link UBSAN C++ runtime very selectively, as it's needed in only very + // specific cases, but forces the program to depend on C++ ABI. UBSAN C++ + // runtime is not included with other sanitizers. + return static_cast(Sanitizers.Mask & NeedsUbsanCxxRt & + ~TrapSanitizers.Mask); +} + bool SanitizerArgs::needsCfiRt() const { return !(Sanitizers.Mask & SanitizerKind::CFI & ~TrapSanitizers.Mask) && CfiCrossDso && !ImplicitCfiRuntime; diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index e33fa443b3a0e..f8f751cb6a66d 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -1454,6 +1454,7 @@ collectSanitizerRuntimes(const ToolChain &TC, const ArgList &Args, SmallVectorImpl &NonWholeStaticRuntimes, SmallVectorImpl &HelperStaticRuntimes, SmallVectorImpl &RequiredSymbols) { + assert(!TC.getTriple().isOSDarwin() && "it's not used by Darwin"); const SanitizerArgs &SanArgs = TC.getSanitizerArgs(Args); // Collect shared runtimes. if (SanArgs.needsSharedRt()) { @@ -1574,7 +1575,7 @@ collectSanitizerRuntimes(const ToolChain &TC, const ArgList &Args, StaticRuntimes.push_back("cfi_diag"); } if (SanArgs.linkCXXRuntimes() && !SanArgs.requiresMinimalRuntime() && - ((!SanArgs.needsSharedRt() && SanArgs.needsUbsanRt()) || + ((!SanArgs.needsSharedRt() && SanArgs.needsUbsanCXXRt()) || SanArgs.needsCfiDiagRt())) { StaticRuntimes.push_back("ubsan_standalone_cxx"); } diff --git a/clang/test/Driver/sanitizer-ld.c b/clang/test/Driver/sanitizer-ld.c index 8347f9c45935d..8f2f7a5997ab4 100644 --- a/clang/test/Driver/sanitizer-ld.c +++ b/clang/test/Driver/sanitizer-ld.c @@ -579,10 +579,25 @@ // CHECK-ASAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.asan_cxx.a" "--no-whole-archive" // CHECK-ASAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan +// CHECK-ASAN-UBSAN-LINUX-CXX: libclang_rt.ubsan_standalone_cxx +// CHECK-ASAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan // CHECK-ASAN-UBSAN-LINUX-CXX: "-lstdc++" // CHECK-ASAN-UBSAN-LINUX-CXX: "-lpthread" // CHECK-ASAN-UBSAN-LINUX-CXX: "-lresolv" +// RUN: %clangxx -fsanitize=address,undefined -fno-sanitize=vptr -### %s 2>&1 \ +// RUN: --target=i386-unknown-linux -fuse-ld=ld -stdlib=platform \ +// RUN: -resource-dir=%S/Inputs/resource_dir \ +// RUN: --sysroot=%S/Inputs/basic_linux_tree \ +// RUN: | FileCheck --check-prefix=CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX %s +// CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX: "{{.*}}ld{{(.exe)?}}" +// CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" +// CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan_cxx.a" "--no-whole-archive" +// CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-NOT: libclang_rt.ubsan +// CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-SAME: "-lstdc++" +// CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-SAME: "-lpthread" +// CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-SAME: "-lresolv" + // RUN: %clangxx -fsanitize=memory,undefined -### %s 2>&1 \ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ @@ -591,6 +606,8 @@ // CHECK-MSAN-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}" // CHECK-MSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.msan.a" "--no-whole-archive" // CHECK-MSAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan +// CHECK-MSAN-UBSAN-LINUX-CXX: libclang_rt.ubsan_standalone_cxx +// CHECK-MSAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan // RUN: %clangxx -fsanitize=thread,undefined -### %s 2>&1 \ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ @@ -600,6 +617,8 @@ // CHECK-TSAN-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}" // CHECK-TSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tsan.a" "--no-whole-archive" // CHECK-TSAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan +// CHECK-TSAN-UBSAN-LINUX-CXX: libclang_rt.ubsan_standalone_cxx +// CHECK-TSAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan // RUN: %clang -fsanitize=undefined -### %s 2>&1 \ // RUN: --target=i386-unknown-linux -fuse-ld=ld \ diff --git a/compiler-rt/lib/asan/CMakeLists.txt b/compiler-rt/lib/asan/CMakeLists.txt index 5ec995ae159b7..a2c15806f81a2 100644 --- a/compiler-rt/lib/asan/CMakeLists.txt +++ b/compiler-rt/lib/asan/CMakeLists.txt @@ -260,7 +260,6 @@ else() STATIC ARCHS ${ASAN_SUPPORTED_ARCH} OBJECT_LIBS RTAsan_cxx - RTUbsan_cxx CFLAGS ${ASAN_CFLAGS} DEFS ${ASAN_COMMON_DEFINITIONS} PARENT_TARGET asan) @@ -319,7 +318,6 @@ else() # add_dependencies(clang_rt.asan-dynamic-${arch} clang_rt.asan-dynamic-${arch}-version-list) # generates an order-only dependency in ninja. RTAsan_dynamic_version_script_dummy - RTUbsan_cxx ${ASAN_DYNAMIC_WEAK_INTERCEPTION} CFLAGS ${ASAN_DYNAMIC_CFLAGS} LINK_FLAGS ${ASAN_DYNAMIC_LINK_FLAGS} diff --git a/compiler-rt/lib/asan/tests/CMakeLists.txt b/compiler-rt/lib/asan/tests/CMakeLists.txt index 998e0ff24efa4..d80a9f11e50ee 100644 --- a/compiler-rt/lib/asan/tests/CMakeLists.txt +++ b/compiler-rt/lib/asan/tests/CMakeLists.txt @@ -275,8 +275,7 @@ if(COMPILER_RT_CAN_EXECUTE_TESTS AND NOT ANDROID) $ $ $ - $ - $) + $) endif() add_library(${ASAN_TEST_RUNTIME} STATIC ${ASAN_TEST_RUNTIME_OBJECTS}) set_target_properties(${ASAN_TEST_RUNTIME} PROPERTIES @@ -302,8 +301,7 @@ if(ANDROID) $ $ $ - $ - $ + $> ${COMPILER_RT_GTEST_SOURCE} ${ASAN_NOINST_TEST_SOURCES}) set_target_compile_flags(AsanNoinstTest ${ASAN_UNITTEST_COMMON_CFLAGS}) diff --git a/compiler-rt/lib/hwasan/CMakeLists.txt b/compiler-rt/lib/hwasan/CMakeLists.txt index afafa0c4a9276..4372603b45a48 100644 --- a/compiler-rt/lib/hwasan/CMakeLists.txt +++ b/compiler-rt/lib/hwasan/CMakeLists.txt @@ -188,7 +188,6 @@ function(add_hwasan_runtimes arch use_aliases) STATIC ARCHS ${arch} OBJECT_LIBS RTHwasan_cxx - RTUbsan_cxx CFLAGS ${hwasan_rtl_flags} PARENT_TARGET hwasan) @@ -220,7 +219,6 @@ function(add_hwasan_runtimes arch use_aliases) RTSanitizerCommonSymbolizerInternal RTLSanCommon RTUbsan - RTUbsan_cxx # The only purpose of RTHWAsan_dynamic_version_script_dummy is to # carry a dependency of the shared runtime on the version script. # Replacing it with a straightforward diff --git a/compiler-rt/lib/msan/CMakeLists.txt b/compiler-rt/lib/msan/CMakeLists.txt index b9976b258deb2..a0b9c61584c98 100644 --- a/compiler-rt/lib/msan/CMakeLists.txt +++ b/compiler-rt/lib/msan/CMakeLists.txt @@ -66,7 +66,6 @@ foreach(arch ${MSAN_SUPPORTED_ARCH}) STATIC ARCHS ${arch} SOURCES ${MSAN_RTL_CXX_SOURCES} - $ ADDITIONAL_HEADERS ${MSAN_RTL_HEADERS} CFLAGS ${MSAN_RTL_CFLAGS} PARENT_TARGET msan) diff --git a/compiler-rt/lib/tsan/rtl/CMakeLists.txt b/compiler-rt/lib/tsan/rtl/CMakeLists.txt index f40e72dbde1f9..d7d84706bfd58 100644 --- a/compiler-rt/lib/tsan/rtl/CMakeLists.txt +++ b/compiler-rt/lib/tsan/rtl/CMakeLists.txt @@ -259,7 +259,6 @@ else() STATIC ARCHS ${arch} SOURCES ${TSAN_CXX_SOURCES} - $ ADDITIONAL_HEADERS ${TSAN_HEADERS} CFLAGS ${TSAN_RTL_CFLAGS} PARENT_TARGET tsan) diff --git a/compiler-rt/test/asan/TestCases/Linux/interface_symbols_linux.cpp b/compiler-rt/test/asan/TestCases/Linux/interface_symbols_linux.cpp index 2d729497548d9..60ef0e5b0de6f 100644 --- a/compiler-rt/test/asan/TestCases/Linux/interface_symbols_linux.cpp +++ b/compiler-rt/test/asan/TestCases/Linux/interface_symbols_linux.cpp @@ -23,6 +23,8 @@ // RUN: | grep -v "__sanitizer_weak_hook" \ // RUN: | grep -v "__sanitizer_override_function" \ // RUN: | grep -v "__sanitizer_override_function_by_addr" \ +// RUN: | grep -v "__ubsan_handle_dynamic_type_cache_miss" \ +// RUN: | grep -v "__ubsan_handle_dynamic_type_cache_miss_abort" \ // RUN: | grep -v "__sanitizer_register_weak_function" \ // RUN: | sed -e "s/.*(//" -e "s/).*//" > %t.imports // From 34f8573a514915222630cf21e8a0c901a25f4ca0 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Tue, 24 Dec 2024 18:05:28 -0600 Subject: [PATCH 032/567] [OpenMP] Use generic IR for the OpenMP DeviceRTL (#119091) Summary: We previously built this for every single architecture to deal with incompatibility. This patch updates it to use the 'generic' IR that `libc` and other projects use. Who knows if this will have any side-effects, probably worth testing more but it passes the tests I expect to pass on my side. --- clang/docs/ReleaseNotes.rst | 3 + clang/lib/Driver/ToolChains/CommonArgs.cpp | 3 +- clang/lib/Driver/ToolChains/Cuda.cpp | 1 - ...t-nvptx-sm_52.bc => libomptarget-nvptx.bc} | 0 ...t-nvptx-sm_52.bc => libomptarget-nvptx.bc} | 0 clang/test/Driver/openmp-offload-gpu.c | 4 +- offload/DeviceRTL/CMakeLists.txt | 76 ++++------------ offload/DeviceRTL/src/Misc.cpp | 10 +- offload/DeviceRTL/src/Reduction.cpp | 91 ++++++++++--------- openmp/docs/ReleaseNotes.rst | 6 ++ 10 files changed, 76 insertions(+), 118 deletions(-) rename clang/test/Driver/Inputs/libomptarget/{libomptarget-nvptx-sm_52.bc => libomptarget-nvptx.bc} (100%) rename clang/test/Driver/Inputs/libomptarget/subdir/{libomptarget-nvptx-sm_52.bc => libomptarget-nvptx.bc} (100%) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 8b984ecaefeca..ba5aafe25cc93 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -1250,6 +1250,9 @@ OpenMP Support - Added support for 'omp assume' directive. - Added support for 'omp scope' directive. - Added support for allocator-modifier in 'allocate' clause. +- Changed the OpenMP DeviceRTL to use 'generic' IR. The + ``LIBOMPTARGET_DEVICE_ARCHITECTURES`` CMake argument is now unused and will + always build support for AMDGPU and NVPTX targets. Improvements ^^^^^^^^^^^^ diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index f8f751cb6a66d..bc62e8c48238b 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -2838,8 +2838,7 @@ void tools::addOpenMPDeviceRTL(const Driver &D, : options::OPT_libomptarget_nvptx_bc_path_EQ; StringRef ArchPrefix = Triple.isAMDGCN() ? "amdgpu" : "nvptx"; - std::string LibOmpTargetName = - ("libomptarget-" + ArchPrefix + "-" + BitcodeSuffix + ".bc").str(); + std::string LibOmpTargetName = ("libomptarget-" + ArchPrefix + ".bc").str(); // First check whether user specifies bc library if (const Arg *A = DriverArgs.getLastArg(LibomptargetBCPathOpt)) { diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp index 102794829795d..214f1e5d83478 100644 --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -851,7 +851,6 @@ void CudaToolChain::addClangTargetOptions( HostTC.addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadingKind); StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_march_EQ); - assert(!GpuArch.empty() && "Must have an explicit GPU arch."); assert((DeviceOffloadingKind == Action::OFK_OpenMP || DeviceOffloadingKind == Action::OFK_Cuda) && "Only OpenMP or CUDA offloading kinds are supported for NVIDIA GPUs."); diff --git a/clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-sm_52.bc b/clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx.bc similarity index 100% rename from clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-sm_52.bc rename to clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx.bc diff --git a/clang/test/Driver/Inputs/libomptarget/subdir/libomptarget-nvptx-sm_52.bc b/clang/test/Driver/Inputs/libomptarget/subdir/libomptarget-nvptx.bc similarity index 100% rename from clang/test/Driver/Inputs/libomptarget/subdir/libomptarget-nvptx-sm_52.bc rename to clang/test/Driver/Inputs/libomptarget/subdir/libomptarget-nvptx.bc diff --git a/clang/test/Driver/openmp-offload-gpu.c b/clang/test/Driver/openmp-offload-gpu.c index f6e2245dcdbc0..74bd2a6aeee46 100644 --- a/clang/test/Driver/openmp-offload-gpu.c +++ b/clang/test/Driver/openmp-offload-gpu.c @@ -90,8 +90,8 @@ // RUN: %s 2>&1 | FileCheck -check-prefix=CHK-ENV-BCLIB %s // CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-nvptx-test.bc -// CHK-BCLIB-DIR: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget{{/|\\\\}}libomptarget-nvptx-sm_52.bc -// CHK-ENV-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}subdir{{/|\\\\}}libomptarget-nvptx-sm_52.bc +// CHK-BCLIB-DIR: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget{{/|\\\\}}libomptarget-nvptx.bc +// CHK-ENV-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}subdir{{/|\\\\}}libomptarget-nvptx.bc // CHK-BCLIB-NOT: {{error:|warning:}} /// ########################################################################### diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt index 11176a567a643..22940264f9b19 100644 --- a/offload/DeviceRTL/CMakeLists.txt +++ b/offload/DeviceRTL/CMakeLists.txt @@ -42,43 +42,6 @@ set(devicertl_base_directory ${CMAKE_CURRENT_SOURCE_DIR}) set(include_directory ${devicertl_base_directory}/include) set(source_directory ${devicertl_base_directory}/src) -set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803" - "gfx9-generic;gfx900;gfx902;gfx906;gfx908" - "gfx90a;gfx90c" - "gfx9-4-generic;gfx940;gfx941;gfx942;gfx950" - "gfx10-1-generic;gfx1010;gfx1012" - "gfx10-3-generic;gfx1030;gfx1031;gfx1032;gfx1033" - "gfx1034;gfx1035;gfx1036" - "gfx11-generic;gfx1100;gfx1101;gfx1102;gfx1103" - "gfx1150;gfx1151;gfx1152;gfx1153" - "gfx12-generic") -set(all_nvptx_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62" - "sm_70;sm_72;sm_75;sm_80;sm_86;sm_87;sm_89;sm_90") -set(all_gpu_architectures - "${all_amdgpu_architectures};${all_nvptx_architectures}") - -set(LIBOMPTARGET_DEVICE_ARCHITECTURES "all" CACHE STRING - "List of device architectures to be used to compile the OpenMP DeviceRTL.") - -if(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "all") - set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${all_gpu_architectures}) -elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "amdgpu") - set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${all_amdgpu_architectures}) -elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "nvptx") - set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${all_nvptx_architectures}) -elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "auto" OR - LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "native") - if(NOT LIBOMPTARGET_NVPTX_ARCH AND NOT LIBOMPTARGET_AMDGPU_ARCH) - message(FATAL_ERROR - "Could not find 'amdgpu-arch' and 'nvptx-arch' tools required for 'auto'") - elseif(NOT LIBOMPTARGET_FOUND_NVIDIA_GPU AND NOT LIBOMPTARGET_FOUND_AMDGPU_GPU) - message(FATAL_ERROR "No AMD or NVIDIA GPU found on the system when using 'auto'") - endif() - set(LIBOMPTARGET_DEVICE_ARCHITECTURES - "${LIBOMPTARGET_NVPTX_DETECTED_ARCH_LIST};${LIBOMPTARGET_AMDGPU_DETECTED_ARCH_LIST}") -endif() -list(REMOVE_DUPLICATES LIBOMPTARGET_DEVICE_ARCHITECTURES) - set(include_files ${include_directory}/Allocator.h ${include_directory}/Configuration.h @@ -146,20 +109,22 @@ set(bc_flags -c -foffload-lto -std=c++17 -fvisibility=hidden # first create an object target add_library(omptarget.devicertl.all_objs OBJECT IMPORTED) -function(compileDeviceRTLLibrary target_cpu target_name target_triple) +function(compileDeviceRTLLibrary target_name target_triple) set(target_bc_flags ${ARGN}) set(bc_files "") foreach(src ${src_files}) get_filename_component(infile ${src} ABSOLUTE) get_filename_component(outfile ${src} NAME) - set(outfile "${outfile}-${target_cpu}.bc") + set(outfile "${outfile}-${target_name}.bc") set(depfile "${outfile}.d") + # Passing an empty CPU to -march= suppressed target specific metadata. add_custom_command(OUTPUT ${outfile} COMMAND ${CLANG_TOOL} ${bc_flags} - --offload-arch=${target_cpu} + -fopenmp-targets=${target_triple} + -Xopenmp-target=${target_triple} -march= ${target_bc_flags} -MD -MF ${depfile} ${infile} -o ${outfile} @@ -182,7 +147,7 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple) list(APPEND bc_files ${outfile}) endforeach() - set(bclib_name "libomptarget-${target_name}-${target_cpu}.bc") + set(bclib_name "libomptarget-${target_name}.bc") # Link to a bitcode library. add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/linked_${bclib_name} @@ -222,7 +187,7 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple) APPEND) endif() - set(bclib_target_name "omptarget-${target_name}-${target_cpu}-bc") + set(bclib_target_name "omptarget-${target_name}-bc") add_custom_target(${bclib_target_name} DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}) # Copy library to destination. @@ -244,7 +209,7 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple) # Package the bitcode in the bitcode and embed it in an ELF for the static library add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name} COMMAND ${PACKAGER_TOOL} -o ${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name} - "--image=file=${CMAKE_CURRENT_BINARY_DIR}/${bclib_name},${target_feature},triple=${target_triple},arch=${target_cpu},kind=openmp" + "--image=file=${CMAKE_CURRENT_BINARY_DIR}/${bclib_name},${target_feature},triple=${target_triple},arch=generic,kind=openmp" DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} COMMENT "Packaging LLVM offloading binary ${bclib_name}.out" ) @@ -254,14 +219,14 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple) APPEND) endif() - set(output_name "${CMAKE_CURRENT_BINARY_DIR}/devicertl-${target_name}-${target_cpu}.o") + set(output_name "${CMAKE_CURRENT_BINARY_DIR}/devicertl-${target_name}.o") add_custom_command(OUTPUT ${output_name} COMMAND ${CLANG_TOOL} --std=c++17 -c -nostdlib -Xclang -fembed-offload-object=${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name} -o ${output_name} ${source_directory}/Stub.cpp DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name} ${source_directory}/Stub.cpp - COMMENT "Embedding LLVM offloading binary in devicertl-${target_name}-${target_cpu}.o" + COMMENT "Embedding LLVM offloading binary in devicertl-${target_name}.o" VERBATIM ) if(TARGET clang) @@ -274,11 +239,11 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple) set_property(TARGET omptarget.devicertl.all_objs APPEND PROPERTY IMPORTED_OBJECTS ${output_name}) if (CMAKE_EXPORT_COMPILE_COMMANDS) - set(ide_target_name omptarget-ide-${target_name}-${target_cpu}) + set(ide_target_name omptarget-ide-${target_name}) add_library(${ide_target_name} STATIC EXCLUDE_FROM_ALL ${src_files}) target_compile_options(${ide_target_name} PRIVATE - -fopenmp --offload-arch=${target_cpu} -fopenmp-cuda-mode - -mllvm -openmp-opt-disable + -fopenmp-targets=${target_triple} -Xopenmp-target=${target_triple} -march= + -fopenmp -fopenmp-cuda-mode -mllvm -openmp-opt-disable -foffload-lto -fvisibility=hidden --offload-device-only -nocudalib -nogpulib -nogpuinc -nostdlibinc -Wno-unknown-cuda-version ) @@ -293,18 +258,11 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple) endif() endfunction() -# Generate a Bitcode library for all the gpu architectures the user requested. -add_custom_target(omptarget.devicertl.nvptx) add_custom_target(omptarget.devicertl.amdgpu) -foreach(gpu_arch ${LIBOMPTARGET_DEVICE_ARCHITECTURES}) - if("${gpu_arch}" IN_LIST all_amdgpu_architectures) - compileDeviceRTLLibrary(${gpu_arch} amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none) - elseif("${gpu_arch}" IN_LIST all_nvptx_architectures) - compileDeviceRTLLibrary(${gpu_arch} nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx63) - else() - message(FATAL_ERROR "Unknown GPU architecture '${gpu_arch}'") - endif() -endforeach() +compileDeviceRTLLibrary(amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none) + +add_custom_target(omptarget.devicertl.nvptx) +compileDeviceRTLLibrary(nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx63) # Archive all the object files generated above into a static library add_library(omptarget.devicertl STATIC) diff --git a/offload/DeviceRTL/src/Misc.cpp b/offload/DeviceRTL/src/Misc.cpp index 00935cce05e47..ba6fbf5d5c7e3 100644 --- a/offload/DeviceRTL/src/Misc.cpp +++ b/offload/DeviceRTL/src/Misc.cpp @@ -39,15 +39,7 @@ double getWTick() { } double getWTime() { - uint64_t NumTicks = 0; - if constexpr (__has_builtin(__builtin_amdgcn_s_sendmsg_rtnl)) - NumTicks = __builtin_amdgcn_s_sendmsg_rtnl(0x83); - else if constexpr (__has_builtin(__builtin_amdgcn_s_memrealtime)) - NumTicks = __builtin_amdgcn_s_memrealtime(); - else if constexpr (__has_builtin(__builtin_amdgcn_s_memtime)) - NumTicks = __builtin_amdgcn_s_memtime(); - - return static_cast(NumTicks) * getWTick(); + return static_cast(__builtin_readsteadycounter()) * getWTick(); } #pragma omp end declare variant diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp index 57df159d3f28e..d3b4528401953 100644 --- a/offload/DeviceRTL/src/Reduction.cpp +++ b/offload/DeviceRTL/src/Reduction.cpp @@ -44,7 +44,6 @@ void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct, } } -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 static uint32_t gpu_irregular_simd_reduce(void *reduce_data, ShuffleReductFnTy shflFct) { uint32_t size, remote_id, physical_lane_id; @@ -63,7 +62,6 @@ static uint32_t gpu_irregular_simd_reduce(void *reduce_data, } while (logical_lane_id % 2 == 0 && size > 1); return (logical_lane_id == 0); } -#endif static int32_t nvptx_parallel_reduce_nowait(void *reduce_data, ShuffleReductFnTy shflFct, @@ -74,49 +72,53 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data, uint32_t NumThreads = omp_get_num_threads(); if (NumThreads == 1) return 1; - /* - * This reduce function handles reduction within a team. It handles - * parallel regions in both L1 and L2 parallelism levels. It also - * supports Generic, SPMD, and NoOMP modes. - * - * 1. Reduce within a warp. - * 2. Warp master copies value to warp 0 via shared memory. - * 3. Warp 0 reduces to a single value. - * 4. The reduced value is available in the thread that returns 1. - */ - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 - uint32_t WarpsNeeded = - (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); - uint32_t WarpId = mapping::getWarpIdInBlock(); - - // Volta execution model: - // For the Generic execution mode a parallel region either has 1 thread and - // beyond that, always a multiple of 32. For the SPMD execution mode we may - // have any number of threads. - if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1)) - gpu_regular_warp_reduce(reduce_data, shflFct); - else if (NumThreads > 1) // Only SPMD execution mode comes thru this case. - gpu_irregular_warp_reduce(reduce_data, shflFct, - /*LaneCount=*/NumThreads % mapping::getWarpSize(), - /*LaneId=*/mapping::getThreadIdInBlock() % - mapping::getWarpSize()); - // When we have more than [mapping::getWarpSize()] number of threads - // a block reduction is performed here. - // - // Only L1 parallel region can enter this if condition. - if (NumThreads > mapping::getWarpSize()) { - // Gather all the reduced values from each warp - // to the first warp. - cpyFct(reduce_data, WarpsNeeded); + // + // This reduce function handles reduction within a team. It handles + // parallel regions in both L1 and L2 parallelism levels. It also + // supports Generic, SPMD, and NoOMP modes. + // + // 1. Reduce within a warp. + // 2. Warp master copies value to warp 0 via shared memory. + // 3. Warp 0 reduces to a single value. + // 4. The reduced value is available in the thread that returns 1. + // - if (WarpId == 0) - gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, - BlockThreadId); +#if __has_builtin(__nvvm_reflect) + if (__nvvm_reflect("__CUDA_ARCH") >= 700) { + uint32_t WarpsNeeded = + (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); + uint32_t WarpId = mapping::getWarpIdInBlock(); + + // Volta execution model: + // For the Generic execution mode a parallel region either has 1 thread and + // beyond that, always a multiple of 32. For the SPMD execution mode we may + // have any number of threads. + if ((NumThreads % mapping::getWarpSize() == 0) || + (WarpId < WarpsNeeded - 1)) + gpu_regular_warp_reduce(reduce_data, shflFct); + else if (NumThreads > 1) // Only SPMD execution mode comes thru this case. + gpu_irregular_warp_reduce( + reduce_data, shflFct, + /*LaneCount=*/NumThreads % mapping::getWarpSize(), + /*LaneId=*/mapping::getThreadIdInBlock() % mapping::getWarpSize()); + + // When we have more than [mapping::getWarpSize()] number of threads + // a block reduction is performed here. + // + // Only L1 parallel region can enter this if condition. + if (NumThreads > mapping::getWarpSize()) { + // Gather all the reduced values from each warp + // to the first warp. + cpyFct(reduce_data, WarpsNeeded); + + if (WarpId == 0) + gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, + BlockThreadId); + } + return BlockThreadId == 0; } - return BlockThreadId == 0; -#else +#endif __kmpc_impl_lanemask_t Liveness = mapping::activemask(); if (Liveness == lanes::All) // Full warp gpu_regular_warp_reduce(reduce_data, shflFct); @@ -150,10 +152,9 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data, return BlockThreadId == 0; } - // Get the OMP thread Id. This is different from BlockThreadId in the case of - // an L2 parallel region. + // Get the OMP thread Id. This is different from BlockThreadId in the case + // of an L2 parallel region. return BlockThreadId == 0; -#endif // __CUDA_ARCH__ >= 700 } uint32_t roundToWarpsize(uint32_t s) { diff --git a/openmp/docs/ReleaseNotes.rst b/openmp/docs/ReleaseNotes.rst index d4a4b1a99f781..0089f1aa31d14 100644 --- a/openmp/docs/ReleaseNotes.rst +++ b/openmp/docs/ReleaseNotes.rst @@ -19,3 +19,9 @@ from the `LLVM releases web site `_. Non-comprehensive list of changes in this release ================================================= + +Device Runtime +-------------- +- Changed the OpenMP DeviceRTL to use 'generic' IR. The + ``LIBOMPTARGET_DEVICE_ARCHITECTURES`` CMake argument is now unused and will + always build support for AMDGPU and NVPTX targets. From 970f65a98a681831e308860e7004e066f5152791 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20Poussineau?= Date: Wed, 25 Dec 2024 02:26:31 +0100 Subject: [PATCH 033/567] [Clang][MIPS] Create specific targets for MIPS PE/COFF (#121040) Implement GNU and MSVC variants. When using them, _WIN32 and _M_MRX000/_MIPS_ macros are correctly defined. --- clang/lib/Basic/Targets.cpp | 8 +++ clang/lib/Basic/Targets/Mips.cpp | 59 +++++++++++++++++++ clang/lib/Basic/Targets/Mips.h | 37 ++++++++++++ .../test/Preprocessor/predefined-win-macros.c | 16 +++++ 4 files changed, 120 insertions(+) diff --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp index 706a391023b3a..d0815ad33bc75 100644 --- a/clang/lib/Basic/Targets.cpp +++ b/clang/lib/Basic/Targets.cpp @@ -297,6 +297,14 @@ std::unique_ptr AllocateTarget(const llvm::Triple &Triple, case llvm::Triple::NaCl: return std::make_unique>(Triple, Opts); + case llvm::Triple::Win32: + switch (Triple.getEnvironment()) { + case llvm::Triple::GNU: + return std::make_unique(Triple, Opts); + case llvm::Triple::MSVC: + default: // Assume MSVC for unknown environments + return std::make_unique(Triple, Opts); + } default: return std::make_unique(Triple, Opts); } diff --git a/clang/lib/Basic/Targets/Mips.cpp b/clang/lib/Basic/Targets/Mips.cpp index 174bc9d2ab996..d56995e3ccc48 100644 --- a/clang/lib/Basic/Targets/Mips.cpp +++ b/clang/lib/Basic/Targets/Mips.cpp @@ -304,3 +304,62 @@ bool MipsTargetInfo::validateTarget(DiagnosticsEngine &Diags) const { return true; } + +WindowsMipsTargetInfo::WindowsMipsTargetInfo(const llvm::Triple &Triple, + const TargetOptions &Opts) + : WindowsTargetInfo(Triple, Opts), Triple(Triple) {} + +void WindowsMipsTargetInfo::getVisualStudioDefines( + const LangOptions &Opts, MacroBuilder &Builder) const { + Builder.defineMacro("_M_MRX000", "4000"); +} + +TargetInfo::BuiltinVaListKind +WindowsMipsTargetInfo::getBuiltinVaListKind() const { + return TargetInfo::CharPtrBuiltinVaList; +} + +TargetInfo::CallingConvCheckResult +WindowsMipsTargetInfo::checkCallingConvention(CallingConv CC) const { + switch (CC) { + case CC_X86StdCall: + case CC_X86ThisCall: + case CC_X86FastCall: + case CC_X86VectorCall: + return CCCR_Ignore; + case CC_C: + case CC_OpenCLKernel: + case CC_PreserveMost: + case CC_PreserveAll: + case CC_Swift: + case CC_SwiftAsync: + return CCCR_OK; + default: + return CCCR_Warning; + } +} + +// Windows MIPS, MS (C++) ABI +MicrosoftMipsTargetInfo::MicrosoftMipsTargetInfo(const llvm::Triple &Triple, + const TargetOptions &Opts) + : WindowsMipsTargetInfo(Triple, Opts) { + TheCXXABI.set(TargetCXXABI::Microsoft); +} + +void MicrosoftMipsTargetInfo::getTargetDefines(const LangOptions &Opts, + MacroBuilder &Builder) const { + WindowsMipsTargetInfo::getTargetDefines(Opts, Builder); + WindowsMipsTargetInfo::getVisualStudioDefines(Opts, Builder); +} + +MinGWMipsTargetInfo::MinGWMipsTargetInfo(const llvm::Triple &Triple, + const TargetOptions &Opts) + : WindowsMipsTargetInfo(Triple, Opts) { + TheCXXABI.set(TargetCXXABI::GenericMIPS); +} + +void MinGWMipsTargetInfo::getTargetDefines(const LangOptions &Opts, + MacroBuilder &Builder) const { + WindowsMipsTargetInfo::getTargetDefines(Opts, Builder); + Builder.defineMacro("_MIPS_"); +} diff --git a/clang/lib/Basic/Targets/Mips.h b/clang/lib/Basic/Targets/Mips.h index 8acaf56523b21..7ddcd57053cb2 100644 --- a/clang/lib/Basic/Targets/Mips.h +++ b/clang/lib/Basic/Targets/Mips.h @@ -13,6 +13,7 @@ #ifndef LLVM_CLANG_LIB_BASIC_TARGETS_MIPS_H #define LLVM_CLANG_LIB_BASIC_TARGETS_MIPS_H +#include "OSTargets.h" #include "clang/Basic/TargetInfo.h" #include "clang/Basic/TargetOptions.h" #include "llvm/Support/Compiler.h" @@ -450,6 +451,42 @@ class LLVM_LIBRARY_VISIBILITY MipsTargetInfo : public TargetInfo { return std::make_pair(32, 32); } }; + +class LLVM_LIBRARY_VISIBILITY WindowsMipsTargetInfo + : public WindowsTargetInfo { + const llvm::Triple Triple; + +public: + WindowsMipsTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts); + + void getVisualStudioDefines(const LangOptions &Opts, + MacroBuilder &Builder) const; + + BuiltinVaListKind getBuiltinVaListKind() const override; + + CallingConvCheckResult checkCallingConvention(CallingConv CC) const override; +}; + +// Windows MIPS, MS (C++) ABI +class LLVM_LIBRARY_VISIBILITY MicrosoftMipsTargetInfo + : public WindowsMipsTargetInfo { +public: + MicrosoftMipsTargetInfo(const llvm::Triple &Triple, + const TargetOptions &Opts); + + void getTargetDefines(const LangOptions &Opts, + MacroBuilder &Builder) const override; +}; + +// MIPS MinGW target +class LLVM_LIBRARY_VISIBILITY MinGWMipsTargetInfo + : public WindowsMipsTargetInfo { +public: + MinGWMipsTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts); + + void getTargetDefines(const LangOptions &Opts, + MacroBuilder &Builder) const override; +}; } // namespace targets } // namespace clang diff --git a/clang/test/Preprocessor/predefined-win-macros.c b/clang/test/Preprocessor/predefined-win-macros.c index 8e539a2a1faf8..86708e02e8dc0 100644 --- a/clang/test/Preprocessor/predefined-win-macros.c +++ b/clang/test/Preprocessor/predefined-win-macros.c @@ -113,6 +113,13 @@ // CHECK-ARM64EC-WIN: #define _WIN32 1 // CHECK-ARM64EC-WIN: #define _WIN64 1 +// RUN: %clang_cc1 -triple mipsel-windows %s -E -dM -o - \ +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-MIPSEL-WIN + +// CHECK-MIPSEL-WIN: #define _M_MRX000 4000 +// CHECK-MIPSEL-WIN: #define _WIN32 1 +// CHECK-MIPSEL-WIN-NOT: #define _MIPS_ 1 + // RUN: %clang_cc1 -triple i686-windows-gnu %s -E -dM -o - \ // RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-X86-MINGW @@ -173,3 +180,12 @@ // CHECK-ARM64EC-MINGW: #define __arm64ec__ 1 // CHECK-ARM64EC-MINGW: #define __x86_64 1 // CHECK-ARM64EC-MINGW: #define __x86_64__ 1 + +// RUN: %clang_cc1 -triple mipsel-windows-gnu %s -E -dM -o - \ +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-MIPSEL-MINGW + +// CHECK-MIPSEL-MINGW-NOT: #define _M_MRX000 4000 +// CHECK-MIPSEL-MINGW: #define _MIPS_ 1 +// CHECK-MIPSEL-MINGW: #define _WIN32 1 +// CHECK-MIPSEL-MINGW: #define __mips 32 +// CHECK-MIPSEL-MINGW: #define __mips__ 1 From 88d04be815bd289c691ab81061ac8a573ad15677 Mon Sep 17 00:00:00 2001 From: Kinoshita Kotaro Date: Wed, 25 Dec 2024 10:59:59 +0900 Subject: [PATCH 034/567] [AArch64][docs] Add release notes for FUJITSU-MONAKA support (#120684) Adds release notes for the FUJITSU-MONAKA support introduced in PR #118432. These notes were missing from the original PR. --- clang/docs/ReleaseNotes.rst | 6 ++++++ llvm/docs/ReleaseNotes.md | 2 ++ 2 files changed, 8 insertions(+) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index ba5aafe25cc93..d9b0cb815a15d 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -1005,6 +1005,12 @@ Arm and AArch64 Support in leaf functions after enabling ``-fno-omit-frame-pointer``, you can do so by adding the ``-momit-leaf-frame-pointer`` option. +- Support has been added for the following processors (-mcpu identifiers in parenthesis): + + For AArch64: + + * FUJITSU-MONAKA (fujitsu-monaka) + Android Support ^^^^^^^^^^^^^^^ diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index 4486218d4f883..5999f78f7e067 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -131,6 +131,8 @@ Changes to the AArch64 Backend * Assembler/disassembler support has been added for Armv9.6-A (2024) architecture extensions. +* Added support for the FUJITSU-MONAKA CPU. + Changes to the AMDGPU Backend ----------------------------- From b5f0ec80d59d054617994f0de409c38fabc8c207 Mon Sep 17 00:00:00 2001 From: LiqinWeng Date: Wed, 25 Dec 2024 10:11:02 +0800 Subject: [PATCH 035/567] [VPlan] Remove redundant printing final in VPlan::execute (#121048) Multiple prints will cause problems when testing ir-bb --- llvm/lib/Transforms/Vectorize/VPlan.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 71f43abe534ec..9a082921d4f7f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1002,11 +1002,6 @@ void VPlan::execute(VPTransformState *State) { setName("Final VPlan"); LLVM_DEBUG(dump()); - LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << State->VF - << ", UF=" << getUF() << '\n'); - setName("Final VPlan"); - LLVM_DEBUG(dump()); - // Disconnect the middle block from its single successor (the scalar loop // header) in both the CFG and DT. The branch will be recreated during VPlan // execution. From ac1d560709d2ecfe83a98285d4a13afae6db4316 Mon Sep 17 00:00:00 2001 From: Konstantin Varlamov Date: Tue, 24 Dec 2024 18:22:18 -0800 Subject: [PATCH 036/567] [libc++][hardening] Add a bounds check for `valarray` and `bitset`. (#120685) Add a `valid-element-access` check to `valarray::operator[]` and `bitset::operator[]`. --- libcxx/docs/Hardening.rst | 2 +- libcxx/include/bitset | 8 +++- libcxx/include/valarray | 10 ++++- .../libcxx/numerics/numarray/assert.pass.cpp | 42 +++++++++++++++++++ .../utilities/template.bitset/assert.pass.cpp | 42 +++++++++++++++++++ .../bitset.members/op_and_eq.pass.cpp | 2 + 6 files changed, 102 insertions(+), 4 deletions(-) create mode 100644 libcxx/test/libcxx/numerics/numarray/assert.pass.cpp create mode 100644 libcxx/test/libcxx/utilities/template.bitset/assert.pass.cpp diff --git a/libcxx/docs/Hardening.rst b/libcxx/docs/Hardening.rst index 42aacfdcfb41a..4002f40e1dad3 100644 --- a/libcxx/docs/Hardening.rst +++ b/libcxx/docs/Hardening.rst @@ -458,7 +458,7 @@ Hardened containers status - Partial - N/A * - ``bitset`` - - ❌ + - ✅ - N/A Note: for ``vector`` and ``string``, the iterator does not check for diff --git a/libcxx/include/bitset b/libcxx/include/bitset index 8b36182480557..919d2a0f07e09 100644 --- a/libcxx/include/bitset +++ b/libcxx/include/bitset @@ -133,6 +133,7 @@ template struct hash>; # include <__algorithm/fill.h> # include <__algorithm/fill_n.h> # include <__algorithm/find.h> +# include <__assert> # include <__bit_reference> # include <__config> # include <__functional/hash.h> @@ -683,13 +684,18 @@ public: // element access: # ifdef _LIBCPP_ABI_BITSET_VECTOR_BOOL_CONST_SUBSCRIPT_RETURN_BOOL - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool operator[](size_t __p) const { return __base::__make_ref(__p); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool operator[](size_t __p) const { + _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__p < _Size, "bitset::operator[] index out of bounds"); + return __base::__make_ref(__p); + } # else _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR const_reference operator[](size_t __p) const { + _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__p < _Size, "bitset::operator[] index out of bounds"); return __base::__make_ref(__p); } # endif _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 reference operator[](size_t __p) { + _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__p < _Size, "bitset::operator[] index out of bounds"); return __base::__make_ref(__p); } _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long to_ulong() const; diff --git a/libcxx/include/valarray b/libcxx/include/valarray index d0b76ee06e796..abc7d391ada07 100644 --- a/libcxx/include/valarray +++ b/libcxx/include/valarray @@ -821,9 +821,15 @@ public: _LIBCPP_HIDE_FROM_ABI valarray& operator=(const __val_expr<_ValExpr>& __v); // element access: - _LIBCPP_HIDE_FROM_ABI const value_type& operator[](size_t __i) const { return __begin_[__i]; } + _LIBCPP_HIDE_FROM_ABI const value_type& operator[](size_t __i) const { + _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__i < size(), "valarray::operator[] index out of bounds"); + return __begin_[__i]; + } - _LIBCPP_HIDE_FROM_ABI value_type& operator[](size_t __i) { return __begin_[__i]; } + _LIBCPP_HIDE_FROM_ABI value_type& operator[](size_t __i) { + _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__i < size(), "valarray::operator[] index out of bounds"); + return __begin_[__i]; + } // subset operations: _LIBCPP_HIDE_FROM_ABI __val_expr<__slice_expr > operator[](slice __s) const; diff --git a/libcxx/test/libcxx/numerics/numarray/assert.pass.cpp b/libcxx/test/libcxx/numerics/numarray/assert.pass.cpp new file mode 100644 index 0000000000000..2bdf52340abfc --- /dev/null +++ b/libcxx/test/libcxx/numerics/numarray/assert.pass.cpp @@ -0,0 +1,42 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// + +// Test hardening assertions for std::valarray. + +// REQUIRES: has-unix-headers +// UNSUPPORTED: libcpp-hardening-mode=none +// UNSUPPORTED: c++03 +// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing + +#include + +#include "check_assertion.h" + +int main(int, char**) { + { // Empty valarray + std::valarray c; + const auto& const_c = c; + TEST_LIBCPP_ASSERT_FAILURE(c[0], "valarray::operator[] index out of bounds"); + TEST_LIBCPP_ASSERT_FAILURE(const_c[0], "valarray::operator[] index out of bounds"); + TEST_LIBCPP_ASSERT_FAILURE(c[42], "valarray::operator[] index out of bounds"); + TEST_LIBCPP_ASSERT_FAILURE(const_c[42], "valarray::operator[] index out of bounds"); + } + + { // Non-empty valarray + std::valarray c(4); + const auto& const_c = c; + (void)c[3]; // Check that there's no assertion on valid access. + TEST_LIBCPP_ASSERT_FAILURE(c[4], "valarray::operator[] index out of bounds"); + (void)const_c[3]; // Check that there's no assertion on valid access. + TEST_LIBCPP_ASSERT_FAILURE(const_c[4], "valarray::operator[] index out of bounds"); + } + + return 0; +} diff --git a/libcxx/test/libcxx/utilities/template.bitset/assert.pass.cpp b/libcxx/test/libcxx/utilities/template.bitset/assert.pass.cpp new file mode 100644 index 0000000000000..4019bdf1318eb --- /dev/null +++ b/libcxx/test/libcxx/utilities/template.bitset/assert.pass.cpp @@ -0,0 +1,42 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// + +// Test hardening assertions for std::bitset. + +// REQUIRES: has-unix-headers +// UNSUPPORTED: libcpp-hardening-mode=none +// UNSUPPORTED: c++03 +// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing + +#include + +#include "check_assertion.h" + +int main(int, char**) { + { // Empty bitset + std::bitset<0> c; + const auto& const_c = c; + TEST_LIBCPP_ASSERT_FAILURE(c[0], "bitset::operator[] index out of bounds"); + TEST_LIBCPP_ASSERT_FAILURE(const_c[0], "bitset::operator[] index out of bounds"); + TEST_LIBCPP_ASSERT_FAILURE(c[42], "bitset::operator[] index out of bounds"); + TEST_LIBCPP_ASSERT_FAILURE(const_c[42], "bitset::operator[] index out of bounds"); + } + + { // Non-empty bitset + std::bitset<4> c(42); + const auto& const_c = c; + (void)c[3]; // Check that there's no assertion on valid access. + TEST_LIBCPP_ASSERT_FAILURE(c[4], "bitset::operator[] index out of bounds"); + (void)const_c[3]; // Check that there's no assertion on valid access. + TEST_LIBCPP_ASSERT_FAILURE(const_c[4], "bitset::operator[] index out of bounds"); + } + + return 0; +} diff --git a/libcxx/test/std/utilities/template.bitset/bitset.members/op_and_eq.pass.cpp b/libcxx/test/std/utilities/template.bitset/bitset.members/op_and_eq.pass.cpp index e8ab264f4bab6..d87fd91b0356c 100644 --- a/libcxx/test/std/utilities/template.bitset/bitset.members/op_and_eq.pass.cpp +++ b/libcxx/test/std/utilities/template.bitset/bitset.members/op_and_eq.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=2000000 + // bitset& operator&=(const bitset& rhs); // constexpr since C++23 #include From 56600c11add12aababdd313c43650facf2a0338f Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 24 Dec 2024 18:37:46 -0800 Subject: [PATCH 037/567] MCAsmInfo: replace HLASM-specific variables with IsHLASM HLASM is very different from the gas syntax. We don't expect other targets to customize the differences. Unify the numerous variables. --- llvm/include/llvm/MC/MCAsmInfo.h | 42 ++----------------- .../AsmPrinter/AsmPrinterInlineAsm.cpp | 2 +- llvm/lib/MC/MCParser/AsmLexer.cpp | 4 +- llvm/lib/MC/MCParser/AsmParser.cpp | 12 +++--- .../SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp | 7 +--- 5 files changed, 13 insertions(+), 54 deletions(-) diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h index cf31c36cc4cef..fb49eb7645dfb 100644 --- a/llvm/include/llvm/MC/MCAsmInfo.h +++ b/llvm/include/llvm/MC/MCAsmInfo.h @@ -98,6 +98,9 @@ class MCAsmInfo { /// part of .global, .weak, .extern, and .comm. Default is false. bool HasVisibilityOnlyWithLinkage = false; + // True if using the HLASM dialect on z/OS. + bool IsHLASM = false; + /// This is the maximum possible length of an instruction, which is needed to /// compute the size of an inline asm. Defaults to 4. unsigned MaxInstLength = 4; @@ -110,14 +113,6 @@ class MCAsmInfo { /// the current PC. Defaults to false. bool DollarIsPC = false; - /// Allow '.' token, when not referencing an identifier or constant, to refer - /// to the current PC. Defaults to true. - bool DotIsPC = true; - - /// Whether the '*' token refers to the current PC. This is used for the - /// HLASM dialect. - bool StarIsPC = false; - /// This string, if specified, is used to separate instructions from each /// other when on the same line. Defaults to ';' const char *SeparatorString; @@ -126,10 +121,6 @@ class MCAsmInfo { /// "#" StringRef CommentString; - /// This indicates whether the comment string is only accepted as a comment - /// at the beginning of statements. Defaults to false. - bool RestrictCommentStringToStartOfStatement = false; - /// This indicates whether to allow additional "comment strings" to be lexed /// as a comment. Setting this attribute to true, will ensure that C-style /// line comments (// ..), C-style block comments (/* .. */), and "#" are @@ -138,16 +129,9 @@ class MCAsmInfo { /// Default is true. bool AllowAdditionalComments = true; - /// Should we emit the '\t' as the starting indentation marker for GNU inline - /// asm statements. Defaults to true. - bool EmitGNUAsmStartIndentationMarker = true; - /// This is appended to emitted labels. Defaults to ":" const char *LabelSuffix; - /// Emit labels in purely upper case. Defaults to false. - bool EmitLabelsInUpperCase = false; - // Print the EH begin symbol with an assignment. Defaults to false. bool UseAssignmentForEHBegin = false; @@ -209,13 +193,6 @@ class MCAsmInfo { /// still be lexed as a comment. bool AllowAtAtStartOfIdentifier = false; - /// This is true if the assembler allows the "#" character at the start of - /// a string to be lexed as an AsmToken::Identifier. - /// If the AsmLexer determines that the string can be lexed as a possible - /// comment, setting this option will have no effect, and the string will - /// still be lexed as a comment. - bool AllowHashAtStartOfIdentifier = false; - /// If this is true, symbol names with invalid characters will be printed in /// quotes. bool SupportsQuotedNames = true; @@ -590,6 +567,7 @@ class MCAsmInfo { // Accessors. + bool isHLASM() const { return IsHLASM; } bool isMachO() const { return HasSubsectionsViaSymbols; } bool hasCOFFAssociativeComdats() const { return HasCOFFAssociativeComdats; } bool hasCOFFComdatConstants() const { return HasCOFFComdatConstants; } @@ -605,23 +583,14 @@ class MCAsmInfo { unsigned getMinInstAlignment() const { return MinInstAlignment; } bool getDollarIsPC() const { return DollarIsPC; } - bool getDotIsPC() const { return DotIsPC; } - bool getStarIsPC() const { return StarIsPC; } const char *getSeparatorString() const { return SeparatorString; } unsigned getCommentColumn() const { return CommentColumn; } void setCommentColumn(unsigned Col) { CommentColumn = Col; } StringRef getCommentString() const { return CommentString; } - bool getRestrictCommentStringToStartOfStatement() const { - return RestrictCommentStringToStartOfStatement; - } bool shouldAllowAdditionalComments() const { return AllowAdditionalComments; } - bool getEmitGNUAsmStartIndentationMarker() const { - return EmitGNUAsmStartIndentationMarker; - } const char *getLabelSuffix() const { return LabelSuffix; } - bool shouldEmitLabelsInUpperCase() const { return EmitLabelsInUpperCase; } bool useAssignmentForEHBegin() const { return UseAssignmentForEHBegin; } bool needsLocalForSize() const { return NeedsLocalForSize; } @@ -655,9 +624,6 @@ class MCAsmInfo { bool doesAllowDollarAtStartOfIdentifier() const { return AllowDollarAtStartOfIdentifier; } - bool doesAllowHashAtStartOfIdentifier() const { - return AllowHashAtStartOfIdentifier; - } bool supportsNameQuoting() const { return SupportsQuotedNames; } bool doesSupportDataRegionDirectives() const { diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp index ebae27e37b4fc..59fc4cfc23e10 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp @@ -153,7 +153,7 @@ static void EmitInlineAsmStr(const char *AsmStr, const MachineInstr *MI, AsmPrinterVariant = MMI->getTarget().unqualifiedInlineAsmVariant(); // FIXME: Should this happen for `asm inteldialect` as well? - if (!InputIsIntelDialect && MAI->getEmitGNUAsmStartIndentationMarker()) + if (!InputIsIntelDialect && !MAI->isHLASM()) OS << '\t'; while (*LastEmitted) { diff --git a/llvm/lib/MC/MCParser/AsmLexer.cpp b/llvm/lib/MC/MCParser/AsmLexer.cpp index 778ca340e1248..32b6e869cc636 100644 --- a/llvm/lib/MC/MCParser/AsmLexer.cpp +++ b/llvm/lib/MC/MCParser/AsmLexer.cpp @@ -707,7 +707,7 @@ size_t AsmLexer::peekTokens(MutableArrayRef Buf, } bool AsmLexer::isAtStartOfComment(const char *Ptr) { - if (MAI.getRestrictCommentStringToStartOfStatement() && !IsAtStartOfStatement) + if (MAI.isHLASM() && !IsAtStartOfStatement) return false; StringRef CommentString = MAI.getCommentString(); @@ -836,7 +836,7 @@ AsmToken AsmLexer::LexToken() { return LexIdentifier(); return AsmToken(AsmToken::At, StringRef(TokStart, 1)); case '#': - if (MAI.doesAllowHashAtStartOfIdentifier()) + if (MAI.isHLASM()) return LexIdentifier(); return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); case '?': diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp index 153c1070a68c8..bf952df1b2418 100644 --- a/llvm/lib/MC/MCParser/AsmParser.cpp +++ b/llvm/lib/MC/MCParser/AsmParser.cpp @@ -1181,7 +1181,7 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc, if (getTok().is(AsmToken::Dollar) || getTok().is(AsmToken::Star)) { bool ShouldGenerateTempSymbol = false; if ((getTok().is(AsmToken::Dollar) && MAI.getDollarIsPC()) || - (getTok().is(AsmToken::Star) && MAI.getStarIsPC())) + (getTok().is(AsmToken::Star) && MAI.isHLASM())) ShouldGenerateTempSymbol = true; if (!ShouldGenerateTempSymbol) @@ -1248,8 +1248,8 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc, MCSymbol *Sym = getContext().getInlineAsmLabel(SymbolName); if (!Sym) - Sym = getContext().getOrCreateSymbol( - MAI.shouldEmitLabelsInUpperCase() ? SymbolName.upper() : SymbolName); + Sym = getContext().getOrCreateSymbol(MAI.isHLASM() ? SymbolName.upper() + : SymbolName); // If this is an absolute variable reference, substitute it now to preserve // semantics in the face of reassignment. @@ -1312,7 +1312,7 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc, return false; } case AsmToken::Dot: { - if (!MAI.getDotIsPC()) + if (MAI.isHLASM()) return TokError("cannot use . as current PC"); // This is a '.' reference, which references the current PC. Emit a @@ -6322,9 +6322,7 @@ bool HLASMAsmParser::parseAsHLASMLabel(ParseStatementInfo &Info, "Cannot have just a label for an HLASM inline asm statement"); MCSymbol *Sym = getContext().getOrCreateSymbol( - getContext().getAsmInfo()->shouldEmitLabelsInUpperCase() - ? LabelVal.upper() - : LabelVal); + getContext().getAsmInfo()->isHLASM() ? LabelVal.upper() : LabelVal); getTargetParser().doBeforeLabelEmit(Sym, LabelLoc); diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp index 06d7d422e647e..16e7d05b8fb73 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp @@ -29,19 +29,14 @@ SystemZMCAsmInfoGOFF::SystemZMCAsmInfoGOFF(const Triple &TT) { AllowAtInName = true; AllowAtAtStartOfIdentifier = true; AllowDollarAtStartOfIdentifier = true; - AllowHashAtStartOfIdentifier = true; AssemblerDialect = AD_HLASM; CalleeSaveStackSlotSize = 8; CodePointerSize = 8; CommentString = "*"; - DotIsPC = false; - EmitGNUAsmStartIndentationMarker = false; - EmitLabelsInUpperCase = true; ExceptionsType = ExceptionHandling::ZOS; + IsHLASM = true; IsLittleEndian = false; MaxInstLength = 6; - RestrictCommentStringToStartOfStatement = true; - StarIsPC = true; SupportsDebugInformation = true; } From 34f70007348d2d1a0e59fc0996e90a0b6fba1933 Mon Sep 17 00:00:00 2001 From: Congcong Cai Date: Wed, 25 Dec 2024 11:07:05 +0800 Subject: [PATCH 038/567] [clang-tidy]link LLVMTargetParser (#121072) Fix build issue introduced in #120547 --- clang-tools-extra/clang-tidy/tool/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/clang-tools-extra/clang-tidy/tool/CMakeLists.txt b/clang-tools-extra/clang-tidy/tool/CMakeLists.txt index 81fba3bbf12fe..0d4501d1eac06 100644 --- a/clang-tools-extra/clang-tidy/tool/CMakeLists.txt +++ b/clang-tools-extra/clang-tidy/tool/CMakeLists.txt @@ -3,6 +3,7 @@ set(LLVM_LINK_COMPONENTS AllTargetsDescs AllTargetsInfos FrontendOpenMP + TargetParser support ) From d3846eca2061e6e9a8d654551153f7362c27b59a Mon Sep 17 00:00:00 2001 From: Kai Sasaki Date: Wed, 25 Dec 2024 12:19:52 +0900 Subject: [PATCH 039/567] [mlir] Guard sccp pass from crashing with different source type (#120656) Vector::BroadCastOp expects the identical element type in folding. It causes the crash if the different source type is given to the SCCP pass. We need to guard the pass from crashing if the nonidentical element type is given, but still compatible. (e.g. index vs integer type) https://github.com/llvm/llvm-project/issues/120193 --- mlir/lib/Dialect/Vector/IR/VectorOps.cpp | 12 ++++++++++-- mlir/test/Transforms/sccp.mlir | 9 +++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp index 491b5f44b722b..ae1cf95732336 100644 --- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp +++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp @@ -2523,8 +2523,16 @@ OpFoldResult BroadcastOp::fold(FoldAdaptor adaptor) { if (!adaptor.getSource()) return {}; auto vectorType = getResultVectorType(); - if (llvm::isa(adaptor.getSource())) - return DenseElementsAttr::get(vectorType, adaptor.getSource()); + if (auto attr = llvm::dyn_cast(adaptor.getSource())) { + if (vectorType.getElementType() != attr.getType()) + return {}; + return DenseElementsAttr::get(vectorType, attr); + } + if (auto attr = llvm::dyn_cast(adaptor.getSource())) { + if (vectorType.getElementType() != attr.getType()) + return {}; + return DenseElementsAttr::get(vectorType, attr); + } if (auto attr = llvm::dyn_cast(adaptor.getSource())) return DenseElementsAttr::get(vectorType, attr.getSplatValue()); return {}; diff --git a/mlir/test/Transforms/sccp.mlir b/mlir/test/Transforms/sccp.mlir index dcae052c29c24..c78c8594c0ba5 100644 --- a/mlir/test/Transforms/sccp.mlir +++ b/mlir/test/Transforms/sccp.mlir @@ -246,3 +246,12 @@ func.func @op_with_region() -> (i32) { ^b: return %1 : i32 } + +// CHECK-LABEL: no_crash_with_different_source_type +func.func @no_crash_with_different_source_type() { + // CHECK: llvm.mlir.constant(0 : index) : i64 + %0 = llvm.mlir.constant(0 : index) : i64 + // CHECK: vector.broadcast %[[CST:.*]] : i64 to vector<128xi64> + %1 = vector.broadcast %0 : i64 to vector<128xi64> + llvm.return +} From 1de228fa9bf07c118294b53e3da37ab2b5ff1fd4 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Tue, 24 Dec 2024 20:21:40 -0800 Subject: [PATCH 040/567] [ubsan] Remove UBSAN_CAN_USE_CXXABI (#121082) It's should be enough to provide weak implementation. Fixes solaris and android linking after #121006. --- compiler-rt/lib/ubsan/CMakeLists.txt | 3 --- compiler-rt/lib/ubsan/ubsan_handlers.cpp | 11 +---------- llvm/utils/gn/secondary/compiler-rt/lib/lsan/BUILD.gn | 2 -- .../utils/gn/secondary/compiler-rt/lib/ubsan/BUILD.gn | 3 --- 4 files changed, 1 insertion(+), 18 deletions(-) diff --git a/compiler-rt/lib/ubsan/CMakeLists.txt b/compiler-rt/lib/ubsan/CMakeLists.txt index 5d45a53d02dbd..a6c98c40ec772 100644 --- a/compiler-rt/lib/ubsan/CMakeLists.txt +++ b/compiler-rt/lib/ubsan/CMakeLists.txt @@ -43,18 +43,15 @@ include_directories(..) set(UBSAN_CFLAGS ${SANITIZER_COMMON_CFLAGS}) append_list_if(MSVC /Zl UBSAN_CFLAGS) append_rtti_flag(OFF UBSAN_CFLAGS) -append_list_if(SANITIZER_CAN_USE_CXXABI -DUBSAN_CAN_USE_CXXABI UBSAN_CFLAGS) # Too many existing bugs, needs cleanup. append_list_if(COMPILER_RT_HAS_WNO_FORMAT -Wno-format UBSAN_CFLAGS) set(UBSAN_STANDALONE_CFLAGS ${SANITIZER_COMMON_CFLAGS}) append_rtti_flag(OFF UBSAN_STANDALONE_CFLAGS) -append_list_if(SANITIZER_CAN_USE_CXXABI -DUBSAN_CAN_USE_CXXABI UBSAN_STANDALONE_CFLAGS) set(UBSAN_CXXFLAGS ${SANITIZER_COMMON_CFLAGS}) append_rtti_flag(ON UBSAN_CXXFLAGS) -append_list_if(SANITIZER_CAN_USE_CXXABI -DUBSAN_CAN_USE_CXXABI UBSAN_CXXFLAGS) # Silence warnings in system headers with MSVC. if(NOT CLANG_CL) diff --git a/compiler-rt/lib/ubsan/ubsan_handlers.cpp b/compiler-rt/lib/ubsan/ubsan_handlers.cpp index ac7001c74afb5..ee9e3ccd0b1f6 100644 --- a/compiler-rt/lib/ubsan/ubsan_handlers.cpp +++ b/compiler-rt/lib/ubsan/ubsan_handlers.cpp @@ -899,10 +899,7 @@ static void handleCFIBadIcall(CFICheckFailData *Data, ValueHandle Function, namespace __ubsan { -#ifdef UBSAN_CAN_USE_CXXABI - #ifdef _WIN32 - extern "C" void __ubsan_handle_cfi_bad_type_default(CFICheckFailData *Data, ValueHandle Vtable, bool ValidVtable, @@ -911,18 +908,12 @@ extern "C" void __ubsan_handle_cfi_bad_type_default(CFICheckFailData *Data, } WIN_WEAK_ALIAS(__ubsan_handle_cfi_bad_type, __ubsan_handle_cfi_bad_type_default) -#else -SANITIZER_WEAK_ATTRIBUTE #endif -void __ubsan_handle_cfi_bad_type(CFICheckFailData *Data, ValueHandle Vtable, - bool ValidVtable, ReportOptions Opts); - -#else +SANITIZER_WEAK_ATTRIBUTE void __ubsan_handle_cfi_bad_type(CFICheckFailData *Data, ValueHandle Vtable, bool ValidVtable, ReportOptions Opts) { Die(); } -#endif } // namespace __ubsan diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/lsan/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/lsan/BUILD.gn index 77b251030c671..2ac06c4e6c75f 100644 --- a/llvm/utils/gn/secondary/compiler-rt/lib/lsan/BUILD.gn +++ b/llvm/utils/gn/secondary/compiler-rt/lib/lsan/BUILD.gn @@ -1,7 +1,6 @@ source_set("common_sources") { configs -= [ "//llvm/utils/gn/build:llvm_code" ] configs += [ "//llvm/utils/gn/build:crt_code" ] - defines = [ "UBSAN_CAN_USE_CXXABI" ] deps = [ "//compiler-rt/lib/interception:sources", "//compiler-rt/lib/sanitizer_common:sources", @@ -18,7 +17,6 @@ source_set("common_sources") { source_set("sources") { configs -= [ "//llvm/utils/gn/build:llvm_code" ] configs += [ "//llvm/utils/gn/build:crt_code" ] - defines = [ "UBSAN_CAN_USE_CXXABI" ] deps = [ "//compiler-rt/lib/interception:sources", "//compiler-rt/lib/sanitizer_common:sources", diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/ubsan/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/ubsan/BUILD.gn index d3b4a406f8b50..c331193ca94c9 100644 --- a/llvm/utils/gn/secondary/compiler-rt/lib/ubsan/BUILD.gn +++ b/llvm/utils/gn/secondary/compiler-rt/lib/ubsan/BUILD.gn @@ -27,7 +27,6 @@ gen_version_script("version_script") { source_set("sources") { configs -= [ "//llvm/utils/gn/build:llvm_code" ] configs += [ "//llvm/utils/gn/build:crt_code" ] - defines = [ "UBSAN_CAN_USE_CXXABI" ] deps = [ "//compiler-rt/lib/interception:sources", "//compiler-rt/lib/sanitizer_common:sources", @@ -65,7 +64,6 @@ source_set("standalone_sources") { configs -= [ "//llvm/utils/gn/build:llvm_code" ] configs -= [ "//llvm/utils/gn/build:no_rtti" ] configs += [ "//llvm/utils/gn/build:crt_code" ] - defines = [ "UBSAN_CAN_USE_CXXABI" ] sources = [ "ubsan_diag_standalone.cpp", "ubsan_init_standalone.cpp", @@ -77,7 +75,6 @@ source_set("cxx_sources") { configs -= [ "//llvm/utils/gn/build:llvm_code" ] configs -= [ "//llvm/utils/gn/build:no_rtti" ] configs += [ "//llvm/utils/gn/build:crt_code" ] - defines = [ "UBSAN_CAN_USE_CXXABI" ] sources = [ "ubsan_handlers_cxx.cpp", "ubsan_handlers_cxx.h", From a1328c077c9bae5c3b3cb952fe817c4881569cbc Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Tue, 24 Dec 2024 20:26:24 -0800 Subject: [PATCH 041/567] [ubsan] Fix android build Broken by #121006. --- compiler-rt/lib/asan/tests/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/lib/asan/tests/CMakeLists.txt b/compiler-rt/lib/asan/tests/CMakeLists.txt index d80a9f11e50ee..00dcbf6534e28 100644 --- a/compiler-rt/lib/asan/tests/CMakeLists.txt +++ b/compiler-rt/lib/asan/tests/CMakeLists.txt @@ -301,7 +301,7 @@ if(ANDROID) $ $ $ - $> + $ ${COMPILER_RT_GTEST_SOURCE} ${ASAN_NOINST_TEST_SOURCES}) set_target_compile_flags(AsanNoinstTest ${ASAN_UNITTEST_COMMON_CFLAGS}) From 6a7687c455125f6597a9719227a0efcb7f71e572 Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Tue, 24 Dec 2024 20:35:58 -0800 Subject: [PATCH 042/567] [mlir][python] Support `CLANG_CL` (#121075) --- mlir/cmake/modules/AddMLIR.cmake | 2 +- mlir/cmake/modules/AddMLIRPython.cmake | 20 +++++++++++--------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/mlir/cmake/modules/AddMLIR.cmake b/mlir/cmake/modules/AddMLIR.cmake index e1e79593ec2cb..9c7b00b660ba7 100644 --- a/mlir/cmake/modules/AddMLIR.cmake +++ b/mlir/cmake/modules/AddMLIR.cmake @@ -584,7 +584,7 @@ function(add_mlir_aggregate name) # TODO: Should be transitive. set_target_properties(${name} PROPERTIES MLIR_AGGREGATE_EXCLUDE_LIBS "${_embed_libs}") - if(MSVC) + if(WIN32) set_property(TARGET ${name} PROPERTY WINDOWS_EXPORT_ALL_SYMBOLS ON) endif() diff --git a/mlir/cmake/modules/AddMLIRPython.cmake b/mlir/cmake/modules/AddMLIRPython.cmake index 53a70139fd5a6..9d4e06c7909c8 100644 --- a/mlir/cmake/modules/AddMLIRPython.cmake +++ b/mlir/cmake/modules/AddMLIRPython.cmake @@ -512,7 +512,7 @@ function(add_mlir_python_common_capi_library name) ) add_dependencies(${name} ${_header_sources_target}) - if(MSVC) + if(WIN32) set_property(TARGET ${name} PROPERTY WINDOWS_EXPORT_ALL_SYMBOLS ON) endif() set_target_properties(${name} PROPERTIES @@ -649,6 +649,15 @@ function(add_mlir_python_extension libname extname) message(FATAL_ERROR "Unhandled arguments to add_mlir_python_extension(${libname}, ... : ${ARG_UNPARSED_ARGUMENTS}") endif() + # The extension itself must be compiled with RTTI and exceptions enabled. + # Also, some warning classes triggered by pybind11 are disabled. + set(eh_rtti_enable) + if (MSVC) + set(eh_rtti_enable /EHsc /GR) + elseif(LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL) + set(eh_rtti_enable -frtti -fexceptions) + endif () + # The actual extension library produces a shared-object or DLL and has # sources that must be compiled in accordance with pybind11 needs (RTTI and # exceptions). @@ -671,18 +680,11 @@ function(add_mlir_python_extension libname extname) -Wno-nested-anon-types -Wno-c++98-compat-extra-semi -Wno-covered-switch-default + ${eh_rtti_enable} ) endif() endif() - # The extension itself must be compiled with RTTI and exceptions enabled. - # Also, some warning classes triggered by pybind11 are disabled. - set(eh_rtti_enable) - if (MSVC) - set(eh_rtti_enable /EHsc /GR) - elseif(LLVM_COMPILER_IS_GCC_COMPATIBLE) - set(eh_rtti_enable -frtti -fexceptions) - endif () target_compile_options(${libname} PRIVATE ${eh_rtti_enable}) # Configure the output to match python expectations. From 141c544c03702ac7c50522373ad781ede3685e0a Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Tue, 24 Dec 2024 21:47:53 -0800 Subject: [PATCH 043/567] [clang-format] Skip line splices when sorting C++ includes (#120680) Fixes #109864. --- clang/lib/Format/Format.cpp | 11 +++++++++-- clang/unittests/Format/SortIncludesTest.cpp | 12 ++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index dcaac4b0d42cc..95129a8fe9240 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -3246,8 +3246,15 @@ tooling::Replacements sortCppIncludes(const FormatStyle &Style, StringRef Code, SmallVector RawStringMatches; std::string RawStringTermination = ")\""; - for (;;) { - auto Pos = Code.find('\n', SearchFrom); + for (const auto Size = Code.size(); SearchFrom < Size;) { + size_t Pos = SearchFrom; + if (Code[SearchFrom] != '\n') { + do { // Search for the first newline while skipping line splices. + ++Pos; + Pos = Code.find('\n', Pos); + } while (Pos != StringRef::npos && Code[Pos - 1] == '\\'); + } + StringRef Line = Code.substr(Prev, (Pos != StringRef::npos ? Pos : Code.size()) - Prev); diff --git a/clang/unittests/Format/SortIncludesTest.cpp b/clang/unittests/Format/SortIncludesTest.cpp index 3175382564637..cb3f8c73a0487 100644 --- a/clang/unittests/Format/SortIncludesTest.cpp +++ b/clang/unittests/Format/SortIncludesTest.cpp @@ -984,6 +984,18 @@ TEST_F(SortIncludesTest, SortAndDeduplicateIncludes) { "#include \n" "#include ")); + verifyFormat("/* COPYRIGHT *\\\n" + "\\* (C) 2024 */\n" + "\n" + "#include \n" + "#include ", + sort("/* COPYRIGHT *\\\n" + "\\* (C) 2024 */\n" + "\n" + "#include \n" + "#include \n" + "#include ")); + Style.IncludeBlocks = tooling::IncludeStyle::IBS_Merge; verifyFormat("#include \n" "#include \n" From 5d81b1490022d04eb8862791fbcb25018a6860e3 Mon Sep 17 00:00:00 2001 From: Pranav Kant Date: Wed, 25 Dec 2024 06:14:04 +0000 Subject: [PATCH 044/567] [clang-tidy][bazel] Fix #120547 --- .../clang-tools-extra/clang-tidy/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-tidy/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-tidy/BUILD.bazel index db53df08fecc8..d8afbe37e8467 100644 --- a/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-tidy/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-tidy/BUILD.bazel @@ -368,6 +368,7 @@ cc_library( ":utils", "//clang:tooling", "//llvm:Support", + "//llvm:TargetParser", ], ) From 25bb6592c974aed8702767cad4a64de9b5c60aed Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 24 Dec 2024 22:46:12 -0800 Subject: [PATCH 045/567] MCAsmInfo: replace AIX-specific variables with IsAIX AIX assembly is very different from the gas syntax. We don't expect other targets to share these differences. Unify the numerous, essentially AIX-specific variables. --- llvm/include/llvm/MC/MCAsmInfo.h | 73 +--------------------- llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 27 +++----- llvm/lib/MC/MCAsmInfoXCOFF.cpp | 15 +---- llvm/lib/MC/MCAsmStreamer.cpp | 73 ++++++++++------------ llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp | 6 +- 5 files changed, 48 insertions(+), 146 deletions(-) diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h index fb49eb7645dfb..ac2b8524315c3 100644 --- a/llvm/include/llvm/MC/MCAsmInfo.h +++ b/llvm/include/llvm/MC/MCAsmInfo.h @@ -94,9 +94,7 @@ class MCAsmInfo { /// constants into comdat sections. bool HasCOFFComdatConstants = false; - /// True if this is an XCOFF target that supports visibility attributes as - /// part of .global, .weak, .extern, and .comm. Default is false. - bool HasVisibilityOnlyWithLinkage = false; + bool IsAIX = false; // True if using the HLASM dialect on z/OS. bool IsHLASM = false; @@ -202,10 +200,6 @@ class MCAsmInfo { /// instead. bool UseDataRegionDirectives = false; - /// True if .align is to be used for alignment. Only power-of-two - /// alignment is supported. - bool UseDotAlignForAlignment = false; - /// True if the target supports LEB128 directives. bool HasLEB128Directives = true; @@ -220,11 +214,6 @@ class MCAsmInfo { /// "\t.zero\t" const char *ZeroDirective; - /// This should be set to true if the zero directive supports a value to emit - /// other than zero. If this is set to false, the Data*bitsDirective's will be - /// used to emit these bytes. Defaults to true. - bool ZeroDirectiveSupportsNonZeroValue = true; - /// This directive allows emission of an ascii string with the standard C /// escape characters embedded into it. If a target doesn't support this, it /// can be set to null. Defaults to "\t.ascii\t" @@ -235,16 +224,6 @@ class MCAsmInfo { /// doesn't support this, it can be set to null. Defaults to "\t.asciz\t" const char *AscizDirective; - /// This directive accepts a comma-separated list of bytes for emission as a - /// string of bytes. For targets that do not support this, it shall be set to - /// null. Defaults to null. - const char *ByteListDirective = nullptr; - - /// This directive allows emission of a zero-terminated ascii string without - /// the standard C escape characters embedded into it. If a target doesn't - /// support this, it can be set to null. Defaults to null. - const char *PlainStringDirective = nullptr; - /// Form used for character literals in the assembly syntax. Useful for /// producing strings as byte lists. If a target does not use or support /// this, it shall be set to ACLS_Unknown. Defaults to ACLS_Unknown. @@ -325,16 +304,6 @@ class MCAsmInfo { /// argument and how it is interpreted. Defaults to NoAlignment. LCOMM::LCOMMType LCOMMDirectiveAlignmentType = LCOMM::NoAlignment; - /// True if the target only has basename for .file directive. False if the - /// target also needs the directory along with the basename. Defaults to true. - bool HasBasenameOnlyForFileDirective = true; - - /// True if the target represents string constants as mostly raw characters in - /// paired double quotation with paired double quotation marks as the escape - /// mechanism to represent a double quotation mark within the string. Defaults - /// to false. - bool HasPairedDoubleQuoteStringConstants = false; - // True if the target allows .align directives on functions. This is true for // most targets, so defaults to true. bool HasFunctionAlignment = true; @@ -347,10 +316,6 @@ class MCAsmInfo { /// for ELF targets. Defaults to true. bool HasSingleParameterDotFile = true; - /// True if the target has a four strings .file directive, strings separated - /// by comma. Defaults to false. - bool HasFourStringsDotFile = false; - /// True if the target has a .ident directive, this is true for ELF targets. /// Defaults to false. bool HasIdentDirective = false; @@ -417,10 +382,6 @@ class MCAsmInfo { /// absolute difference. bool DwarfFDESymbolsUseAbsDiff = false; - /// True if the target supports generating the DWARF line table through using - /// the .loc/.file directives. Defaults to true. - bool UsesDwarfFileAndLocDirectives = true; - /// True if DWARF `.file directory' directive syntax is used by /// default. bool EnableDwarfFileDirectoryDefault = true; @@ -484,9 +445,6 @@ class MCAsmInfo { // If true, use Motorola-style integers in Assembly (ex. $0ac). bool UseMotorolaIntegers = false; - // If true, emit function descriptor symbol on AIX. - bool NeedsFunctionDescriptors = false; - public: explicit MCAsmInfo(); virtual ~MCAsmInfo(); @@ -567,13 +525,11 @@ class MCAsmInfo { // Accessors. + bool isAIX() const { return IsAIX; } bool isHLASM() const { return IsHLASM; } bool isMachO() const { return HasSubsectionsViaSymbols; } bool hasCOFFAssociativeComdats() const { return HasCOFFAssociativeComdats; } bool hasCOFFComdatConstants() const { return HasCOFFComdatConstants; } - bool hasVisibilityOnlyWithLinkage() const { - return HasVisibilityOnlyWithLinkage; - } /// Returns the maximum possible encoded instruction size in bytes. If \p STI /// is null, this should be the maximum size for any subtarget. @@ -630,23 +586,14 @@ class MCAsmInfo { return UseDataRegionDirectives; } - bool useDotAlignForAlignment() const { - return UseDotAlignForAlignment; - } - bool hasLEB128Directives() const { return HasLEB128Directives; } bool useFullRegisterNames() const { return PPCUseFullRegisterNames; } void setFullRegisterNames(bool V) { PPCUseFullRegisterNames = V; } const char *getZeroDirective() const { return ZeroDirective; } - bool doesZeroDirectiveSupportNonZeroValue() const { - return ZeroDirectiveSupportsNonZeroValue; - } const char *getAsciiDirective() const { return AsciiDirective; } const char *getAscizDirective() const { return AscizDirective; } - const char *getByteListDirective() const { return ByteListDirective; } - const char *getPlainStringDirective() const { return PlainStringDirective; } AsmCharLiteralSyntax characterLiteralSyntax() const { return CharacterLiteralSyntax; } @@ -666,16 +613,9 @@ class MCAsmInfo { return LCOMMDirectiveAlignmentType; } - bool hasBasenameOnlyForFileDirective() const { - return HasBasenameOnlyForFileDirective; - } - bool hasPairedDoubleQuoteStringConstants() const { - return HasPairedDoubleQuoteStringConstants; - } bool hasFunctionAlignment() const { return HasFunctionAlignment; } bool hasDotTypeDotSizeDirective() const { return HasDotTypeDotSizeDirective; } bool hasSingleParameterDotFile() const { return HasSingleParameterDotFile; } - bool hasFourStringsDotFile() const { return HasFourStringsDotFile; } bool hasIdentDirective() const { return HasIdentDirective; } bool hasNoDeadStrip() const { return HasNoDeadStrip; } const char *getWeakDirective() const { return WeakDirective; } @@ -742,13 +682,7 @@ class MCAsmInfo { return SupportsExtendedDwarfLocDirective; } - bool usesDwarfFileAndLocDirectives() const { - return UsesDwarfFileAndLocDirectives; - } - - bool needsDwarfSectionSizeInHeader() const { - return DwarfSectionSizeRequired; - } + bool usesDwarfFileAndLocDirectives() const { return !IsAIX; } bool enableDwarfFileDirectoryDefault() const { return EnableDwarfFileDirectoryDefault; @@ -798,7 +732,6 @@ class MCAsmInfo { bool shouldUseLogicalShr() const { return UseLogicalShr; } bool hasMipsExpressions() const { return HasMipsExpressions; } - bool needsFunctionDescriptors() const { return NeedsFunctionDescriptors; } bool shouldUseMotorolaIntegers() const { return UseMotorolaIntegers; } }; diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index d34fe0e86c749..7bd3fb33b47d2 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -503,13 +503,7 @@ bool AsmPrinter::doInitialization(Module &M) { // don't, this at least helps the user find where a global came from. if (MAI->hasSingleParameterDotFile()) { // .file "foo.c" - - SmallString<128> FileName; - if (MAI->hasBasenameOnlyForFileDirective()) - FileName = llvm::sys::path::filename(M.getSourceFileName()); - else - FileName = M.getSourceFileName(); - if (MAI->hasFourStringsDotFile()) { + if (MAI->isAIX()) { const char VerStr[] = #ifdef PACKAGE_VENDOR PACKAGE_VENDOR " " @@ -520,9 +514,10 @@ bool AsmPrinter::doInitialization(Module &M) { #endif ; // TODO: Add timestamp and description. - OutStreamer->emitFileDirective(FileName, VerStr, "", ""); + OutStreamer->emitFileDirective(M.getSourceFileName(), VerStr, "", ""); } else { - OutStreamer->emitFileDirective(FileName); + OutStreamer->emitFileDirective( + llvm::sys::path::filename(M.getSourceFileName())); } } @@ -967,11 +962,10 @@ void AsmPrinter::emitFunctionHeader() { MF->setSection(getObjFileLowering().SectionForGlobal(&F, TM)); OutStreamer->switchSection(MF->getSection()); - if (!MAI->hasVisibilityOnlyWithLinkage()) - emitVisibility(CurrentFnSym, F.getVisibility()); - - if (MAI->needsFunctionDescriptors()) + if (MAI->isAIX()) emitLinkage(&F, CurrentFnDescSym); + else + emitVisibility(CurrentFnSym, F.getVisibility()); emitLinkage(&F, CurrentFnSym); if (MAI->hasFunctionAlignment()) @@ -1031,7 +1025,7 @@ void AsmPrinter::emitFunctionHeader() { // to emit their specific function descriptor. Right now it is only used by // the AIX target. The PowerPC 64-bit V1 ELF target also uses function // descriptors and should be converted to use this hook as well. - if (MAI->needsFunctionDescriptors()) + if (MAI->isAIX()) emitFunctionDescriptor(); // Emit the CurrentFnSym. This is a virtual function to allow targets to do @@ -2234,9 +2228,6 @@ void AsmPrinter::emitGlobalAlias(const Module &M, const GlobalAlias &GA) { // point, all the extra label is emitted, we just have to emit linkage for // those labels. if (TM.getTargetTriple().isOSBinFormatXCOFF()) { - assert(MAI->hasVisibilityOnlyWithLinkage() && - "Visibility should be handled with emitLinkage() on AIX."); - // Linkage for alias of global variable has been emitted. if (isa(GA.getAliaseeObject())) return; @@ -2730,7 +2721,7 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) { HasNoSplitStack = true; // Get the function symbol. - if (!MAI->needsFunctionDescriptors()) { + if (!MAI->isAIX()) { CurrentFnSym = getSymbol(&MF.getFunction()); } else { assert(TM.getTargetTriple().isOSAIX() && diff --git a/llvm/lib/MC/MCAsmInfoXCOFF.cpp b/llvm/lib/MC/MCAsmInfoXCOFF.cpp index b07e95e45d551..6ef11ba6e8d55 100644 --- a/llvm/lib/MC/MCAsmInfoXCOFF.cpp +++ b/llvm/lib/MC/MCAsmInfoXCOFF.cpp @@ -19,29 +19,17 @@ extern cl::opt UseLEB128Directives; void MCAsmInfoXCOFF::anchor() {} MCAsmInfoXCOFF::MCAsmInfoXCOFF() { + IsAIX = true; IsLittleEndian = false; - HasVisibilityOnlyWithLinkage = true; - HasBasenameOnlyForFileDirective = false; - HasFourStringsDotFile = true; - - // For XCOFF, string constant consists of any number of characters enclosed in - // "" (double quotation marks). - HasPairedDoubleQuoteStringConstants = true; PrivateGlobalPrefix = "L.."; PrivateLabelPrefix = "L.."; SupportsQuotedNames = false; - UseDotAlignForAlignment = true; - UsesDwarfFileAndLocDirectives = false; - DwarfSectionSizeRequired = false; if (UseLEB128Directives == cl::BOU_UNSET) HasLEB128Directives = false; ZeroDirective = "\t.space\t"; - ZeroDirectiveSupportsNonZeroValue = false; AsciiDirective = nullptr; // not supported AscizDirective = nullptr; // not supported - ByteListDirective = "\t.byte\t"; - PlainStringDirective = "\t.string\t"; CharacterLiteralSyntax = ACLS_SingleQuotePrefix; // Use .vbyte for data definition to avoid directives that apply an implicit @@ -53,7 +41,6 @@ MCAsmInfoXCOFF::MCAsmInfoXCOFF() { LCOMMDirectiveAlignmentType = LCOMM::Log2Alignment; HasDotTypeDotSizeDirective = false; ParseInlineAsmUsingAsmParser = true; - NeedsFunctionDescriptors = true; ExceptionsType = ExceptionHandling::AIX; } diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp index 78fed7792ad8a..32f1d63218749 100644 --- a/llvm/lib/MC/MCAsmStreamer.cpp +++ b/llvm/lib/MC/MCAsmStreamer.cpp @@ -1219,7 +1219,7 @@ static void PrintByteList(StringRef Data, raw_ostream &OS, void MCAsmStreamer::PrintQuotedString(StringRef Data, raw_ostream &OS) const { OS << '"'; - if (MAI->hasPairedDoubleQuoteStringConstants()) { + if (MAI->isAIX()) { for (unsigned char C : Data) { if (C == '"') OS << "\"\""; @@ -1273,6 +1273,25 @@ void MCAsmStreamer::emitBytes(StringRef Data) { if (Data.empty()) return; const auto emitAsString = [this](StringRef Data) { + if (MAI->isAIX()) { + if (isPrintableString(Data)) { + // For target with DoubleQuoteString constants, .string and .byte are + // used as replacement of .asciz and .ascii. + if (Data.back() == 0) { + OS << "\t.string\t"; + Data = Data.substr(0, Data.size() - 1); + } else { + OS << "\t.byte\t"; + } + PrintQuotedString(Data, OS); + } else { + OS << "\t.byte\t"; + PrintByteList(Data, OS, MAI->characterLiteralSyntax()); + } + EmitEOL(); + return true; + } + // If the data ends with 0 and the target supports .asciz, use it, otherwise // use .ascii or a byte-list directive if (MAI->getAscizDirective() && Data.back() == 0) { @@ -1280,27 +1299,6 @@ void MCAsmStreamer::emitBytes(StringRef Data) { Data = Data.substr(0, Data.size() - 1); } else if (LLVM_LIKELY(MAI->getAsciiDirective())) { OS << MAI->getAsciiDirective(); - } else if (MAI->hasPairedDoubleQuoteStringConstants() && - isPrintableString(Data)) { - // For target with DoubleQuoteString constants, .string and .byte are used - // as replacement of .asciz and .ascii. - assert(MAI->getPlainStringDirective() && - "hasPairedDoubleQuoteStringConstants target must support " - "PlainString Directive"); - assert(MAI->getByteListDirective() && - "hasPairedDoubleQuoteStringConstants target must support ByteList " - "Directive"); - if (Data.back() == 0) { - OS << MAI->getPlainStringDirective(); - Data = Data.substr(0, Data.size() - 1); - } else { - OS << MAI->getByteListDirective(); - } - } else if (MAI->getByteListDirective()) { - OS << MAI->getByteListDirective(); - PrintByteList(Data, OS, MAI->characterLiteralSyntax()); - EmitEOL(); - return true; } else { return false; } @@ -1483,7 +1481,7 @@ void MCAsmStreamer::emitFill(const MCExpr &NumBytes, uint64_t FillValue, return; if (const char *ZeroDirective = MAI->getZeroDirective()) { - if (MAI->doesZeroDirectiveSupportNonZeroValue() || FillValue == 0) { + if (!MAI->isAIX() || FillValue == 0) { // FIXME: Emit location directives OS << ZeroDirective; NumBytes.print(OS, MAI); @@ -1519,7 +1517,7 @@ void MCAsmStreamer::emitAlignmentDirective(uint64_t ByteAlignment, std::optional Value, unsigned ValueSize, unsigned MaxBytesToEmit) { - if (MAI->useDotAlignForAlignment()) { + if (MAI->isAIX()) { if (!isPowerOf2_64(ByteAlignment)) report_fatal_error("Only power-of-two alignments are supported " "with .align."); @@ -1623,7 +1621,7 @@ void MCAsmStreamer::emitFileDirective(StringRef Filename, StringRef CompilerVersion, StringRef TimeStamp, StringRef Description) { - assert(MAI->hasFourStringsDotFile()); + assert(MAI->isAIX()); OS << "\t.file\t"; PrintQuotedString(Filename, OS); bool useTimeStamp = !TimeStamp.empty(); @@ -1694,8 +1692,7 @@ Expected MCAsmStreamer::tryEmitDwarfFileDirective( // Return early if this file is already emitted before or if target doesn't // support .file directive. - if (NumFiles == Table.getMCDwarfFiles().size() || - !MAI->usesDwarfFileAndLocDirectives()) + if (NumFiles == Table.getMCDwarfFiles().size() || MAI->isAIX()) return FileNo; SmallString<128> Str; @@ -1724,7 +1721,7 @@ void MCAsmStreamer::emitDwarfFile0Directive( Source); // Target doesn't support .loc/.file directives, return early. - if (!MAI->usesDwarfFileAndLocDirectives()) + if (MAI->isAIX()) return; SmallString<128> Str; @@ -1744,7 +1741,7 @@ void MCAsmStreamer::emitDwarfLocDirective(unsigned FileNo, unsigned Line, StringRef FileName) { // If target doesn't support .loc/.file directive, we need to record the lines // same way like we do in object mode. - if (!MAI->usesDwarfFileAndLocDirectives()) { + if (MAI->isAIX()) { // In case we see two .loc directives in a row, make sure the // first one gets a line entry. MCDwarfLineEntry::make(this, getCurrentSectionOnly()); @@ -2444,7 +2441,7 @@ void MCAsmStreamer::AddEncodingComment(const MCInst &Inst, void MCAsmStreamer::emitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) { - if (!MAI->usesDwarfFileAndLocDirectives() && CurFrag) + if (MAI->isAIX() && CurFrag) // Now that a machine instruction has been assembled into this section, make // a line entry for any .loc directive that has been seen. MCDwarfLineEntry::make(this, getCurrentSectionOnly()); @@ -2547,7 +2544,7 @@ void MCAsmStreamer::finishImpl() { // Now it is time to emit debug line sections if target doesn't support .loc // and .line directives. - if (!MAI->usesDwarfFileAndLocDirectives()) { + if (MAI->isAIX()) { MCDwarfLineTable::emit(this, getAssembler().getDWARFLinetableParams()); return; } @@ -2572,7 +2569,7 @@ void MCAsmStreamer::emitDwarfUnitLength(uint64_t Length, const Twine &Comment) { // the debug section headers. In such cases, any label we placed occurs // after the implied length field. We need to adjust the reference here // to account for the offset introduced by the inserted length field. - if (!MAI->needsDwarfSectionSizeInHeader()) + if (MAI->isAIX()) return; MCStreamer::emitDwarfUnitLength(Length, Comment); } @@ -2585,7 +2582,7 @@ MCSymbol *MCAsmStreamer::emitDwarfUnitLength(const Twine &Prefix, // the debug section headers. In such cases, any label we placed occurs // after the implied length field. We need to adjust the reference here // to account for the offset introduced by the inserted length field. - if (!MAI->needsDwarfSectionSizeInHeader()) + if (MAI->isAIX()) return getContext().createTempSymbol(Prefix + "_end"); return MCStreamer::emitDwarfUnitLength(Prefix, Comment); } @@ -2598,7 +2595,7 @@ void MCAsmStreamer::emitDwarfLineStartLabel(MCSymbol *StartSym) { // after the implied length field. We need to adjust the reference here // to account for the offset introduced by the inserted length field. MCContext &Ctx = getContext(); - if (!MAI->needsDwarfSectionSizeInHeader()) { + if (MAI->isAIX()) { MCSymbol *DebugLineSymTmp = Ctx.createTempSymbol("debug_line_"); // Emit the symbol which does not contain the unit length field. emitLabel(DebugLineSymTmp); @@ -2625,7 +2622,7 @@ void MCAsmStreamer::emitDwarfLineEndEntry(MCSection *Section, // we currently use the .text end label as any section end. This will not // impact the debugability as we will jump to the caller of the last function // in the section before we come into the .text end address. - assert(!MAI->usesDwarfFileAndLocDirectives() && + assert(MAI->isAIX() && ".loc should not be generated together with raw data!"); MCContext &Ctx = getContext(); @@ -2648,7 +2645,7 @@ void MCAsmStreamer::emitDwarfAdvanceLineAddr(int64_t LineDelta, const MCSymbol *LastLabel, const MCSymbol *Label, unsigned PointerSize) { - assert(!MAI->usesDwarfFileAndLocDirectives() && + assert(MAI->isAIX() && ".loc/.file don't need raw data in debug line section!"); // Set to new address. @@ -2685,9 +2682,7 @@ void MCAsmStreamer::emitDwarfAdvanceLineAddr(int64_t LineDelta, void MCAsmStreamer::doFinalizationAtSectionEnd(MCSection *Section) { // Emit section end. This is used to tell the debug line section where the end // is for a text section if we don't use .loc to represent the debug line. - if (MAI->usesDwarfFileAndLocDirectives()) - return; - + assert(MAI->isAIX()); switchSectionNoPrint(Section); MCSymbol *Sym = getCurrentSectionOnly()->getEndSymbol(getContext()); diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 073857e4c2dc0..33e07915e735b 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -2230,10 +2230,6 @@ void PPCLinuxAsmPrinter::emitFunctionBodyEnd() { void PPCAIXAsmPrinter::emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const { - - assert(MAI->hasVisibilityOnlyWithLinkage() && - "AIX's linkage directives take a visibility setting."); - MCSymbolAttr LinkageAttr = MCSA_Invalid; switch (GV->getLinkage()) { case GlobalValue::ExternalLinkage: @@ -3251,7 +3247,7 @@ void PPCAIXAsmPrinter::emitInstruction(const MachineInstr *MI) { bool PPCAIXAsmPrinter::doFinalization(Module &M) { // Do streamer related finalization for DWARF. - if (!MAI->usesDwarfFileAndLocDirectives() && hasDebugInfo()) + if (hasDebugInfo()) OutStreamer->doFinalizationAtSectionEnd( OutStreamer->getContext().getObjectFileInfo()->getTextSection()); From 32962f2b77c4298f9ef58182581edf56c76ac685 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Tue, 24 Dec 2024 23:03:47 -0800 Subject: [PATCH 046/567] [ubsan] Try to fix Windows --- compiler-rt/lib/ubsan/ubsan_handlers.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/compiler-rt/lib/ubsan/ubsan_handlers.cpp b/compiler-rt/lib/ubsan/ubsan_handlers.cpp index ee9e3ccd0b1f6..63319f46734a4 100644 --- a/compiler-rt/lib/ubsan/ubsan_handlers.cpp +++ b/compiler-rt/lib/ubsan/ubsan_handlers.cpp @@ -908,14 +908,17 @@ extern "C" void __ubsan_handle_cfi_bad_type_default(CFICheckFailData *Data, } WIN_WEAK_ALIAS(__ubsan_handle_cfi_bad_type, __ubsan_handle_cfi_bad_type_default) -#endif +void __ubsan_handle_cfi_bad_type(CFICheckFailData *Data, ValueHandle Vtable, + bool ValidVtable, ReportOptions Opts); +#else SANITIZER_WEAK_ATTRIBUTE void __ubsan_handle_cfi_bad_type(CFICheckFailData *Data, ValueHandle Vtable, bool ValidVtable, ReportOptions Opts) { Die(); } +#endif -} // namespace __ubsan +} // namespace __ubsan void __ubsan::__ubsan_handle_cfi_check_fail(CFICheckFailData *Data, ValueHandle Value, From e3846c0fecb5992110788be18cd04f70103bc778 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= Date: Wed, 25 Dec 2024 07:12:23 +0000 Subject: [PATCH 047/567] [mlir] [test] Fix missing SHLIB definition in standalone builds (#120907) Define the `LLVM_SHLIB_OUTPUT_INTDIR` variable that is used by `configure_lit_site_cfg` to fill `SHLIBDIR`. This fixes tool tests that would otherwise be unable to find MLIR's runtime shared libraries (e.g. `libmlir_runner_utils.so`). The logic is copied verbatim from Clang. --- mlir/CMakeLists.txt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt index 0608eef15c5a4..99ea1b70fbc4d 100644 --- a/mlir/CMakeLists.txt +++ b/mlir/CMakeLists.txt @@ -36,6 +36,15 @@ if(MLIR_STANDALONE_BUILD) "${CMAKE_CURRENT_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}") set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/bin") + # These definitions are needed to fill SHLIBDIR in tests. + set(LLVM_RUNTIME_OUTPUT_INTDIR ${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/bin) + set(LLVM_LIBRARY_OUTPUT_INTDIR ${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/lib${LLVM_LIBDIR_SUFFIX}) + if(WIN32 OR CYGWIN) + # DLL platform -- put DLLs into bin. + set(LLVM_SHLIB_OUTPUT_INTDIR ${LLVM_RUNTIME_OUTPUT_INTDIR}) + else() + set(LLVM_SHLIB_OUTPUT_INTDIR ${LLVM_LIBRARY_OUTPUT_INTDIR}) + endif() set(LLVM_LIT_ARGS "-sv" CACHE STRING "Default options for lit") endif() From 6b471b30d7dbce589af16b39f9eb960195a8cd02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= Date: Wed, 25 Dec 2024 07:12:47 +0000 Subject: [PATCH 048/567] [mlir] [test] Fix unittests in standalone builds (#120910) Fix the logic used to run unit tests to account for `llvm_gtest` targets being installed, since 91b3ca39667b6341a8c1983a1467fae14b58318b. This involves removing a rule that would cause a duplicate `llvm_gtest` target being created, and updates the method for determining whether unittests can be run to checking whether the target is present, rather than the source directory (that is no longer actually necessary). --- mlir/CMakeLists.txt | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt index 99ea1b70fbc4d..5ea49c0dbfa7e 100644 --- a/mlir/CMakeLists.txt +++ b/mlir/CMakeLists.txt @@ -27,11 +27,6 @@ if(MLIR_STANDALONE_BUILD) include_directories(${LLVM_INCLUDE_DIRS}) - set(UNITTEST_DIR ${LLVM_THIRD_PARTY_DIR}/unittest) - if(EXISTS ${UNITTEST_DIR}/googletest/include/gtest/gtest.h) - add_subdirectory(${UNITTEST_DIR} third-party/unittest) - endif() - set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}") set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/bin") @@ -227,7 +222,7 @@ if (MLIR_INCLUDE_TESTS) add_definitions(-DMLIR_INCLUDE_TESTS) add_custom_target(MLIRUnitTests) set_target_properties(MLIRUnitTests PROPERTIES FOLDER "MLIR/Tests") - if (EXISTS ${LLVM_THIRD_PARTY_DIR}/unittest/googletest/include/gtest/gtest.h) + if (TARGET llvm_gtest) add_subdirectory(unittests) else() message(WARNING "gtest not found, unittests will not be available") From 9e38e87c8cd39403682f6d4f65fe7b1e6a04955b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= Date: Wed, 25 Dec 2024 07:13:07 +0000 Subject: [PATCH 049/567] [mlir] [test] Do not add dependencies on llvm tools in standalone builds (#120911) Since LLVM tools are installed system-wide, adding dependencies on them is unnecessary. Furthermore, it is problematic for multilib builds, where the tools are only built once, for the native ABI, and therefore are not listed in CMake files for non-native ABIs. --- mlir/test/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt index f181a91328f3f..58d16a657297e 100644 --- a/mlir/test/CMakeLists.txt +++ b/mlir/test/CMakeLists.txt @@ -99,7 +99,6 @@ configure_lit_site_cfg( ) set(MLIR_TEST_DEPENDS - FileCheck count not split-file mlir-capi-ir-test mlir-capi-irdl-test mlir-capi-llvm-test @@ -121,6 +120,9 @@ set(MLIR_TEST_DEPENDS tblgen-lsp-server tblgen-to-irdl ) +if(NOT MLIR_STANDALONE_BUILD) + list(APPEND MLIR_TEST_DEPENDS FileCheck count not split-file) +endif() set(MLIR_TEST_DEPENDS ${MLIR_TEST_DEPENDS} mlir-capi-pdl-test From 319b89197348b7cad1215e235bdc7b5ec8f9b72c Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 24 Dec 2024 23:20:33 -0800 Subject: [PATCH 050/567] MCAsmInfo: remove unused DwarfSectionSizeRequired --- llvm/include/llvm/MC/MCAsmInfo.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h index ac2b8524315c3..f4de106860d35 100644 --- a/llvm/include/llvm/MC/MCAsmInfo.h +++ b/llvm/include/llvm/MC/MCAsmInfo.h @@ -386,10 +386,6 @@ class MCAsmInfo { /// default. bool EnableDwarfFileDirectoryDefault = true; - /// True if the target needs the DWARF section length in the header (if any) - /// of the DWARF section in the assembly file. Defaults to true. - bool DwarfSectionSizeRequired = true; - /// True if dwarf register numbers are printed instead of symbolic register /// names in .cfi_* directives. Defaults to false. bool DwarfRegNumForCFI = false; From 74496b03f7db43caae4c3bda79379623a4657a00 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Tue, 24 Dec 2024 23:36:29 -0800 Subject: [PATCH 051/567] [asan][hwasan] Link RTUbsan_cxx into shared runtime There is no shared version RTUbsan_cxx. Fix android after #121006. --- compiler-rt/lib/asan/CMakeLists.txt | 1 + compiler-rt/lib/hwasan/CMakeLists.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/compiler-rt/lib/asan/CMakeLists.txt b/compiler-rt/lib/asan/CMakeLists.txt index a2c15806f81a2..e2f39f224df9c 100644 --- a/compiler-rt/lib/asan/CMakeLists.txt +++ b/compiler-rt/lib/asan/CMakeLists.txt @@ -318,6 +318,7 @@ else() # add_dependencies(clang_rt.asan-dynamic-${arch} clang_rt.asan-dynamic-${arch}-version-list) # generates an order-only dependency in ninja. RTAsan_dynamic_version_script_dummy + RTUbsan_cxx ${ASAN_DYNAMIC_WEAK_INTERCEPTION} CFLAGS ${ASAN_DYNAMIC_CFLAGS} LINK_FLAGS ${ASAN_DYNAMIC_LINK_FLAGS} diff --git a/compiler-rt/lib/hwasan/CMakeLists.txt b/compiler-rt/lib/hwasan/CMakeLists.txt index 4372603b45a48..207394130d035 100644 --- a/compiler-rt/lib/hwasan/CMakeLists.txt +++ b/compiler-rt/lib/hwasan/CMakeLists.txt @@ -219,6 +219,7 @@ function(add_hwasan_runtimes arch use_aliases) RTSanitizerCommonSymbolizerInternal RTLSanCommon RTUbsan + RTUbsan_cxx # The only purpose of RTHWAsan_dynamic_version_script_dummy is to # carry a dependency of the shared runtime on the version script. # Replacing it with a straightforward From 2d3d62d77e2d011c9dbdb12732aca3070efc83ca Mon Sep 17 00:00:00 2001 From: Ivan Aksamentov Date: Wed, 25 Dec 2024 08:47:30 +0100 Subject: [PATCH 052/567] [flang] fix: split ifndef for CHECK and CHECK_MSG (#114707) Resolves https://github.com/llvm/llvm-project/issues/114703 I think it's the best practice that each macro has it's own `ifndef` check and this way the build issue is resolved for me. I also find the names of these macro a bit too generic - an easy recipe for conflicts. In my case, the error was likely caused by something else defining `CHECK` but not `CHECK_MSG`, so likely these `CHECK` and `CHECK_MSG` weren't actually working at all because the result of `ifndef` is always false. As a definitive fix, perhaps it makes sense to rename them to something more specific, e.g. `FLANG_CHECK` and `FLANG_CHECK_MSG`. --- flang/include/flang/Common/idioms.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/flang/include/flang/Common/idioms.h b/flang/include/flang/Common/idioms.h index 99f383ec75b99..06631bcf2e445 100644 --- a/flang/include/flang/Common/idioms.h +++ b/flang/include/flang/Common/idioms.h @@ -87,7 +87,10 @@ template visitors(LAMBDAS... x) -> visitors; // To disable, compile with '-DCHECK=(void)' #ifndef CHECK #define CHECK(x) ((x) || (DIE("CHECK(" #x ") failed"), false)) +#endif + // Same as above, but with a custom error message. +#ifndef CHECK_MSG #define CHECK_MSG(x, y) ((x) || (DIE("CHECK(" #x ") failed: " #y), false)) #endif From c29536b0336586b2ed7bafedf82c9f4e254cfaa6 Mon Sep 17 00:00:00 2001 From: Sergei Barannikov Date: Wed, 25 Dec 2024 11:32:37 +0300 Subject: [PATCH 053/567] [test] Group GlobalISelEmitter tests under a subdirectory (#121093) Remove extra command line arguments while here. --- .../{ => GlobalISelEmitter}/ContextlessPredicates.td | 4 ++-- .../CustomPredicate.td} | 2 +- .../{ => GlobalISelEmitter}/DefaultOpsGlobalISel.td | 2 +- .../Flags.td} | 2 +- .../TableGen/{ => GlobalISelEmitter}/GlobalISelEmitter.td | 6 +++--- .../HwModes.td} | 2 +- .../MatchTableOptimizer.td} | 2 +- .../MatchTableOptimizerSameOperand-invalid.td} | 2 +- .../MatchTableOptimizerSameOperand.td} | 2 +- .../OverloadedPtr.td} | 2 +- .../PR39045.td} | 2 +- .../RegSequence.td} | 2 +- .../SDNodeXForm-timm.td} | 2 +- .../SkippedPatterns.td} | 4 ++-- .../Subreg.td} | 2 +- .../Variadic.td} | 2 +- .../atomic-store.td} | 2 +- .../frameindex.td} | 2 +- .../TableGen/{ => GlobalISelEmitter}/gisel-physreg-input.td | 2 +- .../immAllZeroOne.td} | 4 ++-- .../immarg-literal-pattern.td} | 2 +- .../TableGen/{ => GlobalISelEmitter}/immarg-predicated.td | 2 +- llvm/test/TableGen/{ => GlobalISelEmitter}/immarg.td | 2 +- .../implicit-defs.td} | 2 +- .../input-discard.td} | 2 +- .../multiple-output-discard.td} | 2 +- .../multiple-output.td} | 2 +- .../nested-subregs.td} | 2 +- .../notype-output-pattern.td} | 2 +- .../optional-def.td} | 2 +- .../output-discard.td} | 2 +- .../setcc.td} | 2 +- .../zero-instr.td} | 2 +- .../zero-reg.td} | 2 +- 34 files changed, 39 insertions(+), 39 deletions(-) rename llvm/test/TableGen/{ => GlobalISelEmitter}/ContextlessPredicates.td (94%) rename llvm/test/TableGen/{GlobalISelEmitterCustomPredicate.td => GlobalISelEmitter/CustomPredicate.td} (99%) rename llvm/test/TableGen/{ => GlobalISelEmitter}/DefaultOpsGlobalISel.td (99%) rename llvm/test/TableGen/{GlobalISelEmitterFlags.td => GlobalISelEmitter/Flags.td} (98%) rename llvm/test/TableGen/{ => GlobalISelEmitter}/GlobalISelEmitter.td (99%) rename llvm/test/TableGen/{GlobalISelEmitterHwModes.td => GlobalISelEmitter/HwModes.td} (98%) rename llvm/test/TableGen/{GlobalISelEmitterMatchTableOptimizer.td => GlobalISelEmitter/MatchTableOptimizer.td} (98%) rename llvm/test/TableGen/{GlobalISelEmitterMatchTableOptimizerSameOperand-invalid.td => GlobalISelEmitter/MatchTableOptimizerSameOperand-invalid.td} (99%) rename llvm/test/TableGen/{GlobalISelEmitterMatchTableOptimizerSameOperand.td => GlobalISelEmitter/MatchTableOptimizerSameOperand.td} (95%) rename llvm/test/TableGen/{GlobalISelEmitterOverloadedPtr.td => GlobalISelEmitter/OverloadedPtr.td} (94%) rename llvm/test/TableGen/{GlobalISelEmitter-PR39045.td => GlobalISelEmitter/PR39045.td} (92%) rename llvm/test/TableGen/{GlobalISelEmitterRegSequence.td => GlobalISelEmitter/RegSequence.td} (98%) rename llvm/test/TableGen/{GlobalISelEmitter-SDNodeXForm-timm.td => GlobalISelEmitter/SDNodeXForm-timm.td} (93%) rename llvm/test/TableGen/{GlobalISelEmitterSkippedPatterns.td => GlobalISelEmitter/SkippedPatterns.td} (86%) rename llvm/test/TableGen/{GlobalISelEmitterSubreg.td => GlobalISelEmitter/Subreg.td} (99%) rename llvm/test/TableGen/{GlobalISelEmitterVariadic.td => GlobalISelEmitter/Variadic.td} (97%) rename llvm/test/TableGen/{GlobalISelEmitter-atomic_store.td => GlobalISelEmitter/atomic-store.td} (93%) rename llvm/test/TableGen/{GlobalISelEmitter-frameindex.td => GlobalISelEmitter/frameindex.td} (98%) rename llvm/test/TableGen/{ => GlobalISelEmitter}/gisel-physreg-input.td (98%) rename llvm/test/TableGen/{GlobalISelEmitter-immAllZeroOne.td => GlobalISelEmitter/immAllZeroOne.td} (92%) rename llvm/test/TableGen/{GlobalISelEmitter-immarg-literal-pattern.td => GlobalISelEmitter/immarg-literal-pattern.td} (95%) rename llvm/test/TableGen/{ => GlobalISelEmitter}/immarg-predicated.td (92%) rename llvm/test/TableGen/{ => GlobalISelEmitter}/immarg.td (93%) rename llvm/test/TableGen/{GlobalISelEmitter-implicit-defs.td => GlobalISelEmitter/implicit-defs.td} (81%) rename llvm/test/TableGen/{GlobalISelEmitter-input-discard.td => GlobalISelEmitter/input-discard.td} (95%) rename llvm/test/TableGen/{GlobalISelEmitter-multiple-output-discard.td => GlobalISelEmitter/multiple-output-discard.td} (96%) rename llvm/test/TableGen/{GlobalISelEmitter-multiple-output.td => GlobalISelEmitter/multiple-output.td} (98%) rename llvm/test/TableGen/{GlobalISelEmitter-nested-subregs.td => GlobalISelEmitter/nested-subregs.td} (98%) rename llvm/test/TableGen/{GlobalISelEmitter-notype-output-pattern.td => GlobalISelEmitter/notype-output-pattern.td} (89%) rename llvm/test/TableGen/{GlobalISelEmitter-optional-def.td => GlobalISelEmitter/optional-def.td} (97%) rename llvm/test/TableGen/{GlobalISelEmitter-output-discard.td => GlobalISelEmitter/output-discard.td} (94%) rename llvm/test/TableGen/{GlobalISelEmitter-setcc.td => GlobalISelEmitter/setcc.td} (91%) rename llvm/test/TableGen/{GlobalISelEmitter-zero-instr.td => GlobalISelEmitter/zero-instr.td} (71%) rename llvm/test/TableGen/{GlobalISelEmitter-zero-reg.td => GlobalISelEmitter/zero-reg.td} (97%) diff --git a/llvm/test/TableGen/ContextlessPredicates.td b/llvm/test/TableGen/GlobalISelEmitter/ContextlessPredicates.td similarity index 94% rename from llvm/test/TableGen/ContextlessPredicates.td rename to llvm/test/TableGen/GlobalISelEmitter/ContextlessPredicates.td index eead9655111e6..fa3484e3f4a4e 100644 --- a/llvm/test/TableGen/ContextlessPredicates.td +++ b/llvm/test/TableGen/GlobalISelEmitter/ContextlessPredicates.td @@ -1,6 +1,6 @@ -// RUN: llvm-tblgen -gen-global-isel -I %p/../../include -I %p/Common -optimize-match-table=false %s -o %T/context-non-optimized.cpp +// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common -optimize-match-table=false %s -o %T/context-non-optimized.cpp // RUN: FileCheck %s --check-prefixes=CHECK_NOPT -input-file=%T/context-non-optimized.cpp -// RUN: llvm-tblgen -gen-global-isel -I %p/../../include -I %p/Common -optimize-match-table=true %s -o %T/context-optimized.cpp +// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common -optimize-match-table=true %s -o %T/context-optimized.cpp // RUN: FileCheck %s --check-prefixes=CHECK_OPT -input-file=%T/context-optimized.cpp diff --git a/llvm/test/TableGen/GlobalISelEmitterCustomPredicate.td b/llvm/test/TableGen/GlobalISelEmitter/CustomPredicate.td similarity index 99% rename from llvm/test/TableGen/GlobalISelEmitterCustomPredicate.td rename to llvm/test/TableGen/GlobalISelEmitter/CustomPredicate.td index 3ceadf32f0642..56eaa4bdc57de 100644 --- a/llvm/test/TableGen/GlobalISelEmitterCustomPredicate.td +++ b/llvm/test/TableGen/GlobalISelEmitter/CustomPredicate.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=false -I %p/../../include -I %p/Common -o - | FileCheck %s +// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=false -I %p/../../../include -I %p/../Common | FileCheck %s // Verify that all MI predicates are enumerated. // diff --git a/llvm/test/TableGen/DefaultOpsGlobalISel.td b/llvm/test/TableGen/GlobalISelEmitter/DefaultOpsGlobalISel.td similarity index 99% rename from llvm/test/TableGen/DefaultOpsGlobalISel.td rename to llvm/test/TableGen/GlobalISelEmitter/DefaultOpsGlobalISel.td index 8f4176a2aa730..f88045ca9b00b 100644 --- a/llvm/test/TableGen/DefaultOpsGlobalISel.td +++ b/llvm/test/TableGen/GlobalISelEmitter/DefaultOpsGlobalISel.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=false -I %p/../../include -I %p/Common -o - | FileCheck %s +// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=false -I %p/../../../include -I %p/../Common -o - | FileCheck %s include "llvm/Target/Target.td" include "GlobalISelEmitterCommon.td" diff --git a/llvm/test/TableGen/GlobalISelEmitterFlags.td b/llvm/test/TableGen/GlobalISelEmitter/Flags.td similarity index 98% rename from llvm/test/TableGen/GlobalISelEmitterFlags.td rename to llvm/test/TableGen/GlobalISelEmitter/Flags.td index fa8f2a79fbce8..0878955f8f31e 100644 --- a/llvm/test/TableGen/GlobalISelEmitterFlags.td +++ b/llvm/test/TableGen/GlobalISelEmitter/Flags.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=false -I %p/../../include -I %p/Common -o - | FileCheck %s +// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=false -I %p/../../../include -I %p/../Common | FileCheck %s include "llvm/Target/Target.td" include "GlobalISelEmitterCommon.td" diff --git a/llvm/test/TableGen/GlobalISelEmitter.td b/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td similarity index 99% rename from llvm/test/TableGen/GlobalISelEmitter.td rename to llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td index ffefaba284299..7c8181410d400 100644 --- a/llvm/test/TableGen/GlobalISelEmitter.td +++ b/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td @@ -1,6 +1,6 @@ -// RUN: llvm-tblgen -gen-global-isel -I %p/../../include -I %p/Common -optimize-match-table=false %s -o %T/non-optimized.cpp -// RUN: llvm-tblgen -gen-global-isel -I %p/../../include -I %p/Common -optimize-match-table=true %s -o %T/optimized.cpp -// RUN: llvm-tblgen -gen-global-isel -I %p/../../include -I %p/Common %s -o %T/default.cpp +// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common -optimize-match-table=false %s -o %T/non-optimized.cpp +// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common -optimize-match-table=true %s -o %T/optimized.cpp +// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common %s -o %T/default.cpp // RUN: FileCheck %s --check-prefixes=CHECK,R19C,R19N -input-file=%T/non-optimized.cpp // RUN: FileCheck %s --check-prefixes=CHECK,R19C,R19O -input-file=%T/optimized.cpp diff --git a/llvm/test/TableGen/GlobalISelEmitterHwModes.td b/llvm/test/TableGen/GlobalISelEmitter/HwModes.td similarity index 98% rename from llvm/test/TableGen/GlobalISelEmitterHwModes.td rename to llvm/test/TableGen/GlobalISelEmitter/HwModes.td index 9d235f5f07a74..3588ba3979411 100644 --- a/llvm/test/TableGen/GlobalISelEmitterHwModes.td +++ b/llvm/test/TableGen/GlobalISelEmitter/HwModes.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen -gen-global-isel -I %p/../../include -I %p/Common -optimize-match-table=false %s -o %T/hwmode-non-optimized.cpp +// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common -optimize-match-table=false %s -o %T/hwmode-non-optimized.cpp // RUN: FileCheck %s --check-prefixes=CHECK -input-file=%T/hwmode-non-optimized.cpp include "llvm/Target/Target.td" diff --git a/llvm/test/TableGen/GlobalISelEmitterMatchTableOptimizer.td b/llvm/test/TableGen/GlobalISelEmitter/MatchTableOptimizer.td similarity index 98% rename from llvm/test/TableGen/GlobalISelEmitterMatchTableOptimizer.td rename to llvm/test/TableGen/GlobalISelEmitter/MatchTableOptimizer.td index 3db31bea8612e..c4307258aae9a 100644 --- a/llvm/test/TableGen/GlobalISelEmitterMatchTableOptimizer.td +++ b/llvm/test/TableGen/GlobalISelEmitter/MatchTableOptimizer.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=true -I %p/../../include -I %p/Common -o - | FileCheck %s +// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=true -I %p/../../../include -I %p/../Common | FileCheck %s include "llvm/Target/Target.td" include "GlobalISelEmitterCommon.td" diff --git a/llvm/test/TableGen/GlobalISelEmitterMatchTableOptimizerSameOperand-invalid.td b/llvm/test/TableGen/GlobalISelEmitter/MatchTableOptimizerSameOperand-invalid.td similarity index 99% rename from llvm/test/TableGen/GlobalISelEmitterMatchTableOptimizerSameOperand-invalid.td rename to llvm/test/TableGen/GlobalISelEmitter/MatchTableOptimizerSameOperand-invalid.td index d93805b612a19..18ae76720518d 100644 --- a/llvm/test/TableGen/GlobalISelEmitterMatchTableOptimizerSameOperand-invalid.td +++ b/llvm/test/TableGen/GlobalISelEmitter/MatchTableOptimizerSameOperand-invalid.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=true -I %p/../../include -I %p/Common -o - | FileCheck %s +// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=true -I %p/../../../include -I %p/../Common | FileCheck %s include "llvm/Target/Target.td" include "GlobalISelEmitterCommon.td" diff --git a/llvm/test/TableGen/GlobalISelEmitterMatchTableOptimizerSameOperand.td b/llvm/test/TableGen/GlobalISelEmitter/MatchTableOptimizerSameOperand.td similarity index 95% rename from llvm/test/TableGen/GlobalISelEmitterMatchTableOptimizerSameOperand.td rename to llvm/test/TableGen/GlobalISelEmitter/MatchTableOptimizerSameOperand.td index 1ac33990ab3b1..c6ca9b7fbed5d 100644 --- a/llvm/test/TableGen/GlobalISelEmitterMatchTableOptimizerSameOperand.td +++ b/llvm/test/TableGen/GlobalISelEmitter/MatchTableOptimizerSameOperand.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=true -I %p/../../include -I %p/Common -o - | FileCheck %s +// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=true -I %p/../../../include -I %p/../Common | FileCheck %s include "llvm/Target/Target.td" include "GlobalISelEmitterCommon.td" diff --git a/llvm/test/TableGen/GlobalISelEmitterOverloadedPtr.td b/llvm/test/TableGen/GlobalISelEmitter/OverloadedPtr.td similarity index 94% rename from llvm/test/TableGen/GlobalISelEmitterOverloadedPtr.td rename to llvm/test/TableGen/GlobalISelEmitter/OverloadedPtr.td index 422edbba0e7a0..c70211d665225 100644 --- a/llvm/test/TableGen/GlobalISelEmitterOverloadedPtr.td +++ b/llvm/test/TableGen/GlobalISelEmitter/OverloadedPtr.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen -gen-global-isel -I %p/../../include -I %p/Common %s -o - | FileCheck %s +// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common %s | FileCheck %s // Boilerplate code. include "llvm/Target/Target.td" diff --git a/llvm/test/TableGen/GlobalISelEmitter-PR39045.td b/llvm/test/TableGen/GlobalISelEmitter/PR39045.td similarity index 92% rename from llvm/test/TableGen/GlobalISelEmitter-PR39045.td rename to llvm/test/TableGen/GlobalISelEmitter/PR39045.td index 5407222121bb3..595fa92f6773f 100644 --- a/llvm/test/TableGen/GlobalISelEmitter-PR39045.td +++ b/llvm/test/TableGen/GlobalISelEmitter/PR39045.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen -gen-global-isel -I %p/../../include -I %p/Common %s -o %t +// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common %s -o %t // RUN: FileCheck %s < %t // Both predicates should be tested diff --git a/llvm/test/TableGen/GlobalISelEmitterRegSequence.td b/llvm/test/TableGen/GlobalISelEmitter/RegSequence.td similarity index 98% rename from llvm/test/TableGen/GlobalISelEmitterRegSequence.td rename to llvm/test/TableGen/GlobalISelEmitter/RegSequence.td index 69f82eac49c16..97790fb483933 100644 --- a/llvm/test/TableGen/GlobalISelEmitterRegSequence.td +++ b/llvm/test/TableGen/GlobalISelEmitter/RegSequence.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=false -I %p/../../include -I %p/Common -o - | FileCheck %s +// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=false -I %p/../../../include -I %p/../Common | FileCheck %s include "llvm/Target/Target.td" include "GlobalISelEmitterCommon.td" diff --git a/llvm/test/TableGen/GlobalISelEmitter-SDNodeXForm-timm.td b/llvm/test/TableGen/GlobalISelEmitter/SDNodeXForm-timm.td similarity index 93% rename from llvm/test/TableGen/GlobalISelEmitter-SDNodeXForm-timm.td rename to llvm/test/TableGen/GlobalISelEmitter/SDNodeXForm-timm.td index 8d6dedf2f920c..fab395dc2dfc2 100644 --- a/llvm/test/TableGen/GlobalISelEmitter-SDNodeXForm-timm.td +++ b/llvm/test/TableGen/GlobalISelEmitter/SDNodeXForm-timm.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -optimize-match-table=false -I %p/../../include -I %p/Common %s -o - | FileCheck -check-prefix=GISEL %s +// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -optimize-match-table=false -I %p/../../../include -I %p/../Common %s | FileCheck -check-prefix=GISEL %s include "llvm/Target/Target.td" include "GlobalISelEmitterCommon.td" diff --git a/llvm/test/TableGen/GlobalISelEmitterSkippedPatterns.td b/llvm/test/TableGen/GlobalISelEmitter/SkippedPatterns.td similarity index 86% rename from llvm/test/TableGen/GlobalISelEmitterSkippedPatterns.td rename to llvm/test/TableGen/GlobalISelEmitter/SkippedPatterns.td index fc8abc6fbc547..9d9d3f936e3e4 100644 --- a/llvm/test/TableGen/GlobalISelEmitterSkippedPatterns.td +++ b/llvm/test/TableGen/GlobalISelEmitter/SkippedPatterns.td @@ -1,5 +1,5 @@ -// RUN: llvm-tblgen -warn-on-skipped-patterns -gen-global-isel -I %p/../../include %s -I %p/Common -o /dev/null 2>&1 | FileCheck %s -// RUN: llvm-tblgen -warn-on-skipped-patterns -gen-global-isel -I %p/../../include %s -I %p/Common -o /dev/null -DIGNORE 2>&1 | FileCheck --allow-empty --check-prefix=IGNORED %s +// RUN: llvm-tblgen -warn-on-skipped-patterns -gen-global-isel -I %p/../../../include %s -I %p/../Common -o /dev/null 2>&1 | FileCheck %s +// RUN: llvm-tblgen -warn-on-skipped-patterns -gen-global-isel -I %p/../../../include %s -I %p/../Common -o /dev/null -DIGNORE 2>&1 | FileCheck --allow-empty --check-prefix=IGNORED %s include "llvm/Target/Target.td" include "GlobalISelEmitterCommon.td" diff --git a/llvm/test/TableGen/GlobalISelEmitterSubreg.td b/llvm/test/TableGen/GlobalISelEmitter/Subreg.td similarity index 99% rename from llvm/test/TableGen/GlobalISelEmitterSubreg.td rename to llvm/test/TableGen/GlobalISelEmitter/Subreg.td index 08e690f3e894d..5203c2b4a6e4f 100644 --- a/llvm/test/TableGen/GlobalISelEmitterSubreg.td +++ b/llvm/test/TableGen/GlobalISelEmitter/Subreg.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen %s -gen-global-isel -warn-on-skipped-patterns -I %p/../../include -I %p/Common -o - 2> %t.skipped | FileCheck %s +// RUN: llvm-tblgen %s -gen-global-isel -warn-on-skipped-patterns -I %p/../../../include -I %p/../Common 2> %t.skipped | FileCheck %s // RUN: cat %t.skipped | FileCheck %s --check-prefix=SKIPPED include "llvm/Target/Target.td" diff --git a/llvm/test/TableGen/GlobalISelEmitterVariadic.td b/llvm/test/TableGen/GlobalISelEmitter/Variadic.td similarity index 97% rename from llvm/test/TableGen/GlobalISelEmitterVariadic.td rename to llvm/test/TableGen/GlobalISelEmitter/Variadic.td index 992e1a4b907c3..b3c80526af81f 100644 --- a/llvm/test/TableGen/GlobalISelEmitterVariadic.td +++ b/llvm/test/TableGen/GlobalISelEmitter/Variadic.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen -gen-global-isel -I %p/../../include -I %p/Common %s -o - | FileCheck %s +// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common %s | FileCheck %s include "llvm/Target/Target.td" include "GlobalISelEmitterCommon.td" diff --git a/llvm/test/TableGen/GlobalISelEmitter-atomic_store.td b/llvm/test/TableGen/GlobalISelEmitter/atomic-store.td similarity index 93% rename from llvm/test/TableGen/GlobalISelEmitter-atomic_store.td rename to llvm/test/TableGen/GlobalISelEmitter/atomic-store.td index da2dfe8004289..53b8670f47e63 100644 --- a/llvm/test/TableGen/GlobalISelEmitter-atomic_store.td +++ b/llvm/test/TableGen/GlobalISelEmitter/atomic-store.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../include -I %p/Common %s -o - < %s | FileCheck -check-prefix=GISEL %s +// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../../include -I %p/../Common %s | FileCheck -check-prefix=GISEL %s include "llvm/Target/Target.td" include "GlobalISelEmitterCommon.td" diff --git a/llvm/test/TableGen/GlobalISelEmitter-frameindex.td b/llvm/test/TableGen/GlobalISelEmitter/frameindex.td similarity index 98% rename from llvm/test/TableGen/GlobalISelEmitter-frameindex.td rename to llvm/test/TableGen/GlobalISelEmitter/frameindex.td index 715e53ddbad08..27784526a65ba 100644 --- a/llvm/test/TableGen/GlobalISelEmitter-frameindex.td +++ b/llvm/test/TableGen/GlobalISelEmitter/frameindex.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../include -I %p/Common %s | FileCheck %s +// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../../include -I %p/../Common %s | FileCheck %s include "llvm/Target/Target.td" include "GlobalISelEmitterCommon.td" diff --git a/llvm/test/TableGen/gisel-physreg-input.td b/llvm/test/TableGen/GlobalISelEmitter/gisel-physreg-input.td similarity index 98% rename from llvm/test/TableGen/gisel-physreg-input.td rename to llvm/test/TableGen/GlobalISelEmitter/gisel-physreg-input.td index f19872a331fc8..a05f364eb3f05 100644 --- a/llvm/test/TableGen/gisel-physreg-input.td +++ b/llvm/test/TableGen/GlobalISelEmitter/gisel-physreg-input.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../include %s -o - < %s | FileCheck -check-prefix=GISEL %s +// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../../include %s | FileCheck -check-prefix=GISEL %s include "llvm/Target/Target.td" diff --git a/llvm/test/TableGen/GlobalISelEmitter-immAllZeroOne.td b/llvm/test/TableGen/GlobalISelEmitter/immAllZeroOne.td similarity index 92% rename from llvm/test/TableGen/GlobalISelEmitter-immAllZeroOne.td rename to llvm/test/TableGen/GlobalISelEmitter/immAllZeroOne.td index 0125aa5c30fa6..68278f46627c6 100644 --- a/llvm/test/TableGen/GlobalISelEmitter-immAllZeroOne.td +++ b/llvm/test/TableGen/GlobalISelEmitter/immAllZeroOne.td @@ -1,5 +1,5 @@ -// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -optimize-match-table=false -I %p/../../include -I %p/Common %s -o - | FileCheck -check-prefixes=GISEL-NOOPT %s -// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -optimize-match-table=true -I %p/../../include -I %p/Common %s -o - | FileCheck -check-prefixes=GISEL-OPT %s +// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -optimize-match-table=false -I %p/../../../include -I %p/../Common %s | FileCheck -check-prefixes=GISEL-NOOPT %s +// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -optimize-match-table=true -I %p/../../../include -I %p/../Common %s | FileCheck -check-prefixes=GISEL-OPT %s include "llvm/Target/Target.td" include "GlobalISelEmitterCommon.td" diff --git a/llvm/test/TableGen/GlobalISelEmitter-immarg-literal-pattern.td b/llvm/test/TableGen/GlobalISelEmitter/immarg-literal-pattern.td similarity index 95% rename from llvm/test/TableGen/GlobalISelEmitter-immarg-literal-pattern.td rename to llvm/test/TableGen/GlobalISelEmitter/immarg-literal-pattern.td index 6b4012eb736cb..ff05ac12f32ce 100644 --- a/llvm/test/TableGen/GlobalISelEmitter-immarg-literal-pattern.td +++ b/llvm/test/TableGen/GlobalISelEmitter/immarg-literal-pattern.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -optimize-match-table=false -I %p/../../include -I %p/Common %s -o - | FileCheck -check-prefix=GISEL %s +// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -optimize-match-table=false -I %p/../../../include -I %p/../Common %s | FileCheck -check-prefix=GISEL %s include "llvm/Target/Target.td" include "GlobalISelEmitterCommon.td" diff --git a/llvm/test/TableGen/immarg-predicated.td b/llvm/test/TableGen/GlobalISelEmitter/immarg-predicated.td similarity index 92% rename from llvm/test/TableGen/immarg-predicated.td rename to llvm/test/TableGen/GlobalISelEmitter/immarg-predicated.td index dcacb2f8f1de3..ab412fac48e12 100644 --- a/llvm/test/TableGen/immarg-predicated.td +++ b/llvm/test/TableGen/GlobalISelEmitter/immarg-predicated.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/Common -I %p/../../include %s -o - < %s | FileCheck -check-prefix=GISEL %s +// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../Common -I %p/../../../include %s | FileCheck -check-prefix=GISEL %s include "llvm/Target/Target.td" include "GlobalISelEmitterCommon.td" diff --git a/llvm/test/TableGen/immarg.td b/llvm/test/TableGen/GlobalISelEmitter/immarg.td similarity index 93% rename from llvm/test/TableGen/immarg.td rename to llvm/test/TableGen/GlobalISelEmitter/immarg.td index e5fd06ce6c083..eae04094a689e 100644 --- a/llvm/test/TableGen/immarg.td +++ b/llvm/test/TableGen/GlobalISelEmitter/immarg.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/Common -I %p/../../include %s -o - < %s | FileCheck -check-prefix=GISEL %s +// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../Common -I %p/../../../include %s | FileCheck -check-prefix=GISEL %s include "llvm/Target/Target.td" include "GlobalISelEmitterCommon.td" diff --git a/llvm/test/TableGen/GlobalISelEmitter-implicit-defs.td b/llvm/test/TableGen/GlobalISelEmitter/implicit-defs.td similarity index 81% rename from llvm/test/TableGen/GlobalISelEmitter-implicit-defs.td rename to llvm/test/TableGen/GlobalISelEmitter/implicit-defs.td index 79af1a336f289..06e5e39a68175 100644 --- a/llvm/test/TableGen/GlobalISelEmitter-implicit-defs.td +++ b/llvm/test/TableGen/GlobalISelEmitter/implicit-defs.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -I %p/../../include -I %p/Common %s -o /dev/null 2>&1 < %s | FileCheck %s --implicit-check-not="Skipped pattern" +// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -I %p/../../../include -I %p/../Common %s -o /dev/null 2>&1 | FileCheck %s --implicit-check-not="Skipped pattern" include "llvm/Target/Target.td" include "GlobalISelEmitterCommon.td" diff --git a/llvm/test/TableGen/GlobalISelEmitter-input-discard.td b/llvm/test/TableGen/GlobalISelEmitter/input-discard.td similarity index 95% rename from llvm/test/TableGen/GlobalISelEmitter-input-discard.td rename to llvm/test/TableGen/GlobalISelEmitter/input-discard.td index 202ff4a5758d7..65ebfa2c5b325 100644 --- a/llvm/test/TableGen/GlobalISelEmitter-input-discard.td +++ b/llvm/test/TableGen/GlobalISelEmitter/input-discard.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -I %p/../../include -I %p/Common %s -o - < %s | FileCheck -check-prefix=GISEL %s +// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -I %p/../../../include -I %p/../Common %s | FileCheck -check-prefix=GISEL %s include "llvm/Target/Target.td" include "GlobalISelEmitterCommon.td" diff --git a/llvm/test/TableGen/GlobalISelEmitter-multiple-output-discard.td b/llvm/test/TableGen/GlobalISelEmitter/multiple-output-discard.td similarity index 96% rename from llvm/test/TableGen/GlobalISelEmitter-multiple-output-discard.td rename to llvm/test/TableGen/GlobalISelEmitter/multiple-output-discard.td index 2d968bebbc65e..a180431b94f6f 100644 --- a/llvm/test/TableGen/GlobalISelEmitter-multiple-output-discard.td +++ b/llvm/test/TableGen/GlobalISelEmitter/multiple-output-discard.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -warn-on-skipped-patterns -I %p/../../include -I %p/Common %s -o - < %s | FileCheck %s +// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -warn-on-skipped-patterns -I %p/../../../include -I %p/../Common %s | FileCheck %s include "llvm/Target/Target.td" include "GlobalISelEmitterCommon.td" diff --git a/llvm/test/TableGen/GlobalISelEmitter-multiple-output.td b/llvm/test/TableGen/GlobalISelEmitter/multiple-output.td similarity index 98% rename from llvm/test/TableGen/GlobalISelEmitter-multiple-output.td rename to llvm/test/TableGen/GlobalISelEmitter/multiple-output.td index dea3b54960717..baf767598f82c 100644 --- a/llvm/test/TableGen/GlobalISelEmitter-multiple-output.td +++ b/llvm/test/TableGen/GlobalISelEmitter/multiple-output.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -warn-on-skipped-patterns -I %p/../../include -I %p/Common %s -o - < %s | FileCheck %s +// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -warn-on-skipped-patterns -I %p/../../../include -I %p/../Common %s | FileCheck %s include "llvm/Target/Target.td" include "GlobalISelEmitterCommon.td" diff --git a/llvm/test/TableGen/GlobalISelEmitter-nested-subregs.td b/llvm/test/TableGen/GlobalISelEmitter/nested-subregs.td similarity index 98% rename from llvm/test/TableGen/GlobalISelEmitter-nested-subregs.td rename to llvm/test/TableGen/GlobalISelEmitter/nested-subregs.td index 79e55ef2e8b8c..8688e4f04bab9 100644 --- a/llvm/test/TableGen/GlobalISelEmitter-nested-subregs.td +++ b/llvm/test/TableGen/GlobalISelEmitter/nested-subregs.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=false -I %p/../../include -I %p/Common -o - | FileCheck %s +// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=false -I %p/../../../include -I %p/../Common | FileCheck %s include "llvm/Target/Target.td" include "GlobalISelEmitterCommon.td" diff --git a/llvm/test/TableGen/GlobalISelEmitter-notype-output-pattern.td b/llvm/test/TableGen/GlobalISelEmitter/notype-output-pattern.td similarity index 89% rename from llvm/test/TableGen/GlobalISelEmitter-notype-output-pattern.td rename to llvm/test/TableGen/GlobalISelEmitter/notype-output-pattern.td index 622d7fa1f7955..80b125649ed2d 100644 --- a/llvm/test/TableGen/GlobalISelEmitter-notype-output-pattern.td +++ b/llvm/test/TableGen/GlobalISelEmitter/notype-output-pattern.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen -gen-global-isel -I %p/../../include -I %p/Common %s | FileCheck %s +// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common %s | FileCheck %s include "llvm/Target/Target.td" include "GlobalISelEmitterCommon.td" diff --git a/llvm/test/TableGen/GlobalISelEmitter-optional-def.td b/llvm/test/TableGen/GlobalISelEmitter/optional-def.td similarity index 97% rename from llvm/test/TableGen/GlobalISelEmitter-optional-def.td rename to llvm/test/TableGen/GlobalISelEmitter/optional-def.td index def4a0447fe53..7792a97e1377e 100644 --- a/llvm/test/TableGen/GlobalISelEmitter-optional-def.td +++ b/llvm/test/TableGen/GlobalISelEmitter/optional-def.td @@ -1,5 +1,5 @@ // RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns \ -// RUN: -I %p/../../include -I %p/Common %s 2> %t | FileCheck %s +// RUN: -I %p/../../../include -I %p/../Common %s 2> %t | FileCheck %s // RUN: FileCheck -DFILE=%s -check-prefix=ERR %s < %t include "llvm/Target/Target.td" diff --git a/llvm/test/TableGen/GlobalISelEmitter-output-discard.td b/llvm/test/TableGen/GlobalISelEmitter/output-discard.td similarity index 94% rename from llvm/test/TableGen/GlobalISelEmitter-output-discard.td rename to llvm/test/TableGen/GlobalISelEmitter/output-discard.td index 7a0242d9a9924..c249dcbe214e0 100644 --- a/llvm/test/TableGen/GlobalISelEmitter-output-discard.td +++ b/llvm/test/TableGen/GlobalISelEmitter/output-discard.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -I %p/../../include -I %p/Common %s -o - < %s | FileCheck -check-prefix=GISEL %s +// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -I %p/../../../include -I %p/../Common %s | FileCheck -check-prefix=GISEL %s include "llvm/Target/Target.td" include "GlobalISelEmitterCommon.td" diff --git a/llvm/test/TableGen/GlobalISelEmitter-setcc.td b/llvm/test/TableGen/GlobalISelEmitter/setcc.td similarity index 91% rename from llvm/test/TableGen/GlobalISelEmitter-setcc.td rename to llvm/test/TableGen/GlobalISelEmitter/setcc.td index 38add7627f503..02622d0358ee6 100644 --- a/llvm/test/TableGen/GlobalISelEmitter-setcc.td +++ b/llvm/test/TableGen/GlobalISelEmitter/setcc.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -optimize-match-table=false -I %p/../../include -I %p/Common %s -o - 2> %t < %s | FileCheck -check-prefix=GISEL %s +// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -optimize-match-table=false -I %p/../../../include -I %p/../Common %s 2> %t | FileCheck -check-prefix=GISEL %s // RUN: FileCheck -DFILE=%s -check-prefix=ERR %s < %t include "llvm/Target/Target.td" diff --git a/llvm/test/TableGen/GlobalISelEmitter-zero-instr.td b/llvm/test/TableGen/GlobalISelEmitter/zero-instr.td similarity index 71% rename from llvm/test/TableGen/GlobalISelEmitter-zero-instr.td rename to llvm/test/TableGen/GlobalISelEmitter/zero-instr.td index c8a8cab2b6523..f9463ba0fefc4 100644 --- a/llvm/test/TableGen/GlobalISelEmitter-zero-instr.td +++ b/llvm/test/TableGen/GlobalISelEmitter/zero-instr.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../include -I %p/Common %s -o /dev/null --warn-on-skipped-patterns 2>&1 < %s 2>&1 | FileCheck %s +// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../../include -I %p/../Common %s -o /dev/null --warn-on-skipped-patterns 2>&1 | FileCheck %s include "llvm/Target/Target.td" include "GlobalISelEmitterCommon.td" diff --git a/llvm/test/TableGen/GlobalISelEmitter-zero-reg.td b/llvm/test/TableGen/GlobalISelEmitter/zero-reg.td similarity index 97% rename from llvm/test/TableGen/GlobalISelEmitter-zero-reg.td rename to llvm/test/TableGen/GlobalISelEmitter/zero-reg.td index ddf02240ee1f8..87e5432093377 100644 --- a/llvm/test/TableGen/GlobalISelEmitter-zero-reg.td +++ b/llvm/test/TableGen/GlobalISelEmitter/zero-reg.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../include -I %p/Common %s -o - < %s | FileCheck %s +// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../../include -I %p/../Common %s | FileCheck %s include "llvm/Target/Target.td" include "GlobalISelEmitterCommon.td" From c870632ef6162fbdccaad8cd09420728220ad344 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Wed, 25 Dec 2024 09:42:03 +0100 Subject: [PATCH 054/567] [flang] Fix some memory leaks (#121050) This commit fixes some but not all memory leaks in Flang. There are still 91 tests that fail with ASAN. - Use `mlir::OwningOpRef` instead of `std::unique_ptr`. The latter does not free allocations of nested blocks. - Pass `ModuleOp` as value instead of reference. - Add few missing deallocations in test cases and other places. --- .../include/flang/Frontend/FrontendActions.h | 3 +- flang/include/flang/Lower/AbstractConverter.h | 4 +-- flang/include/flang/Lower/Bridge.h | 6 ++-- flang/include/flang/Lower/OpenACC.h | 8 +++--- flang/include/flang/Tools/CrossToolHelpers.h | 4 +-- flang/lib/Frontend/FrontendActions.cpp | 22 +++++++++------ flang/lib/Lower/Bridge.cpp | 28 ++++++++----------- flang/lib/Lower/OpenACC.cpp | 14 ++++++---- flang/lib/Optimizer/CodeGen/CodeGen.cpp | 2 +- .../unittests/Frontend/CodeGenActionTest.cpp | 3 +- .../Optimizer/Builder/CharacterTest.cpp | 9 +++--- .../Optimizer/Builder/ComplexTest.cpp | 9 +++--- .../Optimizer/Builder/FIRBuilderTest.cpp | 9 +++--- .../Optimizer/Builder/HLFIRToolsTest.cpp | 9 +++--- .../Builder/Runtime/RuntimeCallTestBase.h | 9 +++--- .../Optimizer/FortranVariableTest.cpp | 7 +++-- flang/unittests/Runtime/ArrayConstructor.cpp | 3 ++ flang/unittests/Runtime/CharacterTest.cpp | 3 ++ 18 files changed, 85 insertions(+), 67 deletions(-) diff --git a/flang/include/flang/Frontend/FrontendActions.h b/flang/include/flang/Frontend/FrontendActions.h index 374fd76c8ae17..4e3d3cb2657db 100644 --- a/flang/include/flang/Frontend/FrontendActions.h +++ b/flang/include/flang/Frontend/FrontendActions.h @@ -19,6 +19,7 @@ #include "flang/Semantics/semantics.h" #include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/OwningOpRef.h" #include "llvm/ADT/StringRef.h" #include "llvm/IR/Module.h" #include @@ -215,8 +216,8 @@ class CodeGenAction : public FrontendAction { CodeGenAction(BackendActionTy act) : action{act} {}; /// @name MLIR /// { - std::unique_ptr mlirModule; std::unique_ptr mlirCtx; + mlir::OwningOpRef mlirModule; /// } /// @name LLVM IR diff --git a/flang/include/flang/Lower/AbstractConverter.h b/flang/include/flang/Lower/AbstractConverter.h index 8f026ac3280bf..607aff41f6459 100644 --- a/flang/include/flang/Lower/AbstractConverter.h +++ b/flang/include/flang/Lower/AbstractConverter.h @@ -62,7 +62,7 @@ struct SymbolBox; namespace pft { struct Variable; struct FunctionLikeUnit; -} +} // namespace pft using SomeExpr = Fortran::evaluate::Expr; using SymbolRef = Fortran::common::Reference; @@ -295,7 +295,7 @@ class AbstractConverter { /// Get the OpBuilder virtual fir::FirOpBuilder &getFirOpBuilder() = 0; /// Get the ModuleOp - virtual mlir::ModuleOp &getModuleOp() = 0; + virtual mlir::ModuleOp getModuleOp() = 0; /// Get the MLIRContext virtual mlir::MLIRContext &getMLIRContext() = 0; /// Unique a symbol (add a containing scope specific prefix) diff --git a/flang/include/flang/Lower/Bridge.h b/flang/include/flang/Lower/Bridge.h index 8ea5ed52e2821..6404a16f7785a 100644 --- a/flang/include/flang/Lower/Bridge.h +++ b/flang/include/flang/Lower/Bridge.h @@ -23,6 +23,7 @@ #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Dialect/Support/KindMapping.h" #include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/OwningOpRef.h" #include namespace llvm { @@ -83,7 +84,8 @@ class LoweringBridge { mlir::MLIRContext &getMLIRContext() { return context; } /// Get the ModuleOp. It can never be null, which is asserted in the ctor. - mlir::ModuleOp &getModule() { return *module.get(); } + mlir::ModuleOp getModule() { return *module; } + mlir::ModuleOp getModuleAndRelease() { return module.release(); } const Fortran::common::IntrinsicTypeDefaultKinds &getDefaultKinds() const { return defaultKinds; @@ -166,7 +168,7 @@ class LoweringBridge { const Fortran::evaluate::TargetCharacteristics &targetCharacteristics; const Fortran::parser::AllCookedSources *cooked; mlir::MLIRContext &context; - std::unique_ptr module; + mlir::OwningOpRef module; fir::KindMapping &kindMap; const Fortran::lower::LoweringOptions &loweringOptions; const std::vector &envDefaults; diff --git a/flang/include/flang/Lower/OpenACC.h b/flang/include/flang/Lower/OpenACC.h index fbf61e7184ae2..0d7038a7fd856 100644 --- a/flang/include/flang/Lower/OpenACC.h +++ b/flang/include/flang/Lower/OpenACC.h @@ -19,7 +19,7 @@ namespace llvm { template class SmallVector; class StringRef; -} +} // namespace llvm namespace mlir { class Location; @@ -44,7 +44,7 @@ struct OpenACCRoutineConstruct; namespace semantics { class SemanticsContext; class Symbol; -} +} // namespace semantics namespace lower { @@ -78,11 +78,11 @@ void genOpenACCDeclarativeConstruct(AbstractConverter &, AccRoutineInfoMappingList &); void genOpenACCRoutineConstruct(AbstractConverter &, Fortran::semantics::SemanticsContext &, - mlir::ModuleOp &, + mlir::ModuleOp, const parser::OpenACCRoutineConstruct &, AccRoutineInfoMappingList &); -void finalizeOpenACCRoutineAttachment(mlir::ModuleOp &, +void finalizeOpenACCRoutineAttachment(mlir::ModuleOp, AccRoutineInfoMappingList &); /// Get a acc.private.recipe op for the given type or create it if it does not diff --git a/flang/include/flang/Tools/CrossToolHelpers.h b/flang/include/flang/Tools/CrossToolHelpers.h index c0091e1c953b8..0286f2aa14519 100644 --- a/flang/include/flang/Tools/CrossToolHelpers.h +++ b/flang/include/flang/Tools/CrossToolHelpers.h @@ -174,7 +174,7 @@ struct OffloadModuleOpts { // Shares assinging of the OpenMP OffloadModuleInterface and its assorted // attributes accross Flang tools (bbc/flang) [[maybe_unused]] static void setOffloadModuleInterfaceAttributes( - mlir::ModuleOp &module, OffloadModuleOpts Opts) { + mlir::ModuleOp module, OffloadModuleOpts Opts) { // Should be registered by the OpenMPDialect if (auto offloadMod = llvm::dyn_cast( module.getOperation())) { @@ -198,7 +198,7 @@ struct OffloadModuleOpts { } [[maybe_unused]] static void setOpenMPVersionAttribute( - mlir::ModuleOp &module, int64_t version) { + mlir::ModuleOp module, int64_t version) { module.getOperation()->setAttr( mlir::StringAttr::get(module.getContext(), llvm::Twine{"omp.version"}), mlir::omp::VersionAttr::get(module.getContext(), version)); diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp index 77631f70dfd19..603cb039d20b1 100644 --- a/flang/lib/Frontend/FrontendActions.cpp +++ b/flang/lib/Frontend/FrontendActions.cpp @@ -149,7 +149,7 @@ bool PrescanAndSemaDebugAction::beginSourceFileAction() { (runSemanticChecks() || true) && (generateRtTypeTables() || true); } -static void addDependentLibs(mlir::ModuleOp &mlirModule, CompilerInstance &ci) { +static void addDependentLibs(mlir::ModuleOp mlirModule, CompilerInstance &ci) { const std::vector &libs = ci.getInvocation().getCodeGenOpts().DependentLibs; if (libs.empty()) { @@ -171,7 +171,7 @@ static void addDependentLibs(mlir::ModuleOp &mlirModule, CompilerInstance &ci) { // Add to MLIR code target specific items which are dependent on target // configuration specified by the user. // Clang equivalent function: AMDGPUTargetCodeGenInfo::emitTargetGlobals -static void addAMDGPUSpecificMLIRItems(mlir::ModuleOp &mlirModule, +static void addAMDGPUSpecificMLIRItems(mlir::ModuleOp mlirModule, CompilerInstance &ci) { const TargetOptions &targetOpts = ci.getInvocation().getTargetOpts(); const llvm::Triple triple(targetOpts.triple); @@ -269,7 +269,7 @@ bool CodeGenAction::beginSourceFileAction() { return false; } - mlirModule = std::make_unique(module.release()); + mlirModule = std::move(module); const llvm::DataLayout &dl = targetMachine.createDataLayout(); fir::support::setMLIRDataLayout(*mlirModule, dl); return true; @@ -303,14 +303,11 @@ bool CodeGenAction::beginSourceFileAction() { ci.getInvocation().getFrontendOpts().features, targetMachine, ci.getInvocation().getTargetOpts(), ci.getInvocation().getCodeGenOpts()); - // Fetch module from lb, so we can set - mlirModule = std::make_unique(lb.getModule()); - if (ci.getInvocation().getFrontendOpts().features.IsEnabled( Fortran::common::LanguageFeature::OpenMP)) { - setOffloadModuleInterfaceAttributes(*mlirModule, + setOffloadModuleInterfaceAttributes(lb.getModule(), ci.getInvocation().getLangOpts()); - setOpenMPVersionAttribute(*mlirModule, + setOpenMPVersionAttribute(lb.getModule(), ci.getInvocation().getLangOpts().OpenMPVersion); } @@ -318,6 +315,9 @@ bool CodeGenAction::beginSourceFileAction() { Fortran::parser::Program &parseTree{*ci.getParsing().parseTree()}; lb.lower(parseTree, ci.getSemanticsContext()); + // Fetch module from lb, so we can set + mlirModule = lb.getModuleAndRelease(); + // Add target specific items like dependent libraries, target specific // constants etc. addDependentLibs(*mlirModule, ci); @@ -961,6 +961,9 @@ static void generateMachineCodeOrAssemblyImpl(clang::DiagnosticsEngine &diags, // Run the passes codeGenPasses.run(llvmModule); + + // Cleanup + delete tlii; } void CodeGenAction::runOptimizationPipeline(llvm::raw_pwrite_stream &os) { @@ -1043,6 +1046,9 @@ void CodeGenAction::runOptimizationPipeline(llvm::raw_pwrite_stream &os) { // Run the passes. mpm.run(*llvmModule, mam); + + // Cleanup + delete tlii; } // This class handles optimization remark messages requested if diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 17b794d147c6f..c7e2635230e98 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -1028,7 +1028,7 @@ class FirConverter : public Fortran::lower::AbstractConverter { fir::FirOpBuilder &getFirOpBuilder() override final { return *builder; } - mlir::ModuleOp &getModuleOp() override final { return bridge.getModule(); } + mlir::ModuleOp getModuleOp() override final { return bridge.getModule(); } mlir::MLIRContext &getMLIRContext() override final { return bridge.getMLIRContext(); @@ -6137,10 +6137,7 @@ void Fortran::lower::LoweringBridge::lower( } void Fortran::lower::LoweringBridge::parseSourceFile(llvm::SourceMgr &srcMgr) { - mlir::OwningOpRef owningRef = - mlir::parseSourceFile(srcMgr, &context); - module.reset(new mlir::ModuleOp(owningRef.get().getOperation())); - owningRef.release(); + module = mlir::parseSourceFile(srcMgr, &context); } Fortran::lower::LoweringBridge::LoweringBridge( @@ -6207,19 +6204,18 @@ Fortran::lower::LoweringBridge::LoweringBridge( }; // Create the module and attach the attributes. - module = std::make_unique( + module = mlir::OwningOpRef( mlir::ModuleOp::create(getPathLocation())); - assert(module.get() && "module was not created"); - fir::setTargetTriple(*module.get(), triple); - fir::setKindMapping(*module.get(), kindMap); - fir::setTargetCPU(*module.get(), targetMachine.getTargetCPU()); - fir::setTuneCPU(*module.get(), targetOpts.cpuToTuneFor); - fir::setTargetFeatures(*module.get(), targetMachine.getTargetFeatureString()); - fir::support::setMLIRDataLayout(*module.get(), - targetMachine.createDataLayout()); - fir::setIdent(*module.get(), Fortran::common::getFlangFullVersion()); + assert(*module && "module was not created"); + fir::setTargetTriple(*module, triple); + fir::setKindMapping(*module, kindMap); + fir::setTargetCPU(*module, targetMachine.getTargetCPU()); + fir::setTuneCPU(*module, targetOpts.cpuToTuneFor); + fir::setTargetFeatures(*module, targetMachine.getTargetFeatureString()); + fir::support::setMLIRDataLayout(*module, targetMachine.createDataLayout()); + fir::setIdent(*module, Fortran::common::getFlangFullVersion()); if (cgOpts.RecordCommandLine) - fir::setCommandline(*module.get(), *cgOpts.RecordCommandLine); + fir::setCommandline(*module, *cgOpts.RecordCommandLine); } void Fortran::lower::genCleanUpInRegionIfAny( diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp index ed18ad89c16ef..8155c36396b11 100644 --- a/flang/lib/Lower/OpenACC.cpp +++ b/flang/lib/Lower/OpenACC.cpp @@ -2670,11 +2670,13 @@ static void genACCDataOp(Fortran::lower::AbstractConverter &converter, asyncOnlyDeviceTypes); attachEntryOperands.append(dataClauseOperands.begin() + crtDataStart, dataClauseOperands.end()); - } else if(const auto *defaultClause = - std::get_if(&clause.u)) { + } else if (const auto *defaultClause = + std::get_if( + &clause.u)) { if ((defaultClause->v).v == llvm::acc::DefaultValue::ACC_Default_none) hasDefaultNone = true; - else if ((defaultClause->v).v == llvm::acc::DefaultValue::ACC_Default_present) + else if ((defaultClause->v).v == + llvm::acc::DefaultValue::ACC_Default_present) hasDefaultPresent = true; } } @@ -3830,7 +3832,7 @@ genDeclareInFunction(Fortran::lower::AbstractConverter &converter, static void genDeclareInModule(Fortran::lower::AbstractConverter &converter, - mlir::ModuleOp &moduleOp, + mlir::ModuleOp moduleOp, const Fortran::parser::AccClauseList &accClauseList) { mlir::OpBuilder modBuilder(moduleOp.getBodyRegion()); for (const Fortran::parser::AccClause &clause : accClauseList.v) { @@ -3981,7 +3983,7 @@ static void attachRoutineInfo(mlir::func::FuncOp func, void Fortran::lower::genOpenACCRoutineConstruct( Fortran::lower::AbstractConverter &converter, - Fortran::semantics::SemanticsContext &semanticsContext, mlir::ModuleOp &mod, + Fortran::semantics::SemanticsContext &semanticsContext, mlir::ModuleOp mod, const Fortran::parser::OpenACCRoutineConstruct &routineConstruct, Fortran::lower::AccRoutineInfoMappingList &accRoutineInfos) { fir::FirOpBuilder &builder = converter.getFirOpBuilder(); @@ -4139,7 +4141,7 @@ void Fortran::lower::genOpenACCRoutineConstruct( } void Fortran::lower::finalizeOpenACCRoutineAttachment( - mlir::ModuleOp &mod, + mlir::ModuleOp mod, Fortran::lower::AccRoutineInfoMappingList &accRoutineInfos) { for (auto &mapping : accRoutineInfos) { mlir::func::FuncOp funcOp = diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp index 926f83b9c9a64..4edea86b417c3 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp @@ -3087,7 +3087,7 @@ struct GlobalOpConversion : public fir::FIROpConversion { private: static void addComdat(mlir::LLVM::GlobalOp &global, mlir::ConversionPatternRewriter &rewriter, - mlir::ModuleOp &module) { + mlir::ModuleOp module) { const char *comdatName = "__llvm_comdat"; mlir::LLVM::ComdatOp comdatOp = module.lookupSymbol(comdatName); diff --git a/flang/unittests/Frontend/CodeGenActionTest.cpp b/flang/unittests/Frontend/CodeGenActionTest.cpp index 5d75de03d4e55..e9ff095973b97 100644 --- a/flang/unittests/Frontend/CodeGenActionTest.cpp +++ b/flang/unittests/Frontend/CodeGenActionTest.cpp @@ -72,8 +72,7 @@ class LLVMConversionFailureCodeGenAction : public CodeGenAction { mlirCtx->loadDialect(); mlir::Location loc(mlir::UnknownLoc::get(mlirCtx.get())); - mlirModule = - std::make_unique(mlir::ModuleOp::create(loc, "mod")); + mlirModule = mlir::ModuleOp::create(loc, "mod"); mlir::OpBuilder builder(mlirCtx.get()); builder.setInsertionPointToStart(&mlirModule->getRegion().front()); diff --git a/flang/unittests/Optimizer/Builder/CharacterTest.cpp b/flang/unittests/Optimizer/Builder/CharacterTest.cpp index c6defcd51095b..6d912b81d9541 100644 --- a/flang/unittests/Optimizer/Builder/CharacterTest.cpp +++ b/flang/unittests/Optimizer/Builder/CharacterTest.cpp @@ -26,19 +26,20 @@ struct CharacterTest : public testing::Test { // Set up a Module with a dummy function operation inside. // Set the insertion point in the function entry block. - mlir::ModuleOp mod = builder.create(loc); - mlir::func::FuncOp func = mlir::func::FuncOp::create( + moduleOp = builder.create(loc); + builder.setInsertionPointToStart(moduleOp->getBody()); + mlir::func::FuncOp func = builder.create( loc, "func1", builder.getFunctionType(std::nullopt, std::nullopt)); auto *entryBlock = func.addEntryBlock(); - mod.push_back(mod); builder.setInsertionPointToStart(entryBlock); - firBuilder = std::make_unique(mod, *kindMap); + firBuilder = std::make_unique(builder, *kindMap); } fir::FirOpBuilder &getBuilder() { return *firBuilder; } mlir::MLIRContext context; + mlir::OwningOpRef moduleOp; std::unique_ptr kindMap; std::unique_ptr firBuilder; }; diff --git a/flang/unittests/Optimizer/Builder/ComplexTest.cpp b/flang/unittests/Optimizer/Builder/ComplexTest.cpp index 6472a52f25ee5..eefab118e255a 100644 --- a/flang/unittests/Optimizer/Builder/ComplexTest.cpp +++ b/flang/unittests/Optimizer/Builder/ComplexTest.cpp @@ -22,15 +22,15 @@ struct ComplexTest : public testing::Test { // Set up a Module with a dummy function operation inside. // Set the insertion point in the function entry block. - mlir::ModuleOp mod = builder.create(loc); - mlir::func::FuncOp func = mlir::func::FuncOp::create( + moduleOp = builder.create(loc); + builder.setInsertionPointToStart(moduleOp->getBody()); + mlir::func::FuncOp func = builder.create( loc, "func1", builder.getFunctionType(std::nullopt, std::nullopt)); auto *entryBlock = func.addEntryBlock(); - mod.push_back(mod); builder.setInsertionPointToStart(entryBlock); kindMap = std::make_unique(&context); - firBuilder = std::make_unique(mod, *kindMap); + firBuilder = std::make_unique(builder, *kindMap); helper = std::make_unique(*firBuilder, loc); // Init commonly used types @@ -46,6 +46,7 @@ struct ComplexTest : public testing::Test { } mlir::MLIRContext context; + mlir::OwningOpRef moduleOp; std::unique_ptr kindMap; std::unique_ptr firBuilder; std::unique_ptr helper; diff --git a/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp b/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp index f63afe4137683..05407d96998a2 100644 --- a/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp +++ b/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp @@ -26,19 +26,20 @@ struct FIRBuilderTest : public testing::Test { // Set up a Module with a dummy function operation inside. // Set the insertion point in the function entry block. - mlir::ModuleOp mod = builder.create(loc); - mlir::func::FuncOp func = mlir::func::FuncOp::create( + moduleOp = builder.create(loc); + builder.setInsertionPointToStart(moduleOp->getBody()); + mlir::func::FuncOp func = builder.create( loc, "func1", builder.getFunctionType(std::nullopt, std::nullopt)); auto *entryBlock = func.addEntryBlock(); - mod.push_back(mod); builder.setInsertionPointToStart(entryBlock); - firBuilder = std::make_unique(mod, kindMap); + firBuilder = std::make_unique(builder, kindMap); } fir::FirOpBuilder &getBuilder() { return *firBuilder; } mlir::MLIRContext context; + mlir::OwningOpRef moduleOp; std::unique_ptr firBuilder; }; diff --git a/flang/unittests/Optimizer/Builder/HLFIRToolsTest.cpp b/flang/unittests/Optimizer/Builder/HLFIRToolsTest.cpp index 1858b276f1fc3..640b7ecc1e565 100644 --- a/flang/unittests/Optimizer/Builder/HLFIRToolsTest.cpp +++ b/flang/unittests/Optimizer/Builder/HLFIRToolsTest.cpp @@ -25,14 +25,14 @@ struct HLFIRToolsTest : public testing::Test { // Set up a Module with a dummy function operation inside. // Set the insertion point in the function entry block. - mlir::ModuleOp mod = builder.create(loc); - mlir::func::FuncOp func = mlir::func::FuncOp::create( + moduleOp = builder.create(loc); + builder.setInsertionPointToStart(moduleOp->getBody()); + mlir::func::FuncOp func = builder.create( loc, "func1", builder.getFunctionType(std::nullopt, std::nullopt)); auto *entryBlock = func.addEntryBlock(); - mod.push_back(mod); builder.setInsertionPointToStart(entryBlock); - firBuilder = std::make_unique(mod, kindMap); + firBuilder = std::make_unique(builder, kindMap); } mlir::Value createDeclare(fir::ExtendedValue exv) { @@ -52,6 +52,7 @@ struct HLFIRToolsTest : public testing::Test { int varCounter = 0; mlir::MLIRContext context; + mlir::OwningOpRef moduleOp; std::unique_ptr firBuilder; }; diff --git a/flang/unittests/Optimizer/Builder/Runtime/RuntimeCallTestBase.h b/flang/unittests/Optimizer/Builder/Runtime/RuntimeCallTestBase.h index d0ec97733e83d..40abf567400b3 100644 --- a/flang/unittests/Optimizer/Builder/Runtime/RuntimeCallTestBase.h +++ b/flang/unittests/Optimizer/Builder/Runtime/RuntimeCallTestBase.h @@ -24,16 +24,16 @@ struct RuntimeCallTest : public testing::Test { // Set up a Module with a dummy function operation inside. // Set the insertion point in the function entry block. - mlir::ModuleOp mod = builder.create(loc); + moduleOp = builder.create(loc); + builder.setInsertionPointToStart(moduleOp->getBody()); mlir::func::FuncOp func = - mlir::func::FuncOp::create(loc, "runtime_unit_tests_func", + builder.create(loc, "runtime_unit_tests_func", builder.getFunctionType(std::nullopt, std::nullopt)); auto *entryBlock = func.addEntryBlock(); - mod.push_back(mod); builder.setInsertionPointToStart(entryBlock); kindMap = std::make_unique(&context); - firBuilder = std::make_unique(mod, *kindMap); + firBuilder = std::make_unique(builder, *kindMap); i1Ty = firBuilder->getI1Type(); i8Ty = firBuilder->getI8Type(); @@ -66,6 +66,7 @@ struct RuntimeCallTest : public testing::Test { } mlir::MLIRContext context; + mlir::OwningOpRef moduleOp; std::unique_ptr kindMap; std::unique_ptr firBuilder; diff --git a/flang/unittests/Optimizer/FortranVariableTest.cpp b/flang/unittests/Optimizer/FortranVariableTest.cpp index 87efb624735cf..4ba9359a07e4d 100644 --- a/flang/unittests/Optimizer/FortranVariableTest.cpp +++ b/flang/unittests/Optimizer/FortranVariableTest.cpp @@ -19,12 +19,12 @@ struct FortranVariableTest : public testing::Test { // Set up a Module with a dummy function operation inside. // Set the insertion point in the function entry block. - mlir::ModuleOp mod = builder->create(loc); + moduleOp = builder->create(loc); + builder->setInsertionPointToStart(moduleOp->getBody()); mlir::func::FuncOp func = - mlir::func::FuncOp::create(loc, "fortran_variable_tests", + builder->create(loc, "fortran_variable_tests", builder->getFunctionType(std::nullopt, std::nullopt)); auto *entryBlock = func.addEntryBlock(); - mod.push_back(mod); builder->setInsertionPointToStart(entryBlock); } @@ -40,6 +40,7 @@ struct FortranVariableTest : public testing::Test { } mlir::MLIRContext context; std::unique_ptr builder; + mlir::OwningOpRef moduleOp; }; TEST_F(FortranVariableTest, SimpleScalar) { diff --git a/flang/unittests/Runtime/ArrayConstructor.cpp b/flang/unittests/Runtime/ArrayConstructor.cpp index 62e3b780a27e7..53774a0eea07d 100644 --- a/flang/unittests/Runtime/ArrayConstructor.cpp +++ b/flang/unittests/Runtime/ArrayConstructor.cpp @@ -127,6 +127,9 @@ TEST(ArrayConstructor, Character) { 0); result.Deallocate(); cookieAllocator.deallocate(acVector, 1); + x->Deallocate(); + y->Deallocate(); + c->Deallocate(); } TEST(ArrayConstructor, CharacterRuntimeCheck) { diff --git a/flang/unittests/Runtime/CharacterTest.cpp b/flang/unittests/Runtime/CharacterTest.cpp index e54fd8a5075f6..d462c9120fd8c 100644 --- a/flang/unittests/Runtime/CharacterTest.cpp +++ b/flang/unittests/Runtime/CharacterTest.cpp @@ -259,6 +259,9 @@ void RunExtremumTests(const char *which, t.expect[i], t.expect[i] + std::strlen(t.expect[i])}; EXPECT_EQ(expect, got) << "inputs: '" << t.x[i] << "','" << t.y[i] << "'"; } + + x->Deallocate(); + y->Deallocate(); } } From 2d6e7c2b359d4cafca8eaca4c9ed308a3a8fa6c1 Mon Sep 17 00:00:00 2001 From: Alexey Gerenkov Date: Wed, 25 Dec 2024 11:56:15 +0300 Subject: [PATCH 055/567] [Clang][Xtensa] Add Xtensa target. (#118008) This PR implements support for generic Xtensa target in CLang. Co-authored-by: Andrei Safronov --- clang/include/clang/Basic/TargetInfo.h | 9 +- clang/lib/AST/ASTContext.cpp | 39 ++++ clang/lib/Basic/CMakeLists.txt | 1 + clang/lib/Basic/Targets.cpp | 4 + clang/lib/Basic/Targets/Xtensa.cpp | 35 +++ clang/lib/Basic/Targets/Xtensa.h | 111 +++++++++ clang/lib/Driver/ToolChains/CommonArgs.cpp | 5 + clang/test/Preprocessor/init.c | 249 +++++++++++++++++++++ clang/test/Preprocessor/stdint.c | 107 +++++++++ 9 files changed, 559 insertions(+), 1 deletion(-) create mode 100644 clang/lib/Basic/Targets/Xtensa.cpp create mode 100644 clang/lib/Basic/Targets/Xtensa.h diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h index 82bd537b242c1..f2905f30a7c34 100644 --- a/clang/include/clang/Basic/TargetInfo.h +++ b/clang/include/clang/Basic/TargetInfo.h @@ -358,7 +358,14 @@ class TargetInfo : public TransferrableTargetInfo, // void *__saved_reg_area_end_pointer; // void *__overflow_area_pointer; //} va_list; - HexagonBuiltinVaList + HexagonBuiltinVaList, + + // typedef struct __va_list_tag { + // int* __va_stk; + // int* __va_reg; + // int __va_ndx; + //} va_list; + XtensaABIBuiltinVaList }; protected: diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index 6ec927e13a755..8b4ae58e8427a 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -9751,6 +9751,43 @@ static TypedefDecl *CreateHexagonBuiltinVaListDecl(const ASTContext *Context) { return Context->buildImplicitTypedef(VaListTagArrayType, "__builtin_va_list"); } +static TypedefDecl * +CreateXtensaABIBuiltinVaListDecl(const ASTContext *Context) { + // typedef struct __va_list_tag { + RecordDecl *VaListTagDecl = Context->buildImplicitRecord("__va_list_tag"); + + VaListTagDecl->startDefinition(); + + // int* __va_stk; + // int* __va_reg; + // int __va_ndx; + constexpr size_t NumFields = 3; + QualType FieldTypes[NumFields] = {Context->getPointerType(Context->IntTy), + Context->getPointerType(Context->IntTy), + Context->IntTy}; + const char *FieldNames[NumFields] = {"__va_stk", "__va_reg", "__va_ndx"}; + + // Create fields + for (unsigned i = 0; i < NumFields; ++i) { + FieldDecl *Field = FieldDecl::Create( + *Context, VaListTagDecl, SourceLocation(), SourceLocation(), + &Context->Idents.get(FieldNames[i]), FieldTypes[i], /*TInfo=*/nullptr, + /*BitWidth=*/nullptr, + /*Mutable=*/false, ICIS_NoInit); + Field->setAccess(AS_public); + VaListTagDecl->addDecl(Field); + } + VaListTagDecl->completeDefinition(); + Context->VaListTagDecl = VaListTagDecl; + QualType VaListTagType = Context->getRecordType(VaListTagDecl); + + // } __va_list_tag; + TypedefDecl *VaListTagTypedefDecl = + Context->buildImplicitTypedef(VaListTagType, "__builtin_va_list"); + + return VaListTagTypedefDecl; +} + static TypedefDecl *CreateVaListDecl(const ASTContext *Context, TargetInfo::BuiltinVaListKind Kind) { switch (Kind) { @@ -9772,6 +9809,8 @@ static TypedefDecl *CreateVaListDecl(const ASTContext *Context, return CreateSystemZBuiltinVaListDecl(Context); case TargetInfo::HexagonBuiltinVaList: return CreateHexagonBuiltinVaListDecl(Context); + case TargetInfo::XtensaABIBuiltinVaList: + return CreateXtensaABIBuiltinVaListDecl(Context); } llvm_unreachable("Unhandled __builtin_va_list type kind"); diff --git a/clang/lib/Basic/CMakeLists.txt b/clang/lib/Basic/CMakeLists.txt index e11e1ac4a6fa6..331dfbb3f4b67 100644 --- a/clang/lib/Basic/CMakeLists.txt +++ b/clang/lib/Basic/CMakeLists.txt @@ -120,6 +120,7 @@ add_clang_library(clangBasic Targets/WebAssembly.cpp Targets/X86.cpp Targets/XCore.cpp + Targets/Xtensa.cpp TokenKinds.cpp TypeTraits.cpp Version.cpp diff --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp index d0815ad33bc75..be5dedbe8044e 100644 --- a/clang/lib/Basic/Targets.cpp +++ b/clang/lib/Basic/Targets.cpp @@ -40,6 +40,7 @@ #include "Targets/WebAssembly.h" #include "Targets/X86.h" #include "Targets/XCore.h" +#include "Targets/Xtensa.h" #include "clang/Basic/Diagnostic.h" #include "clang/Basic/DiagnosticFrontend.h" #include "llvm/ADT/StringExtras.h" @@ -751,6 +752,9 @@ std::unique_ptr AllocateTarget(const llvm::Triple &Triple, default: return std::make_unique(Triple, Opts); } + + case llvm::Triple::xtensa: + return std::make_unique(Triple, Opts); } } } // namespace targets diff --git a/clang/lib/Basic/Targets/Xtensa.cpp b/clang/lib/Basic/Targets/Xtensa.cpp new file mode 100644 index 0000000000000..f3216f4ba4e80 --- /dev/null +++ b/clang/lib/Basic/Targets/Xtensa.cpp @@ -0,0 +1,35 @@ +//===--- Xtensa.cpp - Implement Xtensa target feature support -------------===// +// +// The LLVM Compiler Infrastructure +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements Xtensa TargetInfo objects. +// +//===----------------------------------------------------------------------===// + +#include "Xtensa.h" +#include "clang/Basic/Builtins.h" +#include "clang/Basic/MacroBuilder.h" +#include "clang/Basic/TargetBuiltins.h" + +using namespace clang; +using namespace clang::targets; + +void XtensaTargetInfo::getTargetDefines(const LangOptions &Opts, + MacroBuilder &Builder) const { + Builder.defineMacro("__xtensa__"); + Builder.defineMacro("__XTENSA__"); + if (BigEndian) + Builder.defineMacro("__XTENSA_EB__"); + else + Builder.defineMacro("__XTENSA_EL__"); + Builder.defineMacro("__XCHAL_HAVE_BE", BigEndian ? "1" : "0"); + Builder.defineMacro("__XCHAL_HAVE_ABS"); // core arch + Builder.defineMacro("__XCHAL_HAVE_ADDX"); // core arch + Builder.defineMacro("__XCHAL_HAVE_L32R"); // core arch +} diff --git a/clang/lib/Basic/Targets/Xtensa.h b/clang/lib/Basic/Targets/Xtensa.h new file mode 100644 index 0000000000000..a440ba8aa3c6d --- /dev/null +++ b/clang/lib/Basic/Targets/Xtensa.h @@ -0,0 +1,111 @@ +//===--- Xtensa.h - Declare Xtensa target feature support -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file declares Xtensa TargetInfo objects. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_LIB_BASIC_TARGETS_XTENSA_H +#define LLVM_CLANG_LIB_BASIC_TARGETS_XTENSA_H + +#include "clang/Basic/TargetInfo.h" +#include "clang/Basic/TargetOptions.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/Support/Compiler.h" +#include "llvm/TargetParser/Triple.h" + +#include "clang/Basic/Builtins.h" +#include "clang/Basic/MacroBuilder.h" +#include "clang/Basic/TargetBuiltins.h" + +namespace clang { +namespace targets { + +class LLVM_LIBRARY_VISIBILITY XtensaTargetInfo : public TargetInfo { + static const Builtin::Info BuiltinInfo[]; + +protected: + std::string CPU; + +public: + XtensaTargetInfo(const llvm::Triple &Triple, const TargetOptions &) + : TargetInfo(Triple) { + // no big-endianess support yet + BigEndian = false; + NoAsmVariants = true; + LongLongAlign = 64; + SuitableAlign = 32; + DoubleAlign = LongDoubleAlign = 64; + SizeType = UnsignedInt; + PtrDiffType = SignedInt; + IntPtrType = SignedInt; + WCharType = SignedInt; + WIntType = UnsignedInt; + UseZeroLengthBitfieldAlignment = true; + MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 32; + resetDataLayout("e-m:e-p:32:32-i8:8:32-i16:16:32-i64:64-n32"); + } + + void getTargetDefines(const LangOptions &Opts, + MacroBuilder &Builder) const override; + + ArrayRef getTargetBuiltins() const override { + return std::nullopt; + } + + BuiltinVaListKind getBuiltinVaListKind() const override { + return TargetInfo::XtensaABIBuiltinVaList; + } + + std::string_view getClobbers() const override { return ""; } + + ArrayRef getGCCRegNames() const override { + static const char *const GCCRegNames[] = { + // General register name + "a0", "sp", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9", "a10", + "a11", "a12", "a13", "a14", "a15", + // Special register name + "sar"}; + return llvm::ArrayRef(GCCRegNames); + } + + ArrayRef getGCCRegAliases() const override { + return std::nullopt; + } + + bool validateAsmConstraint(const char *&Name, + TargetInfo::ConstraintInfo &Info) const override { + switch (*Name) { + default: + return false; + case 'a': + Info.setAllowsRegister(); + return true; + } + return false; + } + + int getEHDataRegisterNumber(unsigned RegNo) const override { + return (RegNo < 2) ? RegNo : -1; + } + + bool isValidCPUName(StringRef Name) const override { + return llvm::StringSwitch(Name).Case("generic", true).Default(false); + } + + bool setCPU(const std::string &Name) override { + CPU = Name; + return isValidCPUName(Name); + } +}; + +} // namespace targets +} // namespace clang +#endif // LLVM_CLANG_LIB_BASIC_TARGETS_XTENSA_H diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index bc62e8c48238b..8b9639061d543 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -708,6 +708,11 @@ std::string tools::getCPUName(const Driver &D, const ArgList &Args, case llvm::Triple::loongarch32: case llvm::Triple::loongarch64: return loongarch::getLoongArchTargetCPU(Args, T); + + case llvm::Triple::xtensa: + if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) + return A->getValue(); + return ""; } } diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c index 3b99204acd7a4..5999b9c1d1bc3 100644 --- a/clang/test/Preprocessor/init.c +++ b/clang/test/Preprocessor/init.c @@ -2746,3 +2746,252 @@ // RUN: %clang_cc1 -dM -triple=x86_64-uefi -E /dev/null | FileCheck -match-full-lines -check-prefix UEFI %s // UEFI: #define __UEFI__ 1 + +// RUN: %clang_cc1 -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=xtensa < /dev/null \ +// RUN: | FileCheck -match-full-lines -check-prefix=XTENSA %s +// XTENSA: #define _ILP32 1 +// XTENSA: #define __ATOMIC_ACQUIRE 2 +// XTENSA: #define __ATOMIC_ACQ_REL 4 +// XTENSA: #define __ATOMIC_CONSUME 1 +// XTENSA: #define __ATOMIC_RELAXED 0 +// XTENSA: #define __ATOMIC_RELEASE 3 +// XTENSA: #define __ATOMIC_SEQ_CST 5 +// XTENSA: #define __BIGGEST_ALIGNMENT__ 4 +// XTENSA: #define __BITINT_MAXWIDTH__ 128 +// XTENSA: #define __BOOL_WIDTH__ 1 +// XTENSA: #define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__ +// XTENSA: #define __CHAR16_TYPE__ unsigned short +// XTENSA: #define __CHAR32_TYPE__ unsigned int +// XTENSA: #define __CHAR_BIT__ 8 +// XTENSA: #define __CLANG_ATOMIC_BOOL_LOCK_FREE 2 +// XTENSA: #define __CLANG_ATOMIC_CHAR16_T_LOCK_FREE 2 +// XTENSA: #define __CLANG_ATOMIC_CHAR32_T_LOCK_FREE 2 +// XTENSA: #define __CLANG_ATOMIC_CHAR_LOCK_FREE 2 +// XTENSA: #define __CLANG_ATOMIC_INT_LOCK_FREE 2 +// XTENSA: #define __CLANG_ATOMIC_LLONG_LOCK_FREE 1 +// XTENSA: #define __CLANG_ATOMIC_LONG_LOCK_FREE 2 +// XTENSA: #define __CLANG_ATOMIC_POINTER_LOCK_FREE 2 +// XTENSA: #define __CLANG_ATOMIC_SHORT_LOCK_FREE 2 +// XTENSA: #define __CLANG_ATOMIC_WCHAR_T_LOCK_FREE 2 +// XTENSA: #define __CONSTANT_CFSTRINGS__ 1 +// XTENSA: #define __DBL_DECIMAL_DIG__ 17 +// XTENSA: #define __DBL_DENORM_MIN__ 4.9406564584124654e-324 +// XTENSA: #define __DBL_DIG__ 15 +// XTENSA: #define __DBL_EPSILON__ 2.2204460492503131e-16 +// XTENSA: #define __DBL_HAS_DENORM__ 1 +// XTENSA: #define __DBL_HAS_INFINITY__ 1 +// XTENSA: #define __DBL_HAS_QUIET_NAN__ 1 +// XTENSA: #define __DBL_MANT_DIG__ 53 +// XTENSA: #define __DBL_MAX_10_EXP__ 308 +// XTENSA: #define __DBL_MAX_EXP__ 1024 +// XTENSA: #define __DBL_MAX__ 1.7976931348623157e+308 +// XTENSA: #define __DBL_MIN_10_EXP__ (-307) +// XTENSA: #define __DBL_MIN_EXP__ (-1021) +// XTENSA: #define __DBL_MIN__ 2.2250738585072014e-308 +// XTENSA: #define __DBL_NORM_MAX__ 1.7976931348623157e+308 +// XTENSA: #define __DECIMAL_DIG__ __LDBL_DECIMAL_DIG__ +// XTENSA: #define __ELF__ 1 +// XTENSA: #define __FINITE_MATH_ONLY__ 0 +// XTENSA: #define __FLT_DECIMAL_DIG__ 9 +// XTENSA: #define __FLT_DENORM_MIN__ 1.40129846e-45F +// XTENSA: #define __FLT_DIG__ 6 +// XTENSA: #define __FLT_EPSILON__ 1.19209290e-7F +// XTENSA: #define __FLT_HAS_DENORM__ 1 +// XTENSA: #define __FLT_HAS_INFINITY__ 1 +// XTENSA: #define __FLT_HAS_QUIET_NAN__ 1 +// XTENSA: #define __FLT_MANT_DIG__ 24 +// XTENSA: #define __FLT_MAX_10_EXP__ 38 +// XTENSA: #define __FLT_MAX_EXP__ 128 +// XTENSA: #define __FLT_MAX__ 3.40282347e+38F +// XTENSA: #define __FLT_MIN_10_EXP__ (-37) +// XTENSA: #define __FLT_MIN_EXP__ (-125) +// XTENSA: #define __FLT_MIN__ 1.17549435e-38F +// XTENSA: #define __FLT_NORM_MAX__ 3.40282347e+38F +// XTENSA: #define __FLT_RADIX__ 2 +// XTENSA: #define __FPCLASS_NEGINF 0x0004 +// XTENSA: #define __FPCLASS_NEGNORMAL 0x0008 +// XTENSA: #define __FPCLASS_NEGSUBNORMAL 0x0010 +// XTENSA: #define __FPCLASS_NEGZERO 0x0020 +// XTENSA: #define __FPCLASS_POSINF 0x0200 +// XTENSA: #define __FPCLASS_POSNORMAL 0x0100 +// XTENSA: #define __FPCLASS_POSSUBNORMAL 0x0080 +// XTENSA: #define __FPCLASS_POSZERO 0x0040 +// XTENSA: #define __FPCLASS_QNAN 0x0002 +// XTENSA: #define __FPCLASS_SNAN 0x0001 +// XTENSA: #define __GCC_ATOMIC_BOOL_LOCK_FREE 2 +// XTENSA: #define __GCC_ATOMIC_CHAR16_T_LOCK_FREE 2 +// XTENSA: #define __GCC_ATOMIC_CHAR32_T_LOCK_FREE 2 +// XTENSA: #define __GCC_ATOMIC_CHAR_LOCK_FREE 2 +// XTENSA: #define __GCC_ATOMIC_INT_LOCK_FREE 2 +// XTENSA: #define __GCC_ATOMIC_LLONG_LOCK_FREE 1 +// XTENSA: #define __GCC_ATOMIC_LONG_LOCK_FREE 2 +// XTENSA: #define __GCC_ATOMIC_POINTER_LOCK_FREE 2 +// XTENSA: #define __GCC_ATOMIC_SHORT_LOCK_FREE 2 +// XTENSA: #define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1 +// XTENSA: #define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2 +// XTENSA: #define __GCC_CONSTRUCTIVE_SIZE 64 +// XTENSA: #define __GCC_DESTRUCTIVE_SIZE 64 +// XTENSA: #define __GNUC_MINOR__ {{.*}} +// XTENSA: #define __GNUC_PATCHLEVEL__ {{.*}} +// XTENSA: #define __GNUC_STDC_INLINE__ 1 +// XTENSA: #define __GNUC__ {{.*}} +// XTENSA: #define __GXX_ABI_VERSION {{.*}} +// XTENSA: #define __ILP32__ 1 +// XTENSA: #define __INT16_C_SUFFIX__ +// XTENSA: #define __INT16_MAX__ 32767 +// XTENSA: #define __INT16_TYPE__ short +// XTENSA: #define __INT32_C_SUFFIX__ +// XTENSA: #define __INT32_MAX__ 2147483647 +// XTENSA: #define __INT32_TYPE__ int +// XTENSA: #define __INT64_C_SUFFIX__ LL +// XTENSA: #define __INT64_MAX__ 9223372036854775807LL +// XTENSA: #define __INT64_TYPE__ long long int +// XTENSA: #define __INT8_C_SUFFIX__ +// XTENSA: #define __INT8_MAX__ 127 +// XTENSA: #define __INT8_TYPE__ signed char +// XTENSA: #define __INTMAX_C_SUFFIX__ LL +// XTENSA: #define __INTMAX_MAX__ 9223372036854775807LL +// XTENSA: #define __INTMAX_TYPE__ long long int +// XTENSA: #define __INTMAX_WIDTH__ 64 +// XTENSA: #define __INTPTR_MAX__ 2147483647 +// XTENSA: #define __INTPTR_TYPE__ int +// XTENSA: #define __INTPTR_WIDTH__ 32 +// TODO: Xtensa GCC defines INT_FAST16 as int +// XTENSA: #define __INT_FAST16_MAX__ 32767 +// XTENSA: #define __INT_FAST16_TYPE__ short +// XTENSA: #define __INT_FAST16_WIDTH__ 16 +// XTENSA: #define __INT_FAST32_MAX__ 2147483647 +// XTENSA: #define __INT_FAST32_TYPE__ int +// XTENSA: #define __INT_FAST32_WIDTH__ 32 +// XTENSA: #define __INT_FAST64_MAX__ 9223372036854775807LL +// XTENSA: #define __INT_FAST64_TYPE__ long long int +// XTENSA: #define __INT_FAST64_WIDTH__ 64 +// TODO: Xtensa GCC defines INT_FAST8 as int +// XTENSA: #define __INT_FAST8_MAX__ 127 +// XTENSA: #define __INT_FAST8_TYPE__ signed char +// XTENSA: #define __INT_FAST8_WIDTH__ 8 +// XTENSA: #define __INT_LEAST16_MAX__ 32767 +// XTENSA: #define __INT_LEAST16_TYPE__ short +// XTENSA: #define __INT_LEAST16_WIDTH__ 16 +// XTENSA: #define __INT_LEAST32_MAX__ 2147483647 +// XTENSA: #define __INT_LEAST32_TYPE__ int +// XTENSA: #define __INT_LEAST32_WIDTH__ 32 +// XTENSA: #define __INT_LEAST64_MAX__ 9223372036854775807LL +// XTENSA: #define __INT_LEAST64_TYPE__ long long int +// XTENSA: #define __INT_LEAST64_WIDTH__ 64 +// XTENSA: #define __INT_LEAST8_MAX__ 127 +// XTENSA: #define __INT_LEAST8_TYPE__ signed char +// XTENSA: #define __INT_LEAST8_WIDTH__ 8 +// XTENSA: #define __INT_MAX__ 2147483647 +// XTENSA: #define __INT_WIDTH__ 32 +// XTENSA: #define __LDBL_DECIMAL_DIG__ 17 +// XTENSA: #define __LDBL_DENORM_MIN__ 4.9406564584124654e-324L +// XTENSA: #define __LDBL_DIG__ 15 +// XTENSA: #define __LDBL_EPSILON__ 2.2204460492503131e-16L +// XTENSA: #define __LDBL_HAS_DENORM__ 1 +// XTENSA: #define __LDBL_HAS_INFINITY__ 1 +// XTENSA: #define __LDBL_HAS_QUIET_NAN__ 1 +// XTENSA: #define __LDBL_MANT_DIG__ 53 +// XTENSA: #define __LDBL_MAX_10_EXP__ 308 +// XTENSA: #define __LDBL_MAX_EXP__ 1024 +// XTENSA: #define __LDBL_MAX__ 1.7976931348623157e+308L +// XTENSA: #define __LDBL_MIN_10_EXP__ (-307) +// XTENSA: #define __LDBL_MIN_EXP__ (-1021) +// XTENSA: #define __LDBL_MIN__ 2.2250738585072014e-308L +// XTENSA: #define __LDBL_NORM_MAX__ 1.7976931348623157e+308L +// XTENSA: #define __LITTLE_ENDIAN__ 1 +// XTENSA: #define __LLONG_WIDTH__ 64 +// XTENSA: #define __LONG_LONG_MAX__ 9223372036854775807LL +// XTENSA: #define __LONG_MAX__ 2147483647L +// XTENSA: #define __LONG_WIDTH__ 32 +// XTENSA: #define __MEMORY_SCOPE_DEVICE 1 +// XTENSA: #define __MEMORY_SCOPE_SINGLE 4 +// XTENSA: #define __MEMORY_SCOPE_SYSTEM 0 +// XTENSA: #define __MEMORY_SCOPE_WRKGRP 2 +// XTENSA: #define __MEMORY_SCOPE_WVFRNT 3 +// XTENSA: #define __NO_INLINE__ 1 +// XTENSA: #define __NO_MATH_ERRNO__ 1 +// XTENSA: #define __OBJC_BOOL_IS_BOOL 0 +// XTENSA: #define __POINTER_WIDTH__ 32 +// XTENSA: #define __PRAGMA_REDEFINE_EXTNAME 1 +// XTENSA: #define __PTRDIFF_MAX__ 2147483647 +// XTENSA: #define __PTRDIFF_TYPE__ int +// XTENSA: #define __PTRDIFF_WIDTH__ 32 +// XTENSA: #define __SCHAR_MAX__ 127 +// XTENSA: #define __SHRT_MAX__ 32767 +// XTENSA: #define __SHRT_WIDTH__ 16 +// XTENSA: #define __SIG_ATOMIC_MAX__ 2147483647 +// XTENSA: #define __SIG_ATOMIC_WIDTH__ 32 +// XTENSA: #define __SIZEOF_DOUBLE__ 8 +// XTENSA: #define __SIZEOF_FLOAT__ 4 +// XTENSA: #define __SIZEOF_INT__ 4 +// XTENSA: #define __SIZEOF_LONG_DOUBLE__ 8 +// XTENSA: #define __SIZEOF_LONG_LONG__ 8 +// XTENSA: #define __SIZEOF_LONG__ 4 +// XTENSA: #define __SIZEOF_POINTER__ 4 +// XTENSA: #define __SIZEOF_PTRDIFF_T__ 4 +// XTENSA: #define __SIZEOF_SHORT__ 2 +// XTENSA: #define __SIZEOF_SIZE_T__ 4 +// XTENSA: #define __SIZEOF_WCHAR_T__ 4 +// XTENSA: #define __SIZEOF_WINT_T__ 4 +// XTENSA: #define __SIZE_MAX__ 4294967295U +// XTENSA: #define __SIZE_TYPE__ unsigned int +// XTENSA: #define __SIZE_WIDTH__ 32 +// XTENSA: #define __STDC_EMBED_EMPTY__ 2 +// XTENSA: #define __STDC_EMBED_FOUND__ 1 +// XTENSA: #define __STDC_EMBED_NOT_FOUND__ 0 +// XTENSA: #define __STDC_HOSTED__ 0 +// XTENSA: #define __STDC_UTF_16__ 1 +// XTENSA: #define __STDC_UTF_32__ 1 +// XTENSA: #define __STDC_VERSION__ 201710L +// XTENSA: #define __STDC__ 1 +// XTENSA: #define __UINT16_C_SUFFIX__ +// XTENSA: #define __UINT16_MAX__ 65535 +// XTENSA: #define __UINT16_TYPE__ unsigned short +// XTENSA: #define __UINT32_C_SUFFIX__ U +// XTENSA: #define __UINT32_MAX__ 4294967295U +// XTENSA: #define __UINT32_TYPE__ unsigned int +// XTENSA: #define __UINT64_C_SUFFIX__ ULL +// XTENSA: #define __UINT64_MAX__ 18446744073709551615ULL +// XTENSA: #define __UINT64_TYPE__ long long unsigned int +// XTENSA: #define __UINT8_C_SUFFIX__ +// XTENSA: #define __UINT8_MAX__ 255 +// XTENSA: #define __UINT8_TYPE__ unsigned char +// XTENSA: #define __UINTMAX_C_SUFFIX__ ULL +// XTENSA: #define __UINTMAX_MAX__ 18446744073709551615ULL +// XTENSA: #define __UINTMAX_TYPE__ long long unsigned int +// XTENSA: #define __UINTMAX_WIDTH__ 64 +// XTENSA: #define __UINTPTR_MAX__ 4294967295U +// XTENSA: #define __UINTPTR_TYPE__ unsigned int +// XTENSA: #define __UINTPTR_WIDTH__ 32 +// XTENSA: #define __UINT_FAST16_MAX__ 65535 +// XTENSA: #define __UINT_FAST16_TYPE__ unsigned short +// XTENSA: #define __UINT_FAST32_MAX__ 4294967295U +// XTENSA: #define __UINT_FAST32_TYPE__ unsigned int +// XTENSA: #define __UINT_FAST64_MAX__ 18446744073709551615ULL +// XTENSA: #define __UINT_FAST64_TYPE__ long long unsigned int +// XTENSA: #define __UINT_FAST8_MAX__ 255 +// XTENSA: #define __UINT_FAST8_TYPE__ unsigned char +// XTENSA: #define __UINT_LEAST16_MAX__ 65535 +// XTENSA: #define __UINT_LEAST16_TYPE__ unsigned short +// XTENSA: #define __UINT_LEAST32_MAX__ 4294967295U +// XTENSA: #define __UINT_LEAST32_TYPE__ unsigned int +// XTENSA: #define __UINT_LEAST64_MAX__ 18446744073709551615ULL +// XTENSA: #define __UINT_LEAST64_TYPE__ long long unsigned int +// XTENSA: #define __UINT_LEAST8_MAX__ 255 +// XTENSA: #define __UINT_LEAST8_TYPE__ unsigned char +// XTENSA: #define __USER_LABEL_PREFIX__ +// XTENSA: #define __WCHAR_MAX__ 2147483647 +// XTENSA: #define __WCHAR_TYPE__ int +// XTENSA: #define __WCHAR_WIDTH__ 32 +// XTENSA: #define __WINT_MAX__ 4294967295U +// XTENSA: #define __WINT_TYPE__ unsigned int +// XTENSA: #define __WINT_UNSIGNED__ 1 +// XTENSA: #define __WINT_WIDTH__ 32 +// XTENSA: #define __XCHAL_HAVE_ABS 1 +// XTENSA: #define __XCHAL_HAVE_ADDX 1 +// XTENSA: #define __XCHAL_HAVE_BE 0 +// XTENSA: #define __XCHAL_HAVE_L32R 1 +// XTENSA: #define __XTENSA_EL__ 1 +// XTENSA: #define __XTENSA__ 1 +// XTENSA: #define __xtensa__ 1 diff --git a/clang/test/Preprocessor/stdint.c b/clang/test/Preprocessor/stdint.c index 7cb33ed54739a..af1d6983fd995 100644 --- a/clang/test/Preprocessor/stdint.c +++ b/clang/test/Preprocessor/stdint.c @@ -1498,6 +1498,113 @@ // XCORE:INTMAX_C_(0) 0LL // XCORE:UINTMAX_C_(0) 0ULL // +// RUN: %clang_cc1 -E -ffreestanding -triple=xtensa %s | FileCheck -check-prefix XTENSA %s +// +// XTENSA:typedef long long int int64_t; +// XTENSA:typedef long long unsigned int uint64_t; +// XTENSA:typedef int64_t int_least64_t; +// XTENSA:typedef uint64_t uint_least64_t; +// XTENSA:typedef int64_t int_fast64_t; +// XTENSA:typedef uint64_t uint_fast64_t; +// +// XTENSA:typedef int int32_t; +// XTENSA:typedef unsigned int uint32_t; +// XTENSA:typedef int32_t int_least32_t; +// XTENSA:typedef uint32_t uint_least32_t; +// XTENSA:typedef int32_t int_fast32_t; +// XTENSA:typedef uint32_t uint_fast32_t; +// +// XTENSA:typedef short int16_t; +// XTENSA:typedef unsigned short uint16_t; +// XTENSA:typedef int16_t int_least16_t; +// XTENSA:typedef uint16_t uint_least16_t; +// XTENSA:typedef int16_t int_fast16_t; +// XTENSA:typedef uint16_t uint_fast16_t; +// +// XTENSA:typedef signed char int8_t; +// XTENSA:typedef unsigned char uint8_t; +// XTENSA:typedef int8_t int_least8_t; +// XTENSA:typedef uint8_t uint_least8_t; +// XTENSA:typedef int8_t int_fast8_t; +// XTENSA:typedef uint8_t uint_fast8_t; +// +// XTENSA:typedef int intptr_t; +// XTENSA:typedef unsigned int uintptr_t; +// +// XTENSA:typedef long long int intmax_t; +// XTENSA:typedef long long unsigned int uintmax_t; +// +// XTENSA:INT8_MAX_ 127 +// XTENSA:INT8_MIN_ (-127 -1) +// XTENSA:UINT8_MAX_ 255 +// XTENSA:INT_LEAST8_MIN_ (-127 -1) +// XTENSA:INT_LEAST8_MAX_ 127 +// XTENSA:UINT_LEAST8_MAX_ 255 +// XTENSA:INT_FAST8_MIN_ (-127 -1) +// XTENSA:INT_FAST8_MAX_ 127 +// XTENSA:UINT_FAST8_MAX_ 255 +// +// XTENSA:INT16_MAX_ 32767 +// XTENSA:INT16_MIN_ (-32767 -1) +// XTENSA:UINT16_MAX_ 65535 +// XTENSA:INT_LEAST16_MIN_ (-32767 -1) +// XTENSA:INT_LEAST16_MAX_ 32767 +// XTENSA:UINT_LEAST16_MAX_ 65535 +// XTENSA:INT_FAST16_MIN_ (-32767 -1) +// XTENSA:INT_FAST16_MAX_ 32767 +// XTENSA:UINT_FAST16_MAX_ 65535 +// +// XTENSA:INT32_MAX_ 2147483647 +// XTENSA:INT32_MIN_ (-2147483647 -1) +// XTENSA:UINT32_MAX_ 4294967295U +// XTENSA:INT_LEAST32_MIN_ (-2147483647 -1) +// XTENSA:INT_LEAST32_MAX_ 2147483647 +// XTENSA:UINT_LEAST32_MAX_ 4294967295U +// XTENSA:INT_FAST32_MIN_ (-2147483647 -1) +// XTENSA:INT_FAST32_MAX_ 2147483647 +// XTENSA:UINT_FAST32_MAX_ 4294967295U +// +// XTENSA:INT64_MAX_ 9223372036854775807LL +// XTENSA:INT64_MIN_ (-9223372036854775807LL -1) +// XTENSA:UINT64_MAX_ 18446744073709551615ULL +// XTENSA:INT_LEAST64_MIN_ (-9223372036854775807LL -1) +// XTENSA:INT_LEAST64_MAX_ 9223372036854775807LL +// XTENSA:UINT_LEAST64_MAX_ 18446744073709551615ULL +// XTENSA:INT_FAST64_MIN_ (-9223372036854775807LL -1) +// XTENSA:INT_FAST64_MAX_ 9223372036854775807LL +// XTENSA:UINT_FAST64_MAX_ 18446744073709551615ULL +// +// XTENSA:INTPTR_MIN_ (-2147483647 -1) +// XTENSA:INTPTR_MAX_ 2147483647 +// XTENSA:UINTPTR_MAX_ 4294967295U +// XTENSA:PTRDIFF_MIN_ (-2147483647 -1) +// XTENSA:PTRDIFF_MAX_ 2147483647 +// XTENSA:SIZE_MAX_ 4294967295U +// +// XTENSA:INTMAX_MIN_ (-9223372036854775807LL -1) +// XTENSA:INTMAX_MAX_ 9223372036854775807LL +// XTENSA:UINTMAX_MAX_ 18446744073709551615ULL +// +// XTENSA:SIG_ATOMIC_MIN_ (-2147483647 -1) +// XTENSA:SIG_ATOMIC_MAX_ 2147483647 +// XTENSA:WINT_MIN_ 0U +// XTENSA:WINT_MAX_ 4294967295U +// +// XTENSA:WCHAR_MAX_ 2147483647 +// XTENSA:WCHAR_MIN_ (-2147483647 -1) +// +// XTENSA:INT8_C_(0) 0 +// XTENSA:UINT8_C_(0) 0U +// XTENSA:INT16_C_(0) 0 +// XTENSA:UINT16_C_(0) 0U +// XTENSA:INT32_C_(0) 0 +// XTENSA:UINT32_C_(0) 0U +// XTENSA:INT64_C_(0) 0LL +// XTENSA:UINT64_C_(0) 0ULL +// +// XTENSA:INTMAX_C_(0) 0LL +// XTENSA:UINTMAX_C_(0) 0ULL +// // // stdint.h forms several macro definitions by pasting together identifiers // to form names (eg. int32_t is formed from int ## 32 ## _t). The following From ae435adabba2f137fe50749c0581157a89019b5e Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Wed, 25 Dec 2024 08:56:33 +0000 Subject: [PATCH 056/567] [gn build] Port 2d6e7c2b359d --- llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn index 31b4ba6304a23..d18a10d50310f 100644 --- a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn @@ -128,6 +128,7 @@ static_library("Basic") { "Targets/WebAssembly.cpp", "Targets/X86.cpp", "Targets/XCore.cpp", + "Targets/Xtensa.cpp", "TokenKinds.cpp", "TypeTraits.cpp", "Version.cpp", From 5fb57131b744c52f74919f9487f4a9fa69f455fb Mon Sep 17 00:00:00 2001 From: Usman Nadeem Date: Wed, 25 Dec 2024 01:29:01 -0800 Subject: [PATCH 057/567] [DFAJumpThreading] Don't bail early after encountering unpredictable values (#119774) After #96127 landed, mshockwave reported that the pass was no longer threading SPEC2006/perlbench. After 96127 we started bailing out in `getStateDefMap` and rejecting the transformation because one of the unpredictable values was coming from inside the loop. There was no fundamental change in that function except that we started calling `Loop->contains(IncomingBB)` instead of `LoopBBs.count(IncomingBB)`. After some analysis I came to the conclusion that even before 96127 we would reject the transformation if we provided large enough limits on the path traversal (large enough so that LoopBBs contained blocks corresponding to that unpredictable value). In this patch I changed `getStateDefMap` to not terminate early on finding an unpredictable value, this is because `getPathsFromStateDefMap`, later, actually has checks to ensure that the final list of paths only have predictable values. As a result we can now partially thread functions like `negative6` in the tests that have some predictable paths. This patch does not really have any compile-time impact on the test suite without `-dfa-early-exit-heuristic=false` (early exit is enabled by default). Change-Id: Ie1633b370ed4a0eda8dea52650b40f6f66ef49a3 --- .../Transforms/Scalar/DFAJumpThreading.cpp | 29 ++++++------- .../DFAJumpThreading/dfa-unfold-select.ll | 42 ++++++++++++++++--- .../Transforms/DFAJumpThreading/negative.ll | 40 +++++++++++++++++- 3 files changed, 87 insertions(+), 24 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp index 3c4a40fab3e03..8a5c506eed694 100644 --- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp @@ -109,7 +109,7 @@ static cl::opt MaxNumVisitiedPaths( "dfa-max-num-visited-paths", cl::desc( "Max number of blocks visited while enumerating paths around a switch"), - cl::Hidden, cl::init(2000)); + cl::Hidden, cl::init(2500)); static cl::opt MaxNumPaths("dfa-max-num-paths", @@ -754,17 +754,15 @@ struct AllSwitchPaths { return Res; } - /// Walk the use-def chain and collect all the state-defining instructions. - /// - /// Return an empty map if unpredictable values encountered inside the basic - /// blocks of \p LoopPaths. + /// Walk the use-def chain and collect all the state-defining blocks and the + /// PHI nodes in those blocks that define the state. StateDefMap getStateDefMap() const { StateDefMap Res; - Value *FirstDef = Switch->getOperand(0); - assert(isa(FirstDef) && "The first definition must be a phi."); + PHINode *FirstDef = dyn_cast(Switch->getOperand(0)); + assert(FirstDef && "The first definition must be a phi."); SmallVector Stack; - Stack.push_back(dyn_cast(FirstDef)); + Stack.push_back(FirstDef); SmallSet SeenValues; while (!Stack.empty()) { @@ -774,18 +772,15 @@ struct AllSwitchPaths { SeenValues.insert(CurPhi); for (BasicBlock *IncomingBB : CurPhi->blocks()) { - Value *Incoming = CurPhi->getIncomingValueForBlock(IncomingBB); + PHINode *IncomingPhi = + dyn_cast(CurPhi->getIncomingValueForBlock(IncomingBB)); + if (!IncomingPhi) + continue; bool IsOutsideLoops = !SwitchOuterLoop->contains(IncomingBB); - if (Incoming == FirstDef || isa(Incoming) || - SeenValues.contains(Incoming) || IsOutsideLoops) { + if (SeenValues.contains(IncomingPhi) || IsOutsideLoops) continue; - } - - // Any unpredictable value inside the loops means we must bail out. - if (!isa(Incoming)) - return StateDefMap(); - Stack.push_back(cast(Incoming)); + Stack.push_back(IncomingPhi); } } diff --git a/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll b/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll index 366446a1cc9e4..93872c3938768 100644 --- a/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll +++ b/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll @@ -381,26 +381,58 @@ define void @pr65222(i32 %flags, i1 %cmp, i1 %tobool.not) { ; CHECK: then: ; CHECK-NEXT: br i1 [[TOBOOL_NOT:%.*]], label [[COND1_SI_UNFOLD_TRUE:%.*]], label [[COND_SI_UNFOLD_TRUE:%.*]] ; CHECK: cond.si.unfold.true: +; CHECK-NEXT: br i1 [[CMP]], label [[TOUNFOLD_SI_UNFOLD_FALSE1:%.*]], label [[COND_SI_UNFOLD_FALSE_JT0:%.*]] +; CHECK: cond.si.unfold.true.jt2: ; CHECK-NEXT: [[DOTSI_UNFOLD_PHI:%.*]] = phi i32 [ 2, [[THEN]] ] ; CHECK-NEXT: br i1 [[CMP]], label [[TOUNFOLD_SI_UNFOLD_FALSE:%.*]], label [[COND_SI_UNFOLD_FALSE:%.*]] ; CHECK: cond.si.unfold.false: ; CHECK-NEXT: [[DOTSI_UNFOLD_PHI1:%.*]] = phi i32 [ 0, [[COND_SI_UNFOLD_TRUE]] ] -; CHECK-NEXT: br label [[TOUNFOLD_SI_UNFOLD_FALSE]] +; CHECK-NEXT: br label [[TOUNFOLD_SI_UNFOLD_FALSE1]] +; CHECK: cond.si.unfold.false.jt0: +; CHECK-NEXT: [[DOTSI_UNFOLD_PHI1_JT0:%.*]] = phi i32 [ 0, [[COND_SI_UNFOLD_TRUE1:%.*]] ] +; CHECK-NEXT: br label [[TOUNFOLD_SI_UNFOLD_FALSE_JT0:%.*]] ; CHECK: tounfold.si.unfold.false: -; CHECK-NEXT: [[COND_SI_UNFOLD_PHI:%.*]] = phi i32 [ [[DOTSI_UNFOLD_PHI]], [[COND_SI_UNFOLD_TRUE]] ], [ [[DOTSI_UNFOLD_PHI1]], [[COND_SI_UNFOLD_FALSE]] ] +; CHECK-NEXT: [[COND_SI_UNFOLD_PHI:%.*]] = phi i32 [ poison, [[COND_SI_UNFOLD_TRUE1]] ], [ [[DOTSI_UNFOLD_PHI1]], [[COND_SI_UNFOLD_FALSE]] ] ; CHECK-NEXT: br label [[IF_END]] +; CHECK: tounfold.si.unfold.false.jt0: +; CHECK-NEXT: [[COND_SI_UNFOLD_PHI_JT0:%.*]] = phi i32 [ [[DOTSI_UNFOLD_PHI1_JT0]], [[COND_SI_UNFOLD_FALSE_JT0]] ] +; CHECK-NEXT: br label [[IF_END_JT0:%.*]] +; CHECK: tounfold.si.unfold.false.jt2: +; CHECK-NEXT: [[COND_SI_UNFOLD_PHI_JT2:%.*]] = phi i32 [ [[DOTSI_UNFOLD_PHI]], [[COND_SI_UNFOLD_TRUE]] ] +; CHECK-NEXT: br label [[IF_END_JT2:%.*]] ; CHECK: cond1.si.unfold.true: +; CHECK-NEXT: br i1 [[CMP]], label [[IF_END]], label [[COND1_SI_UNFOLD_FALSE_JT1:%.*]] +; CHECK: cond1.si.unfold.true.jt3: ; CHECK-NEXT: [[DOTSI_UNFOLD_PHI2:%.*]] = phi i32 [ 3, [[THEN]] ] -; CHECK-NEXT: br i1 [[CMP]], label [[IF_END]], label [[COND1_SI_UNFOLD_FALSE:%.*]] +; CHECK-NEXT: br i1 [[CMP]], label [[IF_END_JT3:%.*]], label [[COND1_SI_UNFOLD_FALSE:%.*]] ; CHECK: cond1.si.unfold.false: ; CHECK-NEXT: [[DOTSI_UNFOLD_PHI3:%.*]] = phi i32 [ 1, [[COND1_SI_UNFOLD_TRUE]] ] ; CHECK-NEXT: br label [[IF_END]] +; CHECK: cond1.si.unfold.false.jt1: +; CHECK-NEXT: [[DOTSI_UNFOLD_PHI3_JT1:%.*]] = phi i32 [ 1, [[COND1_SI_UNFOLD_TRUE1:%.*]] ] +; CHECK-NEXT: br label [[IF_END_JT1:%.*]] ; CHECK: if.end: -; CHECK-NEXT: [[UNFOLDED:%.*]] = phi i32 [ [[FLAGS:%.*]], [[WHILE_COND]] ], [ [[COND_SI_UNFOLD_PHI]], [[TOUNFOLD_SI_UNFOLD_FALSE]] ], [ [[DOTSI_UNFOLD_PHI2]], [[COND1_SI_UNFOLD_TRUE]] ], [ [[DOTSI_UNFOLD_PHI3]], [[COND1_SI_UNFOLD_FALSE]] ] -; CHECK-NEXT: [[OTHER:%.*]] = phi i32 [ [[FLAGS]], [[WHILE_COND]] ], [ 0, [[TOUNFOLD_SI_UNFOLD_FALSE]] ], [ 0, [[COND1_SI_UNFOLD_TRUE]] ], [ 0, [[COND1_SI_UNFOLD_FALSE]] ] +; CHECK-NEXT: [[UNFOLDED:%.*]] = phi i32 [ [[FLAGS:%.*]], [[WHILE_COND]] ], [ [[COND_SI_UNFOLD_PHI]], [[TOUNFOLD_SI_UNFOLD_FALSE1]] ], [ poison, [[COND1_SI_UNFOLD_TRUE1]] ], [ [[DOTSI_UNFOLD_PHI3]], [[COND1_SI_UNFOLD_FALSE]] ] +; CHECK-NEXT: [[OTHER:%.*]] = phi i32 [ [[FLAGS]], [[WHILE_COND]] ], [ 0, [[TOUNFOLD_SI_UNFOLD_FALSE1]] ], [ 0, [[COND1_SI_UNFOLD_TRUE1]] ], [ 0, [[COND1_SI_UNFOLD_FALSE]] ] ; CHECK-NEXT: switch i32 [[UNFOLDED]], label [[UNREACHABLE:%.*]] [ ; CHECK-NEXT: i32 0, label [[SW_BB:%.*]] ; CHECK-NEXT: ] +; CHECK: if.end.jt1: +; CHECK-NEXT: [[UNFOLDED_JT1:%.*]] = phi i32 [ [[DOTSI_UNFOLD_PHI3_JT1]], [[COND1_SI_UNFOLD_FALSE_JT1]] ] +; CHECK-NEXT: [[OTHER_JT1:%.*]] = phi i32 [ 0, [[COND1_SI_UNFOLD_FALSE_JT1]] ] +; CHECK-NEXT: br label [[UNREACHABLE]] +; CHECK: if.end.jt3: +; CHECK-NEXT: [[UNFOLDED_JT3:%.*]] = phi i32 [ [[DOTSI_UNFOLD_PHI2]], [[COND1_SI_UNFOLD_TRUE]] ] +; CHECK-NEXT: [[OTHER_JT3:%.*]] = phi i32 [ 0, [[COND1_SI_UNFOLD_TRUE]] ] +; CHECK-NEXT: br label [[UNREACHABLE]] +; CHECK: if.end.jt0: +; CHECK-NEXT: [[UNFOLDED_JT0:%.*]] = phi i32 [ [[COND_SI_UNFOLD_PHI_JT0]], [[TOUNFOLD_SI_UNFOLD_FALSE_JT0]] ] +; CHECK-NEXT: [[OTHER_JT0:%.*]] = phi i32 [ 0, [[TOUNFOLD_SI_UNFOLD_FALSE_JT0]] ] +; CHECK-NEXT: br label [[SW_BB]] +; CHECK: if.end.jt2: +; CHECK-NEXT: [[UNFOLDED_JT2:%.*]] = phi i32 [ [[COND_SI_UNFOLD_PHI_JT2]], [[TOUNFOLD_SI_UNFOLD_FALSE]] ] +; CHECK-NEXT: [[OTHER_JT2:%.*]] = phi i32 [ 0, [[TOUNFOLD_SI_UNFOLD_FALSE]] ] +; CHECK-NEXT: br label [[UNREACHABLE]] ; CHECK: unreachable: ; CHECK-NEXT: unreachable ; CHECK: sw.bb: diff --git a/llvm/test/Transforms/DFAJumpThreading/negative.ll b/llvm/test/Transforms/DFAJumpThreading/negative.ll index a964281427699..3eab1e14417fb 100644 --- a/llvm/test/Transforms/DFAJumpThreading/negative.ll +++ b/llvm/test/Transforms/DFAJumpThreading/negative.ll @@ -218,9 +218,45 @@ for.end: declare i32 @arbitrary_function() ; Don't confuse %state.2 for the initial switch value. +; [ 3, %case2 ] can still be threaded. define i32 @negative6(i32 %init) { -; REMARK: SwitchNotPredictable -; REMARK-NEXT: negative6 +; CHECK-LABEL: define i32 @negative6( +; CHECK-SAME: i32 [[INIT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[INIT]], 0 +; CHECK-NEXT: br label %[[LOOP_2:.*]] +; CHECK: [[LOOP_2]]: +; CHECK-NEXT: [[STATE_2:%.*]] = call i32 @arbitrary_function() +; CHECK-NEXT: br label %[[LOOP_3:.*]] +; CHECK: [[LOOP_3]]: +; CHECK-NEXT: [[STATE:%.*]] = phi i32 [ [[STATE_2]], %[[LOOP_2]] ] +; CHECK-NEXT: switch i32 [[STATE]], label %[[INFLOOP_I:.*]] [ +; CHECK-NEXT: i32 2, label %[[CASE2:.*]] +; CHECK-NEXT: i32 3, label %[[CASE3:.*]] +; CHECK-NEXT: i32 4, label %[[CASE4:.*]] +; CHECK-NEXT: i32 0, label %[[CASE0:.*]] +; CHECK-NEXT: i32 1, label %[[CASE1:.*]] +; CHECK-NEXT: ] +; CHECK: [[LOOP_3_JT3:.*]]: +; CHECK-NEXT: [[STATE_JT3:%.*]] = phi i32 [ 3, %[[CASE2]] ] +; CHECK-NEXT: br label %[[CASE3]] +; CHECK: [[CASE2]]: +; CHECK-NEXT: br label %[[LOOP_3_JT3]] +; CHECK: [[CASE3]]: +; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP_2_BACKEDGE:.*]], label %[[CASE4]] +; CHECK: [[CASE4]]: +; CHECK-NEXT: br label %[[LOOP_2_BACKEDGE]] +; CHECK: [[LOOP_2_BACKEDGE]]: +; CHECK-NEXT: br label %[[LOOP_2]] +; CHECK: [[CASE0]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[CASE1]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[INFLOOP_I]]: +; CHECK-NEXT: br label %[[INFLOOP_I]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret i32 0 +; entry: %cmp = icmp eq i32 %init, 0 br label %loop.2 From 676b48d1ccd8223bb0bd889cce13e6faecd20c6d Mon Sep 17 00:00:00 2001 From: Chuanqi Xu Date: Wed, 25 Dec 2024 17:42:14 +0800 Subject: [PATCH 058/567] [C++20] [Modules] Diagnose if import statement lakcs a semicolon Close https://github.com/llvm/llvm-project/issues/121066 Now we will diagnose that the import statement lacks a semicolon as expected. Note that the original "not found" diagnose message remains. I meant to remove that, but the test shows it might be more complex process (other unexpected diagnose shows up). Given the importance of the issue, I chose to not dig deeper. --- clang/lib/Parse/Parser.cpp | 7 +++---- clang/test/CXX/basic/basic.link/p3.cpp | 3 ++- clang/test/Modules/pr121066.cpp | 4 ++++ 3 files changed, 9 insertions(+), 5 deletions(-) create mode 100644 clang/test/Modules/pr121066.cpp diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp index 8ba6a5dce8a99..0710542f5e938 100644 --- a/clang/lib/Parse/Parser.cpp +++ b/clang/lib/Parse/Parser.cpp @@ -2654,10 +2654,10 @@ Decl *Parser::ParseModuleImport(SourceLocation AtLoc, SeenError = false; break; } - if (SeenError) { - ExpectAndConsumeSemi(diag::err_module_expected_semi); + ExpectAndConsumeSemi(diag::err_module_expected_semi); + + if (SeenError) return nullptr; - } DeclResult Import; if (HeaderUnit) @@ -2666,7 +2666,6 @@ Decl *Parser::ParseModuleImport(SourceLocation AtLoc, else if (!Path.empty()) Import = Actions.ActOnModuleImport(StartLoc, ExportLoc, ImportLoc, Path, IsPartition); - ExpectAndConsumeSemi(diag::err_module_expected_semi); if (Import.isInvalid()) return nullptr; diff --git a/clang/test/CXX/basic/basic.link/p3.cpp b/clang/test/CXX/basic/basic.link/p3.cpp index 23f39d11b655a..01202264d2591 100644 --- a/clang/test/CXX/basic/basic.link/p3.cpp +++ b/clang/test/CXX/basic/basic.link/p3.cpp @@ -15,7 +15,8 @@ export module m; // #1 // Import errors are fatal, so we test them in isolation. #if IMPORT_ERROR == 1 -import x = {}; // expected-error {{module 'x' not found}} +import x = {}; // expected-error {{expected ';' after module name}} + // expected-error@-1 {{module 'x' not found}} #elif IMPORT_ERROR == 2 struct X; diff --git a/clang/test/Modules/pr121066.cpp b/clang/test/Modules/pr121066.cpp new file mode 100644 index 0000000000000..e92a81c53d683 --- /dev/null +++ b/clang/test/Modules/pr121066.cpp @@ -0,0 +1,4 @@ +// RUN: %clang_cc1 -std=c++20 -fsyntax-only %s -verify + +import mod // expected-error {{expected ';' after module name}} + // expected-error@-1 {{module 'mod' not found}} From 7226b39926b5df6452d13e83f61e35a71dbe448d Mon Sep 17 00:00:00 2001 From: xilinbai-intel Date: Wed, 25 Dec 2024 04:54:13 -0500 Subject: [PATCH 059/567] [X86] Support vectorized llvm.fmaximum/fminimum.vXf16 lowering (#120988) Support the lowering of vectorized FMINIMUM and FMAXIMUM to vminph and vmaxph on types v8f16, v16f16 when AVX512FP, AVX512VL features are present, and on type v32f16 when AVX512FP is present. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 9 ++ .../X86/avx512fp16-fminimum-fmaximum.ll | 82 +++++++++++++------ 2 files changed, 66 insertions(+), 25 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 3b260a89911c4..e7f6032ee7d74 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2333,6 +2333,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal); setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal); + + setOperationAction(ISD::FMINIMUM, MVT::v32f16, Custom); + setOperationAction(ISD::FMAXIMUM, MVT::v32f16, Custom); } if (Subtarget.hasVLX()) { @@ -2377,6 +2380,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Need to custom widen these to prevent scalarization. setOperationAction(ISD::LOAD, MVT::v4f16, Custom); setOperationAction(ISD::STORE, MVT::v4f16, Custom); + + setOperationAction(ISD::FMINIMUM, MVT::v8f16, Custom); + setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Custom); + + setOperationAction(ISD::FMINIMUM, MVT::v16f16, Custom); + setOperationAction(ISD::FMAXIMUM, MVT::v16f16, Custom); } } diff --git a/llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll b/llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll index 55b86cadfe30e..9db57fe68bb42 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll @@ -5,6 +5,10 @@ declare half @llvm.minimum.f16(half, half) declare half @llvm.maximum.f16(half, half) declare <8 x half> @llvm.minimum.v8f16(<8 x half>, <8 x half>) declare <8 x half> @llvm.maximum.v8f16(<8 x half>, <8 x half>) +declare <16 x half> @llvm.minimum.v16f16(<16 x half>, <16 x half>) +declare <16 x half> @llvm.maximum.v16f16(<16 x half>, <16 x half>) +declare <32 x half> @llvm.minimum.v32f16(<32 x half>, <32 x half>) +declare <32 x half> @llvm.maximum.v32f16(<32 x half>, <32 x half>) define half @test_fminimum(half %x, half %y) { ; CHECK-LABEL: test_fminimum: @@ -25,20 +29,10 @@ define half @test_fminimum(half %x, half %y) { ret half %z } -define <8 x half> @test_fminimum_scalarize(<8 x half> %x, <8 x half> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" { -; CHECK-LABEL: test_fminimum_scalarize: +define <8 x half> @test_fminimum_v8f16(<8 x half> %x, <8 x half> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" { +; CHECK-LABEL: test_fminimum_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vcmpltph %xmm1, %xmm0, %k1 -; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm2 {%k1} -; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768] -; CHECK-NEXT: vpcmpeqw %xmm3, %xmm0, %k1 -; CHECK-NEXT: vpblendmw %xmm0, %xmm2, %xmm0 {%k1} -; CHECK-NEXT: vpcmpeqw %xmm3, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqph %xmm1, %xmm2, %k1 -; CHECK-NEXT: vmovdqu16 %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 +; CHECK-NEXT: vminph %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %r = call <8 x half> @llvm.minimum.v8f16(<8 x half> %x, <8 x half> %y) ret <8 x half> %r @@ -113,19 +107,10 @@ define half @test_fmaximum(half %x, half %y) { ret half %r } -define <8 x half> @test_fmaximum_scalarize(<8 x half> %x, <8 x half> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" { -; CHECK-LABEL: test_fmaximum_scalarize: +define <8 x half> @test_fmaximum_v8f16(<8 x half> %x, <8 x half> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" { +; CHECK-LABEL: test_fmaximum_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vcmpltph %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm2 {%k1} -; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpblendmw %xmm0, %xmm2, %xmm0 {%k1} -; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqph %xmm1, %xmm2, %k1 -; CHECK-NEXT: vmovdqu16 %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 +; CHECK-NEXT: vmaxph %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %r = call <8 x half> @llvm.maximum.v8f16(<8 x half> %x, <8 x half> %y) ret <8 x half> %r @@ -186,3 +171,50 @@ define half @test_fmaximum_combine_cmps(half %x, half %y) { %2 = tail call half @llvm.maximum.f16(half %x, half %1) ret half %2 } + +define <16 x half> @test_fminimum_v16f16(<16 x half> %x, <16 x half> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" { +; CHECK-LABEL: test_fminimum_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vminph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %r = call <16 x half> @llvm.minimum.v16f16(<16 x half> %x, <16 x half> %y) + ret <16 x half> %r +} + +define <16 x half> @test_fmaximum_v16f16_nans(<16 x half> %x, <16 x half> %y) "no-signed-zeros-fp-math"="true" { +; CHECK-LABEL: test_fmaximum_v16f16_nans: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxph %ymm1, %ymm0, %ymm1 +; CHECK-NEXT: vcmpunordph %ymm0, %ymm0, %k1 +; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: retq + %r = call <16 x half> @llvm.maximum.v16f16(<16 x half> %x, <16 x half> %y) + ret <16 x half> %r +} + +define <32 x half> @test_fminimum_v32f16_szero(<32 x half> %x, <32 x half> %y) "no-nans-fp-math"="true" { +; CHECK-LABEL: test_fminimum_v32f16_szero: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovw2m %zmm0, %k1 +; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm2 {%k1} +; CHECK-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vminph %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: retq + %r = call <32 x half> @llvm.minimum.v32f16(<32 x half> %x, <32 x half> %y) + ret <32 x half> %r +} + +define <32 x half> @test_fmaximum_v32f16_nans_szero(<32 x half> %x, <32 x half> %y) { +; CHECK-LABEL: test_fmaximum_v32f16_nans_szero: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovw2m %zmm0, %k1 +; CHECK-NEXT: vpblendmw %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmaxph %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vcmpunordph %zmm1, %zmm1, %k1 +; CHECK-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %r = call <32 x half> @llvm.maximum.v32f16(<32 x half> %x, <32 x half> %y) + ret <32 x half> %r +} From 4884b1b08a13af430620e7104aa58710a70f618c Mon Sep 17 00:00:00 2001 From: Sergei Barannikov Date: Wed, 25 Dec 2024 13:32:02 +0300 Subject: [PATCH 060/567] [TableGen][GISel] Simplify checks for BasicBlockSDNode (NFC) (#121098) --- llvm/utils/TableGen/GlobalISelEmitter.cpp | 40 ++++++++++------------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp index 4250b57581f63..5038be7b24fbc 100644 --- a/llvm/utils/TableGen/GlobalISelEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp @@ -992,27 +992,24 @@ Error GlobalISelEmitter::importChildMatcher( // Check MBB's before the type check since they are not a known type. if (!SrcChild.isLeaf()) { - if (SrcChild.getOperator()->isSubClassOf("SDNode")) { - auto &ChildSDNI = CGP.getSDNodeInfo(SrcChild.getOperator()); - if (ChildSDNI.getSDClassName() == "BasicBlockSDNode") { - OM.addPredicate(); - return Error::success(); - } - if (SrcChild.getOperator()->getName() == "timm") { - OM.addPredicate(); + if (SrcChild.getOperator()->getName() == "bb") { + OM.addPredicate(); + return Error::success(); + } + if (SrcChild.getOperator()->getName() == "timm") { + OM.addPredicate(); - // Add predicates, if any - for (const TreePredicateCall &Call : SrcChild.getPredicateCalls()) { - const TreePredicateFn &Predicate = Call.Fn; + // Add predicates, if any + for (const TreePredicateCall &Call : SrcChild.getPredicateCalls()) { + const TreePredicateFn &Predicate = Call.Fn; - // Only handle immediate patterns for now - if (Predicate.isImmediatePattern()) { - OM.addPredicate(Predicate); - } + // Only handle immediate patterns for now + if (Predicate.isImmediatePattern()) { + OM.addPredicate(Predicate); } - - return Error::success(); } + + return Error::success(); } } else if (auto *ChildDefInit = dyn_cast(SrcChild.getLeafValue())) { auto *ChildRec = ChildDefInit->getDef(); @@ -1228,12 +1225,9 @@ Expected GlobalISelEmitter::importExplicitUseRenderer( // We accept 'bb' here. It's an operator because BasicBlockSDNode isn't // inline, but in MI it's just another operand. - if (Dst.getOperator()->isSubClassOf("SDNode")) { - auto &ChildSDNI = CGP.getSDNodeInfo(Dst.getOperator()); - if (ChildSDNI.getSDClassName() == "BasicBlockSDNode") { - DstMIBuilder.addRenderer(Dst.getName()); - return InsertPt; - } + if (Dst.getOperator()->getName() == "bb") { + DstMIBuilder.addRenderer(Dst.getName()); + return InsertPt; } // Similarly, imm is an operator in TreePatternNode's view but must be From 3469996d0d057d99a33ec34ee3c80e5d4fa3afcb Mon Sep 17 00:00:00 2001 From: Igor Kirillov Date: Wed, 25 Dec 2024 12:58:21 +0000 Subject: [PATCH 061/567] [SelectOpt] Optimise big select groups in the latch of a non-inner loop to branches (#119728) Loop latches often have a loop-carried dependency, and if they have several SelectLike instructions in one select group, it is usually profitable to convert it to branches rather than keep selects. --- llvm/lib/CodeGen/SelectOptimize.cpp | 12 +++ llvm/test/CodeGen/AArch64/selectopt.ll | 121 +++++++++++++++++++++++++ 2 files changed, 133 insertions(+) diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp index 7b927e6ec9b81..bfc49dd354aa6 100644 --- a/llvm/lib/CodeGen/SelectOptimize.cpp +++ b/llvm/lib/CodeGen/SelectOptimize.cpp @@ -1044,6 +1044,18 @@ bool SelectOptimizeImpl::isConvertToBranchProfitableBase( return true; } + // If latch has a select group with several elements, it is usually profitable + // to convert it to branches. We let `optimizeSelectsInnerLoops` decide if + // conversion is profitable for innermost loops. + auto *BB = SI.getI()->getParent(); + auto *L = LI->getLoopFor(BB); + if (L && !L->isInnermost() && L->getLoopLatch() == BB && + ASI.Selects.size() >= 3) { + OR << "Converted to branch because select group in the latch block is big."; + EmitAndPrintRemark(ORE, OR); + return true; + } + ORmiss << "Not profitable to convert to branch (base heuristic)."; EmitAndPrintRemark(ORE, ORmiss); return false; diff --git a/llvm/test/CodeGen/AArch64/selectopt.ll b/llvm/test/CodeGen/AArch64/selectopt.ll index 54309dca3b834..d72a956e08d0c 100644 --- a/llvm/test/CodeGen/AArch64/selectopt.ll +++ b/llvm/test/CodeGen/AArch64/selectopt.ll @@ -875,3 +875,124 @@ if.end: %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count br i1 %exitcond.not, label %for.cond.cleanup, label %for.body } + +declare i64 @payload(i64, ptr, ptr, i64) + +define void @outer_latch_heuristic(ptr %dst, ptr %src, i64 %p, i64 %dim) { +; CHECKOO-LABEL: @outer_latch_heuristic( +; CHECKOO-NEXT: entry: +; CHECKOO-NEXT: br label [[OUTER_LOOP:%.*]] +; CHECKOO: outer.loop: +; CHECKOO-NEXT: [[K_020_US:%.*]] = phi i64 [ [[INC7_US:%.*]], [[SELECT_END:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECKOO-NEXT: [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[SELECT_END]] ], [ 0, [[ENTRY]] ] +; CHECKOO-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[SELECT_END]] ], [ 0, [[ENTRY]] ] +; CHECKOO-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC:%.*]], i64 [[I]] +; CHECKOO-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX_US]], align 8 +; CHECKOO-NEXT: [[ARRAYIDX1_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[J]] +; CHECKOO-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX1_US]], align 8 +; CHECKOO-NEXT: br label [[INNER_LOOP:%.*]] +; CHECKOO: inner.loop: +; CHECKOO-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[DIM:%.*]], [[OUTER_LOOP]] ], [ [[LSR_IV_NEXT:%.*]], [[INNER_LOOP]] ] +; CHECKOO-NEXT: [[DIFF_04_I_US:%.*]] = phi i64 [ [[CALL_I_US:%.*]], [[INNER_LOOP]] ], [ 0, [[OUTER_LOOP]] ] +; CHECKOO-NEXT: [[CALL_I_US]] = tail call i64 @payload(i64 [[DIFF_04_I_US]], ptr [[TMP0]], ptr [[TMP1]], i64 [[P:%.*]]) +; CHECKOO-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1 +; CHECKOO-NEXT: [[EXITCOND_NOT_I_US:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0 +; CHECKOO-NEXT: br i1 [[EXITCOND_NOT_I_US]], label [[LATCH:%.*]], label [[INNER_LOOP]] +; CHECKOO: latch: +; CHECKOO-NEXT: [[CMP2_US:%.*]] = icmp sgt i64 [[CALL_I_US]], -1 +; CHECKOO-NEXT: [[DIFF_0_LCSSA_I_LOBIT_US:%.*]] = lshr i64 [[CALL_I_US]], 63 +; CHECKOO-NEXT: [[CMP2_US_FROZEN:%.*]] = freeze i1 [[CMP2_US]] +; CHECKOO-NEXT: br i1 [[CMP2_US_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_FALSE_SINK:%.*]] +; CHECKOO: select.true.sink: +; CHECKOO-NEXT: [[TMP2:%.*]] = add nsw i64 [[J]], 1 +; CHECKOO-NEXT: br label [[SELECT_END]] +; CHECKOO: select.false.sink: +; CHECKOO-NEXT: [[TMP3:%.*]] = add nsw i64 1, [[I]] +; CHECKOO-NEXT: br label [[SELECT_END]] +; CHECKOO: select.end: +; CHECKOO-NEXT: [[I_NEXT]] = phi i64 [ [[I]], [[SELECT_TRUE_SINK]] ], [ [[TMP3]], [[SELECT_FALSE_SINK]] ] +; CHECKOO-NEXT: [[J_NEXT]] = phi i64 [ [[TMP2]], [[SELECT_TRUE_SINK]] ], [ [[J]], [[SELECT_FALSE_SINK]] ] +; CHECKOO-NEXT: [[COND_IN_US:%.*]] = phi ptr [ [[ARRAYIDX1_US]], [[SELECT_TRUE_SINK]] ], [ [[ARRAYIDX_US]], [[SELECT_FALSE_SINK]] ] +; CHECKOO-NEXT: [[INC4_US:%.*]] = zext i1 [[CMP2_US]] to i64 +; CHECKOO-NEXT: [[COND_US:%.*]] = load ptr, ptr [[COND_IN_US]], align 8 +; CHECKOO-NEXT: [[ARRAYIDX6_US:%.*]] = getelementptr inbounds ptr, ptr [[DST:%.*]], i64 [[K_020_US]] +; CHECKOO-NEXT: store ptr [[COND_US]], ptr [[ARRAYIDX6_US]], align 8 +; CHECKOO-NEXT: [[INC7_US]] = add i64 [[K_020_US]], 1 +; CHECKOO-NEXT: [[EXITCOND23_NOT:%.*]] = icmp eq i64 [[K_020_US]], 1000 +; CHECKOO-NEXT: br i1 [[EXITCOND23_NOT]], label [[EXIT:%.*]], label [[OUTER_LOOP]] +; CHECKOO: exit: +; CHECKOO-NEXT: ret void +; +; CHECKII-LABEL: @outer_latch_heuristic( +; CHECKII-NEXT: entry: +; CHECKII-NEXT: br label [[OUTER_LOOP:%.*]] +; CHECKII: outer.loop: +; CHECKII-NEXT: [[K_020_US:%.*]] = phi i64 [ [[INC7_US:%.*]], [[LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECKII-NEXT: [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[LATCH]] ], [ 0, [[ENTRY]] ] +; CHECKII-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH]] ], [ 0, [[ENTRY]] ] +; CHECKII-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC:%.*]], i64 [[I]] +; CHECKII-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX_US]], align 8 +; CHECKII-NEXT: [[ARRAYIDX1_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[J]] +; CHECKII-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX1_US]], align 8 +; CHECKII-NEXT: br label [[INNER_LOOP:%.*]] +; CHECKII: inner.loop: +; CHECKII-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[DIM:%.*]], [[OUTER_LOOP]] ], [ [[LSR_IV_NEXT:%.*]], [[INNER_LOOP]] ] +; CHECKII-NEXT: [[DIFF_04_I_US:%.*]] = phi i64 [ [[CALL_I_US:%.*]], [[INNER_LOOP]] ], [ 0, [[OUTER_LOOP]] ] +; CHECKII-NEXT: [[CALL_I_US]] = tail call i64 @payload(i64 [[DIFF_04_I_US]], ptr [[TMP0]], ptr [[TMP1]], i64 [[P:%.*]]) +; CHECKII-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1 +; CHECKII-NEXT: [[EXITCOND_NOT_I_US:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0 +; CHECKII-NEXT: br i1 [[EXITCOND_NOT_I_US]], label [[LATCH]], label [[INNER_LOOP]] +; CHECKII: latch: +; CHECKII-NEXT: [[CMP2_US:%.*]] = icmp sgt i64 [[CALL_I_US]], -1 +; CHECKII-NEXT: [[DIFF_0_LCSSA_I_LOBIT_US:%.*]] = lshr i64 [[CALL_I_US]], 63 +; CHECKII-NEXT: [[I_NEXT]] = add nsw i64 [[DIFF_0_LCSSA_I_LOBIT_US]], [[I]] +; CHECKII-NEXT: [[INC4_US:%.*]] = zext i1 [[CMP2_US]] to i64 +; CHECKII-NEXT: [[J_NEXT]] = add nsw i64 [[J]], [[INC4_US]] +; CHECKII-NEXT: [[COND_IN_US:%.*]] = select i1 [[CMP2_US]], ptr [[ARRAYIDX1_US]], ptr [[ARRAYIDX_US]] +; CHECKII-NEXT: [[COND_US:%.*]] = load ptr, ptr [[COND_IN_US]], align 8 +; CHECKII-NEXT: [[ARRAYIDX6_US:%.*]] = getelementptr inbounds ptr, ptr [[DST:%.*]], i64 [[K_020_US]] +; CHECKII-NEXT: store ptr [[COND_US]], ptr [[ARRAYIDX6_US]], align 8 +; CHECKII-NEXT: [[INC7_US]] = add i64 [[K_020_US]], 1 +; CHECKII-NEXT: [[EXITCOND23_NOT:%.*]] = icmp eq i64 [[K_020_US]], 1000 +; CHECKII-NEXT: br i1 [[EXITCOND23_NOT]], label [[EXIT:%.*]], label [[OUTER_LOOP]] +; CHECKII: exit: +; CHECKII-NEXT: ret void +; +entry: + br label %outer.loop + +outer.loop: + %k.020.us = phi i64 [ %inc7.us, %latch ], [ 0, %entry ] + %j = phi i64 [ %j.next, %latch ], [ 0, %entry ] + %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] + %arrayidx.us = getelementptr inbounds ptr, ptr %src, i64 %i + %4 = load ptr, ptr %arrayidx.us, align 8 + %arrayidx1.us = getelementptr inbounds ptr, ptr %src, i64 %j + %5 = load ptr, ptr %arrayidx1.us, align 8 + br label %inner.loop + +inner.loop: + %lsr.iv = phi i64 [ %dim, %outer.loop ], [ %lsr.iv.next, %inner.loop ] + %diff.04.i.us = phi i64 [ %call.i.us, %inner.loop ], [ 0, %outer.loop ] + %call.i.us = tail call i64 @payload(i64 %diff.04.i.us, ptr %4, ptr %5, i64 %p) + %lsr.iv.next = add i64 %lsr.iv, -1 + %exitcond.not.i.us = icmp eq i64 %lsr.iv.next, 0 + br i1 %exitcond.not.i.us, label %latch, label %inner.loop + +latch: + %cmp2.us = icmp sgt i64 %call.i.us, -1 + %diff.0.lcssa.i.lobit.us = lshr i64 %call.i.us, 63 + %i.next = add nsw i64 %diff.0.lcssa.i.lobit.us, %i + %inc4.us = zext i1 %cmp2.us to i64 + %j.next = add nsw i64 %j, %inc4.us + %cond.in.us = select i1 %cmp2.us, ptr %arrayidx1.us, ptr %arrayidx.us + %cond.us = load ptr, ptr %cond.in.us, align 8 + %arrayidx6.us = getelementptr inbounds ptr, ptr %dst, i64 %k.020.us + store ptr %cond.us, ptr %arrayidx6.us, align 8 + %inc7.us = add i64 %k.020.us, 1 + %exitcond23.not = icmp eq i64 %k.020.us, 1000 + br i1 %exitcond23.not, label %exit, label %outer.loop + +exit: + ret void +} From 6d7cf5206f1238139b7a967dea555514a62f7d83 Mon Sep 17 00:00:00 2001 From: adam-bzowski Date: Wed, 25 Dec 2024 15:39:56 +0100 Subject: [PATCH 062/567] [ValueTracking] Improve KnownBits for signed min-max clamping (#120576) A signed min-max clamp is the sequence of smin and smax intrinsics, which constrain a signed value into the range: smin <= value <= smax. The patch improves the calculation of KnownBits for a value subjected to the signed clamping. --- llvm/lib/Analysis/ValueTracking.cpp | 108 +++---- .../knownbits-trunc-with-min-max-clamp.ll | 266 ++++++++++++++++++ 2 files changed, 325 insertions(+), 49 deletions(-) create mode 100644 llvm/test/Analysis/ValueTracking/knownbits-trunc-with-min-max-clamp.ll diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 14d7c2da8a9f8..78fec25a6e502 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -1065,6 +1065,63 @@ void llvm::adjustKnownBitsForSelectArm(KnownBits &Known, Value *Cond, Known = CondRes; } +// Match a signed min+max clamp pattern like smax(smin(In, CHigh), CLow). +// Returns the input and lower/upper bounds. +static bool isSignedMinMaxClamp(const Value *Select, const Value *&In, + const APInt *&CLow, const APInt *&CHigh) { + assert(isa(Select) && + cast(Select)->getOpcode() == Instruction::Select && + "Input should be a Select!"); + + const Value *LHS = nullptr, *RHS = nullptr; + SelectPatternFlavor SPF = matchSelectPattern(Select, LHS, RHS).Flavor; + if (SPF != SPF_SMAX && SPF != SPF_SMIN) + return false; + + if (!match(RHS, m_APInt(CLow))) + return false; + + const Value *LHS2 = nullptr, *RHS2 = nullptr; + SelectPatternFlavor SPF2 = matchSelectPattern(LHS, LHS2, RHS2).Flavor; + if (getInverseMinMaxFlavor(SPF) != SPF2) + return false; + + if (!match(RHS2, m_APInt(CHigh))) + return false; + + if (SPF == SPF_SMIN) + std::swap(CLow, CHigh); + + In = LHS2; + return CLow->sle(*CHigh); +} + +static bool isSignedMinMaxIntrinsicClamp(const IntrinsicInst *II, + const APInt *&CLow, + const APInt *&CHigh) { + assert((II->getIntrinsicID() == Intrinsic::smin || + II->getIntrinsicID() == Intrinsic::smax) && + "Must be smin/smax"); + + Intrinsic::ID InverseID = getInverseMinMaxIntrinsic(II->getIntrinsicID()); + auto *InnerII = dyn_cast(II->getArgOperand(0)); + if (!InnerII || InnerII->getIntrinsicID() != InverseID || + !match(II->getArgOperand(1), m_APInt(CLow)) || + !match(InnerII->getArgOperand(1), m_APInt(CHigh))) + return false; + + if (II->getIntrinsicID() == Intrinsic::smin) + std::swap(CLow, CHigh); + return CLow->sle(*CHigh); +} + +static void unionWithMinMaxIntrinsicClamp(const IntrinsicInst *II, + KnownBits &Known) { + const APInt *CLow, *CHigh; + if (isSignedMinMaxIntrinsicClamp(II, CLow, CHigh)) + Known = Known.unionWith(ConstantRange(*CLow, *CHigh + 1).toKnownBits()); +} + static void computeKnownBitsFromOperator(const Operator *I, const APInt &DemandedElts, KnownBits &Known, unsigned Depth, @@ -1804,11 +1861,13 @@ static void computeKnownBitsFromOperator(const Operator *I, computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q); computeKnownBits(I->getOperand(1), DemandedElts, Known2, Depth + 1, Q); Known = KnownBits::smin(Known, Known2); + unionWithMinMaxIntrinsicClamp(II, Known); break; case Intrinsic::smax: computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q); computeKnownBits(I->getOperand(1), DemandedElts, Known2, Depth + 1, Q); Known = KnownBits::smax(Known, Known2); + unionWithMinMaxIntrinsicClamp(II, Known); break; case Intrinsic::ptrmask: { computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q); @@ -3751,55 +3810,6 @@ static bool isKnownNonEqual(const Value *V1, const Value *V2, return false; } -// Match a signed min+max clamp pattern like smax(smin(In, CHigh), CLow). -// Returns the input and lower/upper bounds. -static bool isSignedMinMaxClamp(const Value *Select, const Value *&In, - const APInt *&CLow, const APInt *&CHigh) { - assert(isa(Select) && - cast(Select)->getOpcode() == Instruction::Select && - "Input should be a Select!"); - - const Value *LHS = nullptr, *RHS = nullptr; - SelectPatternFlavor SPF = matchSelectPattern(Select, LHS, RHS).Flavor; - if (SPF != SPF_SMAX && SPF != SPF_SMIN) - return false; - - if (!match(RHS, m_APInt(CLow))) - return false; - - const Value *LHS2 = nullptr, *RHS2 = nullptr; - SelectPatternFlavor SPF2 = matchSelectPattern(LHS, LHS2, RHS2).Flavor; - if (getInverseMinMaxFlavor(SPF) != SPF2) - return false; - - if (!match(RHS2, m_APInt(CHigh))) - return false; - - if (SPF == SPF_SMIN) - std::swap(CLow, CHigh); - - In = LHS2; - return CLow->sle(*CHigh); -} - -static bool isSignedMinMaxIntrinsicClamp(const IntrinsicInst *II, - const APInt *&CLow, - const APInt *&CHigh) { - assert((II->getIntrinsicID() == Intrinsic::smin || - II->getIntrinsicID() == Intrinsic::smax) && "Must be smin/smax"); - - Intrinsic::ID InverseID = getInverseMinMaxIntrinsic(II->getIntrinsicID()); - auto *InnerII = dyn_cast(II->getArgOperand(0)); - if (!InnerII || InnerII->getIntrinsicID() != InverseID || - !match(II->getArgOperand(1), m_APInt(CLow)) || - !match(InnerII->getArgOperand(1), m_APInt(CHigh))) - return false; - - if (II->getIntrinsicID() == Intrinsic::smin) - std::swap(CLow, CHigh); - return CLow->sle(*CHigh); -} - /// For vector constants, loop over the elements and find the constant with the /// minimum number of sign bits. Return 0 if the value is not a vector constant /// or if any element was not analyzed; otherwise, return the count for the diff --git a/llvm/test/Analysis/ValueTracking/knownbits-trunc-with-min-max-clamp.ll b/llvm/test/Analysis/ValueTracking/knownbits-trunc-with-min-max-clamp.ll new file mode 100644 index 0000000000000..1ff8a41b3459b --- /dev/null +++ b/llvm/test/Analysis/ValueTracking/knownbits-trunc-with-min-max-clamp.ll @@ -0,0 +1,266 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=aggressive-instcombine -S | FileCheck %s + +; The LIT tests rely on i32, i16 and i8 being valid machine types. +target datalayout = "n8:16:32" + +; This LIT test checks if TruncInstCombine pass correctly recognizes the +; constraints from a signed min-max clamp. The clamp is a sequence of smin and +; smax instructions limiting a variable into a range, smin <= x <= smax. +; +; Each LIT test (except the last ones) has two versions depending on the order +; of smin and smax: +; a) y = smax(smin(x, upper_limit), lower_limit) +; b) y = smin(smax(x, lower_limit), upper_limit) + +define i8 @test_0a(i16 %x) { +; CHECK-LABEL: define i8 @test_0a( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 31) +; CHECK-NEXT: [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 0) +; CHECK-NEXT: [[A:%.*]] = trunc i16 [[TMP2]] to i8 +; CHECK-NEXT: [[B:%.*]] = lshr i8 [[A]], 2 +; CHECK-NEXT: ret i8 [[B]] +; + %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 31) + %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 0) + %a = sext i16 %2 to i32 + %b = lshr i32 %a, 2 + %b.trunc = trunc i32 %b to i8 + ret i8 %b.trunc +} + +define i8 @test_0b(i16 %x) { +; CHECK-LABEL: define i8 @test_0b( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = tail call i16 @llvm.smax.i16(i16 [[X]], i16 0) +; CHECK-NEXT: [[TMP2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP1]], i16 31) +; CHECK-NEXT: [[A:%.*]] = trunc i16 [[TMP2]] to i8 +; CHECK-NEXT: [[B:%.*]] = lshr i8 [[A]], 2 +; CHECK-NEXT: ret i8 [[B]] +; + %1 = tail call i16 @llvm.smax.i16(i16 %x, i16 0) + %2 = tail call i16 @llvm.smin.i16(i16 %1, i16 31) + %a = sext i16 %2 to i32 + %b = lshr i32 %a, 2 + %b.trunc = trunc i32 %b to i8 + ret i8 %b.trunc +} + +define i8 @test_1a(i16 %x) { +; CHECK-LABEL: define i8 @test_1a( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 31) +; CHECK-NEXT: [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 0) +; CHECK-NEXT: [[A:%.*]] = trunc i16 [[TMP2]] to i8 +; CHECK-NEXT: [[B:%.*]] = add i8 [[A]], 2 +; CHECK-NEXT: ret i8 [[B]] +; + %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 31) + %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 0) + %a = sext i16 %2 to i32 + %b = add i32 %a, 2 + %b.trunc = trunc i32 %b to i8 + ret i8 %b.trunc +} + +define i8 @test_1b(i16 %x) { +; CHECK-LABEL: define i8 @test_1b( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = tail call i16 @llvm.smax.i16(i16 [[X]], i16 0) +; CHECK-NEXT: [[TMP2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP1]], i16 31) +; CHECK-NEXT: [[A:%.*]] = trunc i16 [[TMP2]] to i8 +; CHECK-NEXT: [[B:%.*]] = add i8 [[A]], 2 +; CHECK-NEXT: ret i8 [[B]] +; + %1 = tail call i16 @llvm.smax.i16(i16 %x, i16 0) + %2 = tail call i16 @llvm.smin.i16(i16 %1, i16 31) + %a = sext i16 %2 to i32 + %b = add i32 %a, 2 + %b.trunc = trunc i32 %b to i8 + ret i8 %b.trunc +} + +define i8 @test_2a(i16 %x) { +; CHECK-LABEL: define i8 @test_2a( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 -1) +; CHECK-NEXT: [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 -31) +; CHECK-NEXT: [[A:%.*]] = trunc i16 [[TMP2]] to i8 +; CHECK-NEXT: [[B:%.*]] = add i8 [[A]], 2 +; CHECK-NEXT: ret i8 [[B]] +; + %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 -1) + %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 -31) + %a = sext i16 %2 to i32 + %b = add i32 %a, 2 + %b.trunc = trunc i32 %b to i8 + ret i8 %b.trunc +} + +define i8 @test_2b(i16 %x) { +; CHECK-LABEL: define i8 @test_2b( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = tail call i16 @llvm.smax.i16(i16 [[X]], i16 -31) +; CHECK-NEXT: [[TMP2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP1]], i16 -1) +; CHECK-NEXT: [[A:%.*]] = trunc i16 [[TMP2]] to i8 +; CHECK-NEXT: [[B:%.*]] = add i8 [[A]], 2 +; CHECK-NEXT: ret i8 [[B]] +; + %1 = tail call i16 @llvm.smax.i16(i16 %x, i16 -31) + %2 = tail call i16 @llvm.smin.i16(i16 %1, i16 -1) + %a = sext i16 %2 to i32 + %b = add i32 %a, 2 + %b.trunc = trunc i32 %b to i8 + ret i8 %b.trunc +} + +define i8 @test_3a(i16 %x) { +; CHECK-LABEL: define i8 @test_3a( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 31) +; CHECK-NEXT: [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 -31) +; CHECK-NEXT: [[A:%.*]] = trunc i16 [[TMP2]] to i8 +; CHECK-NEXT: [[B:%.*]] = add i8 [[A]], 2 +; CHECK-NEXT: ret i8 [[B]] +; + %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 31) + %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 -31) + %a = sext i16 %2 to i32 + %b = add i32 %a, 2 + %b.trunc = trunc i32 %b to i8 + ret i8 %b.trunc +} + +define i8 @test_3b(i16 %x) { +; CHECK-LABEL: define i8 @test_3b( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = tail call i16 @llvm.smax.i16(i16 [[X]], i16 -31) +; CHECK-NEXT: [[TMP2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP1]], i16 31) +; CHECK-NEXT: [[A:%.*]] = trunc i16 [[TMP2]] to i8 +; CHECK-NEXT: [[B:%.*]] = add i8 [[A]], 2 +; CHECK-NEXT: ret i8 [[B]] +; + %1 = tail call i16 @llvm.smax.i16(i16 %x, i16 -31) + %2 = tail call i16 @llvm.smin.i16(i16 %1, i16 31) + %a = sext i16 %2 to i32 + %b = add i32 %a, 2 + %b.trunc = trunc i32 %b to i8 + ret i8 %b.trunc +} + +define <16 x i8> @test_vec_1a(<16 x i16> %x) { +; CHECK-LABEL: define <16 x i8> @test_vec_1a( +; CHECK-SAME: <16 x i16> [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> [[X]], <16 x i16> splat (i16 127)) +; CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> [[TMP1]], <16 x i16> zeroinitializer) +; CHECK-NEXT: [[A:%.*]] = trunc <16 x i16> [[TMP2]] to <16 x i8> +; CHECK-NEXT: [[B:%.*]] = add <16 x i8> [[A]], splat (i8 2) +; CHECK-NEXT: ret <16 x i8> [[B]] +; + %1 = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> %x, <16 x i16> splat (i16 127)) + %2 = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> %1, <16 x i16> zeroinitializer) + %a = sext <16 x i16> %2 to <16 x i32> + %b = add <16 x i32> %a, splat (i32 2) + %b.trunc = trunc <16 x i32> %b to <16 x i8> + ret <16 x i8> %b.trunc +} + +define <16 x i8> @test_vec_1b(<16 x i16> %x) { +; CHECK-LABEL: define <16 x i8> @test_vec_1b( +; CHECK-SAME: <16 x i16> [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> [[X]], <16 x i16> zeroinitializer) +; CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> [[TMP1]], <16 x i16> splat (i16 127)) +; CHECK-NEXT: [[A:%.*]] = trunc <16 x i16> [[TMP2]] to <16 x i8> +; CHECK-NEXT: [[B:%.*]] = add <16 x i8> [[A]], splat (i8 2) +; CHECK-NEXT: ret <16 x i8> [[B]] +; + %1 = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> %x, <16 x i16> zeroinitializer) + %2 = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> %1, <16 x i16> splat (i16 127)) + %a = sext <16 x i16> %2 to <16 x i32> + %b = add <16 x i32> %a, splat (i32 2) + %b.trunc = trunc <16 x i32> %b to <16 x i8> + ret <16 x i8> %b.trunc +} + +; A longer test that was the original motivation for the smin-smax clamping. +define i8 @test_final(i16 %x, i16 %y) { +; CHECK-LABEL: define i8 @test_final( +; CHECK-SAME: i16 [[X:%.*]], i16 [[Y:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 127) +; CHECK-NEXT: [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 0) +; CHECK-NEXT: [[TMP3:%.*]] = tail call i16 @llvm.smax.i16(i16 [[Y]], i16 0) +; CHECK-NEXT: [[TMP4:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP3]], i16 127) +; CHECK-NEXT: [[MUL:%.*]] = mul i16 [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i16 [[MUL]], 7 +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i16 [[SHR]] to i8 +; CHECK-NEXT: ret i8 [[TRUNC]] +; + %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 127) + %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 0) + %x.clamp = zext nneg i16 %2 to i32 + %3 = tail call i16 @llvm.smax.i16(i16 %y, i16 0) + %4 = tail call i16 @llvm.smin.i16(i16 %3, i16 127) + %y.clamp = zext nneg i16 %4 to i32 + %mul = mul nuw nsw i32 %x.clamp, %y.clamp + %shr = lshr i32 %mul, 7 + %trunc= trunc nuw nsw i32 %shr to i8 + ret i8 %trunc +} + +; Range tests below check if the bounds are dealt with correctly. + +; This gets optimized. +define i8 @test_bounds_1(i16 %x) { +; CHECK-LABEL: define i8 @test_bounds_1( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 127) +; CHECK-NEXT: [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 0) +; CHECK-NEXT: [[A:%.*]] = trunc i16 [[TMP2]] to i8 +; CHECK-NEXT: [[SHR:%.*]] = ashr i8 [[A]], 7 +; CHECK-NEXT: ret i8 [[SHR]] +; + %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 127) + %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 0) + %a = sext i16 %2 to i32 + %shr = ashr i32 %a, 7 + %b.trunc = trunc i32 %shr to i8 + ret i8 %b.trunc +} + +; While this does not. +define i8 @test_bounds_2(i16 %x) { +; CHECK-LABEL: define i8 @test_bounds_2( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 128) +; CHECK-NEXT: [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 0) +; CHECK-NEXT: [[SHR:%.*]] = ashr i16 [[TMP2]], 7 +; CHECK-NEXT: [[B_TRUNC:%.*]] = trunc i16 [[SHR]] to i8 +; CHECK-NEXT: ret i8 [[B_TRUNC]] +; + %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 128) + %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 0) + %a = sext i16 %2 to i32 + %shr = ashr i32 %a, 7 + %b.trunc = trunc i32 %shr to i8 + ret i8 %b.trunc +} + +; This should get optimized. We test here if the optimization works correctly +; if the upper limit is signed max int. +define i8 @test_bounds_3(i16 %x) { +; CHECK-LABEL: define i8 @test_bounds_3( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 32767) +; CHECK-NEXT: [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 32752) +; CHECK-NEXT: [[A:%.*]] = trunc i16 [[TMP2]] to i8 +; CHECK-NEXT: [[AND:%.*]] = and i8 [[A]], -1 +; CHECK-NEXT: ret i8 [[AND]] +; + %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 32767) + %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 32752) + %a = sext i16 %2 to i32 + %and = and i32 %a, 255 + %b.trunc = trunc i32 %and to i8 + ret i8 %b.trunc +} From 8e7f1bee84ff9421f7a4b57abd1feff6b5680e1a Mon Sep 17 00:00:00 2001 From: Brandon Wu Date: Wed, 25 Dec 2024 22:48:54 +0800 Subject: [PATCH 063/567] [clang][RISCV] Remove unneeded RISCV tuple code (#121024) These code are no longer needed because we've modeled tuple type using target extension type rather than structure of scalable vectors. --- clang/lib/CodeGen/CGCall.cpp | 31 ----------------------------- clang/lib/CodeGen/Targets/RISCV.cpp | 8 +------- 2 files changed, 1 insertion(+), 38 deletions(-) diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 50b9dfbbab083..f139c30f3dfd4 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -3235,22 +3235,6 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI, llvm::StructType *STy = dyn_cast(ArgI.getCoerceToType()); - if (ArgI.isDirect() && !ArgI.getCanBeFlattened() && STy && - STy->getNumElements() > 1) { - [[maybe_unused]] llvm::TypeSize StructSize = - CGM.getDataLayout().getTypeAllocSize(STy); - [[maybe_unused]] llvm::TypeSize PtrElementSize = - CGM.getDataLayout().getTypeAllocSize(ConvertTypeForMem(Ty)); - if (STy->containsHomogeneousScalableVectorTypes()) { - assert(StructSize == PtrElementSize && - "Only allow non-fractional movement of structure with" - "homogeneous scalable vector type"); - - ArgVals.push_back(ParamValue::forDirect(AI)); - break; - } - } - Address Alloca = CreateMemTemp(Ty, getContext().getDeclAlign(Arg), Arg->getName()); @@ -5414,21 +5398,6 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, llvm::StructType *STy = dyn_cast(ArgInfo.getCoerceToType()); - if (STy && ArgInfo.isDirect() && !ArgInfo.getCanBeFlattened()) { - llvm::Type *SrcTy = ConvertTypeForMem(I->Ty); - [[maybe_unused]] llvm::TypeSize SrcTypeSize = - CGM.getDataLayout().getTypeAllocSize(SrcTy); - [[maybe_unused]] llvm::TypeSize DstTypeSize = - CGM.getDataLayout().getTypeAllocSize(STy); - if (STy->containsHomogeneousScalableVectorTypes()) { - assert(SrcTypeSize == DstTypeSize && - "Only allow non-fractional movement of structure with " - "homogeneous scalable vector type"); - - IRCallArgs[FirstIRArg] = I->getKnownRValue().getScalarVal(); - break; - } - } // FIXME: Avoid the conversion through memory if possible. Address Src = Address::invalid(); diff --git a/clang/lib/CodeGen/Targets/RISCV.cpp b/clang/lib/CodeGen/Targets/RISCV.cpp index b04e436c665f5..873e696e1328f 100644 --- a/clang/lib/CodeGen/Targets/RISCV.cpp +++ b/clang/lib/CodeGen/Targets/RISCV.cpp @@ -495,13 +495,7 @@ ABIArgInfo RISCVABIInfo::classifyArgumentType(QualType Ty, bool IsFixed, return getNaturalAlignIndirect(Ty, /*ByVal=*/false); } - ABIArgInfo Info = ABIArgInfo::getDirect(); - - // If it is tuple type, it can't be flattened. - if (llvm::StructType *STy = dyn_cast(CGT.ConvertType(Ty))) - Info.setCanBeFlattened(!STy->containsHomogeneousScalableVectorTypes()); - - return Info; + return ABIArgInfo::getDirect(); } if (const VectorType *VT = Ty->getAs()) From dd9c5c118230fc9adde668f2c96323b73a677400 Mon Sep 17 00:00:00 2001 From: Shourya Goel Date: Wed, 25 Dec 2024 20:45:32 +0530 Subject: [PATCH 064/567] [libc][complex] enable CFP128 entrypoints on X86_64 (#121111) --- libc/config/linux/x86_64/entrypoints.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 08d8559d8c81a..94f58b85a6e02 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -739,10 +739,10 @@ endif() if(LIBC_TYPES_HAS_FLOAT128) list(APPEND TARGET_LIBM_ENTRYPOINTS # complex.h C23 _Complex _Float128 entrypoints - # libc.src.complex.crealf128 - # libc.src.complex.cimagf128 - # libc.src.complex.conjf128 - # libc.src.complex.cprojf128 + libc.src.complex.crealf128 + libc.src.complex.cimagf128 + libc.src.complex.conjf128 + libc.src.complex.cprojf128 # math.h C23 _Float128 entrypoints libc.src.math.canonicalizef128 From ff97daadcd05ad3373f8398948d2962e88a0f61b Mon Sep 17 00:00:00 2001 From: Shourya Goel Date: Wed, 25 Dec 2024 20:56:13 +0530 Subject: [PATCH 065/567] Revert "[libc][complex] enable CFP128 entrypoints on X86_64 (#121111)" (#121113) This reverts commit dd9c5c118230fc9adde668f2c96323b73a677400. --- libc/config/linux/x86_64/entrypoints.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 94f58b85a6e02..08d8559d8c81a 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -739,10 +739,10 @@ endif() if(LIBC_TYPES_HAS_FLOAT128) list(APPEND TARGET_LIBM_ENTRYPOINTS # complex.h C23 _Complex _Float128 entrypoints - libc.src.complex.crealf128 - libc.src.complex.cimagf128 - libc.src.complex.conjf128 - libc.src.complex.cprojf128 + # libc.src.complex.crealf128 + # libc.src.complex.cimagf128 + # libc.src.complex.conjf128 + # libc.src.complex.cprojf128 # math.h C23 _Float128 entrypoints libc.src.math.canonicalizef128 From 70965ef259a161a6e9ccfb8bd841dd2246c56c37 Mon Sep 17 00:00:00 2001 From: TilakChad <49703944+TilakChad@users.noreply.github.com> Date: Wed, 25 Dec 2024 23:44:33 +0545 Subject: [PATCH 066/567] [Clang] Prevent assignment to captured structured bindings inside immutable lambda (#120849) For structured bindings, a call to getCapturedDeclRefType(...) was missing. This PR fixes that behavior and adds the related diagnostics too. This fixes https://github.com/llvm/llvm-project/issues/95081. --- clang/docs/ReleaseNotes.rst | 1 + clang/lib/Sema/SemaExpr.cpp | 43 +++++++++++++--------- clang/test/SemaCXX/cxx20-decomposition.cpp | 23 ++++++++++++ 3 files changed, 50 insertions(+), 17 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index d9b0cb815a15d..4410b9f99e802 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -885,6 +885,7 @@ Bug Fixes to C++ Support - Fixed recognition of ``std::initializer_list`` when it's surrounded with ``extern "C++"`` and exported out of a module (which is the case e.g. in MSVC's implementation of ``std`` module). (#GH118218) - Fixed a pack expansion issue in checking unexpanded parameter sizes. (#GH17042) +- Fixed a bug where captured structured bindings were modifiable inside non-mutable lambda (#GH95081) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 24f7d27c69115..562c98c6babe0 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -3352,6 +3352,7 @@ ExprResult Sema::BuildDeclarationNameExpr( case Decl::VarTemplateSpecialization: case Decl::VarTemplatePartialSpecialization: case Decl::Decomposition: + case Decl::Binding: case Decl::OMPCapturedExpr: // In C, "extern void blah;" is valid and is an r-value. if (!getLangOpts().CPlusPlus && !type.hasQualifiers() && @@ -3371,20 +3372,13 @@ ExprResult Sema::BuildDeclarationNameExpr( // potentially-evaluated contexts? Since the variable isn't actually // captured in an unevaluated context, it seems that the answer is no. if (!isUnevaluatedContext()) { - QualType CapturedType = getCapturedDeclRefType(cast(VD), Loc); + QualType CapturedType = getCapturedDeclRefType(cast(VD), Loc); if (!CapturedType.isNull()) type = CapturedType; } - break; } - case Decl::Binding: - // These are always lvalues. - valueKind = VK_LValue; - type = type.getNonReferenceType(); - break; - case Decl::Function: { if (unsigned BID = cast(VD)->getBuiltinID()) { if (!Context.BuiltinInfo.isDirectlyAddressable(BID)) { @@ -13297,11 +13291,24 @@ static NonConstCaptureKind isReferenceToNonConstCapture(Sema &S, Expr *E) { if (!DRE) return NCCK_None; if (!DRE->refersToEnclosingVariableOrCapture()) return NCCK_None; - // The declaration must be a variable which is not declared 'const'. - VarDecl *var = dyn_cast(DRE->getDecl()); - if (!var) return NCCK_None; - if (var->getType().isConstQualified()) return NCCK_None; - assert(var->hasLocalStorage() && "capture added 'const' to non-local?"); + ValueDecl *Value = dyn_cast(DRE->getDecl()); + + // The declaration must be a value which is not declared 'const'. + if (!Value || Value->getType().isConstQualified()) + return NCCK_None; + + BindingDecl *Binding = dyn_cast(Value); + if (Binding) { + assert(S.getLangOpts().CPlusPlus && "BindingDecl outside of C++?"); + assert(!isa(Binding->getDeclContext())); + return NCCK_Lambda; + } + + VarDecl *Var = dyn_cast(Value); + if (!Var) + return NCCK_None; + + assert(Var->hasLocalStorage() && "capture added 'const' to non-local?"); // Decide whether the first capture was for a block or a lambda. DeclContext *DC = S.CurContext, *Prev = nullptr; @@ -13310,16 +13317,16 @@ static NonConstCaptureKind isReferenceToNonConstCapture(Sema &S, Expr *E) { // For init-capture, it is possible that the variable belongs to the // template pattern of the current context. if (auto *FD = dyn_cast(DC)) - if (var->isInitCapture() && - FD->getTemplateInstantiationPattern() == var->getDeclContext()) + if (Var->isInitCapture() && + FD->getTemplateInstantiationPattern() == Var->getDeclContext()) break; - if (DC == var->getDeclContext()) + if (DC == Var->getDeclContext()) break; Prev = DC; DC = DC->getParent(); } // Unless we have an init-capture, we've gone one step too far. - if (!var->isInitCapture()) + if (!Var->isInitCapture()) DC = Prev; return (isa(DC) ? NCCK_Block : NCCK_Lambda); } @@ -19247,6 +19254,8 @@ bool Sema::NeedToCaptureVariable(ValueDecl *Var, SourceLocation Loc) { } QualType Sema::getCapturedDeclRefType(ValueDecl *Var, SourceLocation Loc) { + assert(Var && "Null value cannot be captured"); + QualType CaptureType; QualType DeclRefType; diff --git a/clang/test/SemaCXX/cxx20-decomposition.cpp b/clang/test/SemaCXX/cxx20-decomposition.cpp index 430a158ff458e..ccc1af5898059 100644 --- a/clang/test/SemaCXX/cxx20-decomposition.cpp +++ b/clang/test/SemaCXX/cxx20-decomposition.cpp @@ -183,3 +183,26 @@ namespace ODRUseTests { }(0); }(0); // expected-note 2{{in instantiation}} } } + + +namespace GH95081 { + void prevent_assignment_check() { + int arr[] = {1,2}; + auto [e1, e2] = arr; + + auto lambda = [e1] { + e1 = 42; // expected-error {{cannot assign to a variable captured by copy in a non-mutable lambda}} + }; + } + + void f(int&) = delete; + void f(const int&); + + int arr[1]; + void foo() { + auto [x] = arr; + [x]() { + f(x); // deleted f(int&) used to be picked up erroneously + } (); + } +} From d6e435362d5984052531a425a1e4c7e594a8503e Mon Sep 17 00:00:00 2001 From: Andrei Safronov Date: Thu, 26 Dec 2024 01:02:53 +0300 Subject: [PATCH 067/567] [Xtensa] Fix Code Density test. (#121073) Fix dissambler test for Code Density feature. --- .../MC/Disassembler/Xtensa/code_density.txt | 58 ++++++++----------- 1 file changed, 24 insertions(+), 34 deletions(-) diff --git a/llvm/test/MC/Disassembler/Xtensa/code_density.txt b/llvm/test/MC/Disassembler/Xtensa/code_density.txt index eac236a4f3081..833dd52e584b2 100644 --- a/llvm/test/MC/Disassembler/Xtensa/code_density.txt +++ b/llvm/test/MC/Disassembler/Xtensa/code_density.txt @@ -1,4 +1,5 @@ -# RUN: llvm-mc -triple=xtensa -mattr=+density -disassemble %s | FileCheck --check-prefix=CHECK-DENSITY %s +# RUN: llvm-mc -triple=xtensa -mattr=+density -disassemble %s | FileCheck -check-prefixes=CHECK-DENSITY %s +# RUN: not llvm-mc -triple=xtensa -disassemble %s 2>&1 | FileCheck --implicit-check-not=warning: -check-prefixes=CHECK-CORE %s #------------------------------------------------------------------------------ # Verify that binary code is correctly disassembled with @@ -6,57 +7,46 @@ # density option generates warnings. #------------------------------------------------------------------------------ -0x4a 0x23 +[0x4a, 0x23] # CHECK-DENSITY: add.n a2, a3, a4 -# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding -# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding +# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding -0x3b 0x23 +[0x3b, 0x23] # CHECK-DENSITY: addi.n a2, a3, 3 -# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding -# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding +# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding -0x9c 0x03 +[0x9c, 0x03] # CHECK-DENSITY: beqz.n a3, . +20 -# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding -# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding +# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding -0xcc 0xe3 +[0xcc, 0xe3] # CHECK-DENSITY: bnez.n a3, . +18 -# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding -# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding +# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding -0x6d 0xf0 +[0x6d, 0xf0] # CHECK-DENSITY: ill.n -# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding -# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding +# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding -0x28 0x33 +[0x28, 0x33] # CHECK-DENSITY: l32i.n a2, a3, 12 -# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding -# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding +# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding -0x2d 0x03 +[0x2d, 0x03] # CHECK-DENSITY: mov.n a2, a3 -# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding -# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding +# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding -0x0d 0xf0 +[0x0d, 0xf0] # CHECK-DENSITY: ret.n -# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding -# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding +# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding -0x29 0x33 +[0x29, 0x33] # CHECK-DENSITY: s32i.n a2, a3, 12 -# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding -# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding +# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding -0x6c 0x02 +[0x6c, 0x02] # CHECK-DENSITY: movi.n a2, -32 -# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding -# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding +# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding -0x3d 0xf0 +[0x3d, 0xf0] # CHECK-DENSITY: nop.n -# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding -# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding +# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding From cea738bc9a9e4835f9a076145953ed1369dcad7d Mon Sep 17 00:00:00 2001 From: DaPorkchop_ Date: Thu, 26 Dec 2024 00:47:26 +0100 Subject: [PATCH 068/567] [SimplifyCFG] Replace unreachable switch lookup table holes with poison (#94990) As discussed in #94468, this causes switch lookup table entries which are unreachable to be poison instead of filling them with a value from one of the reachable cases. --------- Co-authored-by: DianQK --- llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 29 ++- .../SimplifyCFG/X86/switch_to_lookup_table.ll | 232 +++++++++++++++++- .../X86/switch_to_lookup_table_big.ll | 6 +- 3 files changed, 254 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 17f4b396f753b..febc5682c2129 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -6531,8 +6531,8 @@ SwitchLookupTable::SwitchLookupTable( uint64_t Idx = (CaseVal->getValue() - Offset->getValue()).getLimitedValue(); TableContents[Idx] = CaseRes; - if (CaseRes != SingleValue) - SingleValue = nullptr; + if (SingleValue && !isa(CaseRes) && CaseRes != SingleValue) + SingleValue = isa(SingleValue) ? CaseRes : nullptr; } // Fill in any holes in the table with the default result. @@ -6545,7 +6545,10 @@ SwitchLookupTable::SwitchLookupTable( TableContents[I] = DefaultValue; } - if (DefaultValue != SingleValue) + // If the default value is poison, all the holes are poison. + bool DefaultValueIsPoison = isa(DefaultValue); + + if (DefaultValue != SingleValue && !DefaultValueIsPoison) SingleValue = nullptr; } @@ -6569,6 +6572,16 @@ SwitchLookupTable::SwitchLookupTable( // Check if there is the same distance between two consecutive values. for (uint64_t I = 0; I < TableSize; ++I) { ConstantInt *ConstVal = dyn_cast(TableContents[I]); + + if (!ConstVal && isa(TableContents[I])) { + // This is an poison, so it's (probably) a lookup table hole. + // To prevent any regressions from before we switched to using poison as + // the default value, holes will fall back to using the first value. + // This can be removed once we add proper handling for poisons in lookup + // tables. + ConstVal = dyn_cast(Values[0].second); + } + if (!ConstVal) { // This is an undef. We could deal with it, but undefs in lookup tables // are very seldom. It's probably not worth the additional complexity. @@ -7003,8 +7016,8 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, // If the table has holes but the default destination doesn't produce any // constant results, the lookup table entries corresponding to the holes will - // contain undefined values. - bool AllHolesAreUndefined = TableHasHoles && !HasDefaultResults; + // contain poison. + bool AllHolesArePoison = TableHasHoles && !HasDefaultResults; // If the default destination doesn't produce a constant result but is still // reachable, and the lookup table has holes, we need to use a mask to @@ -7012,7 +7025,7 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, // to the default case. // The mask is unnecessary if the table has holes but the default destination // is unreachable, as in that case the holes must also be unreachable. - bool NeedMask = AllHolesAreUndefined && DefaultIsReachable; + bool NeedMask = AllHolesArePoison && DefaultIsReachable; if (NeedMask) { // As an extra penalty for the validity test we require more cases. if (SI->getNumCases() < 4) // FIXME: Find best threshold value (benchmark). @@ -7157,9 +7170,11 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, for (PHINode *PHI : PHIs) { const ResultListTy &ResultList = ResultLists[PHI]; + Type *ResultType = ResultList.begin()->second->getType(); + // Use any value to fill the lookup table holes. Constant *DV = - AllHolesAreUndefined ? ResultLists[PHI][0].second : DefaultResults[PHI]; + AllHolesArePoison ? PoisonValue::get(ResultType) : DefaultResults[PHI]; StringRef FuncName = Fn->getName(); SwitchLookupTable Table(Mod, TableSize, TableIndexOffset, ResultList, DV, DL, FuncName); diff --git a/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll b/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll index 7f484e2ec29d7..ffbacc1a89031 100644 --- a/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll +++ b/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll @@ -34,11 +34,14 @@ target triple = "x86_64-unknown-linux-gnu" ; CHECK: @switch.table.unreachable_case = private unnamed_addr constant [9 x i32] [i32 0, i32 0, i32 0, i32 2, i32 -1, i32 1, i32 1, i32 1, i32 1], align 4 ; CHECK: @switch.table.unreachable_default = private unnamed_addr constant [4 x i32] [i32 42, i32 52, i32 1, i32 2], align 4 ; CHECK: @switch.table.nodefaultnoholes = private unnamed_addr constant [4 x i32] [i32 55, i32 123, i32 0, i32 -1], align 4 -; CHECK: @switch.table.nodefaultwithholes = private unnamed_addr constant [6 x i32] [i32 55, i32 123, i32 0, i32 -1, i32 55, i32 -1], align 4 +; CHECK: @switch.table.nodefaultwithholes = private unnamed_addr constant [6 x i32] [i32 55, i32 123, i32 0, i32 -1, i32 poison, i32 -1], align 4 ; CHECK: @switch.table.threecases = private unnamed_addr constant [3 x i32] [i32 10, i32 7, i32 5], align 4 -; CHECK: @switch.table.covered_switch_with_bit_tests = private unnamed_addr constant [8 x i32] [i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 1, i32 1], align 4 +; CHECK: @switch.table.covered_switch_with_bit_tests = private unnamed_addr constant [8 x i32] [i32 2, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 1], align 4 ; CHECK: @switch.table.signed_overflow1 = private unnamed_addr constant [4 x i32] [i32 3333, i32 4444, i32 1111, i32 2222], align 4 -; CHECK: @switch.table.signed_overflow2 = private unnamed_addr constant [4 x i32] [i32 3333, i32 4444, i32 2222, i32 2222], align 4 +; CHECK: @switch.table.signed_overflow2 = private unnamed_addr constant [4 x i32] [i32 3333, i32 4444, i32 poison, i32 2222], align 4 +; CHECK: @switch.table.constant_hole_unreachable_default_firstundef = private unnamed_addr constant [5 x i32] [i32 undef, i32 poison, i32 1, i32 1, i32 1], align 4 +; CHECK: @switch.table.constant_hole_unreachable_default_lastundef = private unnamed_addr constant [5 x i32] [i32 1, i32 poison, i32 1, i32 1, i32 undef], align 4 +; CHECK: @switch.table.linearmap_hole_unreachable_default = private unnamed_addr constant [5 x i32] [i32 1, i32 poison, i32 5, i32 7, i32 9], align 4 ;. define i32 @f(i32 %c) { ; CHECK-LABEL: @f( @@ -2184,3 +2187,226 @@ return: ; preds = %sw.default, %entry, %retval.0 = phi { i8, i8 } [ undef, %entry ], [ undef, %entry ], [ undef, %entry ], [ %1, %sw.default ] ret { i8, i8 } %retval.0 } + +; The switch has a hole which falls through to an unreachable default case, but it can still be optimized into a constant load because +; the poison value used for the hole is ignored. +define i32 @constant_hole_unreachable_default(i32 %x) { +; CHECK-LABEL: @constant_hole_unreachable_default( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret i32 1 +; +entry: + switch i32 %x, label %sw.default [ + i32 0, label %bb0 + i32 2, label %bb0 + i32 3, label %bb0 + i32 4, label %bb0 + ] + +sw.default: unreachable +bb0: br label %return + +return: + %res = phi i32 [ 1, %bb0 ] + ret i32 %res +} + +; The switch has a hole which falls through to an unreachable default case and the first case explicitly returns undef, yet it cannot be optimized into a simple +; constant because we actually treat undef as a unique value rather than ignoring it. +define i32 @constant_hole_unreachable_default_firstundef(i32 %x) { +; CHECK-LABEL: @constant_hole_unreachable_default_firstundef( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SWITCH_GEP:%.*]] = getelementptr inbounds [5 x i32], ptr @switch.table.constant_hole_unreachable_default_firstundef, i32 0, i32 [[X:%.*]] +; CHECK-NEXT: [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4 +; CHECK-NEXT: ret i32 [[SWITCH_LOAD]] +; +entry: + switch i32 %x, label %sw.default [ + i32 0, label %bb.undef + i32 2, label %bb0 + i32 3, label %bb0 + i32 4, label %bb0 + ] + +sw.default: unreachable +bb.undef: br label %return +bb0: br label %return + +return: + %res = phi i32 [ undef, %bb.undef ], [ 1, %bb0 ] + ret i32 %res +} + +; The switch has a hole which falls through to an unreachable default case and the last case explicitly returns undef, yet it cannot be optimized into a simple +; constant because we actually treat undef as a unique value rather than ignoring it. +define i32 @constant_hole_unreachable_default_lastundef(i32 %x) { +; CHECK-LABEL: @constant_hole_unreachable_default_lastundef( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SWITCH_GEP:%.*]] = getelementptr inbounds [5 x i32], ptr @switch.table.constant_hole_unreachable_default_lastundef, i32 0, i32 [[X:%.*]] +; CHECK-NEXT: [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4 +; CHECK-NEXT: ret i32 [[SWITCH_LOAD]] +; +entry: + switch i32 %x, label %sw.default [ + i32 0, label %bb0 + i32 2, label %bb0 + i32 3, label %bb0 + i32 4, label %bb.undef + ] + +sw.default: unreachable +bb.undef: br label %return +bb0: br label %return + +return: + %res = phi i32 [ undef, %bb.undef ], [ 1, %bb0 ] + ret i32 %res +} + +; The switch has a hole which falls through to an unreachable default case and the first case explicitly returns poison, but it can still +; be optimized into a constant load because the poison values are ignored. +define i32 @constant_hole_unreachable_default_firstpoison(i32 %x) { +; CHECK-LABEL: @constant_hole_unreachable_default_firstpoison( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret i32 1 +; +entry: + switch i32 %x, label %sw.default [ + i32 0, label %bb.poison + i32 2, label %bb0 + i32 3, label %bb0 + i32 4, label %bb0 + ] + +sw.default: unreachable +bb.poison: br label %return +bb0: br label %return + +return: + %res = phi i32 [ poison, %bb.poison ], [ 1, %bb0 ] + ret i32 %res +} + +; The switch has a hole which falls through to an unreachable default case and the first case explicitly returns poison, but it can still +; be optimized into a constant load because the poison values are ignored. +define i32 @constant_hole_unreachable_default_lastpoison(i32 %x) { +; CHECK-LABEL: @constant_hole_unreachable_default_lastpoison( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret i32 1 +; +entry: + switch i32 %x, label %sw.default [ + i32 0, label %bb0 + i32 2, label %bb0 + i32 3, label %bb0 + i32 4, label %bb.poison + ] + +sw.default: unreachable +bb.poison: br label %return +bb0: br label %return + +return: + %res = phi i32 [ poison, %bb.poison ], [ 1, %bb0 ] + ret i32 %res +} + +define i32 @constant_hole_unreachable_default_undef_poison(i32 %x) { +; CHECK-LABEL: @constant_hole_unreachable_default_undef_poison( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret i32 undef +; +entry: + switch i32 %x, label %sw.default [ + i32 0, label %bb.undef + i32 2, label %bb.poison + i32 3, label %bb.poison + i32 4, label %bb.poison + ] + +sw.default: unreachable +bb.undef: br label %return +bb.poison: br label %return + +return: + %res = phi i32 [ undef, %bb.undef ], [ poison, %bb.poison ] + ret i32 %res +} + +define i32 @constant_hole_unreachable_default_poison_undef(i32 %x) { +; CHECK-LABEL: @constant_hole_unreachable_default_poison_undef( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret i32 undef +; +entry: + switch i32 %x, label %sw.default [ + i32 0, label %bb.poison + i32 2, label %bb.poison + i32 3, label %bb.poison + i32 4, label %bb.undef + ] + +sw.default: unreachable +bb.undef: br label %return +bb.poison: br label %return + +return: + %res = phi i32 [ undef, %bb.undef ], [ poison, %bb.poison ] + ret i32 %res +} + +; The switch has a hole which falls through to an unreachable default case, which prevents it from being optimized into a linear mapping 2*x+1. +; TODO: We should add support for this, at least in certain cases. +define i32 @linearmap_hole_unreachable_default(i32 %x) { +; CHECK-LABEL: @linearmap_hole_unreachable_default( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SWITCH_GEP:%.*]] = getelementptr inbounds [5 x i32], ptr @switch.table.linearmap_hole_unreachable_default, i32 0, i32 [[X:%.*]] +; CHECK-NEXT: [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4 +; CHECK-NEXT: ret i32 [[SWITCH_LOAD]] +; +entry: + switch i32 %x, label %sw.default [ + i32 0, label %bb0 + i32 2, label %bb2 + i32 3, label %bb3 + i32 4, label %bb4 + ] + +sw.default: unreachable +bb0: br label %return +bb2: br label %return +bb3: br label %return +bb4: br label %return + +return: + %res = phi i32 [ 1, %bb0 ], [ 5, %bb2 ], [ 7, %bb3 ], [ 9, %bb4 ] + ret i32 %res +} + +; The switch has a hole which falls through to an unreachable default case, but it can still be optimized into a bitmask extraction because +; the poison value used for the hole is simply replaced with zero. +define i1 @bitset_hole_unreachable_default(i32 %x) { +; CHECK-LABEL: @bitset_hole_unreachable_default( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SWITCH_CAST:%.*]] = trunc i32 [[X:%.*]] to i5 +; CHECK-NEXT: [[SWITCH_SHIFTAMT:%.*]] = mul nuw nsw i5 [[SWITCH_CAST]], 1 +; CHECK-NEXT: [[SWITCH_DOWNSHIFT:%.*]] = lshr i5 8, [[SWITCH_SHIFTAMT]] +; CHECK-NEXT: [[SWITCH_MASKED:%.*]] = trunc i5 [[SWITCH_DOWNSHIFT]] to i1 +; CHECK-NEXT: ret i1 [[SWITCH_MASKED]] +; +entry: + switch i32 %x, label %sw.default [ + i32 0, label %bb0 + i32 2, label %bb0 + i32 3, label %bb1 + i32 4, label %bb0 + ] + +sw.default: unreachable +bb0: br label %return +bb1: br label %return + +return: + %res = phi i1 [ 0, %bb0 ], [ 1, %bb1 ] + ret i1 %res +} diff --git a/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table_big.ll b/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table_big.ll index 7988e3057a2c2..4ebf09ae3b127 100644 --- a/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table_big.ll +++ b/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table_big.ll @@ -7,11 +7,11 @@ target triple = "i386-pc-linux-gnu" ;. ; CHECK: @switch.table.reachable_default_dense_0to31 = private unnamed_addr constant [32 x i32] [i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1], align 4 ; CHECK: @switch.table.unreachable_default_dense_0to31 = private unnamed_addr constant [32 x i32] [i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1], align 4 -; CHECK: @switch.table.reachable_default_holes_0to31 = private unnamed_addr constant [32 x i32] [i32 0, i32 7, i32 6, i32 0, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 0, i32 2, i32 1, i32 0, i32 7, i32 0, i32 5, i32 4, i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 5, i32 0, i32 3, i32 2, i32 1], align 4 -; CHECK: @switch.table.unreachable_default_holes_0to31 = private unnamed_addr constant [32 x i32] [i32 0, i32 7, i32 6, i32 0, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 0, i32 2, i32 1, i32 0, i32 7, i32 0, i32 5, i32 4, i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 5, i32 0, i32 3, i32 2, i32 1], align 4 +; CHECK: @switch.table.reachable_default_holes_0to31 = private unnamed_addr constant [32 x i32] [i32 0, i32 7, i32 6, i32 poison, i32 4, i32 3, i32 2, i32 1, i32 poison, i32 7, i32 6, i32 5, i32 4, i32 poison, i32 2, i32 1, i32 0, i32 7, i32 poison, i32 5, i32 4, i32 3, i32 2, i32 poison, i32 0, i32 7, i32 6, i32 5, i32 poison, i32 3, i32 2, i32 1], align 4 +; CHECK: @switch.table.unreachable_default_holes_0to31 = private unnamed_addr constant [32 x i32] [i32 0, i32 7, i32 6, i32 poison, i32 4, i32 3, i32 2, i32 1, i32 poison, i32 7, i32 6, i32 5, i32 4, i32 poison, i32 2, i32 1, i32 0, i32 7, i32 poison, i32 5, i32 4, i32 3, i32 2, i32 poison, i32 0, i32 7, i32 6, i32 5, i32 poison, i32 3, i32 2, i32 1], align 4 ; CHECK: @switch.table.reachable_default_dense_0to32 = private unnamed_addr constant [33 x i32] [i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0], align 4 ; CHECK: @switch.table.unreachable_default_dense_0to32 = private unnamed_addr constant [33 x i32] [i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0], align 4 -; CHECK: @switch.table.unreachable_default_holes_0to32 = private unnamed_addr constant [33 x i32] [i32 0, i32 7, i32 6, i32 0, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 0, i32 2, i32 1, i32 0, i32 7, i32 0, i32 5, i32 4, i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 5, i32 0, i32 3, i32 2, i32 1, i32 0], align 4 +; CHECK: @switch.table.unreachable_default_holes_0to32 = private unnamed_addr constant [33 x i32] [i32 0, i32 7, i32 6, i32 poison, i32 4, i32 3, i32 2, i32 1, i32 poison, i32 7, i32 6, i32 5, i32 4, i32 poison, i32 2, i32 1, i32 0, i32 7, i32 poison, i32 5, i32 4, i32 3, i32 2, i32 poison, i32 0, i32 7, i32 6, i32 5, i32 poison, i32 3, i32 2, i32 1, i32 0], align 4 ;. define i32 @reachable_default_dense_0to31(i32 %x, i32 %y) { ; CHECK-LABEL: @reachable_default_dense_0to31( From 6f72d28dd94c84b28275dbe61f5b2021df752e55 Mon Sep 17 00:00:00 2001 From: Sergei Barannikov Date: Thu, 26 Dec 2024 08:36:35 +0300 Subject: [PATCH 069/567] [TableGen][GISel] Don't copy dead def from a sub-instruction to the root (#121094) Sub-instruction can have a def with the same name as a def in a top-level instruction. Previously this could result in both defs copied to the instruction being built. --- .../TableGen/GlobalISelEmitter/dead-def.td | 27 +++++++++++++++++++ llvm/utils/TableGen/GlobalISelEmitter.cpp | 20 +++++++------- 2 files changed, 38 insertions(+), 9 deletions(-) create mode 100644 llvm/test/TableGen/GlobalISelEmitter/dead-def.td diff --git a/llvm/test/TableGen/GlobalISelEmitter/dead-def.td b/llvm/test/TableGen/GlobalISelEmitter/dead-def.td new file mode 100644 index 0000000000000..a8597f1d84064 --- /dev/null +++ b/llvm/test/TableGen/GlobalISelEmitter/dead-def.td @@ -0,0 +1,27 @@ +// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false \ +// RUN: -I %p/../../../include -I %p/../Common %s | FileCheck %s + +include "llvm/Target/Target.td" +include "GlobalISelEmitterCommon.td" + +// Check that $same_name from I2 isn't copied to the root instruction. + +def I1 : I<(outs GPR32:$same_name), (ins GPR32:$rs), []>; +def I2 : I<(outs GPR32:$other_name, GPR32:$same_name), (ins GPR32:$rs), []>; + +def : Pat<(abs i32:$x), (I1 (I2 $x))>; + +// CHECK-LABEL: // (abs:{ *:[i32] } i32:{ *:[i32] }:$x) => (I1:{ *:[i32] } (I2:{ *:[i32] }:{ *:[i32] } ?:{ *:[i32] }:$x)) +// CHECK-NEXT: GIR_MakeTempReg, /*TempRegID*/1, /*TypeID*/GILLT_s32, +// CHECK-NEXT: GIR_MakeTempReg, /*TempRegID*/0, /*TypeID*/GILLT_s32, +// CHECK-NEXT: GIR_BuildMI, /*InsnID*/1, /*Opcode*/GIMT_Encode2(MyTarget::I2), +// CHECK-NEXT: GIR_AddTempRegister, /*InsnID*/1, /*TempRegID*/0, /*TempRegFlags*/GIMT_Encode2(RegState::Define), +// CHECK-NEXT: GIR_AddTempRegister, /*InsnID*/1, /*TempRegID*/1, /*TempRegFlags*/GIMT_Encode2(RegState::Define|RegState::Dead), +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/1, /*OldInsnID*/0, /*OpIdx*/1, // x +// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/1, +// CHECK-NEXT: GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::I1), +// CHECK-NEXT: GIR_RootToRootCopy, /*OpIdx*/0, // DstI[same_name] +// CHECK-NEXT: GIR_AddSimpleTempRegister, /*InsnID*/0, /*TempRegID*/0, +// CHECK-NEXT: GIR_RootConstrainSelectedInstOperands, +// CHECK-NEXT: // GIR_Coverage, 0, +// CHECK-NEXT: GIR_EraseRootFromParent_Done, diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp index 5038be7b24fbc..9c945edfafe36 100644 --- a/llvm/utils/TableGen/GlobalISelEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp @@ -404,9 +404,10 @@ class GlobalISelEmitter final : public GlobalISelMatchTableExecutorEmitter { createInstructionRenderer(action_iterator InsertPt, RuleMatcher &M, const TreePatternNode &Dst) const; - Expected importExplicitDefRenderers( - action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder, - const TreePatternNode &Dst, unsigned Start = 0) const; + Expected + importExplicitDefRenderers(action_iterator InsertPt, RuleMatcher &M, + BuildMIAction &DstMIBuilder, + const TreePatternNode &Dst, bool IsRoot) const; Expected importExplicitUseRenderers(action_iterator InsertPt, RuleMatcher &M, @@ -1369,7 +1370,8 @@ Expected GlobalISelEmitter::createAndImportInstructionRenderer( CopyToPhysRegMIBuilder.addRenderer(PhysInput.first); } - if (auto Error = importExplicitDefRenderers(InsertPt, M, DstMIBuilder, Dst) + if (auto Error = importExplicitDefRenderers(InsertPt, M, DstMIBuilder, Dst, + /*IsRoot=*/true) .takeError()) return std::move(Error); @@ -1398,8 +1400,8 @@ GlobalISelEmitter::createAndImportSubInstructionRenderer( DstMIBuilder.addRenderer(TempRegID, true); // Handle additional (ignored) results. - InsertPtOrError = importExplicitDefRenderers(std::prev(*InsertPtOrError), M, - DstMIBuilder, Dst, /*Start=*/1); + InsertPtOrError = importExplicitDefRenderers( + std::prev(*InsertPtOrError), M, DstMIBuilder, Dst, /*IsRoot=*/false); if (auto Error = InsertPtOrError.takeError()) return std::move(Error); @@ -1440,16 +1442,16 @@ GlobalISelEmitter::createInstructionRenderer(action_iterator InsertPt, Expected GlobalISelEmitter::importExplicitDefRenderers( action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder, - const TreePatternNode &Dst, unsigned Start) const { + const TreePatternNode &Dst, bool IsRoot) const { const CodeGenInstruction *DstI = DstMIBuilder.getCGI(); // Process explicit defs. The caller may have already handled the first def. - for (unsigned I = Start, E = DstI->Operands.NumDefs; I != E; ++I) { + for (unsigned I = IsRoot ? 0 : 1, E = DstI->Operands.NumDefs; I != E; ++I) { const CGIOperandList::OperandInfo &OpInfo = DstI->Operands[I]; std::string OpName = getMangledRootDefName(OpInfo.Name); // If the def is used in the source DAG, forward it. - if (M.hasOperand(OpName)) { + if (IsRoot && M.hasOperand(OpName)) { // CopyRenderer saves a StringRef, so cannot pass OpName itself - // let's use a string with an appropriate lifetime. StringRef PermanentRef = M.getOperandMatcher(OpName).getSymbolicName(); From a0e1fcc093e0bb1265d8a6977f96dff51d9f5318 Mon Sep 17 00:00:00 2001 From: Sergei Barannikov Date: Thu, 26 Dec 2024 08:40:47 +0300 Subject: [PATCH 070/567] [TableGen][GISel] Refactor node renderers emission (#121071) Split importExplicitUseRenderer into several smaller functions and add a bunch of TODOs and FIXMEs. This is an NFCI change to simplify review of future functional changes. Pull Request: https://github.com/llvm/llvm-project/pull/121071 --- llvm/utils/TableGen/GlobalISelEmitter.cpp | 331 +++++++++++++--------- 1 file changed, 195 insertions(+), 136 deletions(-) diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp index 9c945edfafe36..0b910096b0528 100644 --- a/llvm/utils/TableGen/GlobalISelEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp @@ -413,10 +413,24 @@ class GlobalISelEmitter final : public GlobalISelMatchTableExecutorEmitter { importExplicitUseRenderers(action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder, const TreePatternNode &Dst) const; - Expected - importExplicitUseRenderer(action_iterator InsertPt, RuleMatcher &Rule, - BuildMIAction &DstMIBuilder, - const TreePatternNode &Dst) const; + + Error importNamedNodeRenderer(RuleMatcher &M, BuildMIAction &MIBuilder, + const TreePatternNode &N) const; + + Error importLeafNodeRenderer(RuleMatcher &M, BuildMIAction &MIBuilder, + const TreePatternNode &N) const; + + Error importXFormNodeRenderer(RuleMatcher &M, BuildMIAction &MIBuilder, + const TreePatternNode &N) const; + + Error importInstructionNodeRenderer(RuleMatcher &M, BuildMIAction &MIBuilder, + const TreePatternNode &N, + action_iterator &InsertPt) const; + + Error importNodeRenderer(RuleMatcher &M, BuildMIAction &MIBuilder, + const TreePatternNode &N, + action_iterator &InsertPt) const; + Error importDefaultOperandRenderers(action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder, const DAGDefaultOperand &DefaultOp) const; @@ -1190,159 +1204,207 @@ Error GlobalISelEmitter::importChildMatcher( return failedImport("Src pattern child is an unsupported kind"); } -Expected GlobalISelEmitter::importExplicitUseRenderer( - action_iterator InsertPt, RuleMatcher &Rule, BuildMIAction &DstMIBuilder, - const TreePatternNode &Dst) const { +// Equivalent of MatcherGen::EmitResultOfNamedOperand. +Error GlobalISelEmitter::importNamedNodeRenderer( + RuleMatcher &M, BuildMIAction &MIBuilder, const TreePatternNode &N) const { + StringRef NodeName = N.getName(); - const auto &SubOperand = Rule.getComplexSubOperand(Dst.getName()); - if (SubOperand) { - DstMIBuilder.addRenderer( - *std::get<0>(*SubOperand), Dst.getName(), std::get<1>(*SubOperand), - std::get<2>(*SubOperand)); - return InsertPt; + if (auto SubOperand = M.getComplexSubOperand(NodeName)) { + auto [ComplexPatternRec, RendererID, SubOperandIdx] = *SubOperand; + MIBuilder.addRenderer( + *ComplexPatternRec, NodeName, RendererID, SubOperandIdx); + return Error::success(); } - if (!Dst.isLeaf()) { - if (Dst.getOperator()->isSubClassOf("SDNodeXForm")) { - auto &Child = Dst.getChild(0); - auto I = SDNodeXFormEquivs.find(Dst.getOperator()); - if (I != SDNodeXFormEquivs.end()) { - const Record *XFormOpc = Dst.getOperator()->getValueAsDef("Opcode"); - if (XFormOpc->getName() == "timm") { - // If this is a TargetConstant, there won't be a corresponding - // instruction to transform. Instead, this will refer directly to an - // operand in an instruction's operand list. - DstMIBuilder.addRenderer(*I->second, - Child.getName()); - } else { - DstMIBuilder.addRenderer(*I->second, Child.getName()); - } - - return InsertPt; - } - return failedImport("SDNodeXForm " + Child.getName() + - " has no custom renderer"); - } + if (!N.isLeaf()) { + StringRef OperatorName = N.getOperator()->getName(); - // We accept 'bb' here. It's an operator because BasicBlockSDNode isn't - // inline, but in MI it's just another operand. - if (Dst.getOperator()->getName() == "bb") { - DstMIBuilder.addRenderer(Dst.getName()); - return InsertPt; + if (OperatorName == "imm") { + MIBuilder.addRenderer(NodeName); + return Error::success(); } - // Similarly, imm is an operator in TreePatternNode's view but must be - // rendered as operands. - // FIXME: The target should be able to choose sign-extended when appropriate - // (e.g. on Mips). - if (Dst.getOperator()->getName() == "timm") { - DstMIBuilder.addRenderer(Dst.getName()); - return InsertPt; - } - if (Dst.getOperator()->getName() == "tframeindex") { - DstMIBuilder.addRenderer(Dst.getName()); - return InsertPt; - } - if (Dst.getOperator()->getName() == "imm") { - DstMIBuilder.addRenderer(Dst.getName()); - return InsertPt; - } - if (Dst.getOperator()->getName() == "fpimm") { - DstMIBuilder.addRenderer(Dst.getName()); - return InsertPt; + if (OperatorName == "fpimm") { + MIBuilder.addRenderer(NodeName); + return Error::success(); } - if (Dst.getOperator()->isSubClassOf("Instruction")) { - auto OpTy = getInstResultType(Dst, Target); - if (!OpTy) - return OpTy.takeError(); - - unsigned TempRegID = Rule.allocateTempRegID(); - InsertPt = - Rule.insertAction(InsertPt, *OpTy, TempRegID); - DstMIBuilder.addRenderer(TempRegID); - - auto InsertPtOrError = createAndImportSubInstructionRenderer( - ++InsertPt, Rule, Dst, TempRegID); - if (auto Error = InsertPtOrError.takeError()) - return std::move(Error); - return InsertPtOrError.get(); + // TODO: 'imm' and 'fpimm' are the only nodes that need special treatment. + // Remove this check and add CopyRenderer unconditionally for other nodes. + if (OperatorName == "bb" || OperatorName == "timm" || + OperatorName == "tframeindex") { + MIBuilder.addRenderer(NodeName); + return Error::success(); } - return failedImport("Dst pattern child isn't a leaf node or an MBB" + - llvm::to_string(Dst)); - } - - // It could be a specific immediate in which case we should just check for - // that immediate. - if (const IntInit *ChildIntInit = dyn_cast(Dst.getLeafValue())) { - DstMIBuilder.addRenderer(ChildIntInit->getValue()); - return InsertPt; + return failedImport("node has unsupported operator " + to_string(N)); } - // Otherwise, we're looking for a bog-standard RegisterClass operand. - if (auto *ChildDefInit = dyn_cast(Dst.getLeafValue())) { - auto *ChildRec = ChildDefInit->getDef(); + if (const auto *DI = dyn_cast(N.getLeafValue())) { + const Record *R = DI->getDef(); - ArrayRef ChildTypes = Dst.getExtTypes(); - if (ChildTypes.size() != 1) - return failedImport("Dst pattern child has multiple results"); + if (N.getNumResults() != 1) + return failedImport("node does not have one result " + to_string(N)); std::optional OpTyOrNone; + ArrayRef ChildTypes = N.getExtTypes(); if (ChildTypes.front().isMachineValueType()) OpTyOrNone = MVTToLLT(ChildTypes.front().getMachineValueType().SimpleTy); + + // TODO: Remove this check. Types in the destination DAG should not matter. if (!OpTyOrNone) - return failedImport("Dst operand has an unsupported type"); + return failedImport("node has unsupported type " + to_string(N)); - if (ChildRec->isSubClassOf("Register")) { - DstMIBuilder.addRenderer(Target, ChildRec); - return InsertPt; - } + if (R->isSubClassOf("ComplexPattern")) { + auto I = ComplexPatternEquivs.find(R); + if (I == ComplexPatternEquivs.end()) + return failedImport("ComplexPattern " + R->getName() + + " does not have GISel equivalent"); - if (ChildRec->isSubClassOf("RegisterClass") || - ChildRec->isSubClassOf("RegisterOperand") || - ChildRec->isSubClassOf("ValueType")) { - if (ChildRec->isSubClassOf("RegisterOperand") && - !ChildRec->isValueUnset("GIZeroRegister")) { - DstMIBuilder.addRenderer( - Dst.getName(), ChildRec->getValueAsDef("GIZeroRegister")); - return InsertPt; - } + const OperandMatcher &OM = M.getOperandMatcher(NodeName); + MIBuilder.addRenderer( + *I->second, NodeName, OM.getAllocatedTemporariesBaseID()); + return Error::success(); + } - DstMIBuilder.addRenderer(Dst.getName()); - return InsertPt; + if (R->isSubClassOf("RegisterOperand") && + !R->isValueUnset("GIZeroRegister")) { + MIBuilder.addRenderer( + NodeName, R->getValueAsDef("GIZeroRegister")); + return Error::success(); } - if (ChildRec->isSubClassOf("SubRegIndex")) { - CodeGenSubRegIndex *SubIdx = CGRegs.getSubRegIdx(ChildRec); - DstMIBuilder.addRenderer(SubIdx->EnumValue); - return InsertPt; + // TODO: All special cases are handled above. Remove this check and add + // CopyRenderer unconditionally. + if (R->isSubClassOf("RegisterClass") || + R->isSubClassOf("RegisterOperand") || R->isSubClassOf("ValueType")) { + MIBuilder.addRenderer(NodeName); + return Error::success(); } + } - if (ChildRec->isSubClassOf("ComplexPattern")) { - const auto &ComplexPattern = ComplexPatternEquivs.find(ChildRec); - if (ComplexPattern == ComplexPatternEquivs.end()) - return failedImport( - "SelectionDAG ComplexPattern not mapped to GlobalISel"); + // TODO: Change this to assert and move to the beginning of the function. + if (!M.hasOperand(NodeName)) + return failedImport("could not find node $" + NodeName + + " in the source DAG"); - const OperandMatcher &OM = Rule.getOperandMatcher(Dst.getName()); - DstMIBuilder.addRenderer( - *ComplexPattern->second, Dst.getName(), - OM.getAllocatedTemporariesBaseID()); - return InsertPt; + // TODO: Remove this check and add CopyRenderer unconditionally. + // TODO: Handle nodes with multiple results (provided they can reach here). + if (isa(N.getLeafValue())) { + MIBuilder.addRenderer(NodeName); + return Error::success(); + } + + return failedImport("unsupported node " + to_string(N)); +} + +// Equivalent of MatcherGen::EmitResultLeafAsOperand. +Error GlobalISelEmitter::importLeafNodeRenderer( + RuleMatcher &M, BuildMIAction &MIBuilder, const TreePatternNode &N) const { + if (const auto *II = dyn_cast(N.getLeafValue())) { + MIBuilder.addRenderer(II->getValue()); + return Error::success(); + } + + if (const auto *DI = dyn_cast(N.getLeafValue())) { + const Record *R = DI->getDef(); + + if (R->isSubClassOf("Register")) { + MIBuilder.addRenderer(Target, R); + return Error::success(); } - return failedImport( - "Dst pattern child def is an unsupported tablegen class"); + if (R->isSubClassOf("SubRegIndex")) { + const CodeGenSubRegIndex *SubRegIndex = CGRegs.getSubRegIdx(R); + MIBuilder.addRenderer(SubRegIndex->EnumValue); + return Error::success(); + } + + // There are also RegisterClass / RegisterOperand operands of REG_SEQUENCE / + // COPY_TO_REGCLASS, but these instructions are currently handled elsewhere. } - // Handle the case where the MVT/register class is omitted in the dest pattern - // but MVT exists in the source pattern. - if (isa(Dst.getLeafValue()) && Rule.hasOperand(Dst.getName())) { - DstMIBuilder.addRenderer(Dst.getName()); - return InsertPt; + return failedImport("unrecognized node " + to_string(N)); +} + +// Equivalent of MatcherGen::EmitResultSDNodeXFormAsOperand. +Error GlobalISelEmitter::importXFormNodeRenderer( + RuleMatcher &M, BuildMIAction &MIBuilder, const TreePatternNode &N) const { + const Record *XFormRec = N.getOperator(); + auto I = SDNodeXFormEquivs.find(XFormRec); + if (I == SDNodeXFormEquivs.end()) + return failedImport("SDNodeXForm " + XFormRec->getName() + + " does not have GISel equivalent"); + + // TODO: Fail to import if GISDNodeXForm does not have RendererFn. + // This currently results in a fatal error in emitRenderOpcodes. + const Record *XFormEquivRec = I->second; + + // The node to apply the transformation function to. + // FIXME: The node may not have a name and may be a leaf. It should be + // rendered first, like any other nodes. This may or may not require + // introducing a temporary register, and we can't tell that without + // inspecting the node (possibly recursively). This is a general drawback + // of appending renderers directly to BuildMIAction. + const TreePatternNode &Node = N.getChild(0); + StringRef NodeName = Node.getName(); + + const Record *XFormOpc = CGP.getSDNodeTransform(XFormRec).first; + if (XFormOpc->getName() == "timm") { + // If this is a TargetConstant, there won't be a corresponding + // instruction to transform. Instead, this will refer directly to an + // operand in an instruction's operand list. + MIBuilder.addRenderer(*XFormEquivRec, NodeName); + } else { + MIBuilder.addRenderer(*XFormEquivRec, NodeName); } - return failedImport("Dst pattern child is an unsupported kind"); + + return Error::success(); +} + +// Equivalent of MatcherGen::EmitResultInstructionAsOperand. +Error GlobalISelEmitter::importInstructionNodeRenderer( + RuleMatcher &M, BuildMIAction &MIBuilder, const TreePatternNode &N, + action_iterator &InsertPt) const { + Expected OpTy = getInstResultType(N, Target); + if (!OpTy) + return OpTy.takeError(); + + // TODO: See the comment in importXFormNodeRenderer. We rely on the node + // requiring a temporary register, which prevents us from using this + // function on the root of the destination DAG. + unsigned TempRegID = M.allocateTempRegID(); + InsertPt = M.insertAction(InsertPt, *OpTy, TempRegID); + MIBuilder.addRenderer(TempRegID); + + auto InsertPtOrError = + createAndImportSubInstructionRenderer(++InsertPt, M, N, TempRegID); + if (!InsertPtOrError) + return InsertPtOrError.takeError(); + + InsertPt = *InsertPtOrError; + return Error::success(); +} + +// Equivalent of MatcherGen::EmitResultOperand. +Error GlobalISelEmitter::importNodeRenderer(RuleMatcher &M, + BuildMIAction &MIBuilder, + const TreePatternNode &N, + action_iterator &InsertPt) const { + if (N.hasName()) + return importNamedNodeRenderer(M, MIBuilder, N); + + if (N.isLeaf()) + return importLeafNodeRenderer(M, MIBuilder, N); + + if (N.getOperator()->isSubClassOf("SDNodeXForm")) + return importXFormNodeRenderer(M, MIBuilder, N); + + if (N.getOperator()->isSubClassOf("Instruction")) + return importInstructionNodeRenderer(M, MIBuilder, N, InsertPt); + + // Should not reach here. + return failedImport("unrecognized node " + llvm::to_string(N)); } /// Generates code that builds the resulting instruction(s) from the destination @@ -1597,11 +1659,9 @@ Expected GlobalISelEmitter::importExplicitUseRenderers( dyn_cast(SubRegChild.getLeafValue())) { CodeGenSubRegIndex *SubIdx = CGRegs.getSubRegIdx(SubRegInit->getDef()); - auto InsertPtOrError = - importExplicitUseRenderer(InsertPt, M, DstMIBuilder, ValChild); - if (auto Error = InsertPtOrError.takeError()) - return std::move(Error); - InsertPt = InsertPtOrError.get(); + if (Error Err = importNodeRenderer(M, DstMIBuilder, ValChild, InsertPt)) + return Err; + DstMIBuilder.addRenderer(SubIdx); } } @@ -1666,11 +1726,10 @@ Expected GlobalISelEmitter::importExplicitUseRenderers( continue; } - auto InsertPtOrError = importExplicitUseRenderer(InsertPt, M, DstMIBuilder, - Dst.getChild(Child)); - if (auto Error = InsertPtOrError.takeError()) - return std::move(Error); - InsertPt = InsertPtOrError.get(); + if (Error Err = + importNodeRenderer(M, DstMIBuilder, Dst.getChild(Child), InsertPt)) + return Err; + ++Child; } From a72bfc5a1e5381012213df36389524f74ef7c8a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20Poussineau?= Date: Thu, 26 Dec 2024 06:56:02 +0100 Subject: [PATCH 071/567] [llvm-dlltool] Handle MIPS R4000 architecture (#114621) --- llvm/lib/Object/COFFImportFile.cpp | 2 ++ llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp | 6 +++++- llvm/test/tools/llvm-dlltool/machine-opt.def | 3 +++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Object/COFFImportFile.cpp b/llvm/lib/Object/COFFImportFile.cpp index ff3dcf9e13ffa..595533ff94725 100644 --- a/llvm/lib/Object/COFFImportFile.cpp +++ b/llvm/lib/Object/COFFImportFile.cpp @@ -133,6 +133,8 @@ static uint16_t getImgRelRelocation(MachineTypes Machine) { return IMAGE_REL_ARM64_ADDR32NB; case IMAGE_FILE_MACHINE_I386: return IMAGE_REL_I386_DIR32NB; + case IMAGE_FILE_MACHINE_R4000: + return IMAGE_REL_MIPS_REFWORDNB; } } diff --git a/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp b/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp index 58ff720516f38..1782e24287860 100644 --- a/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp +++ b/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp @@ -79,6 +79,7 @@ MachineTypes getEmulation(StringRef S) { .Case("arm", IMAGE_FILE_MACHINE_ARMNT) .Case("arm64", IMAGE_FILE_MACHINE_ARM64) .Case("arm64ec", IMAGE_FILE_MACHINE_ARM64EC) + .Case("r4000", IMAGE_FILE_MACHINE_R4000) .Default(IMAGE_FILE_MACHINE_UNKNOWN); } @@ -93,6 +94,8 @@ MachineTypes getMachine(Triple T) { case Triple::aarch64: return T.isWindowsArm64EC() ? COFF::IMAGE_FILE_MACHINE_ARM64EC : COFF::IMAGE_FILE_MACHINE_ARM64; + case Triple::mipsel: + return COFF::IMAGE_FILE_MACHINE_R4000; default: return COFF::IMAGE_FILE_MACHINE_UNKNOWN; } @@ -173,7 +176,8 @@ int llvm::dlltoolDriverMain(llvm::ArrayRef ArgsArr) { (!Args.hasArgNoClaim(OPT_d) && !Args.hasArgNoClaim(OPT_l))) { Table.printHelp(outs(), "llvm-dlltool [options] file...", "llvm-dlltool", false); - llvm::outs() << "\nTARGETS: i386, i386:x86-64, arm, arm64, arm64ec\n"; + llvm::outs() + << "\nTARGETS: i386, i386:x86-64, arm, arm64, arm64ec, r4000\n"; return 1; } diff --git a/llvm/test/tools/llvm-dlltool/machine-opt.def b/llvm/test/tools/llvm-dlltool/machine-opt.def index 6dce8255a43db..fcb85299d7bee 100644 --- a/llvm/test/tools/llvm-dlltool/machine-opt.def +++ b/llvm/test/tools/llvm-dlltool/machine-opt.def @@ -6,6 +6,8 @@ ; RUN: llvm-readobj %t.a | FileCheck --check-prefix=ARM %s ; RUN: llvm-dlltool -m arm64 -d %s -l %t.a ; RUN: llvm-readobj %t.a | FileCheck --check-prefix=ARM64 %s +; RUN: llvm-dlltool -m r4000 -d %s -l %t.a +; RUN: llvm-readobj %t.a | FileCheck --check-prefix=MIPS %s LIBRARY test.dll EXPORTS @@ -15,3 +17,4 @@ TestFunction ; X86_64: Format: COFF-x86-64 ; ARM: Format: COFF-ARM{{$}} ; ARM64: Format: COFF-ARM64 +; MIPS: Format: COFF-MIPS From 03093b62d4c9ff43b0f9422db1ff97e5050d7664 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= Date: Thu, 26 Dec 2024 07:09:18 +0000 Subject: [PATCH 072/567] [Polly] Fix gtest logic for standalone builds (#121114) Fix the gtest logic to account for llvm_gtest being installed as part of LLVM, as of 91b3ca39667b6341a8c1983a1467fae14b58318b. --- polly/CMakeLists.txt | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/polly/CMakeLists.txt b/polly/CMakeLists.txt index b4cfc77d0f213..955c171b3967f 100644 --- a/polly/CMakeLists.txt +++ b/polly/CMakeLists.txt @@ -29,11 +29,7 @@ if(POLLY_STANDALONE_BUILD) # Enable unit tests if available. set(POLLY_GTEST_AVAIL 0) - set(UNITTEST_DIR ${LLVM_THIRD_PARTY_DIR}/unittest) - if(EXISTS ${UNITTEST_DIR}/googletest/include/gtest/gtest.h) - if (NOT TARGET gtest) - add_subdirectory(${UNITTEST_DIR} third-party/unittest) - endif() + if(TARGET llvm_gtest) set(POLLY_GTEST_AVAIL 1) endif() From abd91023447d146f36357326fc97c458b49e40af Mon Sep 17 00:00:00 2001 From: Shourya Goel Date: Thu, 26 Dec 2024 12:50:07 +0530 Subject: [PATCH 073/567] [libc][complex] add cfloat16 and cfloat128 compiler flags (#121140) Proper fix for the temporary fix in #114696 --- .../cmake/modules/CheckCompilerFeatures.cmake | 6 +++++ .../compiler_features/check_cfloat128.cpp | 5 +++++ .../compiler_features/check_cfloat16.cpp | 5 +++++ libc/config/linux/aarch64/entrypoints.txt | 21 ++++++++++++------ libc/config/linux/riscv/entrypoints.txt | 7 ++++-- libc/config/linux/x86_64/entrypoints.txt | 22 +++++++++++++------ libc/src/complex/cimagf128.h | 9 ++------ libc/src/complex/cimagf16.h | 9 ++------ libc/src/complex/conjf128.h | 7 +----- libc/src/complex/conjf16.h | 7 +----- libc/src/complex/cprojf128.h | 7 +----- libc/src/complex/cprojf16.h | 7 +----- libc/src/complex/crealf128.h | 9 ++------ libc/src/complex/crealf16.h | 9 ++------ libc/src/complex/generic/cimagf128.cpp | 4 ---- libc/src/complex/generic/cimagf16.cpp | 4 ---- libc/src/complex/generic/conjf128.cpp | 4 ---- libc/src/complex/generic/conjf16.cpp | 4 ---- libc/src/complex/generic/cprojf128.cpp | 4 ---- libc/src/complex/generic/cprojf16.cpp | 4 ---- libc/src/complex/generic/crealf128.cpp | 4 ---- libc/src/complex/generic/crealf16.cpp | 4 ---- libc/test/src/complex/cimagf128_test.cpp | 4 ---- libc/test/src/complex/cimagf16_test.cpp | 4 ---- libc/test/src/complex/conjf128_test.cpp | 4 ---- libc/test/src/complex/conjf16_test.cpp | 4 ---- libc/test/src/complex/cprojf128_test.cpp | 4 ---- libc/test/src/complex/cprojf16_test.cpp | 4 ---- libc/test/src/complex/crealf128_test.cpp | 4 ---- libc/test/src/complex/crealf16_test.cpp | 4 ---- 30 files changed, 62 insertions(+), 132 deletions(-) create mode 100644 libc/cmake/modules/compiler_features/check_cfloat128.cpp create mode 100644 libc/cmake/modules/compiler_features/check_cfloat16.cpp diff --git a/libc/cmake/modules/CheckCompilerFeatures.cmake b/libc/cmake/modules/CheckCompilerFeatures.cmake index 862c7ecbd7fdf..a5ea66a5935b7 100644 --- a/libc/cmake/modules/CheckCompilerFeatures.cmake +++ b/libc/cmake/modules/CheckCompilerFeatures.cmake @@ -13,6 +13,8 @@ set( "float16_conversion" "float128" "fixed_point" + "cfloat16" + "cfloat128" ) # Making sure ALL_COMPILER_FEATURES is sorted. @@ -110,6 +112,10 @@ foreach(feature IN LISTS ALL_COMPILER_FEATURES) set(LIBC_TYPES_HAS_FLOAT128 TRUE) elseif(${feature} STREQUAL "fixed_point") set(LIBC_COMPILER_HAS_FIXED_POINT TRUE) + elseif(${feature} STREQUAL "cfloat16") + set(LIBC_TYPES_HAS_CFLOAT16 TRUE) + elseif(${feature} STREQUAL "cfloat128") + set(LIBC_TYPES_HAS_CFLOAT128 TRUE) elseif(${feature} STREQUAL "builtin_ceil_floor_rint_trunc") set(LIBC_COMPILER_HAS_BUILTIN_CEIL_FLOOR_RINT_TRUNC TRUE) elseif(${feature} STREQUAL "builtin_fmax_fmin") diff --git a/libc/cmake/modules/compiler_features/check_cfloat128.cpp b/libc/cmake/modules/compiler_features/check_cfloat128.cpp new file mode 100644 index 0000000000000..a798ccb989689 --- /dev/null +++ b/libc/cmake/modules/compiler_features/check_cfloat128.cpp @@ -0,0 +1,5 @@ +#include "src/__support/macros/properties/complex_types.h" + +#ifndef LIBC_TYPES_HAS_CFLOAT128 +#error unsupported +#endif diff --git a/libc/cmake/modules/compiler_features/check_cfloat16.cpp b/libc/cmake/modules/compiler_features/check_cfloat16.cpp new file mode 100644 index 0000000000000..31416ff7c6aea --- /dev/null +++ b/libc/cmake/modules/compiler_features/check_cfloat16.cpp @@ -0,0 +1,5 @@ +#include "src/__support/macros/properties/complex_types.h" + +#ifndef LIBC_TYPES_HAS_CFLOAT16 +#error unsupported +#endif diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index b949e4b4f67ba..b096b95b9472e 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -619,14 +619,17 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.math.ufromfpxl ) -if(LIBC_TYPES_HAS_FLOAT16) +if(LIBC_TYPES_HAS_CFLOAT16) list(APPEND TARGET_LIBM_ENTRYPOINTS # complex.h C23 _Complex _Float16 entrypoints - # libc.src.complex.crealf16 - # libc.src.complex.cimagf16 - # libc.src.complex.conjf16 - # libc.src.complex.cprojf16 - + libc.src.complex.crealf16 + libc.src.complex.cimagf16 + libc.src.complex.conjf16 + libc.src.complex.cprojf16 + ) +endif() + +if(LIBC_TYPES_HAS_FLOAT16) # math.h C23 _Float16 entrypoints libc.src.math.canonicalizef16 libc.src.math.ceilf16 @@ -726,14 +729,18 @@ if(LIBC_TYPES_HAS_FLOAT16) # endif() endif() -if(LIBC_TYPES_HAS_FLOAT128) +if(LIBC_TYPES_HAS_CFLOAT128) list(APPEND TARGET_LIBM_ENTRYPOINTS # complex.h C23 _Complex _Float128 entrypoints libc.src.complex.crealf128 libc.src.complex.cimagf128 libc.src.complex.conjf128 libc.src.complex.cprojf128 + ) +endif() +if(LIBC_TYPES_HAS_FLOAT128) + list(APPEND TARGET_LIBM_ENTRYPOINTS # math.h C23 _Float128 entrypoints libc.src.math.canonicalizef128 libc.src.math.ceilf128 diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt index 19980f79e7be8..643e20ddb34eb 100644 --- a/libc/config/linux/riscv/entrypoints.txt +++ b/libc/config/linux/riscv/entrypoints.txt @@ -620,14 +620,17 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.math.ufromfpxl ) -if(LIBC_TYPES_HAS_FLOAT128) +if(LIBC_TYPES_HAS_CFLOAT128) list(APPEND TARGET_LIBM_ENTRYPOINTS # complex.h C23 _Complex _Float128 entrypoints libc.src.complex.crealf128 libc.src.complex.cimagf128 libc.src.complex.conjf128 libc.src.complex.cprojf128 - + ) +endif() + +if(LIBC_TYPES_HAS_FLOAT128) # math.h C23 _Float128 entrypoints libc.src.math.canonicalizef128 libc.src.math.ceilf128 diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 08d8559d8c81a..7e549607716c0 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -624,14 +624,18 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.math.ufromfpxl ) -if(LIBC_TYPES_HAS_FLOAT16) +if(LIBC_TYPES_HAS_CFLOAT16) list(APPEND TARGET_LIBM_ENTRYPOINTS # complex.h C23 _Complex _Float16 entrypoints libc.src.complex.crealf16 libc.src.complex.cimagf16 libc.src.complex.conjf16 libc.src.complex.cprojf16 + ) +endif() +if(LIBC_TYPES_HAS_FLOAT16) + list(APPEND TARGET_LIBM_ENTRYPOINTS # math.h C23 _Float16 entrypoints libc.src.math.canonicalizef16 libc.src.math.ceilf16 @@ -736,14 +740,18 @@ if(LIBC_TYPES_HAS_FLOAT16) endif() endif() -if(LIBC_TYPES_HAS_FLOAT128) +if(LIBC_TYPES_HAS_CFLOAT128) list(APPEND TARGET_LIBM_ENTRYPOINTS # complex.h C23 _Complex _Float128 entrypoints - # libc.src.complex.crealf128 - # libc.src.complex.cimagf128 - # libc.src.complex.conjf128 - # libc.src.complex.cprojf128 - + libc.src.complex.crealf128 + libc.src.complex.cimagf128 + libc.src.complex.conjf128 + libc.src.complex.cprojf128 + ) +endif() + +if(LIBC_TYPES_HAS_FLOAT128) + list(APPEND TARGET_LIBM_ENTRYPOINTS # math.h C23 _Float128 entrypoints libc.src.math.canonicalizef128 libc.src.math.ceilf128 diff --git a/libc/src/complex/cimagf128.h b/libc/src/complex/cimagf128.h index ab8f9ac7da58c..aaf52cfc54eff 100644 --- a/libc/src/complex/cimagf128.h +++ b/libc/src/complex/cimagf128.h @@ -6,15 +6,12 @@ // //===----------------------------------------------------------------------===// -#include "src/__support/macros/properties/complex_types.h" -#include "src/__support/macros/properties/types.h" - -#if defined(LIBC_TYPES_HAS_CFLOAT128) - #ifndef LLVM_LIBC_SRC_COMPLEX_CIMAGF128_H #define LLVM_LIBC_SRC_COMPLEX_CIMAGF128_H #include "src/__support/macros/config.h" +#include "src/__support/macros/properties/complex_types.h" +#include "src/__support/macros/properties/types.h" namespace LIBC_NAMESPACE_DECL { @@ -23,5 +20,3 @@ float128 cimagf128(cfloat128 x); } // namespace LIBC_NAMESPACE_DECL #endif // LLVM_LIBC_SRC_COMPLEX_CIMAGF128_H - -#endif // LIBC_TYPES_HAS_CFLOAT128 diff --git a/libc/src/complex/cimagf16.h b/libc/src/complex/cimagf16.h index 5c5de2eb1bcf2..81ed4d2ce567e 100644 --- a/libc/src/complex/cimagf16.h +++ b/libc/src/complex/cimagf16.h @@ -6,15 +6,12 @@ // //===----------------------------------------------------------------------===// -#include "src/__support/macros/properties/complex_types.h" -#include "src/__support/macros/properties/types.h" - -#if defined(LIBC_TYPES_HAS_CFLOAT16) - #ifndef LLVM_LIBC_SRC_COMPLEX_CIMAGF16_H #define LLVM_LIBC_SRC_COMPLEX_CIMAGF16_H #include "src/__support/macros/config.h" +#include "src/__support/macros/properties/complex_types.h" +#include "src/__support/macros/properties/types.h" namespace LIBC_NAMESPACE_DECL { @@ -23,5 +20,3 @@ float16 cimagf16(cfloat16 x); } // namespace LIBC_NAMESPACE_DECL #endif // LLVM_LIBC_SRC_COMPLEX_CIMAGF16_H - -#endif // LIBC_TYPES_HAS_CFLOAT16 diff --git a/libc/src/complex/conjf128.h b/libc/src/complex/conjf128.h index c1ae0b03d067a..cae01d3f00694 100644 --- a/libc/src/complex/conjf128.h +++ b/libc/src/complex/conjf128.h @@ -6,14 +6,11 @@ // //===----------------------------------------------------------------------===// -#include "src/__support/macros/properties/complex_types.h" - -#if defined(LIBC_TYPES_HAS_CFLOAT128) - #ifndef LLVM_LIBC_SRC_COMPLEX_CONJF128_H #define LLVM_LIBC_SRC_COMPLEX_CONJF128_H #include "src/__support/macros/config.h" +#include "src/__support/macros/properties/complex_types.h" namespace LIBC_NAMESPACE_DECL { @@ -22,5 +19,3 @@ cfloat128 conjf128(cfloat128 x); } // namespace LIBC_NAMESPACE_DECL #endif // LLVM_LIBC_SRC_COMPLEX_CONJF128_H - -#endif // LIBC_TYPES_HAS_CFLOAT128 diff --git a/libc/src/complex/conjf16.h b/libc/src/complex/conjf16.h index 685ac8ac5c858..dde1221473e40 100644 --- a/libc/src/complex/conjf16.h +++ b/libc/src/complex/conjf16.h @@ -6,14 +6,11 @@ // //===----------------------------------------------------------------------===// -#include "src/__support/macros/properties/complex_types.h" - -#if defined(LIBC_TYPES_HAS_CFLOAT16) - #ifndef LLVM_LIBC_SRC_COMPLEX_CONJF16_H #define LLVM_LIBC_SRC_COMPLEX_CONJF16_H #include "src/__support/macros/config.h" +#include "src/__support/macros/properties/complex_types.h" namespace LIBC_NAMESPACE_DECL { @@ -22,5 +19,3 @@ cfloat16 conjf16(cfloat16 x); } // namespace LIBC_NAMESPACE_DECL #endif // LLVM_LIBC_SRC_COMPLEX_CONJF16_H - -#endif // LIBC_TYPES_HAS_CFLOAT16 diff --git a/libc/src/complex/cprojf128.h b/libc/src/complex/cprojf128.h index 5f7fe992ef30b..71c1bbec2218a 100644 --- a/libc/src/complex/cprojf128.h +++ b/libc/src/complex/cprojf128.h @@ -6,14 +6,11 @@ // //===----------------------------------------------------------------------===// -#include "src/__support/macros/properties/complex_types.h" - -#if defined(LIBC_TYPES_HAS_CFLOAT128) - #ifndef LLVM_LIBC_SRC_COMPLEX_CPROJF128_H #define LLVM_LIBC_SRC_COMPLEX_CPROJF128_H #include "src/__support/macros/config.h" +#include "src/__support/macros/properties/complex_types.h" namespace LIBC_NAMESPACE_DECL { @@ -22,5 +19,3 @@ cfloat128 cprojf128(cfloat128 x); } // namespace LIBC_NAMESPACE_DECL #endif // LLVM_LIBC_SRC_COMPLEX_CPROJF128_H - -#endif // LIBC_TYPES_HAS_CFLOAT128 diff --git a/libc/src/complex/cprojf16.h b/libc/src/complex/cprojf16.h index 8cce5f0bcef2b..f12a46df9e175 100644 --- a/libc/src/complex/cprojf16.h +++ b/libc/src/complex/cprojf16.h @@ -6,14 +6,11 @@ // //===----------------------------------------------------------------------===// -#include "src/__support/macros/properties/complex_types.h" - -#if defined(LIBC_TYPES_HAS_CFLOAT16) - #ifndef LLVM_LIBC_SRC_COMPLEX_CPROJF16_H #define LLVM_LIBC_SRC_COMPLEX_CPROJF16_H #include "src/__support/macros/config.h" +#include "src/__support/macros/properties/complex_types.h" namespace LIBC_NAMESPACE_DECL { @@ -22,5 +19,3 @@ cfloat16 cprojf16(cfloat16 x); } // namespace LIBC_NAMESPACE_DECL #endif // LLVM_LIBC_SRC_COMPLEX_CPROJF16_H - -#endif // LIBC_TYPES_HAS_CFLOAT16 diff --git a/libc/src/complex/crealf128.h b/libc/src/complex/crealf128.h index 4922ae78cb238..b90c3e7c8548e 100644 --- a/libc/src/complex/crealf128.h +++ b/libc/src/complex/crealf128.h @@ -6,15 +6,12 @@ // //===----------------------------------------------------------------------===// -#include "src/__support/macros/properties/complex_types.h" -#include "src/__support/macros/properties/types.h" - -#if defined(LIBC_TYPES_HAS_CFLOAT128) - #ifndef LLVM_LIBC_SRC_COMPLEX_CREALF128_H #define LLVM_LIBC_SRC_COMPLEX_CREALF128_H #include "src/__support/macros/config.h" +#include "src/__support/macros/properties/complex_types.h" +#include "src/__support/macros/properties/types.h" namespace LIBC_NAMESPACE_DECL { @@ -23,5 +20,3 @@ float128 crealf128(cfloat128 x); } // namespace LIBC_NAMESPACE_DECL #endif // LLVM_LIBC_SRC_COMPLEX_CREALF128_H - -#endif // LIBC_TYPES_HAS_CFLOAT128 diff --git a/libc/src/complex/crealf16.h b/libc/src/complex/crealf16.h index e6098a218d092..09d66649fa272 100644 --- a/libc/src/complex/crealf16.h +++ b/libc/src/complex/crealf16.h @@ -6,15 +6,12 @@ // //===----------------------------------------------------------------------===// -#include "src/__support/macros/properties/complex_types.h" -#include "src/__support/macros/properties/types.h" - -#if defined(LIBC_TYPES_HAS_CFLOAT16) - #ifndef LLVM_LIBC_SRC_COMPLEX_CREALF16_H #define LLVM_LIBC_SRC_COMPLEX_CREALF16_H #include "src/__support/macros/config.h" +#include "src/__support/macros/properties/complex_types.h" +#include "src/__support/macros/properties/types.h" namespace LIBC_NAMESPACE_DECL { @@ -23,5 +20,3 @@ float16 crealf16(cfloat16 x); } // namespace LIBC_NAMESPACE_DECL #endif // LLVM_LIBC_SRC_COMPLEX_CREALF16_H - -#endif // LIBC_TYPES_HAS_CFLOAT16 diff --git a/libc/src/complex/generic/cimagf128.cpp b/libc/src/complex/generic/cimagf128.cpp index c21bd7f4602cc..78dbb8eddd3eb 100644 --- a/libc/src/complex/generic/cimagf128.cpp +++ b/libc/src/complex/generic/cimagf128.cpp @@ -7,8 +7,6 @@ //===----------------------------------------------------------------------===// #include "src/complex/cimagf128.h" -#if defined(LIBC_TYPES_HAS_CFLOAT128) - #include "src/__support/CPP/bit.h" #include "src/__support/common.h" #include "src/__support/complex_type.h" @@ -21,5 +19,3 @@ LLVM_LIBC_FUNCTION(float128, cimagf128, (cfloat128 x)) { } } // namespace LIBC_NAMESPACE_DECL - -#endif // LIBC_TYPES_HAS_CFLOAT128 diff --git a/libc/src/complex/generic/cimagf16.cpp b/libc/src/complex/generic/cimagf16.cpp index 361687984067b..25d9b3ddf3b6b 100644 --- a/libc/src/complex/generic/cimagf16.cpp +++ b/libc/src/complex/generic/cimagf16.cpp @@ -7,8 +7,6 @@ //===----------------------------------------------------------------------===// #include "src/complex/cimagf16.h" -#if defined(LIBC_TYPES_HAS_CFLOAT16) - #include "src/__support/CPP/bit.h" #include "src/__support/common.h" #include "src/__support/complex_type.h" @@ -21,5 +19,3 @@ LLVM_LIBC_FUNCTION(float16, cimagf16, (cfloat16 x)) { } } // namespace LIBC_NAMESPACE_DECL - -#endif // LIBC_TYPES_HAS_CFLOAT16 diff --git a/libc/src/complex/generic/conjf128.cpp b/libc/src/complex/generic/conjf128.cpp index c65b54849f52e..a63809a66e25a 100644 --- a/libc/src/complex/generic/conjf128.cpp +++ b/libc/src/complex/generic/conjf128.cpp @@ -7,8 +7,6 @@ //===----------------------------------------------------------------------===// #include "src/complex/conjf128.h" -#if defined(LIBC_TYPES_HAS_CFLOAT128) - #include "src/__support/common.h" #include "src/__support/complex_type.h" @@ -19,5 +17,3 @@ LLVM_LIBC_FUNCTION(cfloat128, conjf128, (cfloat128 x)) { } } // namespace LIBC_NAMESPACE_DECL - -#endif // LIBC_TYPES_HAS_CFLOAT128 diff --git a/libc/src/complex/generic/conjf16.cpp b/libc/src/complex/generic/conjf16.cpp index dac11e27b30a2..cd1ab67ed1cd9 100644 --- a/libc/src/complex/generic/conjf16.cpp +++ b/libc/src/complex/generic/conjf16.cpp @@ -7,8 +7,6 @@ //===----------------------------------------------------------------------===// #include "src/complex/conjf16.h" -#if defined(LIBC_TYPES_HAS_CFLOAT16) - #include "src/__support/common.h" #include "src/__support/complex_type.h" @@ -19,5 +17,3 @@ LLVM_LIBC_FUNCTION(cfloat16, conjf16, (cfloat16 x)) { } } // namespace LIBC_NAMESPACE_DECL - -#endif // LIBC_TYPES_HAS_CFLOAT16 diff --git a/libc/src/complex/generic/cprojf128.cpp b/libc/src/complex/generic/cprojf128.cpp index 97134b5523a56..eb2cd08dfc117 100644 --- a/libc/src/complex/generic/cprojf128.cpp +++ b/libc/src/complex/generic/cprojf128.cpp @@ -7,8 +7,6 @@ //===----------------------------------------------------------------------===// #include "src/complex/cprojf128.h" -#if defined(LIBC_TYPES_HAS_CFLOAT128) - #include "src/__support/common.h" #include "src/__support/complex_type.h" @@ -19,5 +17,3 @@ LLVM_LIBC_FUNCTION(cfloat128, cprojf128, (cfloat128 x)) { } } // namespace LIBC_NAMESPACE_DECL - -#endif // LIBC_TYPES_HAS_CFLOAT128 diff --git a/libc/src/complex/generic/cprojf16.cpp b/libc/src/complex/generic/cprojf16.cpp index bd0425ffb5fe5..8d2d64a439e02 100644 --- a/libc/src/complex/generic/cprojf16.cpp +++ b/libc/src/complex/generic/cprojf16.cpp @@ -7,8 +7,6 @@ //===----------------------------------------------------------------------===// #include "src/complex/cprojf16.h" -#if defined(LIBC_TYPES_HAS_CFLOAT16) - #include "src/__support/common.h" #include "src/__support/complex_type.h" @@ -19,5 +17,3 @@ LLVM_LIBC_FUNCTION(cfloat16, cprojf16, (cfloat16 x)) { } } // namespace LIBC_NAMESPACE_DECL - -#endif // LIBC_TYPES_HAS_CFLOAT16 diff --git a/libc/src/complex/generic/crealf128.cpp b/libc/src/complex/generic/crealf128.cpp index e72a778216010..e7554989e14aa 100644 --- a/libc/src/complex/generic/crealf128.cpp +++ b/libc/src/complex/generic/crealf128.cpp @@ -7,8 +7,6 @@ //===----------------------------------------------------------------------===// #include "src/complex/crealf128.h" -#if defined(LIBC_TYPES_HAS_CFLOAT128) - #include "src/__support/CPP/bit.h" #include "src/__support/common.h" #include "src/__support/complex_type.h" @@ -21,5 +19,3 @@ LLVM_LIBC_FUNCTION(float128, crealf128, (cfloat128 x)) { } } // namespace LIBC_NAMESPACE_DECL - -#endif // LIBC_TYPES_HAS_CFLOAT128 diff --git a/libc/src/complex/generic/crealf16.cpp b/libc/src/complex/generic/crealf16.cpp index 35142071f0536..c9e8626bfda9d 100644 --- a/libc/src/complex/generic/crealf16.cpp +++ b/libc/src/complex/generic/crealf16.cpp @@ -7,8 +7,6 @@ //===----------------------------------------------------------------------===// #include "src/complex/crealf16.h" -#if defined(LIBC_TYPES_HAS_CFLOAT16) - #include "src/__support/CPP/bit.h" #include "src/__support/common.h" #include "src/__support/complex_type.h" @@ -21,5 +19,3 @@ LLVM_LIBC_FUNCTION(float16, crealf16, (cfloat16 x)) { } } // namespace LIBC_NAMESPACE_DECL - -#endif // LIBC_TYPES_HAS_CFLOAT16 diff --git a/libc/test/src/complex/cimagf128_test.cpp b/libc/test/src/complex/cimagf128_test.cpp index 50ddc0ab06166..70ad0de3d38fb 100644 --- a/libc/test/src/complex/cimagf128_test.cpp +++ b/libc/test/src/complex/cimagf128_test.cpp @@ -10,8 +10,4 @@ #include "src/complex/cimagf128.h" -#if defined(LIBC_TYPES_HAS_CFLOAT128) - LIST_CIMAG_TESTS(cfloat128, float128, LIBC_NAMESPACE::cimagf128) - -#endif // LIBC_TYPES_HAS_CFLOAT128 diff --git a/libc/test/src/complex/cimagf16_test.cpp b/libc/test/src/complex/cimagf16_test.cpp index 65a69787ddbd6..3842381351abe 100644 --- a/libc/test/src/complex/cimagf16_test.cpp +++ b/libc/test/src/complex/cimagf16_test.cpp @@ -10,8 +10,4 @@ #include "src/complex/cimagf16.h" -#if defined(LIBC_TYPES_HAS_CFLOAT16) - LIST_CIMAG_TESTS(cfloat16, float16, LIBC_NAMESPACE::cimagf16) - -#endif // LIBC_TYPES_HAS_CFLOAT16 diff --git a/libc/test/src/complex/conjf128_test.cpp b/libc/test/src/complex/conjf128_test.cpp index a1feb9ff31fdc..4c2a72c6d39d6 100644 --- a/libc/test/src/complex/conjf128_test.cpp +++ b/libc/test/src/complex/conjf128_test.cpp @@ -10,8 +10,4 @@ #include "src/complex/conjf128.h" -#if defined(LIBC_TYPES_HAS_CFLOAT128) - LIST_CONJ_TESTS(cfloat128, float128, LIBC_NAMESPACE::conjf128) - -#endif // LIBC_TYPES_HAS_CFLOAT128 diff --git a/libc/test/src/complex/conjf16_test.cpp b/libc/test/src/complex/conjf16_test.cpp index 0de9f448e8681..374f9ec3e6243 100644 --- a/libc/test/src/complex/conjf16_test.cpp +++ b/libc/test/src/complex/conjf16_test.cpp @@ -10,8 +10,4 @@ #include "src/complex/conjf16.h" -#if defined(LIBC_TYPES_HAS_CFLOAT16) - LIST_CONJ_TESTS(cfloat16, float16, LIBC_NAMESPACE::conjf16) - -#endif // LIBC_TYPES_HAS_CFLOAT16 diff --git a/libc/test/src/complex/cprojf128_test.cpp b/libc/test/src/complex/cprojf128_test.cpp index 75708122260d6..7b41eb5cf5f93 100644 --- a/libc/test/src/complex/cprojf128_test.cpp +++ b/libc/test/src/complex/cprojf128_test.cpp @@ -10,8 +10,4 @@ #include "src/complex/cprojf128.h" -#if defined(LIBC_TYPES_HAS_CFLOAT128) - LIST_CPROJ_TESTS(cfloat128, float128, LIBC_NAMESPACE::cprojf128) - -#endif // LIBC_TYPES_HAS_CFLOAT128 diff --git a/libc/test/src/complex/cprojf16_test.cpp b/libc/test/src/complex/cprojf16_test.cpp index 628cec0dc5d96..db9b7b9316bca 100644 --- a/libc/test/src/complex/cprojf16_test.cpp +++ b/libc/test/src/complex/cprojf16_test.cpp @@ -10,8 +10,4 @@ #include "src/complex/cprojf16.h" -#if defined(LIBC_TYPES_HAS_CFLOAT16) - LIST_CPROJ_TESTS(cfloat16, float16, LIBC_NAMESPACE::cprojf16) - -#endif // LIBC_TYPES_HAS_CFLOAT16 diff --git a/libc/test/src/complex/crealf128_test.cpp b/libc/test/src/complex/crealf128_test.cpp index 7626eeebca278..0d1c26df77371 100644 --- a/libc/test/src/complex/crealf128_test.cpp +++ b/libc/test/src/complex/crealf128_test.cpp @@ -10,8 +10,4 @@ #include "src/complex/crealf128.h" -#if defined(LIBC_TYPES_HAS_CFLOAT128) - LIST_CREAL_TESTS(cfloat128, float128, LIBC_NAMESPACE::crealf128) - -#endif // LIBC_TYPES_HAS_CFLOAT128 diff --git a/libc/test/src/complex/crealf16_test.cpp b/libc/test/src/complex/crealf16_test.cpp index 97346aad615f7..b8560d74d35b5 100644 --- a/libc/test/src/complex/crealf16_test.cpp +++ b/libc/test/src/complex/crealf16_test.cpp @@ -10,8 +10,4 @@ #include "src/complex/crealf16.h" -#if defined(LIBC_TYPES_HAS_CFLOAT16) - LIST_CREAL_TESTS(cfloat16, float16, LIBC_NAMESPACE::crealf16) - -#endif // LIBC_TYPES_HAS_CFLOAT16 From 9f75b6664f1eaec1517f6cb620b34100b7b54857 Mon Sep 17 00:00:00 2001 From: Shourya Goel Date: Thu, 26 Dec 2024 12:57:48 +0530 Subject: [PATCH 074/567] [libc][complex] fix buildbot errors (#121141) Fix buildbot errors due to #121140 --- libc/config/linux/aarch64/entrypoints.txt | 3 ++- libc/config/linux/riscv/entrypoints.txt | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index b096b95b9472e..00f0c6a8bfb8e 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -629,7 +629,8 @@ if(LIBC_TYPES_HAS_CFLOAT16) ) endif() -if(LIBC_TYPES_HAS_FLOAT16) +if(LIBC_TYPES_HAS_FLOAT16) + list(APPEND TARGET_LIBM_ENTRYPOINTS # math.h C23 _Float16 entrypoints libc.src.math.canonicalizef16 libc.src.math.ceilf16 diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt index 643e20ddb34eb..49a8d61b93802 100644 --- a/libc/config/linux/riscv/entrypoints.txt +++ b/libc/config/linux/riscv/entrypoints.txt @@ -631,6 +631,7 @@ if(LIBC_TYPES_HAS_CFLOAT128) endif() if(LIBC_TYPES_HAS_FLOAT128) + list(APPEND TARGET_LIBM_ENTRYPOINTS # math.h C23 _Float128 entrypoints libc.src.math.canonicalizef128 libc.src.math.ceilf128 From cbe583b0bd8d46b4e5edda463e19e6a24c0817bc Mon Sep 17 00:00:00 2001 From: Thirumalai Shaktivel <74826228+Thirumalai-Shaktivel@users.noreply.github.com> Date: Thu, 26 Dec 2024 15:02:09 +0530 Subject: [PATCH 075/567] [Flang] Add translation support for MutexInOutSet and InOutSet (#120715) Implementatoin details: Both Mutexinoutset and Inoutset is recognized as flag=0x4 and 0x8 respectively, the flags is set to `kmp_depend_info` and passed as argument to `__kmpc_omp_task_with_deps` runtime call --- flang/lib/Lower/OpenMP/ClauseProcessor.cpp | 7 +++- .../OpenMP/Todo/depend-clause-depobj.f90 | 2 +- .../OpenMP/Todo/depend-clause-inoutset.f90 | 11 ----- .../Todo/depend-clause-mutexinoutset.f90 | 11 ----- flang/test/Lower/OpenMP/task.f90 | 12 ++++++ .../mlir/Dialect/OpenMP/OpenMPEnums.td | 17 ++++---- .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 6 +++ mlir/test/Target/LLVMIR/openmp-llvm.mlir | 40 +++++++++++++++++++ 8 files changed, 73 insertions(+), 33 deletions(-) delete mode 100644 flang/test/Lower/OpenMP/Todo/depend-clause-inoutset.f90 delete mode 100644 flang/test/Lower/OpenMP/Todo/depend-clause-mutexinoutset.f90 diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp index 3c9831120351e..c4ab5e0033d04 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp @@ -153,10 +153,13 @@ genDependKindAttr(lower::AbstractConverter &converter, pbKind = mlir::omp::ClauseTaskDepend::taskdependinout; break; case omp::clause::DependenceType::Mutexinoutset: + pbKind = mlir::omp::ClauseTaskDepend::taskdependmutexinoutset; + break; case omp::clause::DependenceType::Inoutset: + pbKind = mlir::omp::ClauseTaskDepend::taskdependinoutset; + break; case omp::clause::DependenceType::Depobj: - TODO(currentLocation, - "INOUTSET, MUTEXINOUTSET and DEPOBJ dependence-types"); + TODO(currentLocation, "DEPOBJ dependence-type"); break; case omp::clause::DependenceType::Sink: case omp::clause::DependenceType::Source: diff --git a/flang/test/Lower/OpenMP/Todo/depend-clause-depobj.f90 b/flang/test/Lower/OpenMP/Todo/depend-clause-depobj.f90 index 3bc730f849192..4e98d77d0bb3e 100644 --- a/flang/test/Lower/OpenMP/Todo/depend-clause-depobj.f90 +++ b/flang/test/Lower/OpenMP/Todo/depend-clause-depobj.f90 @@ -1,7 +1,7 @@ !RUN: %not_todo_cmd bbc -emit-hlfir -fopenmp -fopenmp-version=52 -o - %s 2>&1 | FileCheck %s !RUN: %not_todo_cmd %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 -o - %s 2>&1 | FileCheck %s -!CHECK: not yet implemented: INOUTSET, MUTEXINOUTSET and DEPOBJ dependence-types +!CHECK: not yet implemented: DEPOBJ dependence-type subroutine f00(x) integer :: x diff --git a/flang/test/Lower/OpenMP/Todo/depend-clause-inoutset.f90 b/flang/test/Lower/OpenMP/Todo/depend-clause-inoutset.f90 deleted file mode 100644 index 160893fccdc5f..0000000000000 --- a/flang/test/Lower/OpenMP/Todo/depend-clause-inoutset.f90 +++ /dev/null @@ -1,11 +0,0 @@ -!RUN: %not_todo_cmd bbc -emit-hlfir -fopenmp -fopenmp-version=52 -o - %s 2>&1 | FileCheck %s -!RUN: %not_todo_cmd %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 -o - %s 2>&1 | FileCheck %s - -!CHECK: not yet implemented: INOUTSET, MUTEXINOUTSET and DEPOBJ dependence-types -subroutine f00(x) - integer :: x - !$omp task depend(inoutset: x) - x = x + 1 - !$omp end task -end - diff --git a/flang/test/Lower/OpenMP/Todo/depend-clause-mutexinoutset.f90 b/flang/test/Lower/OpenMP/Todo/depend-clause-mutexinoutset.f90 deleted file mode 100644 index 17cc3894c548f..0000000000000 --- a/flang/test/Lower/OpenMP/Todo/depend-clause-mutexinoutset.f90 +++ /dev/null @@ -1,11 +0,0 @@ -!RUN: %not_todo_cmd bbc -emit-hlfir -fopenmp -fopenmp-version=52 -o - %s 2>&1 | FileCheck %s -!RUN: %not_todo_cmd %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 -o - %s 2>&1 | FileCheck %s - -!CHECK: not yet implemented: INOUTSET, MUTEXINOUTSET and DEPOBJ dependence-types -subroutine f00(x) - integer :: x - !$omp task depend(mutexinoutset: x) - x = x + 1 - !$omp end task -end - diff --git a/flang/test/Lower/OpenMP/task.f90 b/flang/test/Lower/OpenMP/task.f90 index 6e525a044b011..f5591bd9d8609 100644 --- a/flang/test/Lower/OpenMP/task.f90 +++ b/flang/test/Lower/OpenMP/task.f90 @@ -144,6 +144,18 @@ subroutine task_depend_multi_task() x = x + 12 !CHECK: omp.terminator !$omp end task + !CHECK: omp.task depend(taskdependmutexinoutset -> %{{.+}} : !fir.ref) + !$omp task depend(mutexinoutset : x) + !CHECK: arith.subi + x = x - 12 + !CHECK: omp.terminator + !$omp end task + !CHECK: omp.task depend(taskdependinoutset -> %{{.+}} : !fir.ref) + !$omp task depend(inoutset : x) + !CHECK: arith.subi + x = x - 12 + !CHECK: omp.terminator + !$omp end task end subroutine task_depend_multi_task !=============================================================================== diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td index b1a9e3330522b..2091c0c76dff7 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td @@ -108,14 +108,15 @@ def ClauseRequiresAttr : OpenMP_EnumAttr; def ClauseTaskDependIn : I32EnumAttrCase<"taskdependin", 0>; def ClauseTaskDependOut : I32EnumAttrCase<"taskdependout", 1>; def ClauseTaskDependInOut : I32EnumAttrCase<"taskdependinout", 2>; - -def ClauseTaskDepend : OpenMP_I32EnumAttr< - "ClauseTaskDepend", - "depend clause in a target or task construct", [ - ClauseTaskDependIn, - ClauseTaskDependOut, - ClauseTaskDependInOut - ]>; +def ClauseTaskDependMutexInOutSet + : I32EnumAttrCase<"taskdependmutexinoutset", 3>; +def ClauseTaskDependInOutSet : I32EnumAttrCase<"taskdependinoutset", 4>; + +def ClauseTaskDepend + : OpenMP_I32EnumAttr< + "ClauseTaskDepend", "depend clause in a target or task construct", + [ClauseTaskDependIn, ClauseTaskDependOut, ClauseTaskDependInOut, + ClauseTaskDependMutexInOutSet, ClauseTaskDependInOutSet]>; def ClauseTaskDependAttr : OpenMP_EnumAttr { diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 060113c412324..9a30266103b15 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -1701,6 +1701,12 @@ buildDependData(std::optional dependKinds, OperandRange dependVars, case mlir::omp::ClauseTaskDepend::taskdependinout: type = llvm::omp::RTLDependenceKindTy::DepInOut; break; + case mlir::omp::ClauseTaskDepend::taskdependmutexinoutset: + type = llvm::omp::RTLDependenceKindTy::DepMutexInOutSet; + break; + case mlir::omp::ClauseTaskDepend::taskdependinoutset: + type = llvm::omp::RTLDependenceKindTy::DepInOutSet; + break; }; llvm::Value *depVal = moduleTranslation.lookupValue(std::get<0>(dep)); llvm::OpenMPIRBuilder::DependData dd(type, depVal->getType(), depVal); diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir index 5f8bdf8afdf78..44e32c3f35f9b 100644 --- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir +++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir @@ -2590,6 +2590,34 @@ llvm.func @omp_task_attrs() -> () attributes { // CHECK: store i64 8, ptr %[[dep_arr_addr_0_size]], align 4 // CHECK: %[[dep_arr_addr_0_kind:.+]] = getelementptr inbounds nuw %struct.kmp_dep_info, ptr %[[dep_arr_addr_0]], i32 0, i32 2 // CHECK: store i8 1, ptr %[[dep_arr_addr_0_kind]], align 1 +// ----- +// dependence_type: Out +// CHECK: %[[DEP_ARR_ADDR1:.+]] = alloca [1 x %struct.kmp_dep_info], align 8 +// CHECK: %[[DEP_ARR_ADDR_1:.+]] = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %[[DEP_ARR_ADDR1]], i64 0, i64 0 +// [...] +// CHECK: %[[DEP_TYPE_1:.+]] = getelementptr inbounds nuw %struct.kmp_dep_info, ptr %[[DEP_ARR_ADDR_1]], i32 0, i32 2 +// CHECK: store i8 3, ptr %[[DEP_TYPE_1]], align 1 +// ----- +// dependence_type: Inout +// CHECK: %[[DEP_ARR_ADDR2:.+]] = alloca [1 x %struct.kmp_dep_info], align 8 +// CHECK: %[[DEP_ARR_ADDR_2:.+]] = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %[[DEP_ARR_ADDR2]], i64 0, i64 0 +// [...] +// CHECK: %[[DEP_TYPE_2:.+]] = getelementptr inbounds nuw %struct.kmp_dep_info, ptr %[[DEP_ARR_ADDR_2]], i32 0, i32 2 +// CHECK: store i8 3, ptr %[[DEP_TYPE_2]], align 1 +// ----- +// dependence_type: Mutexinoutset +// CHECK: %[[DEP_ARR_ADDR3:.+]] = alloca [1 x %struct.kmp_dep_info], align 8 +// CHECK: %[[DEP_ARR_ADDR_3:.+]] = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %[[DEP_ARR_ADDR3]], i64 0, i64 0 +// [...] +// CHECK: %[[DEP_TYPE_3:.+]] = getelementptr inbounds nuw %struct.kmp_dep_info, ptr %[[DEP_ARR_ADDR_3]], i32 0, i32 2 +// CHECK: store i8 4, ptr %[[DEP_TYPE_3]], align 1 +// ----- +// dependence_type: Inoutset +// CHECK: %[[DEP_ARR_ADDR4:.+]] = alloca [1 x %struct.kmp_dep_info], align 8 +// CHECK: %[[DEP_ARR_ADDR_4:.+]] = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %[[DEP_ARR_ADDR4]], i64 0, i64 0 +// [...] +// CHECK: %[[DEP_TYPE_4:.+]] = getelementptr inbounds nuw %struct.kmp_dep_info, ptr %[[DEP_ARR_ADDR_4]], i32 0, i32 2 +// CHECK: store i8 8, ptr %[[DEP_TYPE_4]], align 1 llvm.func @omp_task_with_deps(%zaddr: !llvm.ptr) { // CHECK: %[[omp_global_thread_num:.+]] = call i32 @__kmpc_global_thread_num({{.+}}) // CHECK: %[[task_data:.+]] = call ptr @__kmpc_omp_task_alloc @@ -2604,6 +2632,18 @@ llvm.func @omp_task_with_deps(%zaddr: !llvm.ptr) { llvm.store %double, %valaddr : i32, !llvm.ptr omp.terminator } + omp.task depend(taskdependout -> %zaddr : !llvm.ptr) { + omp.terminator + } + omp.task depend(taskdependinout -> %zaddr : !llvm.ptr) { + omp.terminator + } + omp.task depend(taskdependmutexinoutset -> %zaddr : !llvm.ptr) { + omp.terminator + } + omp.task depend(taskdependinoutset -> %zaddr : !llvm.ptr) { + omp.terminator + } llvm.return } From 698bb5f239f50e8217cbec1d19bf8e0bba8c5d11 Mon Sep 17 00:00:00 2001 From: Hongren Zheng Date: Thu, 26 Dec 2024 19:58:11 +0800 Subject: [PATCH 076/567] [mlir][docs] Add C example for C-compatible wrapper for LLVM IR (#120955) `TargetLLVMIR` documentation introduced the C-compatible wrapper function for a MLIR function and ways to generate it, but did not demonstrate the corresponding C function signature for them. The C function signature is not obvious, in that * `MemrefDescriptor` should be passed as _pointer_. + For example, MLIR function could return a new Descriptor, so pointer is a must. + Surprisingly, directly pass the struct, by C convention, is also a pointer so some function will work, but that is implicit and error-prone. * for `@foo() -> memref<>`, the return type becomes the first argument in `_mlir_ciface_foo(%arg0: !llvm.ptr)`. + This is described in https://github.com/llvm/llvm-project/blob/f70ab7d909d6861c7eec5ab40679bde16ab826c6/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp#L110-L167 Especially by code `size_t argOffset = resultStructType ? 1 : 0;` saying the actual argument starts at 1 when result is a struct (memref) Users using the wrong signature will get incorrect results. LLVM discourse has some example of it * https://discourse.llvm.org/t/how-to-compile-and-link-with-other-c-c-programs/4835/10 * https://discourse.llvm.org/t/segmentation-fault-on-memref-store/80286/3 * https://discourse.llvm.org/t/memref-store-storing-a-memref-load/80307 Cc @ftynse for relevent commit history. Cc @charitha22 and @Wheest from discourse post. --- mlir/docs/TargetLLVMIR.md | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/mlir/docs/TargetLLVMIR.md b/mlir/docs/TargetLLVMIR.md index 96a4589eb80e7..3a2f44f46f782 100644 --- a/mlir/docs/TargetLLVMIR.md +++ b/mlir/docs/TargetLLVMIR.md @@ -646,7 +646,7 @@ Examples: ```mlir -func.func @qux(%arg0: memref) +func.func @qux(%arg0: memref) attributes {llvm.emit_c_interface} // Gets converted into the following // (using type alias for brevity): @@ -683,8 +683,18 @@ llvm.func @qux(%arg0: !llvm.ptr, %arg1: !llvm.ptr, llvm.func @_mlir_ciface_qux(!llvm.ptr) ``` + +```cpp +// The C function implementation for the interface function. +extern "C" { +void _mlir_ciface_qux(MemRefDescriptor *input) { + // detailed impl +} +} +``` + ```mlir -func.func @foo(%arg0: memref) { +func.func @foo(%arg0: memref) attributes {llvm.emit_c_interface} { return } @@ -719,8 +729,15 @@ llvm.func @_mlir_ciface_foo(%arg0: !llvm.ptr) { } ``` +```cpp +// The C function signature for the interface function. +extern "C" { +void _mlir_ciface_foo(MemRefDescriptor *input); +} +``` + ```mlir -func.func @foo(%arg0: memref) -> memref { +func.func @foo(%arg0: memref) -> memref attributes {llvm.emit_c_interface} { return %arg0 : memref } @@ -744,6 +761,7 @@ llvm.func @foo(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, } // Interface function callable from C. +// NOTE: the returned memref becomes the first argument llvm.func @_mlir_ciface_foo(%arg0: !llvm.ptr, %arg1: !llvm.ptr) { %0 = llvm.load %arg1 : !llvm.ptr %1 = llvm.extractvalue %0[0] : !llvm.memref_2d @@ -760,6 +778,14 @@ llvm.func @_mlir_ciface_foo(%arg0: !llvm.ptr, %arg1: !llvm.ptr) { } ``` +```cpp +// The C function signature for the interface function. +extern "C" { +void _mlir_ciface_foo(MemRefDescriptor *output, + MemRefDescriptor *input); +} +``` + Rationale: Introducing auxiliary functions for C-compatible interfaces is preferred to modifying the calling convention since it will minimize the effect of C compatibility on intra-module calls or calls between MLIR-generated From 1b476ecdcf4b544af1436341fc923c0b73793cbe Mon Sep 17 00:00:00 2001 From: Brad Smith Date: Thu, 26 Dec 2024 08:04:44 -0500 Subject: [PATCH 077/567] [lldb] A few more pieces towards OpenBSD support (#121051) --- lldb/cmake/modules/LLDBConfig.cmake | 2 +- lldb/source/Initialization/CMakeLists.txt | 2 +- lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp | 1 + lldb/source/Plugins/Process/CMakeLists.txt | 2 ++ 4 files changed, 5 insertions(+), 2 deletions(-) diff --git a/lldb/cmake/modules/LLDBConfig.cmake b/lldb/cmake/modules/LLDBConfig.cmake index ee4c2630d32e2..9bb37f5967d4f 100644 --- a/lldb/cmake/modules/LLDBConfig.cmake +++ b/lldb/cmake/modules/LLDBConfig.cmake @@ -292,7 +292,7 @@ endif() # Figure out if lldb could use lldb-server. If so, then we'll # ensure we build lldb-server when an lldb target is being built. -if (CMAKE_SYSTEM_NAME MATCHES "Android|Darwin|FreeBSD|Linux|NetBSD|Windows") +if (CMAKE_SYSTEM_NAME MATCHES "Android|Darwin|FreeBSD|Linux|NetBSD|OpenBSD|Windows") set(LLDB_CAN_USE_LLDB_SERVER ON) else() set(LLDB_CAN_USE_LLDB_SERVER OFF) diff --git a/lldb/source/Initialization/CMakeLists.txt b/lldb/source/Initialization/CMakeLists.txt index c1a167826f76f..b6282e162aa10 100644 --- a/lldb/source/Initialization/CMakeLists.txt +++ b/lldb/source/Initialization/CMakeLists.txt @@ -1,4 +1,4 @@ -if ( CMAKE_SYSTEM_NAME MATCHES "Linux|Android|FreeBSD|NetBSD" ) +if ( CMAKE_SYSTEM_NAME MATCHES "Linux|Android|FreeBSD|NetBSD|OpenBSD" ) list(APPEND EXTRA_PLUGINS lldbPluginProcessPOSIX) endif() diff --git a/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp b/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp index ad13a4406cfda..54028b1b3261a 100644 --- a/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp +++ b/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp @@ -95,6 +95,7 @@ ABISysV_x86_64::CreateInstance(lldb::ProcessSP process_sp, const ArchSpec &arch) case llvm::Triple::OSType::Linux: case llvm::Triple::OSType::MacOSX: case llvm::Triple::OSType::NetBSD: + case llvm::Triple::OSType::OpenBSD: case llvm::Triple::OSType::Solaris: case llvm::Triple::OSType::UnknownOS: return ABISP( diff --git a/lldb/source/Plugins/Process/CMakeLists.txt b/lldb/source/Plugins/Process/CMakeLists.txt index a51d0f7afd175..7f4f6fee7a9ea 100644 --- a/lldb/source/Plugins/Process/CMakeLists.txt +++ b/lldb/source/Plugins/Process/CMakeLists.txt @@ -7,6 +7,8 @@ elseif (CMAKE_SYSTEM_NAME MATCHES "FreeBSD") elseif (CMAKE_SYSTEM_NAME MATCHES "NetBSD") add_subdirectory(NetBSD) add_subdirectory(POSIX) +elseif (CMAKE_SYSTEM_NAME MATCHES "OpenBSD") + add_subdirectory(POSIX) elseif (CMAKE_SYSTEM_NAME MATCHES "Windows") add_subdirectory(Windows/Common) elseif (CMAKE_SYSTEM_NAME MATCHES "Darwin") From 889215a30ed60474e573f9632d1fa362dfa1b04e Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Thu, 26 Dec 2024 05:09:51 -0800 Subject: [PATCH 078/567] [SLP]Followup fix for the poisonous logical op in reductions If the VectorizedTree still may generate poisonous value, but it is not the original operand of the reduction op, need to check if Res still the operand, to generate correct code. Fixes #114905 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 24 +++++++++---------- .../SLPVectorizer/X86/reduction-logical.ll | 4 ++-- .../logical-ops-poisonous-repeated.ll | 2 +- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 57f3016fbe1e0..e9fc89fa242a7 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -19821,21 +19821,21 @@ class HorizontalReduction { Builder.SetCurrentDebugLocation( cast(ReductionOps.front().front())->getDebugLoc()); if (AnyBoolLogicOp) { - - if (auto It = ReducedValsToOps.find(VectorizedTree); - It == ReducedValsToOps.end() || + auto It = ReducedValsToOps.find(VectorizedTree); + auto It1 = ReducedValsToOps.find(Res); + if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) || isGuaranteedNotToBePoison(VectorizedTree, AC) || - any_of(It->getSecond(), [&](Instruction *I) { - return isBoolLogicOp(I) && - getRdxOperand(I, 0) == VectorizedTree; - })) { + (It != ReducedValsToOps.end() && + any_of(It->getSecond(), [&](Instruction *I) { + return isBoolLogicOp(I) && + getRdxOperand(I, 0) == VectorizedTree; + }))) { ; - } else if (auto It = ReducedValsToOps.find(Res); - It == ReducedValsToOps.end() || - isGuaranteedNotToBePoison(Res, AC) || - any_of(It->getSecond(), [&](Instruction *I) { + } else if (isGuaranteedNotToBePoison(Res, AC) || + (It1 != ReducedValsToOps.end() && + any_of(It1->getSecond(), [&](Instruction *I) { return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res; - })) { + }))) { std::swap(VectorizedTree, Res); } else { VectorizedTree = Builder.CreateFreeze(VectorizedTree); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll index 0771fabef3e02..e0b3ff714162f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -428,7 +428,7 @@ define i1 @logical_and_icmp_extra_op(<4 x i32> %x, <4 x i32> %y, i1 %c) { ; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP2]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP3]], i1 [[C:%.*]], i1 false +; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[C:%.*]], i1 [[TMP3]], i1 false ; CHECK-NEXT: ret i1 [[OP_RDX]] ; %x0 = extractelement <4 x i32> %x, i32 0 @@ -456,7 +456,7 @@ define i1 @logical_or_icmp_extra_op(<4 x i32> %x, <4 x i32> %y, i1 %c) { ; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP3]], i1 true, i1 [[C:%.*]] +; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[C:%.*]], i1 true, i1 [[TMP3]] ; CHECK-NEXT: ret i1 [[OP_RDX]] ; %x0 = extractelement <4 x i32> %x, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll b/llvm/test/Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll index 101f66f331304..f0cfd99a892a1 100644 --- a/llvm/test/Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll +++ b/llvm/test/Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll @@ -14,7 +14,7 @@ define i1 @test(<4 x i32> %x) { ; CHECK-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = freeze i1 [[C3]] ; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP2]], i1 [[C1]], i1 false -; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i1 [[TMP1]], i1 false +; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[TMP1]], i1 [[OP_RDX]], i1 false ; CHECK-NEXT: ret i1 [[OP_RDX1]] ; %x0 = extractelement <4 x i32> %x, i32 0 From c21a3776c94eef46ea43192c6d0b8df6258b6489 Mon Sep 17 00:00:00 2001 From: Vikash Gupta Date: Thu, 26 Dec 2024 18:57:19 +0530 Subject: [PATCH 079/567] [GlobalIsel] [Utility] [NFC] Added isConstantOrConstantSplatVectorFP to handle float constants. (#120935) Needed for #120104 --- llvm/include/llvm/CodeGen/GlobalISel/Utils.h | 7 ++ llvm/lib/CodeGen/GlobalISel/Utils.cpp | 12 +++ .../CodeGen/GlobalISel/GISelUtilsTest.cpp | 95 +++++++++++++++++++ 3 files changed, 114 insertions(+) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h index 37653631cc238..cb5a4c14b364c 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h @@ -522,6 +522,13 @@ std::optional isConstantOrConstantSplatVector(MachineInstr &MI, const MachineRegisterInfo &MRI); +/// Determines if \p MI defines a float constant integer or a splat vector of +/// float constant integers. +/// \returns the float constant or std::nullopt. +std::optional +isConstantOrConstantSplatVectorFP(MachineInstr &MI, + const MachineRegisterInfo &MRI); + /// Attempt to match a unary predicate against a scalar/splat constant or every /// element of a constant G_BUILD_VECTOR. If \p ConstVal is null, the source /// value was undef. diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index 8c1e41ea106ec..79382933a1f42 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -1517,6 +1517,18 @@ llvm::isConstantOrConstantSplatVector(MachineInstr &MI, return APInt(ScalarSize, *MaybeCst, true); } +std::optional +llvm::isConstantOrConstantSplatVectorFP(MachineInstr &MI, + const MachineRegisterInfo &MRI) { + Register Def = MI.getOperand(0).getReg(); + if (auto FpConst = getFConstantVRegValWithLookThrough(Def, MRI)) + return FpConst->Value; + auto MaybeCstFP = getFConstantSplat(Def, MRI, /*allowUndef=*/false); + if (!MaybeCstFP) + return std::nullopt; + return MaybeCstFP->Value; +} + bool llvm::isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs) { switch (MI.getOpcode()) { diff --git a/llvm/unittests/CodeGen/GlobalISel/GISelUtilsTest.cpp b/llvm/unittests/CodeGen/GlobalISel/GISelUtilsTest.cpp index 1ff7fd956d015..9163663c2b776 100644 --- a/llvm/unittests/CodeGen/GlobalISel/GISelUtilsTest.cpp +++ b/llvm/unittests/CodeGen/GlobalISel/GISelUtilsTest.cpp @@ -77,6 +77,15 @@ static const LLT NXV3P0 = LLT::scalable_vector(3, P0); static const LLT NXV4P0 = LLT::scalable_vector(4, P0); static const LLT NXV12P0 = LLT::scalable_vector(12, P0); +static void collectNonCopyMI(SmallVectorImpl &MIList, + MachineFunction *MF) { + for (auto &MBB : *MF) + for (MachineInstr &MI : MBB) { + if (MI.getOpcode() != TargetOpcode::COPY) + MIList.push_back(&MI); + } +} + TEST(GISelUtilsTest, getGCDType) { EXPECT_EQ(S1, getGCDType(S1, S1)); EXPECT_EQ(S32, getGCDType(S32, S32)); @@ -408,4 +417,90 @@ TEST_F(AArch64GISelMITest, ConstFalseTest) { } } } + +TEST_F(AMDGPUGISelMITest, isConstantOrConstantSplatVectorFP) { + StringRef MIRString = + " %cst0:_(s32) = G_FCONSTANT float 2.000000e+00\n" + " %cst1:_(s32) = G_FCONSTANT float 0.0\n" + " %cst2:_(s64) = G_FCONSTANT double 3.000000e-02\n" + " %cst3:_(s32) = G_CONSTANT i32 2\n" + " %cst4:_(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst0(s32)\n" + " %cst5:_(<2 x s32>) = G_BUILD_VECTOR %cst1(s32), %cst0(s32)\n" + " %cst6:_(<2 x s64>) = G_BUILD_VECTOR %cst2(s64), %cst2(s64)\n" + " %cst7:_(<2 x s32>) = G_BUILD_VECTOR %cst3(s32), %cst3:_(s32)\n" + " %cst8:_(<4 x s32>) = G_CONCAT_VECTORS %cst4:_(<2 x s32>), %cst4:_(<2 " + "x s32>)\n" + " %cst9:_(<4 x s64>) = G_CONCAT_VECTORS %cst6:_(<2 x s64>), %cst6:_(<2 " + "x s64>)\n" + " %cst10:_(<4 x s32>) = G_CONCAT_VECTORS %cst4:_(<2 x s32>), %cst5:_(<2 " + "x s32>)\n" + " %cst11:_(<4 x s32>) = G_CONCAT_VECTORS %cst7:_(<2 x s32>), %cst7:_(<2 " + "x s32>)\n" + " %cst12:_(s32) = G_IMPLICIT_DEF \n" + " %cst13:_(<2 x s32>) = G_BUILD_VECTOR %cst12(s32), %cst12(s32)\n" + " %cst14:_(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst12(s32)\n" + " %cst15:_(<4 x s32>) = G_CONCAT_VECTORS %cst4:_(<2 x s32>), " + "%cst14:_(<2 " + "x s32>)\n"; + + SmallVector MIList; + + setUp(MIRString); + if (!TM) + GTEST_SKIP(); + + collectNonCopyMI(MIList, MF); + + EXPECT_TRUE(isConstantOrConstantSplatVectorFP(*MIList[0], *MRI).has_value()); + auto val = isConstantOrConstantSplatVectorFP(*MIList[0], *MRI).value(); + EXPECT_EQ(2.0, val.convertToFloat()); + + EXPECT_TRUE(isConstantOrConstantSplatVectorFP(*MIList[1], *MRI).has_value()); + val = isConstantOrConstantSplatVectorFP(*MIList[1], *MRI).value(); + EXPECT_EQ(0.0, val.convertToFloat()); + + EXPECT_TRUE(isConstantOrConstantSplatVectorFP(*MIList[2], *MRI).has_value()); + val = isConstantOrConstantSplatVectorFP(*MIList[2], *MRI).value(); + EXPECT_EQ(0.03, val.convertToDouble()); + + EXPECT_FALSE(isConstantOrConstantSplatVectorFP(*MIList[3], *MRI).has_value()); + + EXPECT_TRUE(isConstantOrConstantSplatVectorFP(*MIList[4], *MRI).has_value()); + val = isConstantOrConstantSplatVectorFP(*MIList[4], *MRI).value(); + EXPECT_EQ(2.0, val.convertToFloat()); + + EXPECT_FALSE(isConstantOrConstantSplatVectorFP(*MIList[5], *MRI).has_value()); + + EXPECT_TRUE(isConstantOrConstantSplatVectorFP(*MIList[6], *MRI).has_value()); + val = isConstantOrConstantSplatVectorFP(*MIList[6], *MRI).value(); + EXPECT_EQ(0.03, val.convertToDouble()); + + EXPECT_FALSE(isConstantOrConstantSplatVectorFP(*MIList[7], *MRI).has_value()); + + EXPECT_TRUE(isConstantOrConstantSplatVectorFP(*MIList[8], *MRI).has_value()); + val = isConstantOrConstantSplatVectorFP(*MIList[8], *MRI).value(); + EXPECT_EQ(2.0, val.convertToFloat()); + + EXPECT_TRUE(isConstantOrConstantSplatVectorFP(*MIList[9], *MRI).has_value()); + val = isConstantOrConstantSplatVectorFP(*MIList[9], *MRI).value(); + EXPECT_EQ(0.03, val.convertToDouble()); + + EXPECT_FALSE( + isConstantOrConstantSplatVectorFP(*MIList[10], *MRI).has_value()); + + EXPECT_FALSE( + isConstantOrConstantSplatVectorFP(*MIList[11], *MRI).has_value()); + + EXPECT_FALSE( + isConstantOrConstantSplatVectorFP(*MIList[12], *MRI).has_value()); + + EXPECT_FALSE( + isConstantOrConstantSplatVectorFP(*MIList[13], *MRI).has_value()); + + EXPECT_FALSE( + isConstantOrConstantSplatVectorFP(*MIList[14], *MRI).has_value()); + + EXPECT_FALSE( + isConstantOrConstantSplatVectorFP(*MIList[15], *MRI).has_value()); +} } From 4a92c27f9d29d065156647f9bcc44a8418c98efa Mon Sep 17 00:00:00 2001 From: Sergei Barannikov Date: Thu, 26 Dec 2024 17:45:29 +0300 Subject: [PATCH 080/567] [TableGen][GISel] Remove check for LLT when emitting renderers (#121144) Types used in the destination DAG of a pattern should not matter for GlobalISel. All necessary checks are emitted in the form of matchers when traversing the source DAG. In particular, the check prevented importing patterns containing iPTR in the middle of the destination DAG. This reduces the number of skipped patterns on Mips and RISCV: ``` Mips 1270 -> 1212 (-58) RISCV 42165 -> 42088 (-77) ``` Most of these patterns are for atomic operations. --- .../GlobalISelEmitter/OverloadedPtr.td | 26 ++++++++++++++++--- llvm/utils/TableGen/GlobalISelEmitter.cpp | 9 ------- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/llvm/test/TableGen/GlobalISelEmitter/OverloadedPtr.td b/llvm/test/TableGen/GlobalISelEmitter/OverloadedPtr.td index c70211d665225..31accba8b1847 100644 --- a/llvm/test/TableGen/GlobalISelEmitter/OverloadedPtr.td +++ b/llvm/test/TableGen/GlobalISelEmitter/OverloadedPtr.td @@ -4,10 +4,32 @@ include "llvm/Target/Target.td" include "GlobalISelEmitterCommon.td" +def GPR : RegisterClass<"MyTarget", [i32, i64], 32, (add R0)>; + let TargetPrefix = "mytarget" in { def int_mytarget_anyptr : Intrinsic<[llvm_i32_ty], [llvm_anyptr_ty]>; } +// Check that iPTR in the destination DAG doesn't prevent the pattern from being imported. + +// CHECK: GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckMemorySizeEqualToLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0, +// CHECK-NEXT: GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic, +// CHECK-NEXT: GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID), +// CHECK-NEXT: // MIs[0] src1 +// CHECK-NEXT: GIM_CheckPointerToAny, /*MI*/0, /*Op*/1, /*SizeInBits*/0, +// CHECK-NEXT: GIM_RootCheckRegBankForClass, /*Op*/1, /*RC*/GIMT_Encode2(MyTarget::GPRRegClassID), +// CHECK-NEXT: // (ld:{ *:[i32] } GPR:{ *:[iPTR] }:$src1)<><> => (ANYLOAD:{ *:[i32] } GPR:{ *:[iPTR] }:$src1) +// CHECK-NEXT: GIR_MutateOpcode, /*InsnID*/0, /*RecycleInsnID*/0, /*Opcode*/GIMT_Encode2(MyTarget::ANYLOAD), +// CHECK-NEXT: GIR_RootConstrainSelectedInstOperands, +// CHECK-NEXT: // GIR_Coverage, 0, +// CHECK-NEXT: GIR_Done, + +let hasSideEffects = 1 in { + def ANYLOAD : I<(outs GPR32:$dst), (ins GPR:$src1), + [(set GPR32:$dst, (load GPR:$src1))]>; +} + // Ensure that llvm_anyptr_ty on an intrinsic results in a // GIM_CheckPointerToAny rather than a GIM_CheckType. // @@ -20,10 +42,6 @@ let TargetPrefix = "mytarget" in { // CHECK-NEXT: GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GIMT_Encode2(GICXXPred_MI_Predicate_frag_anyptr), // CHECK-NEXT: // (intrinsic_w_chain:{ *:[i32] } {{[0-9]+}}:{ *:[iPTR] }, GPR32:{ *:[i32] }:$src)<> => (ANYLOAD:{ *:[i32] } GPR32:{ *:[i32] }:$src) // CHECK-NEXT: GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::ANYLOAD), -let hasSideEffects = 1 in { - def ANYLOAD : I<(outs GPR32:$dst), (ins GPR32:$src1), - [(set GPR32:$dst, (load GPR32:$src1))]>; -} def frag_anyptr : PatFrag<(ops node:$src), (int_mytarget_anyptr node:$src), diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp index 0b910096b0528..f0fb11625883e 100644 --- a/llvm/utils/TableGen/GlobalISelEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp @@ -1246,15 +1246,6 @@ Error GlobalISelEmitter::importNamedNodeRenderer( if (N.getNumResults() != 1) return failedImport("node does not have one result " + to_string(N)); - std::optional OpTyOrNone; - ArrayRef ChildTypes = N.getExtTypes(); - if (ChildTypes.front().isMachineValueType()) - OpTyOrNone = MVTToLLT(ChildTypes.front().getMachineValueType().SimpleTy); - - // TODO: Remove this check. Types in the destination DAG should not matter. - if (!OpTyOrNone) - return failedImport("node has unsupported type " + to_string(N)); - if (R->isSubClassOf("ComplexPattern")) { auto I = ComplexPatternEquivs.find(R); if (I == ComplexPatternEquivs.end()) From d21f300f06d56a46e96d1e594522ab0ac362f074 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Thu, 26 Dec 2024 09:09:02 -0800 Subject: [PATCH 081/567] [MIPatternMatch] Fix incorrect argument type of m_Type (#121074) m_Type is supposed to extract the underlying value type (equality type comparison is covered by m_SpecificType), therefore it should take a LLT reference as its argument rather than passing by value. This was originated from de256478e61d6488db751689af82d280ba114a6f, which refactored out a good chunk of LLT reference usages. And it's just so happen that (for some reasons) no one is using m_Type and no test was covering it. --- llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h | 4 ++-- llvm/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h index ea6ed322e9b19..80d1fef7533c9 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h @@ -338,7 +338,7 @@ template <> struct bind_helper { }; template <> struct bind_helper { - static bool bind(const MachineRegisterInfo &MRI, LLT Ty, Register Reg) { + static bool bind(const MachineRegisterInfo &MRI, LLT &Ty, Register Reg) { Ty = MRI.getType(Reg); if (Ty.isValid()) return true; @@ -368,7 +368,7 @@ template struct bind_ty { inline bind_ty m_Reg(Register &R) { return R; } inline bind_ty m_MInstr(MachineInstr *&MI) { return MI; } -inline bind_ty m_Type(LLT Ty) { return Ty; } +inline bind_ty m_Type(LLT &Ty) { return Ty; } inline bind_ty m_Pred(CmpInst::Predicate &P) { return P; } inline operand_type_match m_Pred() { return operand_type_match(); } diff --git a/llvm/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp b/llvm/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp index 59a86fa5646f3..bcaa321e49c10 100644 --- a/llvm/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp +++ b/llvm/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp @@ -576,6 +576,11 @@ TEST_F(AArch64GISelMITest, MatchMiscellaneous) { auto MIBAdd = B.buildAdd(s64, Copies[0], Copies[1]); Register Reg = MIBAdd.getReg(0); + // Extract the type. + LLT Ty; + EXPECT_TRUE(mi_match(Reg, *MRI, m_GAdd(m_Type(Ty), m_Reg()))); + EXPECT_EQ(Ty, s64); + // Only one use of Reg. B.buildCast(LLT::pointer(0, 32), MIBAdd); EXPECT_TRUE(mi_match(Reg, *MRI, m_OneUse(m_GAdd(m_Reg(), m_Reg())))); From 831e1ac12e766ae8c94d8d735d8f32c8d319e576 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Thu, 26 Dec 2024 09:28:17 -0800 Subject: [PATCH 082/567] [MIPatternMatch] Add m_GUMin and m_GUMax (#121068) And make all unsigned and signed versions of min/max matchers commutative, since we already made a precedent of m_GAdd that is commutative by default. --- .../llvm/CodeGen/GlobalISel/MIPatternMatch.h | 20 +++++++++++--- .../CodeGen/GlobalISel/PatternMatchTest.cpp | 26 +++++++++++++++++++ 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h index 80d1fef7533c9..47417f53b6e40 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h @@ -538,15 +538,27 @@ m_GAShr(const LHS &L, const RHS &R) { } template -inline BinaryOp_match +inline BinaryOp_match m_GSMax(const LHS &L, const RHS &R) { - return BinaryOp_match(L, R); + return BinaryOp_match(L, R); } template -inline BinaryOp_match +inline BinaryOp_match m_GSMin(const LHS &L, const RHS &R) { - return BinaryOp_match(L, R); + return BinaryOp_match(L, R); +} + +template +inline BinaryOp_match +m_GUMax(const LHS &L, const RHS &R) { + return BinaryOp_match(L, R); +} + +template +inline BinaryOp_match +m_GUMin(const LHS &L, const RHS &R) { + return BinaryOp_match(L, R); } // Helper for unary instructions (G_[ZSA]EXT/G_TRUNC) etc diff --git a/llvm/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp b/llvm/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp index bcaa321e49c10..fc76d4055722e 100644 --- a/llvm/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp +++ b/llvm/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp @@ -224,6 +224,32 @@ TEST_F(AArch64GISelMITest, MatchBinaryOp) { auto MIBAddCst = B.buildAdd(s64, MIBCst, Copies[0]); auto MIBUnmerge = B.buildUnmerge({s32, s32}, B.buildConstant(s64, 42)); + // Match min/max, and make sure they're commutative. + auto SMin = B.buildSMin(s64, Copies[2], MIBAdd); + EXPECT_TRUE(mi_match(SMin.getReg(0), *MRI, + m_GSMin(m_GAdd(m_Reg(Src1), m_Reg(Src2)), m_Reg(Src0)))); + EXPECT_EQ(Src0, Copies[2]); + EXPECT_EQ(Src1, Copies[0]); + EXPECT_EQ(Src2, Copies[1]); + auto SMax = B.buildSMax(s64, Copies[2], MIBAdd); + EXPECT_TRUE(mi_match(SMax.getReg(0), *MRI, + m_GSMax(m_GAdd(m_Reg(Src1), m_Reg(Src2)), m_Reg(Src0)))); + EXPECT_EQ(Src0, Copies[2]); + EXPECT_EQ(Src1, Copies[0]); + EXPECT_EQ(Src2, Copies[1]); + auto UMin = B.buildUMin(s64, Copies[2], MIBAdd); + EXPECT_TRUE(mi_match(UMin.getReg(0), *MRI, + m_GUMin(m_GAdd(m_Reg(Src1), m_Reg(Src2)), m_Reg(Src0)))); + EXPECT_EQ(Src0, Copies[2]); + EXPECT_EQ(Src1, Copies[0]); + EXPECT_EQ(Src2, Copies[1]); + auto UMax = B.buildUMax(s64, Copies[2], MIBAdd); + EXPECT_TRUE(mi_match(UMax.getReg(0), *MRI, + m_GUMax(m_GAdd(m_Reg(Src1), m_Reg(Src2)), m_Reg(Src0)))); + EXPECT_EQ(Src0, Copies[2]); + EXPECT_EQ(Src1, Copies[0]); + EXPECT_EQ(Src2, Copies[1]); + // m_BinOp with opcode. // Match binary instruction, opcode and its non-commutative operands. match = mi_match(MIBAddCst, *MRI, From 8906b7be918be653d3c5f2ef3dbd923561603969 Mon Sep 17 00:00:00 2001 From: srcarroll <50210727+srcarroll@users.noreply.github.com> Date: Thu, 26 Dec 2024 11:32:51 -0600 Subject: [PATCH 083/567] Enable custom alloc-like ops in `promoteBufferResultsToOutParams` (#120288) In `buffer-results-to-out-params`, when `hoist-static-allocs` option is enabled the pass was looking for `memref.alloc`s in order to attempt to avoid copies when it can. Which makes it not extensible to external ops that have allocation like properties. This patch simply changes `memref::AllocOp` to `AllocationOpInterface` in the check to enable for any allocation op. Moreover, for function call updates, we enable setting an allocation function callback in `BufferResultsToOutParamsOpts` to allow users to emit their own alloc-like op. --- .../Dialect/Bufferization/Transforms/Passes.h | 25 ++++++++++++-- .../Transforms/BufferResultsToOutParams.cpp | 34 ++++++++++--------- 2 files changed, 40 insertions(+), 19 deletions(-) diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h index fe43a05c81fdc..c8e456a1d7e38 100644 --- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h @@ -2,10 +2,12 @@ #define MLIR_DIALECT_BUFFERIZATION_TRANSFORMS_PASSES_H #include "mlir/Dialect/Bufferization/IR/BufferDeallocationOpInterface.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Pass/Pass.h" namespace mlir { class FunctionOpInterface; +class MemRefType; class ModuleOp; class RewritePatternSet; class OpBuilder; @@ -38,7 +40,7 @@ std::unique_ptr createOwnershipBasedBufferDeallocationPass( DeallocationOptions options = DeallocationOptions()); /// Creates a pass that finds all temporary allocations -/// and attempts to move the deallocation after the last user/dependency +/// and attempts to move the deallocation after the last user/dependency /// of the allocation, thereby optimizing allocation liveness. std::unique_ptr createOptimizeAllocationLivenessPass(); @@ -157,6 +159,12 @@ std::unique_ptr createBufferLoopHoistingPass(); // Options struct for BufferResultsToOutParams pass. // Note: defined only here, not in tablegen. struct BufferResultsToOutParamsOpts { + /// Allocator function: Generate a memref allocation with the given type. + /// Since `promoteBufferResultsToOutParams` doesn't allow dynamically shaped + /// results, we don't allow passing a range of values for dynamic dims. + using AllocationFn = + std::function(OpBuilder &, Location, MemRefType)>; + /// Memcpy function: Generate a memcpy between two memrefs. using MemCpyFn = std::function; @@ -167,9 +175,20 @@ struct BufferResultsToOutParamsOpts { return true; }; + /// Allocation function; used to allocate a memref. + /// Default memref.alloc is used + AllocationFn allocationFn = [](OpBuilder &builder, Location loc, + MemRefType type) { + return builder.create(loc, type).getResult(); + }; + /// Memcpy function; used to create a copy between two memrefs. - /// If this is empty, memref.copy is used. - std::optional memCpyFn; + /// Default memref.copy is used. + MemCpyFn memCpyFn = [](OpBuilder &builder, Location loc, Value from, + Value to) { + builder.create(loc, from, to); + return success(); + }; /// If true, the pass adds a "bufferize.result" attribute to each output /// parameter. diff --git a/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp b/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp index b7755b2be8483..2502744cb3f58 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "mlir/Dialect/Bufferization/IR/AllocationOpInterface.h" #include "mlir/Dialect/Bufferization/Transforms/Passes.h" #include "mlir/Dialect/Func/IR/FuncOps.h" @@ -21,6 +22,7 @@ namespace bufferization { } // namespace mlir using namespace mlir; +using AllocationFn = bufferization::BufferResultsToOutParamsOpts::AllocationFn; using MemCpyFn = bufferization::BufferResultsToOutParamsOpts::MemCpyFn; /// Return `true` if the given MemRef type has a fully dynamic layout. @@ -105,10 +107,9 @@ updateFuncOp(func::FuncOp func, // Updates all ReturnOps in the scope of the given func::FuncOp by either // keeping them as return values or copying the associated buffer contents into // the given out-params. -static LogicalResult updateReturnOps(func::FuncOp func, - ArrayRef appendedEntryArgs, - MemCpyFn memCpyFn, - bool hoistStaticAllocs) { +static LogicalResult +updateReturnOps(func::FuncOp func, ArrayRef appendedEntryArgs, + const bufferization::BufferResultsToOutParamsOpts &options) { auto res = func.walk([&](func::ReturnOp op) { SmallVector copyIntoOutParams; SmallVector keepAsReturnOperands; @@ -120,13 +121,14 @@ static LogicalResult updateReturnOps(func::FuncOp func, } OpBuilder builder(op); for (auto [orig, arg] : llvm::zip(copyIntoOutParams, appendedEntryArgs)) { - if (hoistStaticAllocs && - isa_and_nonnull(orig.getDefiningOp()) && + if (options.hoistStaticAllocs && + isa_and_nonnull( + orig.getDefiningOp()) && mlir::cast(orig.getType()).hasStaticShape()) { orig.replaceAllUsesWith(arg); orig.getDefiningOp()->erase(); } else { - if (failed(memCpyFn(builder, op.getLoc(), orig, arg))) + if (failed(options.memCpyFn(builder, op.getLoc(), orig, arg))) return WalkResult::interrupt(); } } @@ -175,7 +177,14 @@ updateCalls(ModuleOp module, auto allocType = MemRefType::get(memrefType.getShape(), memrefType.getElementType(), AffineMap(), memrefType.getMemorySpace()); - Value outParam = builder.create(op.getLoc(), allocType); + auto maybeOutParam = + options.allocationFn(builder, op.getLoc(), allocType); + if (failed(maybeOutParam)) { + op.emitError() << "failed to create allocation op"; + didFail = true; + return; + } + Value outParam = maybeOutParam.value(); if (!hasStaticIdentityLayout(memrefType)) { // Layout maps are already checked in `updateFuncOp`. assert(hasFullyDynamicLayoutMap(memrefType) && @@ -213,14 +222,7 @@ LogicalResult mlir::bufferization::promoteBufferResultsToOutParams( return failure(); if (func.isExternal()) continue; - auto defaultMemCpyFn = [](OpBuilder &builder, Location loc, Value from, - Value to) { - builder.create(loc, from, to); - return success(); - }; - if (failed(updateReturnOps(func, appendedEntryArgs, - options.memCpyFn.value_or(defaultMemCpyFn), - options.hoistStaticAllocs))) { + if (failed(updateReturnOps(func, appendedEntryArgs, options))) { return failure(); } } From 6e8a1a45a783c13e4cd19bfd20b7a56cab6f7d81 Mon Sep 17 00:00:00 2001 From: Franklin Date: Fri, 27 Dec 2024 01:54:23 +0800 Subject: [PATCH 084/567] [BOLT] Detect Linux kernel version if the binary is a Linux kernel (#119088) This makes it easier to handle differences (e.g. of exception table entry size) between versions of Linux kernel --- bolt/include/bolt/Core/BinaryData.h | 5 ++ bolt/lib/Core/BinaryContext.cpp | 1 + bolt/lib/Rewrite/LinuxKernelRewriter.cpp | 59 ++++++++++++++++++++++++ bolt/test/X86/linux-alt-instruction.s | 9 ++++ bolt/test/X86/linux-bug-table.s | 9 ++++ bolt/test/X86/linux-exceptions.s | 9 ++++ bolt/test/X86/linux-orc.s | 9 ++++ bolt/test/X86/linux-parainstructions.s | 9 ++++ bolt/test/X86/linux-pci-fixup.s | 9 ++++ bolt/test/X86/linux-smp-locks.s | 9 ++++ bolt/test/X86/linux-static-calls.s | 9 ++++ bolt/test/X86/linux-static-keys.s | 9 ++++ bolt/test/X86/linux-version.S | 53 +++++++++++++++++++++ 13 files changed, 199 insertions(+) create mode 100644 bolt/test/X86/linux-version.S diff --git a/bolt/include/bolt/Core/BinaryData.h b/bolt/include/bolt/Core/BinaryData.h index 6a773c4cb7067..4ab628030ff0d 100644 --- a/bolt/include/bolt/Core/BinaryData.h +++ b/bolt/include/bolt/Core/BinaryData.h @@ -169,6 +169,11 @@ class BinaryData { return Parent && (Parent == BD || Parent->isAncestorOf(BD)); } + void updateSize(uint64_t N) { + if (N > Size) + Size = N; + } + void setIsMoveable(bool Flag) { IsMoveable = Flag; } void setSection(BinarySection &NewSection); void setOutputSection(BinarySection &NewSection) { diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp index f88e34b8e8962..f5e11358daaa3 100644 --- a/bolt/lib/Core/BinaryContext.cpp +++ b/bolt/lib/Core/BinaryContext.cpp @@ -1076,6 +1076,7 @@ MCSymbol *BinaryContext::registerNameAtAddress(StringRef Name, uint64_t Address, BD = GAI->second; if (!BD->hasName(Name)) { GlobalSymbols[Name] = BD; + BD->updateSize(Size); BD->Symbols.push_back(Symbol); } } diff --git a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp index 0532468c237e0..5a5e044184d0b 100644 --- a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp +++ b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp @@ -21,6 +21,8 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Errc.h" +#include "llvm/Support/ErrorOr.h" +#include #define DEBUG_TYPE "bolt-linux" @@ -89,6 +91,34 @@ static cl::opt } // namespace opts +/// Linux kernel version +struct LKVersion { + LKVersion() {} + LKVersion(unsigned Major, unsigned Minor, unsigned Rev) + : Major(Major), Minor(Minor), Rev(Rev) {} + + bool operator<(const LKVersion &Other) const { + return std::make_tuple(Major, Minor, Rev) < + std::make_tuple(Other.Major, Other.Minor, Other.Rev); + } + + bool operator>(const LKVersion &Other) const { return Other < *this; } + + bool operator<=(const LKVersion &Other) const { return !(*this > Other); } + + bool operator>=(const LKVersion &Other) const { return !(*this < Other); } + + bool operator==(const LKVersion &Other) const { + return Major == Other.Major && Minor == Other.Minor && Rev == Other.Rev; + } + + bool operator!=(const LKVersion &Other) const { return !(*this == Other); } + + unsigned Major{0}; + unsigned Minor{0}; + unsigned Rev{0}; +}; + /// Linux Kernel supports stack unwinding using ORC (oops rewind capability). /// ORC state at every IP can be described by the following data structure. struct ORCState { @@ -148,6 +178,8 @@ class AddressExtractor : public DataExtractor { }; class LinuxKernelRewriter final : public MetadataRewriter { + LKVersion LinuxKernelVersion; + /// Information required for updating metadata referencing an instruction. struct InstructionFixup { BinarySection &Section; // Section referencing the instruction. @@ -249,6 +281,8 @@ class LinuxKernelRewriter final : public MetadataRewriter { ErrorOr PCIFixupSection = std::errc::bad_address; static constexpr size_t PCI_FIXUP_ENTRY_SIZE = 16; + Error detectLinuxKernelVersion(); + /// Process linux kernel special sections and their relocations. void processLKSections(); @@ -314,6 +348,9 @@ class LinuxKernelRewriter final : public MetadataRewriter { : MetadataRewriter("linux-kernel-rewriter", BC) {} Error preCFGInitializer() override { + if (Error E = detectLinuxKernelVersion()) + return E; + processLKSections(); if (Error E = processSMPLocks()) @@ -394,6 +431,28 @@ class LinuxKernelRewriter final : public MetadataRewriter { } }; +Error LinuxKernelRewriter::detectLinuxKernelVersion() { + if (BinaryData *BD = BC.getBinaryDataByName("linux_banner")) { + const BinarySection &Section = BD->getSection(); + const std::string S = + Section.getContents().substr(BD->getOffset(), BD->getSize()).str(); + + const std::regex Re(R"---(Linux version ((\d+)\.(\d+)(\.(\d+))?))---"); + std::smatch Match; + if (std::regex_search(S, Match, Re)) { + const unsigned Major = std::stoi(Match[2].str()); + const unsigned Minor = std::stoi(Match[3].str()); + const unsigned Rev = Match[5].matched ? std::stoi(Match[5].str()) : 0; + LinuxKernelVersion = LKVersion(Major, Minor, Rev); + BC.outs() << "BOLT-INFO: Linux kernel version is " << Match[1].str() + << "\n"; + return Error::success(); + } + } + return createStringError(errc::executable_format_error, + "Linux kernel version is unknown"); +} + void LinuxKernelRewriter::processLKSections() { processLKKSymtab(); processLKKSymtab(true); diff --git a/bolt/test/X86/linux-alt-instruction.s b/bolt/test/X86/linux-alt-instruction.s index fe3abbfc2b4c9..83d2cd0634d08 100644 --- a/bolt/test/X86/linux-alt-instruction.s +++ b/bolt/test/X86/linux-alt-instruction.s @@ -142,6 +142,15 @@ _start: .section .orc_unwind_ip .long .L0 + 2 - . +## Linux kernel version + .rodata + .align 16 + .globl linux_banner + .type linux_banner, @object +linux_banner: + .string "Linux version 6.6.61\n" + .size linux_banner, . - linux_banner + ## Fake Linux Kernel sections. .section __ksymtab,"a",@progbits .section __ksymtab_gpl,"a",@progbits diff --git a/bolt/test/X86/linux-bug-table.s b/bolt/test/X86/linux-bug-table.s index 07a4729ade737..2965daab2b265 100644 --- a/bolt/test/X86/linux-bug-table.s +++ b/bolt/test/X86/linux-bug-table.s @@ -56,6 +56,15 @@ _start: .long .L1 - . # instruction .org 2b + 12 +## Linux kernel version + .rodata + .align 16 + .globl linux_banner + .type linux_banner, @object +linux_banner: + .string "Linux version 6.6.61\n" + .size linux_banner, . - linux_banner + ## Fake Linux Kernel sections. .section __ksymtab,"a",@progbits .section __ksymtab_gpl,"a",@progbits diff --git a/bolt/test/X86/linux-exceptions.s b/bolt/test/X86/linux-exceptions.s index 20b8c965f853a..b0e7641af1cd9 100644 --- a/bolt/test/X86/linux-exceptions.s +++ b/bolt/test/X86/linux-exceptions.s @@ -59,6 +59,15 @@ foo: .long .LF0 - . # fixup .long 0 # data +## Linux kernel version + .rodata + .align 16 + .globl linux_banner + .type linux_banner, @object +linux_banner: + .string "Linux version 6.6.61\n" + .size linux_banner, . - linux_banner + ## Fake Linux Kernel sections. .section __ksymtab,"a",@progbits .section __ksymtab_gpl,"a",@progbits diff --git a/bolt/test/X86/linux-orc.s b/bolt/test/X86/linux-orc.s index 1b0e681b1dbf9..133b0df690e62 100644 --- a/bolt/test/X86/linux-orc.s +++ b/bolt/test/X86/linux-orc.s @@ -157,6 +157,15 @@ bar: .section .orc_unwind_ip .long .L4 - . +## Linux kernel version + .rodata + .align 16 + .globl linux_banner + .type linux_banner, @object +linux_banner: + .string "Linux version 6.6.61\n" + .size linux_banner, . - linux_banner + ## Fake Linux Kernel sections. .section __ksymtab,"a",@progbits .section __ksymtab_gpl,"a",@progbits diff --git a/bolt/test/X86/linux-parainstructions.s b/bolt/test/X86/linux-parainstructions.s index 07fca6bbedafa..facfcb168b166 100644 --- a/bolt/test/X86/linux-parainstructions.s +++ b/bolt/test/X86/linux-parainstructions.s @@ -49,6 +49,15 @@ _start: .byte 1 # type .byte 7 # length +## Linux kernel version + .rodata + .align 16 + .globl linux_banner + .type linux_banner, @object +linux_banner: + .string "Linux version 6.6.61\n" + .size linux_banner, . - linux_banner + ## Fake Linux Kernel sections. .section __ksymtab,"a",@progbits .section __ksymtab_gpl,"a",@progbits diff --git a/bolt/test/X86/linux-pci-fixup.s b/bolt/test/X86/linux-pci-fixup.s index 42504c108d339..d8df91a4e9bcd 100644 --- a/bolt/test/X86/linux-pci-fixup.s +++ b/bolt/test/X86/linux-pci-fixup.s @@ -36,6 +36,15 @@ _start: .long 0x0 # class shift .long .L0 - . # fixup +## Linux kernel version + .rodata + .align 16 + .globl linux_banner + .type linux_banner, @object +linux_banner: + .string "Linux version 6.6.61\n" + .size linux_banner, . - linux_banner + ## Fake Linux Kernel sections. .section __ksymtab,"a",@progbits .section __ksymtab_gpl,"a",@progbits diff --git a/bolt/test/X86/linux-smp-locks.s b/bolt/test/X86/linux-smp-locks.s index 50d9e632b1172..2fc136fd78cda 100644 --- a/bolt/test/X86/linux-smp-locks.s +++ b/bolt/test/X86/linux-smp-locks.s @@ -35,6 +35,15 @@ _start: .long .L0 - . .long .L1 - . +## Linux kernel version + .rodata + .align 16 + .globl linux_banner + .type linux_banner, @object +linux_banner: + .string "Linux version 6.6.61\n" + .size linux_banner, . - linux_banner + ## Fake Linux Kernel sections. .section __ksymtab,"a",@progbits .section __ksymtab_gpl,"a",@progbits diff --git a/bolt/test/X86/linux-static-calls.s b/bolt/test/X86/linux-static-calls.s index ce90f4bb79c09..758e1395d8846 100644 --- a/bolt/test/X86/linux-static-calls.s +++ b/bolt/test/X86/linux-static-calls.s @@ -54,6 +54,15 @@ __start_static_call_sites: .type __stop_static_call_sites, %object __stop_static_call_sites: +## Linux kernel version + .rodata + .align 16 + .globl linux_banner + .type linux_banner, @object +linux_banner: + .string "Linux version 6.6.61\n" + .size linux_banner, . - linux_banner + ## Fake Linux Kernel sections. .section __ksymtab,"a",@progbits .section __ksymtab_gpl,"a",@progbits diff --git a/bolt/test/X86/linux-static-keys.s b/bolt/test/X86/linux-static-keys.s index d34dd640ef879..2e4457e4df9fb 100644 --- a/bolt/test/X86/linux-static-keys.s +++ b/bolt/test/X86/linux-static-keys.s @@ -85,6 +85,15 @@ __stop___jump_table: fake_static_key: .quad 0 +## Linux kernel version + .rodata + .align 16 + .globl linux_banner + .type linux_banner, @object +linux_banner: + .string "Linux version 6.6.61\n" + .size linux_banner, . - linux_banner + ## Fake Linux Kernel sections. .section __ksymtab,"a",@progbits .section __ksymtab_gpl,"a",@progbits diff --git a/bolt/test/X86/linux-version.S b/bolt/test/X86/linux-version.S new file mode 100644 index 0000000000000..e680d0d64a21f --- /dev/null +++ b/bolt/test/X86/linux-version.S @@ -0,0 +1,53 @@ +# REQUIRES: system-linux + +## Check that BOLT correctly detects the Linux kernel version + +# RUN: %clang -DA -target x86_64-unknown-unknown \ +# RUN: %cflags -nostdlib %s -o %t.exe \ +# RUN: -Wl,--image-base=0xffffffff80000000,--no-dynamic-linker,--no-eh-frame-hdr +# RUN: llvm-bolt %t.exe -o %t.out 2>&1 | FileCheck --check-prefix=CHECK-A %s + +# RUN: %clang -DB -target x86_64-unknown-unknown \ +# RUN: %cflags -nostdlib %s -o %t.exe \ +# RUN: -Wl,--image-base=0xffffffff80000000,--no-dynamic-linker,--no-eh-frame-hdr +# RUN: llvm-bolt %t.exe -o %t.out 2>&1 | FileCheck --check-prefix=CHECK-B %s + +# RUN: %clang -DC -target x86_64-unknown-unknown \ +# RUN: %cflags -nostdlib %s -o %t.exe \ +# RUN: -Wl,--image-base=0xffffffff80000000,--no-dynamic-linker,--no-eh-frame-hdr +# RUN: llvm-bolt %t.exe -o %t.out 2>&1 | FileCheck --check-prefix=CHECK-C %s + + .text + .globl foo + .type foo, %function +foo: + ret + .size foo, .-foo + +## Linux kernel version + .rodata + .align 16 + .globl linux_banner + .type linux_banner, @object +linux_banner: + +#ifdef A + .string "Linux version 6.6.61\n" +#endif +# CHECK-A: BOLT-INFO: Linux kernel version is 6.6.61 + +#ifdef B + .string "Linux version 6.6.50-rc4\n" +#endif +# CHECK-B: BOLT-INFO: Linux kernel version is 6.6.50 + +#ifdef C + .string "Linux version 6.6\n" +#endif +# CHECK-C: BOLT-INFO: Linux kernel version is 6.6 + + .size linux_banner, . - linux_banner + +## Fake Linux Kernel sections. + .section __ksymtab,"a",@progbits + .section __ksymtab_gpl,"a",@progbits From 62c39d773422fd7193758c325085c864a67a55cc Mon Sep 17 00:00:00 2001 From: Davide Italiano Date: Thu, 26 Dec 2024 11:18:12 -0800 Subject: [PATCH 085/567] [BOLT/docs] The support for macro-op fusion was removed. (#121158) Update the documentation accordingly. --- bolt/docs/CommandLineArgumentReference.md | 9 --------- 1 file changed, 9 deletions(-) diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md index 91918d614a90f..f3881c9a640a9 100644 --- a/bolt/docs/CommandLineArgumentReference.md +++ b/bolt/docs/CommandLineArgumentReference.md @@ -931,15 +931,6 @@ Remove redundant Address-Size override prefix -### BOLT options in relocation mode: - -- `--align-macro-fusion=` - - Fix instruction alignment for macro-fusion (x86 relocation mode) - - `none`: do not insert alignment no-ops for macro-fusion - - `hot`: only insert alignment no-ops on hot execution paths (default) - - `all`: always align instructions to allow macro-fusion - ### BOLT instrumentation options: `llvm-bolt -instrument [-o outputfile] ` From 776ac21c7f95e092759ba39e5533aad90d63c86e Mon Sep 17 00:00:00 2001 From: "Oleksandr \"Alex\" Zinenko" Date: Thu, 26 Dec 2024 11:18:35 -0800 Subject: [PATCH 086/567] [mlir] minor documentation fix in GPUTransformOps.td (#121157) - do not refer to handles as `PDLOperation`, this is an outdated and incorrect vision of what they are based on the type used in the early days; - use backticks around inline code. --- .../mlir/Dialect/GPU/TransformOps/GPUTransformOps.td | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td index 80b4547c32c10..61d4ccec5f0bd 100644 --- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td +++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td @@ -168,13 +168,13 @@ def MapNestedForallToThreads : #### Return modes: - This operation ignores non-gpu_launch ops and drops them in the return. + This operation ignores non-`gpu_launch` ops and drops them in the return. If any scf.forall with tensors is found, the transform definitely fails. - If all the scf.forall operations with gpu.thread mapping contained - within the LaunchOp referred to by the `target` PDLOperation lower to GPU + If all the `scf.forall` operations with gpu.thread mapping contained + within the `LaunchOp` referred to by the `target` handle lower to GPU properly, the transform succeeds. Otherwise the transform definitely fails. @@ -277,8 +277,8 @@ def MapForallToBlocks : If any scf.forall with tensors is found, the transform definitely fails. - If all the scf.forall operations contained within the LaunchOp - referred to by the `target` PDLOperation lower to GPU properly, the + If all the `scf.forall` operations contained within the LaunchOp + referred to by the `target` handle lower to GPU properly, the transform succeeds. Otherwise the transform definitely fails. The returned handle points to the same LaunchOp operand, consuming it and From b59a0a6f404a82b71802eceaf3addb94ce728ce9 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 26 Dec 2024 12:25:16 -0800 Subject: [PATCH 087/567] [NFC][Driver] Define %{filecheck} as suggested on #121081 (#121159) We will introduce `--implicit-check-not=libclang_rt` in #121081, this let us to avoid repeating the flag. --- clang/test/Driver/sanitizer-ld.c | 232 ++++++++++++++++--------------- 1 file changed, 117 insertions(+), 115 deletions(-) diff --git a/clang/test/Driver/sanitizer-ld.c b/clang/test/Driver/sanitizer-ld.c index 8f2f7a5997ab4..0faa582c081c5 100644 --- a/clang/test/Driver/sanitizer-ld.c +++ b/clang/test/Driver/sanitizer-ld.c @@ -1,10 +1,12 @@ // Test sanitizers ld flags. +// DEFINE: %{filecheck} = FileCheck %s + // RUN: %clang -### %s 2>&1 \ // RUN: --target=i386-unknown-linux -fuse-ld=ld -fsanitize=address \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-ASAN-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-ASAN-LINUX // // CHECK-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-ASAN-LINUX-NOT: "-lc" @@ -21,7 +23,7 @@ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-ASAN-NO-LINK-RUNTIME-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-ASAN-NO-LINK-RUNTIME-LINUX // // CHECK-ASAN-NO-LINK-RUNTIME-LINUX-NOT: libclang_rt.asan_static-x86_64 // CHECK-ASAN-NO-LINK-RUNTIME-LINUX-NOT: libclang_rt.asan-x86_64 @@ -30,7 +32,7 @@ // RUN: --target=arm64e-apple-macosx -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-ASAN-NO-LINK-RUNTIME-DARWIN %s +// RUN: | %{filecheck} --check-prefix=CHECK-ASAN-NO-LINK-RUNTIME-DARWIN // // CHECK-ASAN-NO-LINK-RUNTIME-DARWIN-NOT: libclang_rt.asan_static // CHECK-ASAN-NO-LINK-RUNTIME-DARWIN-NOT: libclang_rt.asan @@ -39,7 +41,7 @@ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-ASAN-EXECUTABLE-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-ASAN-EXECUTABLE-LINUX // // CHECK-ASAN-EXECUTABLE-LINUX: libclang_rt.asan_static // CHECK-ASAN-EXECUTABLE-LINUX: libclang_rt.asan @@ -48,7 +50,7 @@ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-ASAN-SHARED-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-ASAN-SHARED-LINUX // // CHECK-ASAN-SHARED-LINUX: libclang_rt.asan_static // CHECK-ASAN-SHARED-LINUX-NOT: libclang_rt.asan @@ -57,20 +59,20 @@ // RUN: --target=i386-unknown-linux -fuse-ld=ld -fsanitize=address -shared-libsan \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-SHARED-ASAN-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-SHARED-ASAN-LINUX // RUN: %clang -### %s 2>&1 \ // RUN: --target=i386-unknown-linux -fuse-ld=ld -fsanitize=address -shared-libasan \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-SHARED-ASAN-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-SHARED-ASAN-LINUX // RUN: %clang -### %s 2>&1 \ // RUN: --target=i386-unknown-linux -fuse-ld=ld -fsanitize=address \ // RUN: -shared-libsan -static-libsan -shared-libasan \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-SHARED-ASAN-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-SHARED-ASAN-LINUX // // CHECK-SHARED-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-SHARED-ASAN-LINUX-NOT: "-lc" @@ -88,7 +90,7 @@ // RUN: --target=i386-unknown-linux -fuse-ld=ld -fsanitize=address -shared-libsan \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-DSO-SHARED-ASAN-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-DSO-SHARED-ASAN-LINUX // // CHECK-DSO-SHARED-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-DSO-SHARED-ASAN-LINUX-NOT: "-lc" @@ -106,7 +108,7 @@ // RUN: --target=i386-unknown-freebsd -fuse-ld=ld -fsanitize=address \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_freebsd_tree \ -// RUN: | FileCheck --check-prefix=CHECK-ASAN-FREEBSD %s +// RUN: | %{filecheck} --check-prefix=CHECK-ASAN-FREEBSD // // CHECK-ASAN-FREEBSD: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-ASAN-FREEBSD-NOT: "-lc" @@ -122,7 +124,7 @@ // RUN: --target=i386-unknown-freebsd -fuse-ld=ld -fsanitize=address \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_freebsd_tree \ -// RUN: | FileCheck --check-prefix=CHECK-ASAN-FREEBSD-LDL %s +// RUN: | %{filecheck} --check-prefix=CHECK-ASAN-FREEBSD-LDL // // CHECK-ASAN-FREEBSD-LDL: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-ASAN-FREEBSD-LDL-NOT: "-ldl" @@ -131,14 +133,14 @@ // RUN: --target=i386-unknown-linux -fuse-ld=ld -stdlib=platform -fsanitize=address \ // RUN: -resource-dir=%S/Inputs/empty_resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-ASAN-LINUX-CXX %s +// RUN: | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-CXX // RUN: %clangxx -### %s 2>&1 \ // RUN: --target=i386-unknown-linux -fuse-ld=ld -stdlib=platform -fsanitize=address \ // RUN: -resource-dir=%S/Inputs/empty_resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: -fsanitize-link-c++-runtime \ -// RUN: | FileCheck --check-prefix=CHECK-ASAN-LINUX-CXX %s +// RUN: | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-CXX // CHECK-ASAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-ASAN-LINUX-CXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" @@ -157,7 +159,7 @@ // RUN: -resource-dir=%S/Inputs/empty_resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: -fno-sanitize-link-c++-runtime \ -// RUN: | FileCheck --check-prefix=CHECK-ASAN-LINUX-CNOCXX %s +// RUN: | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-CNOCXX // CHECK-ASAN-LINUX-CNOCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-ASAN-LINUX-CNOCXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" @@ -175,7 +177,7 @@ // RUN: -resource-dir=%S/Inputs/empty_resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: -fno-sanitize-link-c++-runtime \ -// RUN: | FileCheck --check-prefix=CHECK-ASAN-LINUX-NOCXX %s +// RUN: | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-NOCXX // CHECK-ASAN-LINUX-NOCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-ASAN-LINUX-NOCXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" @@ -193,7 +195,7 @@ // RUN: -resource-dir=%S/Inputs/empty_resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: -nostdlib++ \ -// RUN: | FileCheck --check-prefix=CHECK-ASAN-LINUX-NOSTDCXX %s +// RUN: | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-NOSTDCXX // CHECK-ASAN-LINUX-NOSTDCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-ASAN-LINUX-NOSTDCXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" @@ -209,7 +211,7 @@ // RUN: --target=i386-unknown-linux -fuse-ld=ld -stdlib=platform \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree -lstdc++ -static 2>&1 \ -// RUN: | FileCheck --check-prefix=CHECK-ASAN-LINUX-CXX-STATIC %s +// RUN: | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-CXX-STATIC // // CHECK-ASAN-LINUX-CXX-STATIC: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-ASAN-LINUX-CXX-STATIC-NOT: stdc++ @@ -219,7 +221,7 @@ // RUN: %clang -### %s 2>&1 \ // RUN: --target=arm-linux-gnueabi -fuse-ld=ld -fsanitize=address \ // RUN: --sysroot=%S/Inputs/basic_android_tree/sysroot \ -// RUN: | FileCheck --check-prefix=CHECK-ASAN-ARM %s +// RUN: | %{filecheck} --check-prefix=CHECK-ASAN-ARM // // CHECK-ASAN-ARM: "{{(.*[^.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-ASAN-ARM-NOT: "-lc" @@ -228,7 +230,7 @@ // RUN: %clang -### %s 2>&1 \ // RUN: --target=armv7l-linux-gnueabi -fuse-ld=ld -fsanitize=address \ // RUN: --sysroot=%S/Inputs/basic_android_tree/sysroot \ -// RUN: | FileCheck --check-prefix=CHECK-ASAN-ARMv7 %s +// RUN: | %{filecheck} --check-prefix=CHECK-ASAN-ARMv7 // // CHECK-ASAN-ARMv7: "{{(.*[^.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-ASAN-ARMv7-NOT: "-lc" @@ -238,7 +240,7 @@ // RUN: --target=arm-linux-androideabi -fuse-ld=ld -fsanitize=address \ // RUN: --sysroot=%S/Inputs/basic_android_tree/sysroot \ // RUN: -resource-dir=%S/Inputs/resource_dir \ -// RUN: | FileCheck --check-prefix=CHECK-ASAN-ANDROID %s +// RUN: | %{filecheck} --check-prefix=CHECK-ASAN-ANDROID // // CHECK-ASAN-ANDROID: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" // CHECK-ASAN-ANDROID: "-pie" @@ -254,14 +256,14 @@ // RUN: --sysroot=%S/Inputs/basic_android_tree/sysroot \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: -static-libsan \ -// RUN: | FileCheck --check-prefix=CHECK-ASAN-ANDROID-STATICLIBASAN %s +// RUN: | %{filecheck} --check-prefix=CHECK-ASAN-ANDROID-STATICLIBASAN // // RUN: %clang -### %s 2>&1 \ // RUN: --target=arm-linux-androideabi -fuse-ld=ld -fsanitize=address \ // RUN: --sysroot=%S/Inputs/basic_android_tree/sysroot \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: -static-libasan \ -// RUN: | FileCheck --check-prefix=CHECK-ASAN-ANDROID-STATICLIBASAN %s +// RUN: | %{filecheck} --check-prefix=CHECK-ASAN-ANDROID-STATICLIBASAN // // CHECK-ASAN-ANDROID-STATICLIBASAN: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" // CHECK-ASAN-ANDROID-STATICLIBASAN: libclang_rt.asan.a" @@ -273,7 +275,7 @@ // RUN: --target=arm-linux-androideabi -fuse-ld=ld -fsanitize=undefined \ // RUN: --sysroot=%S/Inputs/basic_android_tree/sysroot \ // RUN: -resource-dir=%S/Inputs/resource_dir \ -// RUN: | FileCheck --check-prefix=CHECK-UBSAN-ANDROID %s +// RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-ANDROID // // CHECK-UBSAN-ANDROID: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" // CHECK-UBSAN-ANDROID: "-pie" @@ -289,7 +291,7 @@ // RUN: --sysroot=%S/Inputs/basic_android_tree/sysroot \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: -static-libsan \ -// RUN: | FileCheck --check-prefix=CHECK-UBSAN-ANDROID-STATICLIBASAN %s +// RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-ANDROID-STATICLIBASAN // // CHECK-UBSAN-ANDROID-STATICLIBASAN: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" // CHECK-UBSAN-ANDROID-STATICLIBASAN: libclang_rt.ubsan_standalone.a" @@ -302,7 +304,7 @@ // RUN: --target=i686-linux-android -fuse-ld=ld -fsanitize=address \ // RUN: --sysroot=%S/Inputs/basic_android_tree/sysroot \ // RUN: -resource-dir=%S/Inputs/resource_dir \ -// RUN: | FileCheck --check-prefix=CHECK-ASAN-ANDROID-X86 %s +// RUN: | %{filecheck} --check-prefix=CHECK-ASAN-ANDROID-X86 // // CHECK-ASAN-ANDROID-X86: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" // CHECK-ASAN-ANDROID-X86: "-pie" @@ -317,7 +319,7 @@ // RUN: --target=arm-linux-androideabi -fsanitize=address \ // RUN: --sysroot=%S/Inputs/basic_android_tree/sysroot \ // RUN: -shared-libsan \ -// RUN: | FileCheck --check-prefix=CHECK-ASAN-ANDROID-SHARED-LIBASAN %s +// RUN: | %{filecheck} --check-prefix=CHECK-ASAN-ANDROID-SHARED-LIBASAN // // CHECK-ASAN-ANDROID-SHARED-LIBASAN-NOT: argument unused during compilation: '-shared-libsan' // @@ -326,7 +328,7 @@ // RUN: --sysroot=%S/Inputs/basic_android_tree/sysroot \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: -shared \ -// RUN: | FileCheck --check-prefix=CHECK-ASAN-ANDROID-SHARED %s +// RUN: | %{filecheck} --check-prefix=CHECK-ASAN-ANDROID-SHARED // // CHECK-ASAN-ANDROID-SHARED: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" // CHECK-ASAN-ANDROID-SHARED-NOT: "-lc" @@ -340,7 +342,7 @@ // RUN: -fsanitize=type \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-TYSAN-LINUX-CXX %s +// RUN: | %{filecheck} --check-prefix=CHECK-TYSAN-LINUX-CXX // // CHECK-TYSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-TYSAN-LINUX-CXX-NOT: stdc++ @@ -352,7 +354,7 @@ // RUN: --target=x86_64-apple-darwin13.4.0 -fuse-ld=ld -stdlib=platform \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-TYSAN-DARWIN-CXX %s +// RUN: | %{filecheck} --check-prefix=CHECK-TYSAN-DARWIN-CXX // CHECK-TYSAN-DARWIN-CXX: "{{.*}}ld{{(.exe)?}}" // CHECK-TYSAN-DARWIN-CXX: libclang_rt.tysan_osx_dynamic.dylib // CHECK-TYSAN-DARWIN-CXX-NOT: -lc++abi @@ -362,7 +364,7 @@ // RUN: -fsanitize=thread \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-TSAN-LINUX-CXX %s +// RUN: | %{filecheck} --check-prefix=CHECK-TSAN-LINUX-CXX // // CHECK-TSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-TSAN-LINUX-CXX-NOT: stdc++ @@ -381,7 +383,7 @@ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-TSAN-NO-LINK-RUNTIME-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-TSAN-NO-LINK-RUNTIME-LINUX // // CHECK-TSAN-NO-LINK-RUNTIME-LINUX-NOT: libclang_rt.tsan @@ -389,7 +391,7 @@ // RUN: --target=arm64e-apple-ios -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-TSAN-NO-LINK-RUNTIME-DARWIN %s +// RUN: | %{filecheck} --check-prefix=CHECK-TSAN-NO-LINK-RUNTIME-DARWIN // // CHECK-TSAN-NO-LINK-RUNTIME-DARWIN-NOT: libclang_rt.tsan @@ -398,7 +400,7 @@ // RUN: -fsanitize=memory \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-MSAN-LINUX-CXX %s +// RUN: | %{filecheck} --check-prefix=CHECK-MSAN-LINUX-CXX // // CHECK-MSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-MSAN-LINUX-CXX-NOT: stdc++ @@ -417,7 +419,7 @@ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-MSAN-NO-LINK-RUNTIME-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-MSAN-NO-LINK-RUNTIME-LINUX // // CHECK-MSAN-NO-LINK-RUNTIME-LINUX-NOT: libclang_rt.msan @@ -425,20 +427,20 @@ // RUN: --target=x86_64-unknown-linux-gnux32 -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/multilib_64bit_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-UBSAN-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX // RUN: %clang -fsanitize=float-divide-by-zero -### %s 2>&1 \ // RUN: --target=x86_64-unknown-linux-gnux32 -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/multilib_64bit_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-UBSAN-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX // RUN: %clang -fsanitize=undefined -### %s 2>&1 \ // RUN: --target=x86_64-unknown-linux-gnux32 -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/multilib_64bit_linux_tree \ // RUN: -static-libsan \ -// RUN: | FileCheck --check-prefix=CHECK-UBSAN-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX // CHECK-UBSAN-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-UBSAN-LINUX-NOT: libclang_rt.asan @@ -454,7 +456,7 @@ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-UBSAN-NO-LINK-RUNTIME-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-NO-LINK-RUNTIME-LINUX // // CHECK-UBSAN-NO-LINK-RUNTIME-LINUX-NOT: libclang_rt.undefined @@ -462,7 +464,7 @@ // RUN: --target=x86_64-apple-darwin -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-UBSAN-NO-LINK-RUNTIME-DARWIN %s +// RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-NO-LINK-RUNTIME-DARWIN // // CHECK-UBSAN-NO-LINK-RUNTIME-DARWIN-NOT: libclang_rt.ubsan @@ -470,7 +472,7 @@ // RUN: --target=arm64e-apple-watchos -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-FUZZER-NO-LINK-RUNTIME-DARWIN %s +// RUN: | %{filecheck} --check-prefix=CHECK-FUZZER-NO-LINK-RUNTIME-DARWIN // // CHECK-FUZZER-NO-LINK-RUNTIME-DARWIN-NOT: libclang_rt.fuzzer @@ -479,21 +481,21 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: -shared-libsan \ -// RUN: | FileCheck --check-prefix=CHECK-UBSAN-LINUX-SHAREDLIBASAN %s +// RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX-SHAREDLIBASAN // RUN: %clang -fsanitize=undefined -### %s 2>&1 \ // RUN: --target=i386-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: -static-libsan -shared-libsan \ -// RUN: | FileCheck --check-prefix=CHECK-UBSAN-LINUX-SHAREDLIBASAN %s +// RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX-SHAREDLIBASAN // RUN: %clang -fsanitize=undefined -### %s 2>&1 \ // RUN: --target=i386-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: -shared -shared-libsan \ -// RUN: | FileCheck --check-prefix=CHECK-UBSAN-LINUX-SHAREDLIBASAN %s +// RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX-SHAREDLIBASAN // CHECK-UBSAN-LINUX-SHAREDLIBASAN: "{{.*}}ld{{(.exe)?}}" // CHECK-UBSAN-LINUX-SHAREDLIBASAN: "{{.*}}libclang_rt.ubsan_standalone.so{{.*}}" @@ -502,7 +504,7 @@ // RUN: --target=i386-unknown-linux \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-UBSAN-LINUX-LINK-CXX %s +// RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX-LINK-CXX // CHECK-UBSAN-LINUX-LINK-CXX-NOT: "-lstdc++" // CHECK-UBSAN-LINUX-LINK-CXX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone_cxx.a" "--no-whole-archive" // CHECK-UBSAN-LINUX-LINK-CXX-NOT: "-lstdc++" @@ -511,7 +513,7 @@ // RUN: --target=i386-unknown-linux -fuse-ld=ld -stdlib=platform \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-UBSAN-LINUX-CXX %s +// RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX-CXX // CHECK-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}" // CHECK-UBSAN-LINUX-CXX-NOT: libclang_rt.asan // CHECK-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive" @@ -527,7 +529,7 @@ // RUN: --target=i386-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-UBSAN-MINIMAL-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-MINIMAL-LINUX // CHECK-UBSAN-MINIMAL-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-UBSAN-MINIMAL-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_minimal.a" "--no-whole-archive" // CHECK-UBSAN-MINIMAL-LINUX: "-lpthread" @@ -536,33 +538,33 @@ // RUN: %clang -fsanitize=undefined -fsanitize-minimal-runtime -### %s 2>&1 \ // RUN: --target=x86_64-apple-darwin -fuse-ld=ld \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-UBSAN-MINIMAL-DARWIN %s +// RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-MINIMAL-DARWIN // CHECK-UBSAN-MINIMAL-DARWIN: "{{.*}}ld{{(.exe)?}}" // CHECK-UBSAN-MINIMAL-DARWIN: "{{.*}}libclang_rt.ubsan_minimal_osx_dynamic.dylib" // RUN: not %clang -fsanitize=undefined -### %s 2>&1 \ // RUN: --target=x86_64-apple-darwin -fuse-ld=ld -static-libsan \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-UBSAN-STATIC-DARWIN %s +// RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-STATIC-DARWIN // CHECK-UBSAN-STATIC-DARWIN: {{.*}}error: static UndefinedBehaviorSanitizer runtime is not supported on darwin // RUN: not %clang -fsanitize=address -### %s 2>&1 \ // RUN: --target=x86_64-apple-darwin -fuse-ld=ld -static-libsan \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-ASAN-STATIC-DARWIN %s +// RUN: | %{filecheck} --check-prefix=CHECK-ASAN-STATIC-DARWIN // CHECK-ASAN-STATIC-DARWIN: {{.*}}error: static AddressSanitizer runtime is not supported on darwin // RUN: not %clang -fsanitize=thread -### %s 2>&1 \ // RUN: --target=x86_64-apple-darwin -fuse-ld=ld -static-libsan \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-TSAN-STATIC-DARWIN %s +// RUN: | %{filecheck} --check-prefix=CHECK-TSAN-STATIC-DARWIN // CHECK-TSAN-STATIC-DARWIN: {{.*}}error: static ThreadSanitizer runtime is not supported on darwin // RUN: %clang -fsanitize=address,undefined -### %s 2>&1 \ // RUN: --target=i386-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-ASAN-UBSAN-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-ASAN-UBSAN-LINUX // CHECK-ASAN-UBSAN-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-ASAN-UBSAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-UBSAN-LINUX-NOT: libclang_rt.ubsan @@ -574,7 +576,7 @@ // RUN: --target=i386-unknown-linux -fuse-ld=ld -stdlib=platform \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-ASAN-UBSAN-LINUX-CXX %s +// RUN: | %{filecheck} --check-prefix=CHECK-ASAN-UBSAN-LINUX-CXX // CHECK-ASAN-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}" // CHECK-ASAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.asan_cxx.a" "--no-whole-archive" @@ -589,7 +591,7 @@ // RUN: --target=i386-unknown-linux -fuse-ld=ld -stdlib=platform \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX %s +// RUN: | %{filecheck} --check-prefix=CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX // CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX: "{{.*}}ld{{(.exe)?}}" // CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan_cxx.a" "--no-whole-archive" @@ -602,7 +604,7 @@ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-MSAN-UBSAN-LINUX-CXX %s +// RUN: | %{filecheck} --check-prefix=CHECK-MSAN-UBSAN-LINUX-CXX // CHECK-MSAN-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}" // CHECK-MSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.msan.a" "--no-whole-archive" // CHECK-MSAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan @@ -613,7 +615,7 @@ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-TSAN-UBSAN-LINUX-CXX %s +// RUN: | %{filecheck} --check-prefix=CHECK-TSAN-UBSAN-LINUX-CXX // CHECK-TSAN-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}" // CHECK-TSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tsan.a" "--no-whole-archive" // CHECK-TSAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan @@ -625,7 +627,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: -shared \ -// RUN: | FileCheck --check-prefix=CHECK-UBSAN-LINUX-SHARED %s +// RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX-SHARED // CHECK-UBSAN-LINUX-SHARED: "{{.*}}ld{{(.exe)?}}" // CHECK-UBSAN-LINUX-SHARED-NOT: --export-dynamic // CHECK-UBSAN-LINUX-SHARED-NOT: --dynamic-list @@ -635,7 +637,7 @@ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld -fsanitize=leak \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-LSAN-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-LSAN-LINUX // // CHECK-LSAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-LSAN-LINUX-NOT: "-lc" @@ -649,7 +651,7 @@ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-LSAN-NO-LINK-RUNTIME-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-LSAN-NO-LINK-RUNTIME-LINUX // // CHECK-LSAN-NO-LINK-RUNTIME-LINUX-NOT: libclang_rt.lsan @@ -657,7 +659,7 @@ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld -fsanitize=leak -fsanitize-coverage=func \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-LSAN-COV-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-LSAN-COV-LINUX // // CHECK-LSAN-COV-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-LSAN-COV-LINUX-NOT: "-lc" @@ -672,7 +674,7 @@ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-LSAN-ASAN-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-LSAN-ASAN-LINUX // CHECK-LSAN-ASAN-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-LSAN-ASAN-LINUX-NOT: libclang_rt.lsan // CHECK-LSAN-ASAN-LINUX: libclang_rt.asan @@ -682,7 +684,7 @@ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-ASAN-COV-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-ASAN-COV-LINUX // CHECK-ASAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-ASAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-COV-LINUX-NOT: libclang_rt.ubsan @@ -694,7 +696,7 @@ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-MSAN-COV-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-MSAN-COV-LINUX // CHECK-MSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-MSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.msan.a" "--no-whole-archive" // CHECK-MSAN-COV-LINUX-NOT: libclang_rt.ubsan @@ -706,7 +708,7 @@ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-DFSAN-COV-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-DFSAN-COV-LINUX // CHECK-DFSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-DFSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.dfsan.a" "--no-whole-archive" // CHECK-DFSAN-COV-LINUX-NOT: libclang_rt.ubsan @@ -718,7 +720,7 @@ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-UBSAN-COV-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-COV-LINUX // CHECK-UBSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-UBSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive" // CHECK-UBSAN-COV-LINUX-NOT: "-lstdc++" @@ -729,7 +731,7 @@ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-COV-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-COV-LINUX // CHECK-COV-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive" // CHECK-COV-LINUX-NOT: "-lstdc++" @@ -740,7 +742,7 @@ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld -fsanitize=numerical \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-NSAN-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-NSAN-LINUX // // CHECK-NSAN-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-NSAN-LINUX-NOT: "-lc" @@ -751,7 +753,7 @@ // RUN: %clang -### %s 2>&1 --target=x86_64-unknown-linux -fuse-ld=ld -fsanitize=numerical -shared-libsan \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-NSAN-SHARED-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-NSAN-SHARED-LINUX // CHECK-NSAN-SHARED-LINUX: libclang_rt.nsan.so" // CHECK-NSAN-SHARED-LINUX-NOT: "-lpthread" @@ -761,7 +763,7 @@ // RUN: %clang -### %s 2>&1 --target=x86_64-unknown-linux -fsanitize=numerical,undefined \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-NSAN-UBSAN %s +// RUN: | %{filecheck} --check-prefix=CHECK-NSAN-UBSAN // CHECK-NSAN-UBSAN: "--whole-archive" "{{[^"]*}}libclang_rt.nsan.a" "--no-whole-archive" // CHECK-NSAN-UBSAN-NOT: libclang_rt.ubsan @@ -771,7 +773,7 @@ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld -rtlib=platform \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-CFI-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-CFI-LINUX // CHECK-CFI-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-CFI-LINUX-NOT: libclang_rt. @@ -781,7 +783,7 @@ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-CFI-DIAG-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-CFI-DIAG-LINUX // CHECK-CFI-DIAG-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-CFI-DIAG-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive" @@ -790,7 +792,7 @@ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-CFI-CROSS-DSO-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-CFI-CROSS-DSO-LINUX // CHECK-CFI-CROSS-DSO-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-CFI-CROSS-DSO-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.cfi.a" "--no-whole-archive" // CHECK-CFI-CROSS-DSO-LINUX: -export-dynamic @@ -801,7 +803,7 @@ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-CFI-CROSS-DSO-DIAG-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-CFI-CROSS-DSO-DIAG-LINUX // CHECK-CFI-CROSS-DSO-DIAG-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-CFI-CROSS-DSO-DIAG-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.cfi_diag.a" "--no-whole-archive" // CHECK-CFI-CROSS-DSO-DIAG-LINUX: -export-dynamic @@ -811,7 +813,7 @@ // RUN: --target=aarch64-linux-android -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_android_tree \ -// RUN: | FileCheck --check-prefix=CHECK-CFI-CROSS-DSO-ANDROID %s +// RUN: | %{filecheck} --check-prefix=CHECK-CFI-CROSS-DSO-ANDROID // CHECK-CFI-CROSS-DSO-ANDROID: "{{.*}}ld{{(.exe)?}}" // CHECK-CFI-CROSS-DSO-ANDROID-NOT: libclang_rt.cfi @@ -821,7 +823,7 @@ // RUN: --target=aarch64-linux-android -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_android_tree \ -// RUN: | FileCheck --check-prefix=CHECK-CFI-CROSS-DSO-DIAG-ANDROID %s +// RUN: | %{filecheck} --check-prefix=CHECK-CFI-CROSS-DSO-DIAG-ANDROID // CHECK-CFI-CROSS-DSO-DIAG-ANDROID: "{{.*}}ld{{(.exe)?}}" // CHECK-CFI-CROSS-DSO-DIAG-ANDROID: "{{[^"]*}}libclang_rt.ubsan_standalone.so" // CHECK-CFI-CROSS-DSO-DIAG-ANDROID: "--export-dynamic-symbol=__cfi_check" @@ -831,7 +833,7 @@ // RUN: --target=x86_64-apple-darwin13.4.0 -fuse-ld=ld -stdlib=platform \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-ASAN-DARWIN106-CXX %s +// RUN: | %{filecheck} --check-prefix=CHECK-ASAN-DARWIN106-CXX // CHECK-ASAN-DARWIN106-CXX: "{{.*}}ld{{(.exe)?}}" // CHECK-ASAN-DARWIN106-CXX: libclang_rt.asan_osx_dynamic.dylib // CHECK-ASAN-DARWIN106-CXX-NOT: -lc++abi @@ -841,7 +843,7 @@ // RUN: --target=x86_64-apple-darwin13.4.0 -fuse-ld=ld -stdlib=platform \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-LSAN-DARWIN106-CXX %s +// RUN: | %{filecheck} --check-prefix=CHECK-LSAN-DARWIN106-CXX // CHECK-LSAN-DARWIN106-CXX: "{{.*}}ld{{(.exe)?}}" // CHECK-LSAN-DARWIN106-CXX: libclang_rt.lsan_osx_dynamic.dylib // CHECK-LSAN-DARWIN106-CXX-NOT: -lc++abi @@ -850,7 +852,7 @@ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld -fsanitize=safe-stack \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-SAFESTACK-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-SAFESTACK-LINUX // // CHECK-SAFESTACK-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-SAFESTACK-LINUX-NOT: "-lc" @@ -863,59 +865,59 @@ // RUN: %clang -fsanitize=shadow-call-stack -### %s 2>&1 \ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ -// RUN: | FileCheck --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-X86-64 %s +// RUN: | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-X86-64 // CHECK-SHADOWCALLSTACK-LINUX-X86-64-NOT: error: // RUN: not %clang -fsanitize=shadow-call-stack -### %s 2>&1 \ // RUN: --target=aarch64-unknown-linux -fuse-ld=ld \ -// RUN: | FileCheck --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-AARCH64 %s +// RUN: | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-AARCH64 // CHECK-SHADOWCALLSTACK-LINUX-AARCH64: '-fsanitize=shadow-call-stack' only allowed with '-ffixed-x18' // RUN: %clang -fsanitize=shadow-call-stack -### %s 2>&1 \ // RUN: --target=riscv32-unknown-elf -fuse-ld=ld \ -// RUN: | FileCheck --check-prefix=CHECK-SHADOWCALLSTACK-ELF-RISCV32 %s +// RUN: | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-ELF-RISCV32 // CHECK-SHADOWCALLSTACK-ELF-RISCV32-NOT: error: // RUN: %clang -fsanitize=shadow-call-stack -### %s 2>&1 \ // RUN: --target=riscv64-unknown-linux -fuse-ld=ld \ -// RUN: | FileCheck --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-RISCV64 %s +// RUN: | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-RISCV64 // CHECK-SHADOWCALLSTACK-LINUX-RISCV64-NOT: error: // RUN: %clang -target riscv64-linux-android -fsanitize=shadow-call-stack %s -### 2>&1 \ -// RUN: | FileCheck %s --check-prefix=CHECK-SHADOWCALLSTACK-ANDROID-RISCV64 +// RUN: | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-ANDROID-RISCV64 // CHECK-SHADOWCALLSTACK-ANDROID-RISCV64-NOT: error: // RUN: %clang -fsanitize=shadow-call-stack -### %s 2>&1 \ // RUN: --target=riscv64-unknown-fuchsia -fuse-ld=ld \ -// RUN: | FileCheck --check-prefix=CHECK-SHADOWCALLSTACK-FUCHSIA-RISCV64 %s +// RUN: | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-FUCHSIA-RISCV64 // CHECK-SHADOWCALLSTACK-FUCHSIA-RISCV64-NOT: error: // RUN: %clang -fsanitize=shadow-call-stack -### %s 2>&1 \ // RUN: --target=aarch64-unknown-linux -fuse-ld=ld -ffixed-x18 \ -// RUN: | FileCheck --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18 %s +// RUN: | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18 // RUN: %clang -fsanitize=shadow-call-stack -### %s 2>&1 \ // RUN: --target=arm64-unknown-ios -fuse-ld=ld \ -// RUN: | FileCheck --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18 %s +// RUN: | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18 // RUN: %clang -fsanitize=shadow-call-stack -### %s 2>&1 \ // RUN: --target=aarch64-unknown-linux-android -fuse-ld=ld \ -// RUN: | FileCheck --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18 %s +// RUN: | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18 // CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18-NOT: error: // RUN: not %clang -fsanitize=shadow-call-stack -### %s 2>&1 \ // RUN: --target=x86-unknown-linux -fuse-ld=ld \ -// RUN: | FileCheck --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-X86 %s +// RUN: | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-X86 // CHECK-SHADOWCALLSTACK-LINUX-X86: error: unsupported option '-fsanitize=shadow-call-stack' for target 'x86-unknown-linux' // RUN: %clang -fsanitize=shadow-call-stack -### %s 2>&1 \ // RUN: -fsanitize=safe-stack --target=x86_64-unknown-linux -fuse-ld=ld \ -// RUN: | FileCheck --check-prefix=CHECK-SHADOWCALLSTACK-SAFESTACK %s +// RUN: | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-SAFESTACK // CHECK-SHADOWCALLSTACK-SAFESTACK-NOT: error: // RUN: not %clang -fsanitize=cfi -fsanitize-stats -### %s 2>&1 \ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-CFI-STATS-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-CFI-STATS-LINUX // CHECK-CFI-STATS-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-CFI-STATS-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.stats_client.a" "--no-whole-archive" // CHECK-CFI-STATS-LINUX-NOT: "--whole-archive" @@ -924,7 +926,7 @@ // RUN: not %clang -fsanitize=cfi -fsanitize-stats -### %s 2>&1 \ // RUN: --target=x86_64-apple-darwin -fuse-ld=ld \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-CFI-STATS-DARWIN %s +// RUN: | %{filecheck} --check-prefix=CHECK-CFI-STATS-DARWIN // CHECK-CFI-STATS-DARWIN: "{{.*}}ld{{(.exe)?}}" // CHECK-CFI-STATS-DARWIN: "{{[^"]*}}libclang_rt.stats_client_osx.a" // CHECK-CFI-STATS-DARWIN: "{{[^"]*}}libclang_rt.stats_osx_dynamic.dylib" @@ -932,7 +934,7 @@ // RUN: not %clang -fsanitize=cfi -fsanitize-stats -### %s 2>&1 \ // RUN: --target=x86_64-pc-windows \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-CFI-STATS-WIN64 %s +// RUN: | %{filecheck} --check-prefix=CHECK-CFI-STATS-WIN64 // CHECK-CFI-STATS-WIN64: "--dependent-lib=clang_rt.stats_client{{(-x86_64)?}}.lib" // CHECK-CFI-STATS-WIN64: "--dependent-lib=clang_rt.stats{{(-x86_64)?}}.lib" // CHECK-CFI-STATS-WIN64: "--linker-option=/include:__sanitizer_stats_register" @@ -940,13 +942,13 @@ // RUN: not %clang -fsanitize=cfi -fsanitize-stats -### %s 2>&1 \ // RUN: --target=i686-pc-windows \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-CFI-STATS-WIN32 %s +// RUN: | %{filecheck} --check-prefix=CHECK-CFI-STATS-WIN32 // RUN: not %clang -fsanitize=cfi -fsanitize-stats -### %s 2>&1 \ // RUN: --target=i686-pc-windows \ // RUN: -fno-rtlib-defaultlib \ // RUN: -frtlib-defaultlib \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-CFI-STATS-WIN32 %s +// RUN: | %{filecheck} --check-prefix=CHECK-CFI-STATS-WIN32 // CHECK-CFI-STATS-WIN32: "--dependent-lib=clang_rt.stats_client{{(-i386)?}}.lib" // CHECK-CFI-STATS-WIN32: "--dependent-lib=clang_rt.stats{{(-i386)?}}.lib" // CHECK-CFI-STATS-WIN32: "--linker-option=/include:___sanitizer_stats_register" @@ -955,14 +957,14 @@ // RUN: --target=i686-pc-windows \ // RUN: -fno-rtlib-defaultlib \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-CFI-STATS-WIN32-NODEF %s +// RUN: | %{filecheck} --check-prefix=CHECK-CFI-STATS-WIN32-NODEF // CHECK-CFI-STATS-WIN32-NODEF-NOT: "--dependent-lib=clang_rt.stats_client{{(-i386)?}}.lib" // CHECK-CFI-STATS-WIN32-NODEF-NOT: "--dependent-lib=clang_rt.stats{{(-i386)?}}.lib" // RUN: %clang -### %s 2>&1 \ // RUN: --target=arm-linux-androideabi -fuse-ld=ld -fsanitize=safe-stack \ // RUN: --sysroot=%S/Inputs/basic_android_tree \ -// RUN: | FileCheck --check-prefix=CHECK-SAFESTACK-ANDROID-ARM %s +// RUN: | %{filecheck} --check-prefix=CHECK-SAFESTACK-ANDROID-ARM // // CHECK-SAFESTACK-ANDROID-ARM: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" // CHECK-SAFESTACK-ANDROID-ARM-NOT: libclang_rt.safestack @@ -970,7 +972,7 @@ // RUN: %clang -### %s -shared 2>&1 \ // RUN: --target=arm-linux-androideabi -fuse-ld=ld -fsanitize=safe-stack \ // RUN: --sysroot=%S/Inputs/basic_android_tree \ -// RUN: | FileCheck --check-prefix=CHECK-SAFESTACK-SHARED-ANDROID-ARM %s +// RUN: | %{filecheck} --check-prefix=CHECK-SAFESTACK-SHARED-ANDROID-ARM // // CHECK-SAFESTACK-SHARED-ANDROID-ARM: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" // CHECK-SAFESTACK-SHARED-ANDROID-ARM-NOT: libclang_rt.safestack @@ -978,7 +980,7 @@ // RUN: %clang -### %s 2>&1 \ // RUN: --target=aarch64-linux-android -fuse-ld=ld -fsanitize=safe-stack \ // RUN: --sysroot=%S/Inputs/basic_android_tree \ -// RUN: | FileCheck --check-prefix=CHECK-SAFESTACK-ANDROID-AARCH64 %s +// RUN: | %{filecheck} --check-prefix=CHECK-SAFESTACK-ANDROID-AARCH64 // // CHECK-SAFESTACK-ANDROID-AARCH64: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" // CHECK-SAFESTACK-ANDROID-AARCH64-NOT: libclang_rt.safestack @@ -986,7 +988,7 @@ // RUN: not %clang -fsanitize=undefined -### %s 2>&1 \ // RUN: --target=x86_64-scei-ps4 -fuse-ld=ld \ // RUN: -shared \ -// RUN: | FileCheck --check-prefix=CHECK-UBSAN-PS4 %s +// RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-PS4 // CHECK-UBSAN-PS4: --dependent-lib=libSceDbgUBSanitizer_stub_weak.a // CHECK-UBSAN-PS4: "{{.*}}ld{{(.gold)?(.exe)?}}" // CHECK-UBSAN-PS4: -lSceDbgUBSanitizer_stub_weak @@ -994,7 +996,7 @@ // RUN: not %clang -fsanitize=undefined -### %s 2>&1 \ // RUN: --target=x86_64-sie-ps5 -fuse-ld=ld \ // RUN: -shared \ -// RUN: | FileCheck --check-prefix=CHECK-UBSAN-PS5 %s +// RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-PS5 // CHECK-UBSAN-PS5: --dependent-lib=libSceUBSanitizer_nosubmission_stub_weak.a // CHECK-UBSAN-PS5: "{{.*}}ld{{(.gold)?(.exe)?}}" // CHECK-UBSAN-PS5: -lSceUBSanitizer_nosubmission_stub_weak @@ -1002,7 +1004,7 @@ // RUN: not %clang -fsanitize=address -### %s 2>&1 \ // RUN: --target=x86_64-scei-ps4 -fuse-ld=ld \ // RUN: -shared \ -// RUN: | FileCheck --check-prefix=CHECK-ASAN-PS4 %s +// RUN: | %{filecheck} --check-prefix=CHECK-ASAN-PS4 // CHECK-ASAN-PS4: --dependent-lib=libSceDbgAddressSanitizer_stub_weak.a // CHECK-ASAN-PS4: "{{.*}}ld{{(.gold)?(.exe)?}}" // CHECK-ASAN-PS4: -lSceDbgAddressSanitizer_stub_weak @@ -1010,7 +1012,7 @@ // RUN: not %clang -fsanitize=address -### %s 2>&1 \ // RUN: --target=x86_64-sie-ps5 -fuse-ld=ld \ // RUN: -shared \ -// RUN: | FileCheck --check-prefix=CHECK-ASAN-PS5 %s +// RUN: | %{filecheck} --check-prefix=CHECK-ASAN-PS5 // CHECK-ASAN-PS5: --dependent-lib=libSceAddressSanitizer_nosubmission_stub_weak.a // CHECK-ASAN-PS5: "{{.*}}ld{{(.gold)?(.exe)?}}" // CHECK-ASAN-PS5: -lSceAddressSanitizer_nosubmission_stub_weak @@ -1018,7 +1020,7 @@ // RUN: not %clang -fsanitize=address,undefined -### %s 2>&1 \ // RUN: --target=x86_64-scei-ps4 -fuse-ld=ld \ // RUN: -shared \ -// RUN: | FileCheck --check-prefix=CHECK-AUBSAN-PS4 %s +// RUN: | %{filecheck} --check-prefix=CHECK-AUBSAN-PS4 // CHECK-AUBSAN-PS4-NOT: --dependent-lib=libSceDbgUBSanitizer_stub_weak.a // CHECK-AUBSAN-PS4: --dependent-lib=libSceDbgAddressSanitizer_stub_weak.a // CHECK-AUBSAN-PS4-NOT: --dependent-lib=libSceDbgUBSanitizer_stub_weak.a @@ -1028,7 +1030,7 @@ // RUN: not %clang -fsanitize=address,undefined -### %s 2>&1 \ // RUN: --target=x86_64-sie-ps5 -fuse-ld=ld \ // RUN: -shared \ -// RUN: | FileCheck --check-prefix=CHECK-AUBSAN-PS5 %s +// RUN: | %{filecheck} --check-prefix=CHECK-AUBSAN-PS5 // CHECK-AUBSAN-PS5-NOT: --dependent-lib=libSceUBSanitizer_nosubmission_stub_weak.a // CHECK-AUBSAN-PS5: --dependent-lib=libSceAddressSanitizer_nosubmission_stub_weak.a // CHECK-AUBSAN-PS5-NOT: --dependent-lib=libSceUBSanitizer_nosubmission_stub_weak.a @@ -1039,21 +1041,21 @@ // RUN: --target=x86_64-scei-ps4 -fuse-ld=ld \ // RUN: -shared \ // RUN: -nostdlib \ -// RUN: | FileCheck --check-prefix=CHECK-NOLIB-PS4 %s +// RUN: | %{filecheck} --check-prefix=CHECK-NOLIB-PS4 // CHECK-NOLIB-PS4-NOT: SceDbgAddressSanitizer_stub_weak // RUN: not %clang -fsanitize=address,undefined -### %s 2>&1 \ // RUN: --target=x86_64-sie-ps5 -fuse-ld=ld \ // RUN: -shared \ // RUN: -nostdlib \ -// RUN: | FileCheck --check-prefix=CHECK-NOLIB-PS5 %s +// RUN: | %{filecheck} --check-prefix=CHECK-NOLIB-PS5 // CHECK-NOLIB-PS5-NOT: SceAddressSanitizer_nosubmission_stub_weak // RUN: %clang -fsanitize=scudo -### %s 2>&1 \ // RUN: --target=i386-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-SCUDO-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-SCUDO-LINUX // CHECK-SCUDO-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-SCUDO-LINUX: "--whole-archive" "{{.*}}libclang_rt.scudo_standalone.a" "--no-whole-archive" // CHECK-SCUDO-LINUX-NOT: "-lstdc++" @@ -1065,7 +1067,7 @@ // RUN: --target=i386-unknown-linux -fuse-ld=ld -fsanitize=scudo -shared-libsan \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-SCUDO-SHARED-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-SCUDO-SHARED-LINUX // // CHECK-SCUDO-SHARED-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-SCUDO-SHARED-LINUX-NOT: "-lc" @@ -1081,7 +1083,7 @@ // RUN: %clang -### %s 2>&1 \ // RUN: --target=arm-linux-androideabi -fuse-ld=ld -fsanitize=scudo \ // RUN: --sysroot=%S/Inputs/basic_android_tree/sysroot \ -// RUN: | FileCheck --check-prefix=CHECK-SCUDO-ANDROID %s +// RUN: | %{filecheck} --check-prefix=CHECK-SCUDO-ANDROID // // CHECK-SCUDO-ANDROID: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" // CHECK-SCUDO-ANDROID-NOT: "-lc" @@ -1096,7 +1098,7 @@ // RUN: --target=arm-linux-androideabi -fuse-ld=ld -fsanitize=scudo \ // RUN: --sysroot=%S/Inputs/basic_android_tree/sysroot \ // RUN: -static-libsan \ -// RUN: | FileCheck --check-prefix=CHECK-SCUDO-ANDROID-STATIC %s +// RUN: | %{filecheck} --check-prefix=CHECK-SCUDO-ANDROID-STATIC // CHECK-SCUDO-ANDROID-STATIC: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" // CHECK-SCUDO-ANDROID-STATIC: "-pie" // CHECK-SCUDO-ANDROID-STATIC: "--whole-archive" "{{.*}}libclang_rt.scudo_standalone.a" "--no-whole-archive" @@ -1109,7 +1111,7 @@ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld -fsanitize=hwaddress \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-HWASAN-X86-64-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-HWASAN-X86-64-LINUX // // CHECK-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-HWASAN-X86-64-LINUX-NOT: "-lc" @@ -1126,7 +1128,7 @@ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld -fsanitize=hwaddress \ // RUN: -shared-libsan -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-SHARED-HWASAN-X86-64-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-SHARED-HWASAN-X86-64-LINUX // // CHECK-SHARED-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-SHARED-HWASAN-X86-64-LINUX-NOT: "-lc" @@ -1142,7 +1144,7 @@ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld -fsanitize=hwaddress \ // RUN: -shared-libsan -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-DSO-SHARED-HWASAN-X86-64-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-DSO-SHARED-HWASAN-X86-64-LINUX // // CHECK-DSO-SHARED-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-DSO-SHARED-HWASAN-X86-64-LINUX-NOT: "-lc" @@ -1158,7 +1160,7 @@ // RUN: --target=aarch64-unknown-linux -fuse-ld=ld -fsanitize=hwaddress \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-HWASAN-AARCH64-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-HWASAN-AARCH64-LINUX // // CHECK-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-HWASAN-AARCH64-LINUX-NOT: "-lc" @@ -1176,7 +1178,7 @@ // RUN: -shared-libsan \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-SHARED-HWASAN-AARCH64-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-SHARED-HWASAN-AARCH64-LINUX // // CHECK-SHARED-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lc" @@ -1192,7 +1194,7 @@ // RUN: --target=aarch64-unknown-linux -fuse-ld=ld -fsanitize=hwaddress \ // RUN: -shared-libsan -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX %s +// RUN: | %{filecheck} --check-prefix=CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX // // CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lc" From 377755c87e9d5494237f0e2e88f70886b5107342 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 26 Dec 2024 12:46:11 -0800 Subject: [PATCH 088/567] [nfc][Driver] Remove {{(.exe)?}} from sanitizer test (#121160) These are not Windows tests --- clang/test/Driver/sanitizer-ld.c | 132 +++++++++++++++---------------- 1 file changed, 66 insertions(+), 66 deletions(-) diff --git a/clang/test/Driver/sanitizer-ld.c b/clang/test/Driver/sanitizer-ld.c index 0faa582c081c5..763230568251c 100644 --- a/clang/test/Driver/sanitizer-ld.c +++ b/clang/test/Driver/sanitizer-ld.c @@ -8,7 +8,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-LINUX // -// CHECK-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-ASAN-LINUX-NOT: "-lc" // CHECK-ASAN-LINUX: libclang_rt.asan.a" // CHECK-ASAN-LINUX-NOT: "--export-dynamic" @@ -74,7 +74,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-SHARED-ASAN-LINUX // -// CHECK-SHARED-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-SHARED-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-SHARED-ASAN-LINUX-NOT: "-lc" // CHECK-SHARED-ASAN-LINUX-NOT: libclang_rt.asan.a" // CHECK-SHARED-ASAN-LINUX: libclang_rt.asan.so" @@ -92,7 +92,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-DSO-SHARED-ASAN-LINUX // -// CHECK-DSO-SHARED-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-DSO-SHARED-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-DSO-SHARED-ASAN-LINUX-NOT: "-lc" // CHECK-DSO-SHARED-ASAN-LINUX-NOT: libclang_rt.asan.a" // CHECK-DSO-SHARED-ASAN-LINUX-NOT: "libclang_rt.asan-preinit.a" @@ -110,7 +110,7 @@ // RUN: --sysroot=%S/Inputs/basic_freebsd_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-FREEBSD // -// CHECK-ASAN-FREEBSD: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-ASAN-FREEBSD: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-ASAN-FREEBSD-NOT: "-lc" // CHECK-ASAN-FREEBSD-NOT: libclang_rt.asan_cxx // CHECK-ASAN-FREEBSD: freebsd{{/|\\+}}libclang_rt.asan.a" @@ -126,7 +126,7 @@ // RUN: --sysroot=%S/Inputs/basic_freebsd_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-FREEBSD-LDL // -// CHECK-ASAN-FREEBSD-LDL: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-ASAN-FREEBSD-LDL: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-ASAN-FREEBSD-LDL-NOT: "-ldl" // RUN: %clangxx -### %s 2>&1 \ @@ -142,7 +142,7 @@ // RUN: -fsanitize-link-c++-runtime \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-CXX -// CHECK-ASAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-ASAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-ASAN-LINUX-CXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-LINUX-CXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan_cxx.a" "--no-whole-archive" // CHECK-ASAN-LINUX-CXX-NOT: "--dynamic-list" @@ -161,7 +161,7 @@ // RUN: -fno-sanitize-link-c++-runtime \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-CNOCXX -// CHECK-ASAN-LINUX-CNOCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-ASAN-LINUX-CNOCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-ASAN-LINUX-CNOCXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-LINUX-CNOCXX-NOT: libclang_rt.asan_cxx // CHECK-ASAN-LINUX-CNOCXX-SAME: "--export-dynamic" @@ -179,7 +179,7 @@ // RUN: -fno-sanitize-link-c++-runtime \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-NOCXX -// CHECK-ASAN-LINUX-NOCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-ASAN-LINUX-NOCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-ASAN-LINUX-NOCXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-LINUX-NOCXX-NOT: libclang_rt.asan_cxx // CHECK-ASAN-LINUX-NOCXX-SAME: "--export-dynamic" @@ -197,7 +197,7 @@ // RUN: -nostdlib++ \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-NOSTDCXX -// CHECK-ASAN-LINUX-NOSTDCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-ASAN-LINUX-NOSTDCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-ASAN-LINUX-NOSTDCXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-LINUX-NOSTDCXX-SAME: libclang_rt.asan_cxx // CHECK-ASAN-LINUX-NOSTDCXX-SAME: "--export-dynamic" @@ -213,7 +213,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree -lstdc++ -static 2>&1 \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-CXX-STATIC // -// CHECK-ASAN-LINUX-CXX-STATIC: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-ASAN-LINUX-CXX-STATIC: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-ASAN-LINUX-CXX-STATIC-NOT: stdc++ // CHECK-ASAN-LINUX-CXX-STATIC: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-LINUX-CXX-STATIC: stdc++ @@ -223,7 +223,7 @@ // RUN: --sysroot=%S/Inputs/basic_android_tree/sysroot \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-ARM // -// CHECK-ASAN-ARM: "{{(.*[^.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-ASAN-ARM: "{{(.*[^.0-9A-Z_a-z])?}}ld" // CHECK-ASAN-ARM-NOT: "-lc" // CHECK-ASAN-ARM: libclang_rt.asan.a" // @@ -232,7 +232,7 @@ // RUN: --sysroot=%S/Inputs/basic_android_tree/sysroot \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-ARMv7 // -// CHECK-ASAN-ARMv7: "{{(.*[^.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-ASAN-ARMv7: "{{(.*[^.0-9A-Z_a-z])?}}ld" // CHECK-ASAN-ARMv7-NOT: "-lc" // CHECK-ASAN-ARMv7: libclang_rt.asan.a" @@ -242,7 +242,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-ANDROID // -// CHECK-ASAN-ANDROID: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" +// CHECK-ASAN-ANDROID: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld" // CHECK-ASAN-ANDROID: "-pie" // CHECK-ASAN-ANDROID-NOT: "-lc" // CHECK-ASAN-ANDROID-NOT: "-lpthread" @@ -265,7 +265,7 @@ // RUN: -static-libasan \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-ANDROID-STATICLIBASAN // -// CHECK-ASAN-ANDROID-STATICLIBASAN: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" +// CHECK-ASAN-ANDROID-STATICLIBASAN: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld" // CHECK-ASAN-ANDROID-STATICLIBASAN: libclang_rt.asan.a" // CHECK-ASAN-ANDROID-STATICLIBASAN-NOT: "-lpthread" // CHECK-ASAN-ANDROID-STATICLIBASAN-NOT: "-lrt" @@ -277,7 +277,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-ANDROID // -// CHECK-UBSAN-ANDROID: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" +// CHECK-UBSAN-ANDROID: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld" // CHECK-UBSAN-ANDROID: "-pie" // CHECK-UBSAN-ANDROID-NOT: "-lc" // CHECK-UBSAN-ANDROID-NOT: "-lpthread" @@ -293,7 +293,7 @@ // RUN: -static-libsan \ // RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-ANDROID-STATICLIBASAN // -// CHECK-UBSAN-ANDROID-STATICLIBASAN: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" +// CHECK-UBSAN-ANDROID-STATICLIBASAN: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld" // CHECK-UBSAN-ANDROID-STATICLIBASAN: libclang_rt.ubsan_standalone.a" // CHECK-UBSAN-ANDROID-STATICLIBASAN-NOT: "-lpthread" // CHECK-UBSAN-ANDROID-STATICLIBASAN-NOT: "-lrt" @@ -306,7 +306,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-ANDROID-X86 // -// CHECK-ASAN-ANDROID-X86: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" +// CHECK-ASAN-ANDROID-X86: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld" // CHECK-ASAN-ANDROID-X86: "-pie" // CHECK-ASAN-ANDROID-X86-NOT: "-lc" // CHECK-ASAN-ANDROID-X86-NOT: "-lpthread" @@ -330,7 +330,7 @@ // RUN: -shared \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-ANDROID-SHARED // -// CHECK-ASAN-ANDROID-SHARED: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" +// CHECK-ASAN-ANDROID-SHARED: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld" // CHECK-ASAN-ANDROID-SHARED-NOT: "-lc" // CHECK-ASAN-ANDROID-SHARED: libclang_rt.asan.so" // CHECK-ASAN-ANDROID-SHARED-NOT: "-lpthread" @@ -344,7 +344,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-TYSAN-LINUX-CXX // -// CHECK-TYSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-TYSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-TYSAN-LINUX-CXX-NOT: stdc++ // CHECK-TYSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tysan{{[^.]*}}.a" "--no-whole-archive" // CHECK-TYSAN-LINUX-CXX: stdc++ @@ -355,7 +355,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-TYSAN-DARWIN-CXX -// CHECK-TYSAN-DARWIN-CXX: "{{.*}}ld{{(.exe)?}}" +// CHECK-TYSAN-DARWIN-CXX: "{{.*}}ld" // CHECK-TYSAN-DARWIN-CXX: libclang_rt.tysan_osx_dynamic.dylib // CHECK-TYSAN-DARWIN-CXX-NOT: -lc++abi @@ -366,7 +366,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-TSAN-LINUX-CXX // -// CHECK-TSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-TSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-TSAN-LINUX-CXX-NOT: stdc++ // CHECK-TSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tsan.a" "--no-whole-archive" // CHECK-TSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.tsan.a.syms" @@ -402,7 +402,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-MSAN-LINUX-CXX // -// CHECK-MSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-MSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-MSAN-LINUX-CXX-NOT: stdc++ // CHECK-MSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.msan.a" "--no-whole-archive" // CHECK-MSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.msan.a.syms" @@ -442,7 +442,7 @@ // RUN: -static-libsan \ // RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX -// CHECK-UBSAN-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-UBSAN-LINUX: "{{.*}}ld" // CHECK-UBSAN-LINUX-NOT: libclang_rt.asan // CHECK-UBSAN-LINUX-NOT: libclang_rt.ubsan_standalone_cxx // CHECK-UBSAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive" @@ -497,7 +497,7 @@ // RUN: -shared -shared-libsan \ // RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX-SHAREDLIBASAN -// CHECK-UBSAN-LINUX-SHAREDLIBASAN: "{{.*}}ld{{(.exe)?}}" +// CHECK-UBSAN-LINUX-SHAREDLIBASAN: "{{.*}}ld" // CHECK-UBSAN-LINUX-SHAREDLIBASAN: "{{.*}}libclang_rt.ubsan_standalone.so{{.*}}" // RUN: %clang -fsanitize=undefined -fsanitize-link-c++-runtime -### %s 2>&1 \ @@ -514,7 +514,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX-CXX -// CHECK-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}" +// CHECK-UBSAN-LINUX-CXX: "{{.*}}ld" // CHECK-UBSAN-LINUX-CXX-NOT: libclang_rt.asan // CHECK-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive" // CHECK-UBSAN-LINUX-CXX-NOT: libclang_rt.asan @@ -530,7 +530,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-MINIMAL-LINUX -// CHECK-UBSAN-MINIMAL-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-UBSAN-MINIMAL-LINUX: "{{.*}}ld" // CHECK-UBSAN-MINIMAL-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_minimal.a" "--no-whole-archive" // CHECK-UBSAN-MINIMAL-LINUX: "-lpthread" // CHECK-UBSAN-MINIMAL-LINUX: "-lresolv" @@ -539,7 +539,7 @@ // RUN: --target=x86_64-apple-darwin -fuse-ld=ld \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-MINIMAL-DARWIN -// CHECK-UBSAN-MINIMAL-DARWIN: "{{.*}}ld{{(.exe)?}}" +// CHECK-UBSAN-MINIMAL-DARWIN: "{{.*}}ld" // CHECK-UBSAN-MINIMAL-DARWIN: "{{.*}}libclang_rt.ubsan_minimal_osx_dynamic.dylib" // RUN: not %clang -fsanitize=undefined -### %s 2>&1 \ @@ -565,7 +565,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-UBSAN-LINUX -// CHECK-ASAN-UBSAN-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-ASAN-UBSAN-LINUX: "{{.*}}ld" // CHECK-ASAN-UBSAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-UBSAN-LINUX-NOT: libclang_rt.ubsan // CHECK-ASAN-UBSAN-LINUX-NOT: "-lstdc++" @@ -577,7 +577,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-UBSAN-LINUX-CXX -// CHECK-ASAN-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}" +// CHECK-ASAN-UBSAN-LINUX-CXX: "{{.*}}ld" // CHECK-ASAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.asan_cxx.a" "--no-whole-archive" // CHECK-ASAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan @@ -592,7 +592,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX -// CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX: "{{.*}}ld{{(.exe)?}}" +// CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX: "{{.*}}ld" // CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan_cxx.a" "--no-whole-archive" // CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-NOT: libclang_rt.ubsan @@ -605,7 +605,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-MSAN-UBSAN-LINUX-CXX -// CHECK-MSAN-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}" +// CHECK-MSAN-UBSAN-LINUX-CXX: "{{.*}}ld" // CHECK-MSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.msan.a" "--no-whole-archive" // CHECK-MSAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan // CHECK-MSAN-UBSAN-LINUX-CXX: libclang_rt.ubsan_standalone_cxx @@ -616,7 +616,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-TSAN-UBSAN-LINUX-CXX -// CHECK-TSAN-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}" +// CHECK-TSAN-UBSAN-LINUX-CXX: "{{.*}}ld" // CHECK-TSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tsan.a" "--no-whole-archive" // CHECK-TSAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan // CHECK-TSAN-UBSAN-LINUX-CXX: libclang_rt.ubsan_standalone_cxx @@ -628,7 +628,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: -shared \ // RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX-SHARED -// CHECK-UBSAN-LINUX-SHARED: "{{.*}}ld{{(.exe)?}}" +// CHECK-UBSAN-LINUX-SHARED: "{{.*}}ld" // CHECK-UBSAN-LINUX-SHARED-NOT: --export-dynamic // CHECK-UBSAN-LINUX-SHARED-NOT: --dynamic-list // CHECK-UBSAN-LINUX-SHARED-NOT: libclang_rt.ubsan @@ -639,7 +639,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-LSAN-LINUX // -// CHECK-LSAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-LSAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-LSAN-LINUX-NOT: "-lc" // CHECK-LSAN-LINUX-NOT: libclang_rt.ubsan // CHECK-LSAN-LINUX: libclang_rt.lsan.a" @@ -661,7 +661,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-LSAN-COV-LINUX // -// CHECK-LSAN-COV-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-LSAN-COV-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-LSAN-COV-LINUX-NOT: "-lc" // CHECK-LSAN-COV-LINUX-NOT: libclang_rt.ubsan // CHECK-LSAV-COV-LINUX: libclang_rt.lsan-x86_64.a" @@ -675,7 +675,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-LSAN-ASAN-LINUX -// CHECK-LSAN-ASAN-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-LSAN-ASAN-LINUX: "{{.*}}ld" // CHECK-LSAN-ASAN-LINUX-NOT: libclang_rt.lsan // CHECK-LSAN-ASAN-LINUX: libclang_rt.asan // CHECK-LSAN-ASAN-LINUX-NOT: libclang_rt.lsan @@ -685,7 +685,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-COV-LINUX -// CHECK-ASAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-ASAN-COV-LINUX: "{{.*}}ld" // CHECK-ASAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-COV-LINUX-NOT: libclang_rt.ubsan // CHECK-ASAN-COV-LINUX-NOT: "-lstdc++" @@ -697,7 +697,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-MSAN-COV-LINUX -// CHECK-MSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-MSAN-COV-LINUX: "{{.*}}ld" // CHECK-MSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.msan.a" "--no-whole-archive" // CHECK-MSAN-COV-LINUX-NOT: libclang_rt.ubsan // CHECK-MSAN-COV-LINUX-NOT: "-lstdc++" @@ -709,7 +709,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-DFSAN-COV-LINUX -// CHECK-DFSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-DFSAN-COV-LINUX: "{{.*}}ld" // CHECK-DFSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.dfsan.a" "--no-whole-archive" // CHECK-DFSAN-COV-LINUX-NOT: libclang_rt.ubsan // CHECK-DFSAN-COV-LINUX-NOT: "-lstdc++" @@ -721,7 +721,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-COV-LINUX -// CHECK-UBSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-UBSAN-COV-LINUX: "{{.*}}ld" // CHECK-UBSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive" // CHECK-UBSAN-COV-LINUX-NOT: "-lstdc++" // CHECK-UBSAN-COV-LINUX: "-lpthread" @@ -732,7 +732,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-COV-LINUX -// CHECK-COV-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-COV-LINUX: "{{.*}}ld" // CHECK-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive" // CHECK-COV-LINUX-NOT: "-lstdc++" // CHECK-COV-LINUX: "-lpthread" @@ -744,7 +744,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-NSAN-LINUX // -// CHECK-NSAN-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-NSAN-LINUX: "{{.*}}ld" // CHECK-NSAN-LINUX-NOT: "-lc" // CHECK-NSAN-LINUX-NOT: libclang_rt.ubsan // CHECK-NSAN-LINUX: libclang_rt.nsan.a" @@ -774,7 +774,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-CFI-LINUX -// CHECK-CFI-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-CFI-LINUX: "{{.*}}ld" // CHECK-CFI-LINUX-NOT: libclang_rt. // CFI with diagnostics links the UBSan runtime. @@ -784,7 +784,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-CFI-DIAG-LINUX -// CHECK-CFI-DIAG-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-CFI-DIAG-LINUX: "{{.*}}ld" // CHECK-CFI-DIAG-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive" // Cross-DSO CFI links the CFI runtime. @@ -793,7 +793,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-CFI-CROSS-DSO-LINUX -// CHECK-CFI-CROSS-DSO-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-CFI-CROSS-DSO-LINUX: "{{.*}}ld" // CHECK-CFI-CROSS-DSO-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.cfi.a" "--no-whole-archive" // CHECK-CFI-CROSS-DSO-LINUX: -export-dynamic @@ -804,7 +804,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-CFI-CROSS-DSO-DIAG-LINUX -// CHECK-CFI-CROSS-DSO-DIAG-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-CFI-CROSS-DSO-DIAG-LINUX: "{{.*}}ld" // CHECK-CFI-CROSS-DSO-DIAG-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.cfi_diag.a" "--no-whole-archive" // CHECK-CFI-CROSS-DSO-DIAG-LINUX: -export-dynamic @@ -814,7 +814,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_android_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-CFI-CROSS-DSO-ANDROID -// CHECK-CFI-CROSS-DSO-ANDROID: "{{.*}}ld{{(.exe)?}}" +// CHECK-CFI-CROSS-DSO-ANDROID: "{{.*}}ld" // CHECK-CFI-CROSS-DSO-ANDROID-NOT: libclang_rt.cfi // Cross-DSO CFI with diagnostics on Android links just the UBSAN runtime. @@ -824,7 +824,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_android_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-CFI-CROSS-DSO-DIAG-ANDROID -// CHECK-CFI-CROSS-DSO-DIAG-ANDROID: "{{.*}}ld{{(.exe)?}}" +// CHECK-CFI-CROSS-DSO-DIAG-ANDROID: "{{.*}}ld" // CHECK-CFI-CROSS-DSO-DIAG-ANDROID: "{{[^"]*}}libclang_rt.ubsan_standalone.so" // CHECK-CFI-CROSS-DSO-DIAG-ANDROID: "--export-dynamic-symbol=__cfi_check" @@ -834,7 +834,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-DARWIN106-CXX -// CHECK-ASAN-DARWIN106-CXX: "{{.*}}ld{{(.exe)?}}" +// CHECK-ASAN-DARWIN106-CXX: "{{.*}}ld" // CHECK-ASAN-DARWIN106-CXX: libclang_rt.asan_osx_dynamic.dylib // CHECK-ASAN-DARWIN106-CXX-NOT: -lc++abi @@ -844,7 +844,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-LSAN-DARWIN106-CXX -// CHECK-LSAN-DARWIN106-CXX: "{{.*}}ld{{(.exe)?}}" +// CHECK-LSAN-DARWIN106-CXX: "{{.*}}ld" // CHECK-LSAN-DARWIN106-CXX: libclang_rt.lsan_osx_dynamic.dylib // CHECK-LSAN-DARWIN106-CXX-NOT: -lc++abi @@ -854,7 +854,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-SAFESTACK-LINUX // -// CHECK-SAFESTACK-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-SAFESTACK-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-SAFESTACK-LINUX-NOT: "-lc" // CHECK-SAFESTACK-LINUX-NOT: whole-archive // CHECK-SAFESTACK-LINUX: "-u" "__safestack_init" @@ -918,7 +918,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-CFI-STATS-LINUX -// CHECK-CFI-STATS-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-CFI-STATS-LINUX: "{{.*}}ld" // CHECK-CFI-STATS-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.stats_client.a" "--no-whole-archive" // CHECK-CFI-STATS-LINUX-NOT: "--whole-archive" // CHECK-CFI-STATS-LINUX: "{{[^"]*}}libclang_rt.stats.a" @@ -927,7 +927,7 @@ // RUN: --target=x86_64-apple-darwin -fuse-ld=ld \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-CFI-STATS-DARWIN -// CHECK-CFI-STATS-DARWIN: "{{.*}}ld{{(.exe)?}}" +// CHECK-CFI-STATS-DARWIN: "{{.*}}ld" // CHECK-CFI-STATS-DARWIN: "{{[^"]*}}libclang_rt.stats_client_osx.a" // CHECK-CFI-STATS-DARWIN: "{{[^"]*}}libclang_rt.stats_osx_dynamic.dylib" @@ -966,7 +966,7 @@ // RUN: --sysroot=%S/Inputs/basic_android_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-SAFESTACK-ANDROID-ARM // -// CHECK-SAFESTACK-ANDROID-ARM: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" +// CHECK-SAFESTACK-ANDROID-ARM: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld" // CHECK-SAFESTACK-ANDROID-ARM-NOT: libclang_rt.safestack // RUN: %clang -### %s -shared 2>&1 \ @@ -974,7 +974,7 @@ // RUN: --sysroot=%S/Inputs/basic_android_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-SAFESTACK-SHARED-ANDROID-ARM // -// CHECK-SAFESTACK-SHARED-ANDROID-ARM: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" +// CHECK-SAFESTACK-SHARED-ANDROID-ARM: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld" // CHECK-SAFESTACK-SHARED-ANDROID-ARM-NOT: libclang_rt.safestack // RUN: %clang -### %s 2>&1 \ @@ -982,7 +982,7 @@ // RUN: --sysroot=%S/Inputs/basic_android_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-SAFESTACK-ANDROID-AARCH64 // -// CHECK-SAFESTACK-ANDROID-AARCH64: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" +// CHECK-SAFESTACK-ANDROID-AARCH64: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld" // CHECK-SAFESTACK-ANDROID-AARCH64-NOT: libclang_rt.safestack // RUN: not %clang -fsanitize=undefined -### %s 2>&1 \ @@ -1056,7 +1056,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-SCUDO-LINUX -// CHECK-SCUDO-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-SCUDO-LINUX: "{{.*}}ld" // CHECK-SCUDO-LINUX: "--whole-archive" "{{.*}}libclang_rt.scudo_standalone.a" "--no-whole-archive" // CHECK-SCUDO-LINUX-NOT: "-lstdc++" // CHECK-SCUDO-LINUX: "-lpthread" @@ -1069,7 +1069,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-SCUDO-SHARED-LINUX // -// CHECK-SCUDO-SHARED-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-SCUDO-SHARED-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-SCUDO-SHARED-LINUX-NOT: "-lc" // CHECK-SCUDO-SHARED-LINUX-NOT: libclang_rt.scudo_standalone.a" // CHECK-SCUDO-SHARED-LINUX: libclang_rt.scudo_standalone.so" @@ -1085,7 +1085,7 @@ // RUN: --sysroot=%S/Inputs/basic_android_tree/sysroot \ // RUN: | %{filecheck} --check-prefix=CHECK-SCUDO-ANDROID // -// CHECK-SCUDO-ANDROID: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" +// CHECK-SCUDO-ANDROID: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld" // CHECK-SCUDO-ANDROID-NOT: "-lc" // CHECK-SCUDO-ANDROID: "-pie" // CHECK-SCUDO-ANDROID-NOT: "-lpthread" @@ -1099,7 +1099,7 @@ // RUN: --sysroot=%S/Inputs/basic_android_tree/sysroot \ // RUN: -static-libsan \ // RUN: | %{filecheck} --check-prefix=CHECK-SCUDO-ANDROID-STATIC -// CHECK-SCUDO-ANDROID-STATIC: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" +// CHECK-SCUDO-ANDROID-STATIC: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld" // CHECK-SCUDO-ANDROID-STATIC: "-pie" // CHECK-SCUDO-ANDROID-STATIC: "--whole-archive" "{{.*}}libclang_rt.scudo_standalone.a" "--no-whole-archive" // CHECK-SCUDO-ANDROID-STATIC-NOT: "-lstdc++" @@ -1113,7 +1113,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-HWASAN-X86-64-LINUX // -// CHECK-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-HWASAN-X86-64-LINUX-NOT: "-lc" // CHECK-HWASAN-X86-64-LINUX: libclang_rt.hwasan.a" // CHECK-HWASAN-X86-64-LINUX-NOT: "--export-dynamic" @@ -1130,7 +1130,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-SHARED-HWASAN-X86-64-LINUX // -// CHECK-SHARED-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-SHARED-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-SHARED-HWASAN-X86-64-LINUX-NOT: "-lc" // CHECK-SHARED-HWASAN-X86-64-LINUX: libclang_rt.hwasan.so" // CHECK-SHARED-HWASAN-X86-64-LINUX-NOT: "-lpthread" @@ -1146,7 +1146,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-DSO-SHARED-HWASAN-X86-64-LINUX // -// CHECK-DSO-SHARED-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-DSO-SHARED-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-DSO-SHARED-HWASAN-X86-64-LINUX-NOT: "-lc" // CHECK-DSO-SHARED-HWASAN-X86-64-LINUX: libclang_rt.hwasan.so" // CHECK-DSO-SHARED-HWASAN-X86-64-LINUX-NOT: "-lpthread" @@ -1162,7 +1162,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-HWASAN-AARCH64-LINUX // -// CHECK-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-HWASAN-AARCH64-LINUX-NOT: "-lc" // CHECK-HWASAN-AARCH64-LINUX: libclang_rt.hwasan.a" // CHECK-HWASAN-AARCH64-LINUX-NOT: "--export-dynamic" @@ -1180,7 +1180,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-SHARED-HWASAN-AARCH64-LINUX // -// CHECK-SHARED-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-SHARED-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lc" // CHECK-SHARED-HWASAN-AARCH64-LINUX: libclang_rt.hwasan.so" // CHECK-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lpthread" @@ -1196,7 +1196,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX // -// CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lc" // CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX: libclang_rt.hwasan.so" // CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lpthread" From 2dfe1b404213c6676b9ac55cb89c0a709a712208 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 26 Dec 2024 21:26:17 +0000 Subject: [PATCH 089/567] [VPlan] Remove stray space when printing reverse vector pointer. printFlags() takes care of printing the required space, remove the extra printed space between flags and operands. --- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 36a5d3be113ba..86262e6b9f94e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2061,7 +2061,6 @@ void VPReverseVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, printAsOperand(O, SlotTracker); O << " = reverse-vector-pointer"; printFlags(O); - O << " "; printOperands(O, SlotTracker); } #endif From 7ecbeace0192963482beb6520706ef98ae4d8c0d Mon Sep 17 00:00:00 2001 From: hill Date: Thu, 26 Dec 2024 21:44:14 +0000 Subject: [PATCH 090/567] [clang-tidy] fix incorrect argument names in documentation for ExtraArgs and ExtraArgsBefore (#120963) --- clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp | 6 +++--- clang-tools-extra/docs/clang-tidy/index.rst | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp index 3451e1f624257..fa8887e4639b4 100644 --- a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp +++ b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp @@ -61,12 +61,12 @@ Configuration files: globs can be specified as a list instead of a string. ExcludeHeaderFilterRegex - Same as '--exclude-header-filter'. - ExtraArgs - Same as '--extra-args'. - ExtraArgsBefore - Same as '--extra-args-before'. + ExtraArgs - Same as '--extra-arg'. + ExtraArgsBefore - Same as '--extra-arg-before'. FormatStyle - Same as '--format-style'. HeaderFileExtensions - File extensions to consider to determine if a given diagnostic is located in a header file. - HeaderFilterRegex - Same as '--header-filter-regex'. + HeaderFilterRegex - Same as '--header-filter'. ImplementationFileExtensions - File extensions to consider to determine if a given diagnostic is located in an implementation file. diff --git a/clang-tools-extra/docs/clang-tidy/index.rst b/clang-tools-extra/docs/clang-tidy/index.rst index 8c79b4dc19393..b7a366e874130 100644 --- a/clang-tools-extra/docs/clang-tidy/index.rst +++ b/clang-tools-extra/docs/clang-tidy/index.rst @@ -293,8 +293,8 @@ An overview of all the command-line options: globs can be specified as a list instead of a string. ExcludeHeaderFilterRegex - Same as '--exclude-header-filter'. - ExtraArgs - Same as '--extra-args'. - ExtraArgsBefore - Same as '--extra-args-before'. + ExtraArgs - Same as '--extra-arg'. + ExtraArgsBefore - Same as '--extra-arg-before'. FormatStyle - Same as '--format-style'. HeaderFileExtensions - File extensions to consider to determine if a given diagnostic is located in a header file. From ca28fcc6fce516129d117f5f5a14ba7f54a045d8 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 26 Dec 2024 14:58:38 -0800 Subject: [PATCH 091/567] Revert "[nfc][Driver] Remove {{(.exe)?}} from sanitizer test (#121160)" Revert #121160. It fails Android targets on Windows. This reverts commit 377755c87e9d5494237f0e2e88f70886b5107342. --- clang/test/Driver/sanitizer-ld.c | 132 +++++++++++++++---------------- 1 file changed, 66 insertions(+), 66 deletions(-) diff --git a/clang/test/Driver/sanitizer-ld.c b/clang/test/Driver/sanitizer-ld.c index 763230568251c..0faa582c081c5 100644 --- a/clang/test/Driver/sanitizer-ld.c +++ b/clang/test/Driver/sanitizer-ld.c @@ -8,7 +8,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-LINUX // -// CHECK-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" +// CHECK-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-ASAN-LINUX-NOT: "-lc" // CHECK-ASAN-LINUX: libclang_rt.asan.a" // CHECK-ASAN-LINUX-NOT: "--export-dynamic" @@ -74,7 +74,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-SHARED-ASAN-LINUX // -// CHECK-SHARED-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" +// CHECK-SHARED-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-SHARED-ASAN-LINUX-NOT: "-lc" // CHECK-SHARED-ASAN-LINUX-NOT: libclang_rt.asan.a" // CHECK-SHARED-ASAN-LINUX: libclang_rt.asan.so" @@ -92,7 +92,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-DSO-SHARED-ASAN-LINUX // -// CHECK-DSO-SHARED-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" +// CHECK-DSO-SHARED-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-DSO-SHARED-ASAN-LINUX-NOT: "-lc" // CHECK-DSO-SHARED-ASAN-LINUX-NOT: libclang_rt.asan.a" // CHECK-DSO-SHARED-ASAN-LINUX-NOT: "libclang_rt.asan-preinit.a" @@ -110,7 +110,7 @@ // RUN: --sysroot=%S/Inputs/basic_freebsd_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-FREEBSD // -// CHECK-ASAN-FREEBSD: "{{(.*[^-.0-9A-Z_a-z])?}}ld" +// CHECK-ASAN-FREEBSD: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-ASAN-FREEBSD-NOT: "-lc" // CHECK-ASAN-FREEBSD-NOT: libclang_rt.asan_cxx // CHECK-ASAN-FREEBSD: freebsd{{/|\\+}}libclang_rt.asan.a" @@ -126,7 +126,7 @@ // RUN: --sysroot=%S/Inputs/basic_freebsd_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-FREEBSD-LDL // -// CHECK-ASAN-FREEBSD-LDL: "{{(.*[^-.0-9A-Z_a-z])?}}ld" +// CHECK-ASAN-FREEBSD-LDL: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-ASAN-FREEBSD-LDL-NOT: "-ldl" // RUN: %clangxx -### %s 2>&1 \ @@ -142,7 +142,7 @@ // RUN: -fsanitize-link-c++-runtime \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-CXX -// CHECK-ASAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" +// CHECK-ASAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-ASAN-LINUX-CXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-LINUX-CXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan_cxx.a" "--no-whole-archive" // CHECK-ASAN-LINUX-CXX-NOT: "--dynamic-list" @@ -161,7 +161,7 @@ // RUN: -fno-sanitize-link-c++-runtime \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-CNOCXX -// CHECK-ASAN-LINUX-CNOCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" +// CHECK-ASAN-LINUX-CNOCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-ASAN-LINUX-CNOCXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-LINUX-CNOCXX-NOT: libclang_rt.asan_cxx // CHECK-ASAN-LINUX-CNOCXX-SAME: "--export-dynamic" @@ -179,7 +179,7 @@ // RUN: -fno-sanitize-link-c++-runtime \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-NOCXX -// CHECK-ASAN-LINUX-NOCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" +// CHECK-ASAN-LINUX-NOCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-ASAN-LINUX-NOCXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-LINUX-NOCXX-NOT: libclang_rt.asan_cxx // CHECK-ASAN-LINUX-NOCXX-SAME: "--export-dynamic" @@ -197,7 +197,7 @@ // RUN: -nostdlib++ \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-NOSTDCXX -// CHECK-ASAN-LINUX-NOSTDCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" +// CHECK-ASAN-LINUX-NOSTDCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-ASAN-LINUX-NOSTDCXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-LINUX-NOSTDCXX-SAME: libclang_rt.asan_cxx // CHECK-ASAN-LINUX-NOSTDCXX-SAME: "--export-dynamic" @@ -213,7 +213,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree -lstdc++ -static 2>&1 \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-CXX-STATIC // -// CHECK-ASAN-LINUX-CXX-STATIC: "{{(.*[^-.0-9A-Z_a-z])?}}ld" +// CHECK-ASAN-LINUX-CXX-STATIC: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-ASAN-LINUX-CXX-STATIC-NOT: stdc++ // CHECK-ASAN-LINUX-CXX-STATIC: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-LINUX-CXX-STATIC: stdc++ @@ -223,7 +223,7 @@ // RUN: --sysroot=%S/Inputs/basic_android_tree/sysroot \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-ARM // -// CHECK-ASAN-ARM: "{{(.*[^.0-9A-Z_a-z])?}}ld" +// CHECK-ASAN-ARM: "{{(.*[^.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-ASAN-ARM-NOT: "-lc" // CHECK-ASAN-ARM: libclang_rt.asan.a" // @@ -232,7 +232,7 @@ // RUN: --sysroot=%S/Inputs/basic_android_tree/sysroot \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-ARMv7 // -// CHECK-ASAN-ARMv7: "{{(.*[^.0-9A-Z_a-z])?}}ld" +// CHECK-ASAN-ARMv7: "{{(.*[^.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-ASAN-ARMv7-NOT: "-lc" // CHECK-ASAN-ARMv7: libclang_rt.asan.a" @@ -242,7 +242,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-ANDROID // -// CHECK-ASAN-ANDROID: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld" +// CHECK-ASAN-ANDROID: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" // CHECK-ASAN-ANDROID: "-pie" // CHECK-ASAN-ANDROID-NOT: "-lc" // CHECK-ASAN-ANDROID-NOT: "-lpthread" @@ -265,7 +265,7 @@ // RUN: -static-libasan \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-ANDROID-STATICLIBASAN // -// CHECK-ASAN-ANDROID-STATICLIBASAN: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld" +// CHECK-ASAN-ANDROID-STATICLIBASAN: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" // CHECK-ASAN-ANDROID-STATICLIBASAN: libclang_rt.asan.a" // CHECK-ASAN-ANDROID-STATICLIBASAN-NOT: "-lpthread" // CHECK-ASAN-ANDROID-STATICLIBASAN-NOT: "-lrt" @@ -277,7 +277,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-ANDROID // -// CHECK-UBSAN-ANDROID: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld" +// CHECK-UBSAN-ANDROID: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" // CHECK-UBSAN-ANDROID: "-pie" // CHECK-UBSAN-ANDROID-NOT: "-lc" // CHECK-UBSAN-ANDROID-NOT: "-lpthread" @@ -293,7 +293,7 @@ // RUN: -static-libsan \ // RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-ANDROID-STATICLIBASAN // -// CHECK-UBSAN-ANDROID-STATICLIBASAN: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld" +// CHECK-UBSAN-ANDROID-STATICLIBASAN: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" // CHECK-UBSAN-ANDROID-STATICLIBASAN: libclang_rt.ubsan_standalone.a" // CHECK-UBSAN-ANDROID-STATICLIBASAN-NOT: "-lpthread" // CHECK-UBSAN-ANDROID-STATICLIBASAN-NOT: "-lrt" @@ -306,7 +306,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-ANDROID-X86 // -// CHECK-ASAN-ANDROID-X86: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld" +// CHECK-ASAN-ANDROID-X86: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" // CHECK-ASAN-ANDROID-X86: "-pie" // CHECK-ASAN-ANDROID-X86-NOT: "-lc" // CHECK-ASAN-ANDROID-X86-NOT: "-lpthread" @@ -330,7 +330,7 @@ // RUN: -shared \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-ANDROID-SHARED // -// CHECK-ASAN-ANDROID-SHARED: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld" +// CHECK-ASAN-ANDROID-SHARED: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" // CHECK-ASAN-ANDROID-SHARED-NOT: "-lc" // CHECK-ASAN-ANDROID-SHARED: libclang_rt.asan.so" // CHECK-ASAN-ANDROID-SHARED-NOT: "-lpthread" @@ -344,7 +344,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-TYSAN-LINUX-CXX // -// CHECK-TYSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" +// CHECK-TYSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-TYSAN-LINUX-CXX-NOT: stdc++ // CHECK-TYSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tysan{{[^.]*}}.a" "--no-whole-archive" // CHECK-TYSAN-LINUX-CXX: stdc++ @@ -355,7 +355,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-TYSAN-DARWIN-CXX -// CHECK-TYSAN-DARWIN-CXX: "{{.*}}ld" +// CHECK-TYSAN-DARWIN-CXX: "{{.*}}ld{{(.exe)?}}" // CHECK-TYSAN-DARWIN-CXX: libclang_rt.tysan_osx_dynamic.dylib // CHECK-TYSAN-DARWIN-CXX-NOT: -lc++abi @@ -366,7 +366,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-TSAN-LINUX-CXX // -// CHECK-TSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" +// CHECK-TSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-TSAN-LINUX-CXX-NOT: stdc++ // CHECK-TSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tsan.a" "--no-whole-archive" // CHECK-TSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.tsan.a.syms" @@ -402,7 +402,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-MSAN-LINUX-CXX // -// CHECK-MSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" +// CHECK-MSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-MSAN-LINUX-CXX-NOT: stdc++ // CHECK-MSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.msan.a" "--no-whole-archive" // CHECK-MSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.msan.a.syms" @@ -442,7 +442,7 @@ // RUN: -static-libsan \ // RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX -// CHECK-UBSAN-LINUX: "{{.*}}ld" +// CHECK-UBSAN-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-UBSAN-LINUX-NOT: libclang_rt.asan // CHECK-UBSAN-LINUX-NOT: libclang_rt.ubsan_standalone_cxx // CHECK-UBSAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive" @@ -497,7 +497,7 @@ // RUN: -shared -shared-libsan \ // RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX-SHAREDLIBASAN -// CHECK-UBSAN-LINUX-SHAREDLIBASAN: "{{.*}}ld" +// CHECK-UBSAN-LINUX-SHAREDLIBASAN: "{{.*}}ld{{(.exe)?}}" // CHECK-UBSAN-LINUX-SHAREDLIBASAN: "{{.*}}libclang_rt.ubsan_standalone.so{{.*}}" // RUN: %clang -fsanitize=undefined -fsanitize-link-c++-runtime -### %s 2>&1 \ @@ -514,7 +514,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX-CXX -// CHECK-UBSAN-LINUX-CXX: "{{.*}}ld" +// CHECK-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}" // CHECK-UBSAN-LINUX-CXX-NOT: libclang_rt.asan // CHECK-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive" // CHECK-UBSAN-LINUX-CXX-NOT: libclang_rt.asan @@ -530,7 +530,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-MINIMAL-LINUX -// CHECK-UBSAN-MINIMAL-LINUX: "{{.*}}ld" +// CHECK-UBSAN-MINIMAL-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-UBSAN-MINIMAL-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_minimal.a" "--no-whole-archive" // CHECK-UBSAN-MINIMAL-LINUX: "-lpthread" // CHECK-UBSAN-MINIMAL-LINUX: "-lresolv" @@ -539,7 +539,7 @@ // RUN: --target=x86_64-apple-darwin -fuse-ld=ld \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-MINIMAL-DARWIN -// CHECK-UBSAN-MINIMAL-DARWIN: "{{.*}}ld" +// CHECK-UBSAN-MINIMAL-DARWIN: "{{.*}}ld{{(.exe)?}}" // CHECK-UBSAN-MINIMAL-DARWIN: "{{.*}}libclang_rt.ubsan_minimal_osx_dynamic.dylib" // RUN: not %clang -fsanitize=undefined -### %s 2>&1 \ @@ -565,7 +565,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-UBSAN-LINUX -// CHECK-ASAN-UBSAN-LINUX: "{{.*}}ld" +// CHECK-ASAN-UBSAN-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-ASAN-UBSAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-UBSAN-LINUX-NOT: libclang_rt.ubsan // CHECK-ASAN-UBSAN-LINUX-NOT: "-lstdc++" @@ -577,7 +577,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-UBSAN-LINUX-CXX -// CHECK-ASAN-UBSAN-LINUX-CXX: "{{.*}}ld" +// CHECK-ASAN-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}" // CHECK-ASAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.asan_cxx.a" "--no-whole-archive" // CHECK-ASAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan @@ -592,7 +592,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX -// CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX: "{{.*}}ld" +// CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX: "{{.*}}ld{{(.exe)?}}" // CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan_cxx.a" "--no-whole-archive" // CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-NOT: libclang_rt.ubsan @@ -605,7 +605,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-MSAN-UBSAN-LINUX-CXX -// CHECK-MSAN-UBSAN-LINUX-CXX: "{{.*}}ld" +// CHECK-MSAN-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}" // CHECK-MSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.msan.a" "--no-whole-archive" // CHECK-MSAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan // CHECK-MSAN-UBSAN-LINUX-CXX: libclang_rt.ubsan_standalone_cxx @@ -616,7 +616,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-TSAN-UBSAN-LINUX-CXX -// CHECK-TSAN-UBSAN-LINUX-CXX: "{{.*}}ld" +// CHECK-TSAN-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}" // CHECK-TSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tsan.a" "--no-whole-archive" // CHECK-TSAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan // CHECK-TSAN-UBSAN-LINUX-CXX: libclang_rt.ubsan_standalone_cxx @@ -628,7 +628,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: -shared \ // RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX-SHARED -// CHECK-UBSAN-LINUX-SHARED: "{{.*}}ld" +// CHECK-UBSAN-LINUX-SHARED: "{{.*}}ld{{(.exe)?}}" // CHECK-UBSAN-LINUX-SHARED-NOT: --export-dynamic // CHECK-UBSAN-LINUX-SHARED-NOT: --dynamic-list // CHECK-UBSAN-LINUX-SHARED-NOT: libclang_rt.ubsan @@ -639,7 +639,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-LSAN-LINUX // -// CHECK-LSAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" +// CHECK-LSAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-LSAN-LINUX-NOT: "-lc" // CHECK-LSAN-LINUX-NOT: libclang_rt.ubsan // CHECK-LSAN-LINUX: libclang_rt.lsan.a" @@ -661,7 +661,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-LSAN-COV-LINUX // -// CHECK-LSAN-COV-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" +// CHECK-LSAN-COV-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-LSAN-COV-LINUX-NOT: "-lc" // CHECK-LSAN-COV-LINUX-NOT: libclang_rt.ubsan // CHECK-LSAV-COV-LINUX: libclang_rt.lsan-x86_64.a" @@ -675,7 +675,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-LSAN-ASAN-LINUX -// CHECK-LSAN-ASAN-LINUX: "{{.*}}ld" +// CHECK-LSAN-ASAN-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-LSAN-ASAN-LINUX-NOT: libclang_rt.lsan // CHECK-LSAN-ASAN-LINUX: libclang_rt.asan // CHECK-LSAN-ASAN-LINUX-NOT: libclang_rt.lsan @@ -685,7 +685,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-COV-LINUX -// CHECK-ASAN-COV-LINUX: "{{.*}}ld" +// CHECK-ASAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-ASAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-COV-LINUX-NOT: libclang_rt.ubsan // CHECK-ASAN-COV-LINUX-NOT: "-lstdc++" @@ -697,7 +697,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-MSAN-COV-LINUX -// CHECK-MSAN-COV-LINUX: "{{.*}}ld" +// CHECK-MSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-MSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.msan.a" "--no-whole-archive" // CHECK-MSAN-COV-LINUX-NOT: libclang_rt.ubsan // CHECK-MSAN-COV-LINUX-NOT: "-lstdc++" @@ -709,7 +709,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-DFSAN-COV-LINUX -// CHECK-DFSAN-COV-LINUX: "{{.*}}ld" +// CHECK-DFSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-DFSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.dfsan.a" "--no-whole-archive" // CHECK-DFSAN-COV-LINUX-NOT: libclang_rt.ubsan // CHECK-DFSAN-COV-LINUX-NOT: "-lstdc++" @@ -721,7 +721,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-COV-LINUX -// CHECK-UBSAN-COV-LINUX: "{{.*}}ld" +// CHECK-UBSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-UBSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive" // CHECK-UBSAN-COV-LINUX-NOT: "-lstdc++" // CHECK-UBSAN-COV-LINUX: "-lpthread" @@ -732,7 +732,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-COV-LINUX -// CHECK-COV-LINUX: "{{.*}}ld" +// CHECK-COV-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive" // CHECK-COV-LINUX-NOT: "-lstdc++" // CHECK-COV-LINUX: "-lpthread" @@ -744,7 +744,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-NSAN-LINUX // -// CHECK-NSAN-LINUX: "{{.*}}ld" +// CHECK-NSAN-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-NSAN-LINUX-NOT: "-lc" // CHECK-NSAN-LINUX-NOT: libclang_rt.ubsan // CHECK-NSAN-LINUX: libclang_rt.nsan.a" @@ -774,7 +774,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-CFI-LINUX -// CHECK-CFI-LINUX: "{{.*}}ld" +// CHECK-CFI-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-CFI-LINUX-NOT: libclang_rt. // CFI with diagnostics links the UBSan runtime. @@ -784,7 +784,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-CFI-DIAG-LINUX -// CHECK-CFI-DIAG-LINUX: "{{.*}}ld" +// CHECK-CFI-DIAG-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-CFI-DIAG-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive" // Cross-DSO CFI links the CFI runtime. @@ -793,7 +793,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-CFI-CROSS-DSO-LINUX -// CHECK-CFI-CROSS-DSO-LINUX: "{{.*}}ld" +// CHECK-CFI-CROSS-DSO-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-CFI-CROSS-DSO-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.cfi.a" "--no-whole-archive" // CHECK-CFI-CROSS-DSO-LINUX: -export-dynamic @@ -804,7 +804,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-CFI-CROSS-DSO-DIAG-LINUX -// CHECK-CFI-CROSS-DSO-DIAG-LINUX: "{{.*}}ld" +// CHECK-CFI-CROSS-DSO-DIAG-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-CFI-CROSS-DSO-DIAG-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.cfi_diag.a" "--no-whole-archive" // CHECK-CFI-CROSS-DSO-DIAG-LINUX: -export-dynamic @@ -814,7 +814,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_android_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-CFI-CROSS-DSO-ANDROID -// CHECK-CFI-CROSS-DSO-ANDROID: "{{.*}}ld" +// CHECK-CFI-CROSS-DSO-ANDROID: "{{.*}}ld{{(.exe)?}}" // CHECK-CFI-CROSS-DSO-ANDROID-NOT: libclang_rt.cfi // Cross-DSO CFI with diagnostics on Android links just the UBSAN runtime. @@ -824,7 +824,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_android_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-CFI-CROSS-DSO-DIAG-ANDROID -// CHECK-CFI-CROSS-DSO-DIAG-ANDROID: "{{.*}}ld" +// CHECK-CFI-CROSS-DSO-DIAG-ANDROID: "{{.*}}ld{{(.exe)?}}" // CHECK-CFI-CROSS-DSO-DIAG-ANDROID: "{{[^"]*}}libclang_rt.ubsan_standalone.so" // CHECK-CFI-CROSS-DSO-DIAG-ANDROID: "--export-dynamic-symbol=__cfi_check" @@ -834,7 +834,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-DARWIN106-CXX -// CHECK-ASAN-DARWIN106-CXX: "{{.*}}ld" +// CHECK-ASAN-DARWIN106-CXX: "{{.*}}ld{{(.exe)?}}" // CHECK-ASAN-DARWIN106-CXX: libclang_rt.asan_osx_dynamic.dylib // CHECK-ASAN-DARWIN106-CXX-NOT: -lc++abi @@ -844,7 +844,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-LSAN-DARWIN106-CXX -// CHECK-LSAN-DARWIN106-CXX: "{{.*}}ld" +// CHECK-LSAN-DARWIN106-CXX: "{{.*}}ld{{(.exe)?}}" // CHECK-LSAN-DARWIN106-CXX: libclang_rt.lsan_osx_dynamic.dylib // CHECK-LSAN-DARWIN106-CXX-NOT: -lc++abi @@ -854,7 +854,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-SAFESTACK-LINUX // -// CHECK-SAFESTACK-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" +// CHECK-SAFESTACK-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-SAFESTACK-LINUX-NOT: "-lc" // CHECK-SAFESTACK-LINUX-NOT: whole-archive // CHECK-SAFESTACK-LINUX: "-u" "__safestack_init" @@ -918,7 +918,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-CFI-STATS-LINUX -// CHECK-CFI-STATS-LINUX: "{{.*}}ld" +// CHECK-CFI-STATS-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-CFI-STATS-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.stats_client.a" "--no-whole-archive" // CHECK-CFI-STATS-LINUX-NOT: "--whole-archive" // CHECK-CFI-STATS-LINUX: "{{[^"]*}}libclang_rt.stats.a" @@ -927,7 +927,7 @@ // RUN: --target=x86_64-apple-darwin -fuse-ld=ld \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-CFI-STATS-DARWIN -// CHECK-CFI-STATS-DARWIN: "{{.*}}ld" +// CHECK-CFI-STATS-DARWIN: "{{.*}}ld{{(.exe)?}}" // CHECK-CFI-STATS-DARWIN: "{{[^"]*}}libclang_rt.stats_client_osx.a" // CHECK-CFI-STATS-DARWIN: "{{[^"]*}}libclang_rt.stats_osx_dynamic.dylib" @@ -966,7 +966,7 @@ // RUN: --sysroot=%S/Inputs/basic_android_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-SAFESTACK-ANDROID-ARM // -// CHECK-SAFESTACK-ANDROID-ARM: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld" +// CHECK-SAFESTACK-ANDROID-ARM: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" // CHECK-SAFESTACK-ANDROID-ARM-NOT: libclang_rt.safestack // RUN: %clang -### %s -shared 2>&1 \ @@ -974,7 +974,7 @@ // RUN: --sysroot=%S/Inputs/basic_android_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-SAFESTACK-SHARED-ANDROID-ARM // -// CHECK-SAFESTACK-SHARED-ANDROID-ARM: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld" +// CHECK-SAFESTACK-SHARED-ANDROID-ARM: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" // CHECK-SAFESTACK-SHARED-ANDROID-ARM-NOT: libclang_rt.safestack // RUN: %clang -### %s 2>&1 \ @@ -982,7 +982,7 @@ // RUN: --sysroot=%S/Inputs/basic_android_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-SAFESTACK-ANDROID-AARCH64 // -// CHECK-SAFESTACK-ANDROID-AARCH64: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld" +// CHECK-SAFESTACK-ANDROID-AARCH64: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" // CHECK-SAFESTACK-ANDROID-AARCH64-NOT: libclang_rt.safestack // RUN: not %clang -fsanitize=undefined -### %s 2>&1 \ @@ -1056,7 +1056,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-SCUDO-LINUX -// CHECK-SCUDO-LINUX: "{{.*}}ld" +// CHECK-SCUDO-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-SCUDO-LINUX: "--whole-archive" "{{.*}}libclang_rt.scudo_standalone.a" "--no-whole-archive" // CHECK-SCUDO-LINUX-NOT: "-lstdc++" // CHECK-SCUDO-LINUX: "-lpthread" @@ -1069,7 +1069,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-SCUDO-SHARED-LINUX // -// CHECK-SCUDO-SHARED-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" +// CHECK-SCUDO-SHARED-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-SCUDO-SHARED-LINUX-NOT: "-lc" // CHECK-SCUDO-SHARED-LINUX-NOT: libclang_rt.scudo_standalone.a" // CHECK-SCUDO-SHARED-LINUX: libclang_rt.scudo_standalone.so" @@ -1085,7 +1085,7 @@ // RUN: --sysroot=%S/Inputs/basic_android_tree/sysroot \ // RUN: | %{filecheck} --check-prefix=CHECK-SCUDO-ANDROID // -// CHECK-SCUDO-ANDROID: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld" +// CHECK-SCUDO-ANDROID: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" // CHECK-SCUDO-ANDROID-NOT: "-lc" // CHECK-SCUDO-ANDROID: "-pie" // CHECK-SCUDO-ANDROID-NOT: "-lpthread" @@ -1099,7 +1099,7 @@ // RUN: --sysroot=%S/Inputs/basic_android_tree/sysroot \ // RUN: -static-libsan \ // RUN: | %{filecheck} --check-prefix=CHECK-SCUDO-ANDROID-STATIC -// CHECK-SCUDO-ANDROID-STATIC: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld" +// CHECK-SCUDO-ANDROID-STATIC: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" // CHECK-SCUDO-ANDROID-STATIC: "-pie" // CHECK-SCUDO-ANDROID-STATIC: "--whole-archive" "{{.*}}libclang_rt.scudo_standalone.a" "--no-whole-archive" // CHECK-SCUDO-ANDROID-STATIC-NOT: "-lstdc++" @@ -1113,7 +1113,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-HWASAN-X86-64-LINUX // -// CHECK-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" +// CHECK-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-HWASAN-X86-64-LINUX-NOT: "-lc" // CHECK-HWASAN-X86-64-LINUX: libclang_rt.hwasan.a" // CHECK-HWASAN-X86-64-LINUX-NOT: "--export-dynamic" @@ -1130,7 +1130,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-SHARED-HWASAN-X86-64-LINUX // -// CHECK-SHARED-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" +// CHECK-SHARED-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-SHARED-HWASAN-X86-64-LINUX-NOT: "-lc" // CHECK-SHARED-HWASAN-X86-64-LINUX: libclang_rt.hwasan.so" // CHECK-SHARED-HWASAN-X86-64-LINUX-NOT: "-lpthread" @@ -1146,7 +1146,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-DSO-SHARED-HWASAN-X86-64-LINUX // -// CHECK-DSO-SHARED-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" +// CHECK-DSO-SHARED-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-DSO-SHARED-HWASAN-X86-64-LINUX-NOT: "-lc" // CHECK-DSO-SHARED-HWASAN-X86-64-LINUX: libclang_rt.hwasan.so" // CHECK-DSO-SHARED-HWASAN-X86-64-LINUX-NOT: "-lpthread" @@ -1162,7 +1162,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-HWASAN-AARCH64-LINUX // -// CHECK-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" +// CHECK-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-HWASAN-AARCH64-LINUX-NOT: "-lc" // CHECK-HWASAN-AARCH64-LINUX: libclang_rt.hwasan.a" // CHECK-HWASAN-AARCH64-LINUX-NOT: "--export-dynamic" @@ -1180,7 +1180,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-SHARED-HWASAN-AARCH64-LINUX // -// CHECK-SHARED-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" +// CHECK-SHARED-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lc" // CHECK-SHARED-HWASAN-AARCH64-LINUX: libclang_rt.hwasan.so" // CHECK-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lpthread" @@ -1196,7 +1196,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX // -// CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" +// CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" // CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lc" // CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX: libclang_rt.hwasan.so" // CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lpthread" From 8e9fda1c1140e067c5344c61df56c34167296f17 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 26 Dec 2024 15:32:50 -0800 Subject: [PATCH 092/567] Reapply "[nfc][Driver] Remove {{(.exe)?}} from sanitizer test (#121160)" (#121162) This reverts commit ca28fcc6fce516129d117f5f5a14ba7f54a045d8. Android targets pass on Windows buildbots, but fails on buildkite https://buildkite.com/llvm-project/github-pull-requests/builds/132244 Re-apply #121160 --- clang/test/Driver/sanitizer-ld.c | 106 +++++++++++++++---------------- 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/clang/test/Driver/sanitizer-ld.c b/clang/test/Driver/sanitizer-ld.c index 0faa582c081c5..9ae1a46de3f89 100644 --- a/clang/test/Driver/sanitizer-ld.c +++ b/clang/test/Driver/sanitizer-ld.c @@ -8,7 +8,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-LINUX // -// CHECK-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-ASAN-LINUX-NOT: "-lc" // CHECK-ASAN-LINUX: libclang_rt.asan.a" // CHECK-ASAN-LINUX-NOT: "--export-dynamic" @@ -74,7 +74,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-SHARED-ASAN-LINUX // -// CHECK-SHARED-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-SHARED-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-SHARED-ASAN-LINUX-NOT: "-lc" // CHECK-SHARED-ASAN-LINUX-NOT: libclang_rt.asan.a" // CHECK-SHARED-ASAN-LINUX: libclang_rt.asan.so" @@ -92,7 +92,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-DSO-SHARED-ASAN-LINUX // -// CHECK-DSO-SHARED-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-DSO-SHARED-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-DSO-SHARED-ASAN-LINUX-NOT: "-lc" // CHECK-DSO-SHARED-ASAN-LINUX-NOT: libclang_rt.asan.a" // CHECK-DSO-SHARED-ASAN-LINUX-NOT: "libclang_rt.asan-preinit.a" @@ -110,7 +110,7 @@ // RUN: --sysroot=%S/Inputs/basic_freebsd_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-FREEBSD // -// CHECK-ASAN-FREEBSD: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-ASAN-FREEBSD: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-ASAN-FREEBSD-NOT: "-lc" // CHECK-ASAN-FREEBSD-NOT: libclang_rt.asan_cxx // CHECK-ASAN-FREEBSD: freebsd{{/|\\+}}libclang_rt.asan.a" @@ -126,7 +126,7 @@ // RUN: --sysroot=%S/Inputs/basic_freebsd_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-FREEBSD-LDL // -// CHECK-ASAN-FREEBSD-LDL: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-ASAN-FREEBSD-LDL: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-ASAN-FREEBSD-LDL-NOT: "-ldl" // RUN: %clangxx -### %s 2>&1 \ @@ -142,7 +142,7 @@ // RUN: -fsanitize-link-c++-runtime \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-CXX -// CHECK-ASAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-ASAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-ASAN-LINUX-CXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-LINUX-CXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan_cxx.a" "--no-whole-archive" // CHECK-ASAN-LINUX-CXX-NOT: "--dynamic-list" @@ -161,7 +161,7 @@ // RUN: -fno-sanitize-link-c++-runtime \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-CNOCXX -// CHECK-ASAN-LINUX-CNOCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-ASAN-LINUX-CNOCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-ASAN-LINUX-CNOCXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-LINUX-CNOCXX-NOT: libclang_rt.asan_cxx // CHECK-ASAN-LINUX-CNOCXX-SAME: "--export-dynamic" @@ -179,7 +179,7 @@ // RUN: -fno-sanitize-link-c++-runtime \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-NOCXX -// CHECK-ASAN-LINUX-NOCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-ASAN-LINUX-NOCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-ASAN-LINUX-NOCXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-LINUX-NOCXX-NOT: libclang_rt.asan_cxx // CHECK-ASAN-LINUX-NOCXX-SAME: "--export-dynamic" @@ -197,7 +197,7 @@ // RUN: -nostdlib++ \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-NOSTDCXX -// CHECK-ASAN-LINUX-NOSTDCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-ASAN-LINUX-NOSTDCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-ASAN-LINUX-NOSTDCXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-LINUX-NOSTDCXX-SAME: libclang_rt.asan_cxx // CHECK-ASAN-LINUX-NOSTDCXX-SAME: "--export-dynamic" @@ -213,7 +213,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree -lstdc++ -static 2>&1 \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-CXX-STATIC // -// CHECK-ASAN-LINUX-CXX-STATIC: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-ASAN-LINUX-CXX-STATIC: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-ASAN-LINUX-CXX-STATIC-NOT: stdc++ // CHECK-ASAN-LINUX-CXX-STATIC: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-LINUX-CXX-STATIC: stdc++ @@ -223,7 +223,7 @@ // RUN: --sysroot=%S/Inputs/basic_android_tree/sysroot \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-ARM // -// CHECK-ASAN-ARM: "{{(.*[^.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-ASAN-ARM: "{{(.*[^.0-9A-Z_a-z])?}}ld" // CHECK-ASAN-ARM-NOT: "-lc" // CHECK-ASAN-ARM: libclang_rt.asan.a" // @@ -232,7 +232,7 @@ // RUN: --sysroot=%S/Inputs/basic_android_tree/sysroot \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-ARMv7 // -// CHECK-ASAN-ARMv7: "{{(.*[^.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-ASAN-ARMv7: "{{(.*[^.0-9A-Z_a-z])?}}ld" // CHECK-ASAN-ARMv7-NOT: "-lc" // CHECK-ASAN-ARMv7: libclang_rt.asan.a" @@ -344,7 +344,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-TYSAN-LINUX-CXX // -// CHECK-TYSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-TYSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-TYSAN-LINUX-CXX-NOT: stdc++ // CHECK-TYSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tysan{{[^.]*}}.a" "--no-whole-archive" // CHECK-TYSAN-LINUX-CXX: stdc++ @@ -355,7 +355,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-TYSAN-DARWIN-CXX -// CHECK-TYSAN-DARWIN-CXX: "{{.*}}ld{{(.exe)?}}" +// CHECK-TYSAN-DARWIN-CXX: "{{.*}}ld" // CHECK-TYSAN-DARWIN-CXX: libclang_rt.tysan_osx_dynamic.dylib // CHECK-TYSAN-DARWIN-CXX-NOT: -lc++abi @@ -366,7 +366,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-TSAN-LINUX-CXX // -// CHECK-TSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-TSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-TSAN-LINUX-CXX-NOT: stdc++ // CHECK-TSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tsan.a" "--no-whole-archive" // CHECK-TSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.tsan.a.syms" @@ -402,7 +402,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-MSAN-LINUX-CXX // -// CHECK-MSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-MSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-MSAN-LINUX-CXX-NOT: stdc++ // CHECK-MSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.msan.a" "--no-whole-archive" // CHECK-MSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.msan.a.syms" @@ -442,7 +442,7 @@ // RUN: -static-libsan \ // RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX -// CHECK-UBSAN-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-UBSAN-LINUX: "{{.*}}ld" // CHECK-UBSAN-LINUX-NOT: libclang_rt.asan // CHECK-UBSAN-LINUX-NOT: libclang_rt.ubsan_standalone_cxx // CHECK-UBSAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive" @@ -497,7 +497,7 @@ // RUN: -shared -shared-libsan \ // RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX-SHAREDLIBASAN -// CHECK-UBSAN-LINUX-SHAREDLIBASAN: "{{.*}}ld{{(.exe)?}}" +// CHECK-UBSAN-LINUX-SHAREDLIBASAN: "{{.*}}ld" // CHECK-UBSAN-LINUX-SHAREDLIBASAN: "{{.*}}libclang_rt.ubsan_standalone.so{{.*}}" // RUN: %clang -fsanitize=undefined -fsanitize-link-c++-runtime -### %s 2>&1 \ @@ -514,7 +514,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX-CXX -// CHECK-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}" +// CHECK-UBSAN-LINUX-CXX: "{{.*}}ld" // CHECK-UBSAN-LINUX-CXX-NOT: libclang_rt.asan // CHECK-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive" // CHECK-UBSAN-LINUX-CXX-NOT: libclang_rt.asan @@ -530,7 +530,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-MINIMAL-LINUX -// CHECK-UBSAN-MINIMAL-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-UBSAN-MINIMAL-LINUX: "{{.*}}ld" // CHECK-UBSAN-MINIMAL-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_minimal.a" "--no-whole-archive" // CHECK-UBSAN-MINIMAL-LINUX: "-lpthread" // CHECK-UBSAN-MINIMAL-LINUX: "-lresolv" @@ -539,7 +539,7 @@ // RUN: --target=x86_64-apple-darwin -fuse-ld=ld \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-MINIMAL-DARWIN -// CHECK-UBSAN-MINIMAL-DARWIN: "{{.*}}ld{{(.exe)?}}" +// CHECK-UBSAN-MINIMAL-DARWIN: "{{.*}}ld" // CHECK-UBSAN-MINIMAL-DARWIN: "{{.*}}libclang_rt.ubsan_minimal_osx_dynamic.dylib" // RUN: not %clang -fsanitize=undefined -### %s 2>&1 \ @@ -565,7 +565,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-UBSAN-LINUX -// CHECK-ASAN-UBSAN-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-ASAN-UBSAN-LINUX: "{{.*}}ld" // CHECK-ASAN-UBSAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-UBSAN-LINUX-NOT: libclang_rt.ubsan // CHECK-ASAN-UBSAN-LINUX-NOT: "-lstdc++" @@ -577,7 +577,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-UBSAN-LINUX-CXX -// CHECK-ASAN-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}" +// CHECK-ASAN-UBSAN-LINUX-CXX: "{{.*}}ld" // CHECK-ASAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.asan_cxx.a" "--no-whole-archive" // CHECK-ASAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan @@ -592,7 +592,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX -// CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX: "{{.*}}ld{{(.exe)?}}" +// CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX: "{{.*}}ld" // CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan_cxx.a" "--no-whole-archive" // CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-NOT: libclang_rt.ubsan @@ -605,7 +605,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-MSAN-UBSAN-LINUX-CXX -// CHECK-MSAN-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}" +// CHECK-MSAN-UBSAN-LINUX-CXX: "{{.*}}ld" // CHECK-MSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.msan.a" "--no-whole-archive" // CHECK-MSAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan // CHECK-MSAN-UBSAN-LINUX-CXX: libclang_rt.ubsan_standalone_cxx @@ -616,7 +616,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-TSAN-UBSAN-LINUX-CXX -// CHECK-TSAN-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}" +// CHECK-TSAN-UBSAN-LINUX-CXX: "{{.*}}ld" // CHECK-TSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tsan.a" "--no-whole-archive" // CHECK-TSAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan // CHECK-TSAN-UBSAN-LINUX-CXX: libclang_rt.ubsan_standalone_cxx @@ -628,7 +628,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: -shared \ // RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX-SHARED -// CHECK-UBSAN-LINUX-SHARED: "{{.*}}ld{{(.exe)?}}" +// CHECK-UBSAN-LINUX-SHARED: "{{.*}}ld" // CHECK-UBSAN-LINUX-SHARED-NOT: --export-dynamic // CHECK-UBSAN-LINUX-SHARED-NOT: --dynamic-list // CHECK-UBSAN-LINUX-SHARED-NOT: libclang_rt.ubsan @@ -639,7 +639,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-LSAN-LINUX // -// CHECK-LSAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-LSAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-LSAN-LINUX-NOT: "-lc" // CHECK-LSAN-LINUX-NOT: libclang_rt.ubsan // CHECK-LSAN-LINUX: libclang_rt.lsan.a" @@ -661,7 +661,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-LSAN-COV-LINUX // -// CHECK-LSAN-COV-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-LSAN-COV-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-LSAN-COV-LINUX-NOT: "-lc" // CHECK-LSAN-COV-LINUX-NOT: libclang_rt.ubsan // CHECK-LSAV-COV-LINUX: libclang_rt.lsan-x86_64.a" @@ -675,7 +675,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-LSAN-ASAN-LINUX -// CHECK-LSAN-ASAN-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-LSAN-ASAN-LINUX: "{{.*}}ld" // CHECK-LSAN-ASAN-LINUX-NOT: libclang_rt.lsan // CHECK-LSAN-ASAN-LINUX: libclang_rt.asan // CHECK-LSAN-ASAN-LINUX-NOT: libclang_rt.lsan @@ -685,7 +685,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-COV-LINUX -// CHECK-ASAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-ASAN-COV-LINUX: "{{.*}}ld" // CHECK-ASAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-COV-LINUX-NOT: libclang_rt.ubsan // CHECK-ASAN-COV-LINUX-NOT: "-lstdc++" @@ -697,7 +697,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-MSAN-COV-LINUX -// CHECK-MSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-MSAN-COV-LINUX: "{{.*}}ld" // CHECK-MSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.msan.a" "--no-whole-archive" // CHECK-MSAN-COV-LINUX-NOT: libclang_rt.ubsan // CHECK-MSAN-COV-LINUX-NOT: "-lstdc++" @@ -709,7 +709,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-DFSAN-COV-LINUX -// CHECK-DFSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-DFSAN-COV-LINUX: "{{.*}}ld" // CHECK-DFSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.dfsan.a" "--no-whole-archive" // CHECK-DFSAN-COV-LINUX-NOT: libclang_rt.ubsan // CHECK-DFSAN-COV-LINUX-NOT: "-lstdc++" @@ -721,7 +721,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-COV-LINUX -// CHECK-UBSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-UBSAN-COV-LINUX: "{{.*}}ld" // CHECK-UBSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive" // CHECK-UBSAN-COV-LINUX-NOT: "-lstdc++" // CHECK-UBSAN-COV-LINUX: "-lpthread" @@ -732,7 +732,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-COV-LINUX -// CHECK-COV-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-COV-LINUX: "{{.*}}ld" // CHECK-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive" // CHECK-COV-LINUX-NOT: "-lstdc++" // CHECK-COV-LINUX: "-lpthread" @@ -744,7 +744,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-NSAN-LINUX // -// CHECK-NSAN-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-NSAN-LINUX: "{{.*}}ld" // CHECK-NSAN-LINUX-NOT: "-lc" // CHECK-NSAN-LINUX-NOT: libclang_rt.ubsan // CHECK-NSAN-LINUX: libclang_rt.nsan.a" @@ -774,7 +774,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-CFI-LINUX -// CHECK-CFI-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-CFI-LINUX: "{{.*}}ld" // CHECK-CFI-LINUX-NOT: libclang_rt. // CFI with diagnostics links the UBSan runtime. @@ -784,7 +784,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-CFI-DIAG-LINUX -// CHECK-CFI-DIAG-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-CFI-DIAG-LINUX: "{{.*}}ld" // CHECK-CFI-DIAG-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive" // Cross-DSO CFI links the CFI runtime. @@ -793,7 +793,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-CFI-CROSS-DSO-LINUX -// CHECK-CFI-CROSS-DSO-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-CFI-CROSS-DSO-LINUX: "{{.*}}ld" // CHECK-CFI-CROSS-DSO-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.cfi.a" "--no-whole-archive" // CHECK-CFI-CROSS-DSO-LINUX: -export-dynamic @@ -804,7 +804,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-CFI-CROSS-DSO-DIAG-LINUX -// CHECK-CFI-CROSS-DSO-DIAG-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-CFI-CROSS-DSO-DIAG-LINUX: "{{.*}}ld" // CHECK-CFI-CROSS-DSO-DIAG-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.cfi_diag.a" "--no-whole-archive" // CHECK-CFI-CROSS-DSO-DIAG-LINUX: -export-dynamic @@ -834,7 +834,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-DARWIN106-CXX -// CHECK-ASAN-DARWIN106-CXX: "{{.*}}ld{{(.exe)?}}" +// CHECK-ASAN-DARWIN106-CXX: "{{.*}}ld" // CHECK-ASAN-DARWIN106-CXX: libclang_rt.asan_osx_dynamic.dylib // CHECK-ASAN-DARWIN106-CXX-NOT: -lc++abi @@ -844,7 +844,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-LSAN-DARWIN106-CXX -// CHECK-LSAN-DARWIN106-CXX: "{{.*}}ld{{(.exe)?}}" +// CHECK-LSAN-DARWIN106-CXX: "{{.*}}ld" // CHECK-LSAN-DARWIN106-CXX: libclang_rt.lsan_osx_dynamic.dylib // CHECK-LSAN-DARWIN106-CXX-NOT: -lc++abi @@ -854,7 +854,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-SAFESTACK-LINUX // -// CHECK-SAFESTACK-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-SAFESTACK-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-SAFESTACK-LINUX-NOT: "-lc" // CHECK-SAFESTACK-LINUX-NOT: whole-archive // CHECK-SAFESTACK-LINUX: "-u" "__safestack_init" @@ -918,7 +918,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-CFI-STATS-LINUX -// CHECK-CFI-STATS-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-CFI-STATS-LINUX: "{{.*}}ld" // CHECK-CFI-STATS-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.stats_client.a" "--no-whole-archive" // CHECK-CFI-STATS-LINUX-NOT: "--whole-archive" // CHECK-CFI-STATS-LINUX: "{{[^"]*}}libclang_rt.stats.a" @@ -927,7 +927,7 @@ // RUN: --target=x86_64-apple-darwin -fuse-ld=ld \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-CFI-STATS-DARWIN -// CHECK-CFI-STATS-DARWIN: "{{.*}}ld{{(.exe)?}}" +// CHECK-CFI-STATS-DARWIN: "{{.*}}ld" // CHECK-CFI-STATS-DARWIN: "{{[^"]*}}libclang_rt.stats_client_osx.a" // CHECK-CFI-STATS-DARWIN: "{{[^"]*}}libclang_rt.stats_osx_dynamic.dylib" @@ -1056,7 +1056,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-SCUDO-LINUX -// CHECK-SCUDO-LINUX: "{{.*}}ld{{(.exe)?}}" +// CHECK-SCUDO-LINUX: "{{.*}}ld" // CHECK-SCUDO-LINUX: "--whole-archive" "{{.*}}libclang_rt.scudo_standalone.a" "--no-whole-archive" // CHECK-SCUDO-LINUX-NOT: "-lstdc++" // CHECK-SCUDO-LINUX: "-lpthread" @@ -1069,7 +1069,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-SCUDO-SHARED-LINUX // -// CHECK-SCUDO-SHARED-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-SCUDO-SHARED-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-SCUDO-SHARED-LINUX-NOT: "-lc" // CHECK-SCUDO-SHARED-LINUX-NOT: libclang_rt.scudo_standalone.a" // CHECK-SCUDO-SHARED-LINUX: libclang_rt.scudo_standalone.so" @@ -1113,7 +1113,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-HWASAN-X86-64-LINUX // -// CHECK-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-HWASAN-X86-64-LINUX-NOT: "-lc" // CHECK-HWASAN-X86-64-LINUX: libclang_rt.hwasan.a" // CHECK-HWASAN-X86-64-LINUX-NOT: "--export-dynamic" @@ -1130,7 +1130,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-SHARED-HWASAN-X86-64-LINUX // -// CHECK-SHARED-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-SHARED-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-SHARED-HWASAN-X86-64-LINUX-NOT: "-lc" // CHECK-SHARED-HWASAN-X86-64-LINUX: libclang_rt.hwasan.so" // CHECK-SHARED-HWASAN-X86-64-LINUX-NOT: "-lpthread" @@ -1146,7 +1146,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-DSO-SHARED-HWASAN-X86-64-LINUX // -// CHECK-DSO-SHARED-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-DSO-SHARED-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-DSO-SHARED-HWASAN-X86-64-LINUX-NOT: "-lc" // CHECK-DSO-SHARED-HWASAN-X86-64-LINUX: libclang_rt.hwasan.so" // CHECK-DSO-SHARED-HWASAN-X86-64-LINUX-NOT: "-lpthread" @@ -1162,7 +1162,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-HWASAN-AARCH64-LINUX // -// CHECK-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-HWASAN-AARCH64-LINUX-NOT: "-lc" // CHECK-HWASAN-AARCH64-LINUX: libclang_rt.hwasan.a" // CHECK-HWASAN-AARCH64-LINUX-NOT: "--export-dynamic" @@ -1180,7 +1180,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-SHARED-HWASAN-AARCH64-LINUX // -// CHECK-SHARED-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-SHARED-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lc" // CHECK-SHARED-HWASAN-AARCH64-LINUX: libclang_rt.hwasan.so" // CHECK-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lpthread" @@ -1196,7 +1196,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX // -// CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}" +// CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lc" // CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX: libclang_rt.hwasan.so" // CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lpthread" From 7deaed975eecf78797649b731506478f3d6ccd59 Mon Sep 17 00:00:00 2001 From: Prabhuk Date: Thu, 26 Dec 2024 15:55:39 -0800 Subject: [PATCH 093/567] [libc] Reduce binary size for baremetal targets (#121164) For `math` functions we must choose size optimized implementations. Removing framepointers will also help with binary size savings. --- libc/config/baremetal/config.json | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/libc/config/baremetal/config.json b/libc/config/baremetal/config.json index dc4b0517d938d..85e80879d498e 100644 --- a/libc/config/baremetal/config.json +++ b/libc/config/baremetal/config.json @@ -25,5 +25,15 @@ "LIBC_CONF_QSORT_IMPL": { "value": "LIBC_QSORT_HEAP_SORT" } + }, + "math": { + "LIBC_CONF_MATH_OPTIMIZATIONS": { + "value": "(LIBC_MATH_SKIP_ACCURATE_PASS | LIBC_MATH_SMALL_TABLES)" + } + }, + "codegen": { + "LIBC_CONF_KEEP_FRAME_POINTER": { + "value": false + } } } From 5d529c32cc2d5342a0d183881b6c3023435ed5d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20Poussineau?= Date: Fri, 27 Dec 2024 01:43:19 +0100 Subject: [PATCH 094/567] [llvm-lib] Handle MIPS architecture (#121007) - add a test to check values for /machine argument - add a test to check if machine is correctly inferred from inputs --- llvm/lib/Object/WindowsMachineFlag.cpp | 2 ++ llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp | 3 +++ llvm/test/tools/llvm-lib/Inputs/mips.ll | 7 +++++++ llvm/test/tools/llvm-lib/infer-machine.test | 19 +++++++++++++++++++ llvm/test/tools/llvm-lib/machine-opt.test | 13 +++++++++++++ 5 files changed, 44 insertions(+) create mode 100644 llvm/test/tools/llvm-lib/Inputs/mips.ll create mode 100644 llvm/test/tools/llvm-lib/infer-machine.test create mode 100644 llvm/test/tools/llvm-lib/machine-opt.test diff --git a/llvm/lib/Object/WindowsMachineFlag.cpp b/llvm/lib/Object/WindowsMachineFlag.cpp index b9f818775768a..caf357e8c136f 100644 --- a/llvm/lib/Object/WindowsMachineFlag.cpp +++ b/llvm/lib/Object/WindowsMachineFlag.cpp @@ -21,6 +21,7 @@ using namespace llvm; // Returns /machine's value. COFF::MachineTypes llvm::getMachineType(StringRef S) { + // Flags must be a superset of Microsoft lib.exe /machine flags. return StringSwitch(S.lower()) .Cases("x64", "amd64", COFF::IMAGE_FILE_MACHINE_AMD64) .Cases("x86", "i386", COFF::IMAGE_FILE_MACHINE_I386) @@ -28,6 +29,7 @@ COFF::MachineTypes llvm::getMachineType(StringRef S) { .Case("arm64", COFF::IMAGE_FILE_MACHINE_ARM64) .Case("arm64ec", COFF::IMAGE_FILE_MACHINE_ARM64EC) .Case("arm64x", COFF::IMAGE_FILE_MACHINE_ARM64X) + .Case("mips", COFF::IMAGE_FILE_MACHINE_R4000) .Default(COFF::IMAGE_FILE_MACHINE_UNKNOWN); } diff --git a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp index 138d9fc7f1d7f..6ce06b434b2c0 100644 --- a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp +++ b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp @@ -171,6 +171,7 @@ static Expected getCOFFFileMachine(MemoryBufferRef MB) { uint16_t Machine = (*Obj)->getMachine(); if (Machine != COFF::IMAGE_FILE_MACHINE_I386 && Machine != COFF::IMAGE_FILE_MACHINE_AMD64 && + Machine != COFF::IMAGE_FILE_MACHINE_R4000 && Machine != COFF::IMAGE_FILE_MACHINE_ARMNT && !COFF::isAnyArm64(Machine)) { return createStringError(inconvertibleErrorCode(), "unknown machine: " + std::to_string(Machine)); @@ -195,6 +196,8 @@ static Expected getBitcodeFileMachine(MemoryBufferRef MB) { case Triple::aarch64: return T.isWindowsArm64EC() ? COFF::IMAGE_FILE_MACHINE_ARM64EC : COFF::IMAGE_FILE_MACHINE_ARM64; + case Triple::mipsel: + return COFF::IMAGE_FILE_MACHINE_R4000; default: return createStringError(inconvertibleErrorCode(), "unknown arch in target triple: " + *TripleStr); diff --git a/llvm/test/tools/llvm-lib/Inputs/mips.ll b/llvm/test/tools/llvm-lib/Inputs/mips.ll new file mode 100644 index 0000000000000..dd0f8338cfa97 --- /dev/null +++ b/llvm/test/tools/llvm-lib/Inputs/mips.ll @@ -0,0 +1,7 @@ +target triple = "mipsel-windows-coff" + +; Function Attrs: noinline nounwind optnone +define dso_local void @"?f@@YAXXZ"() #0 { +entry: + ret void +} diff --git a/llvm/test/tools/llvm-lib/infer-machine.test b/llvm/test/tools/llvm-lib/infer-machine.test new file mode 100644 index 0000000000000..c1399c617af40 --- /dev/null +++ b/llvm/test/tools/llvm-lib/infer-machine.test @@ -0,0 +1,19 @@ +RUN: rm -rf %t && mkdir -p %t + +RUN: llc -mtriple=i386-windows-coff -filetype=obj -o %t/i386.obj %S/Inputs/i386.ll +RUN: llvm-as %S/Inputs/i386.ll -o %t/i386.bc +RUN: llvm-lib %t/i386.obj %t/i386.bc /out:%t/i386.lib +RUN: llvm-objdump -h %t/i386.lib | FileCheck %s --check-prefix=I386 +I386: file format coff-i386 + +RUN: llc -mtriple=x86_64-windows-coff -filetype=obj -o %t/x86_64.obj %S/Inputs/x86_64.ll +RUN: llvm-as %S/Inputs/x86_64.ll -o %t/x86_64.bc +RUN: llvm-lib %t/x86_64.obj %t/x86_64.bc /out:%t/x86_64.lib +RUN: llvm-objdump -h %t/x86_64.lib | FileCheck %s --check-prefix=X86_64 +X86_64: file format coff-x86-64 + +RUN: llc -mtriple=mipsel-windows-coff -filetype=obj -o %t/mips.obj %S/Inputs/mips.ll +RUN: llvm-as %S/Inputs/mips.ll -o %t/mips.bc +RUN: llvm-lib %t/mips.obj %t/mips.bc /out:%t/mips.lib +RUN: llvm-objdump -h %t/mips.lib | FileCheck %s --check-prefix=MIPS +MIPS: file format coff-mips diff --git a/llvm/test/tools/llvm-lib/machine-opt.test b/llvm/test/tools/llvm-lib/machine-opt.test new file mode 100644 index 0000000000000..e5ade82c2f0a6 --- /dev/null +++ b/llvm/test/tools/llvm-lib/machine-opt.test @@ -0,0 +1,13 @@ +RUN: rm -f %t.lib + +RUN: llvm-lib /out:%t.lib /machine:i386 2>&1 | FileCheck --check-prefix=EMPTYWARN %s +RUN: llvm-lib /out:%t.lib /machine:amd64 2>&1 | FileCheck --check-prefix=EMPTYWARN %s + +RUN: llvm-lib /out:%t.lib /machine:mips 2>&1 | FileCheck --check-prefix=EMPTYWARN %s + +RUN: llvm-lib /out:%t.lib /machine:arm 2>&1 | FileCheck --check-prefix=EMPTYWARN %s +RUN: llvm-lib /out:%t.lib /machine:arm64 2>&1 | FileCheck --check-prefix=EMPTYWARN %s +RUN: llvm-lib /out:%t.lib /machine:arm64x 2>&1 | FileCheck --check-prefix=EMPTYWARN %s + +EMPTYWARN: warning: no input files, not writing output file + From 6b53a9546c56c805eaf86ac564083bf846570312 Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Thu, 26 Dec 2024 17:38:18 -0800 Subject: [PATCH 095/567] [mlir][arith] DCE `getPredicateByName` (#121165) --- mlir/include/mlir/Dialect/Arith/IR/ArithOps.td | 8 -------- 1 file changed, 8 deletions(-) diff --git a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td index 2f71caaa593a6..0722ff68d890d 100644 --- a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td +++ b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td @@ -1499,10 +1499,6 @@ def Arith_CmpIOp SignlessIntegerLikeOfAnyRank:$lhs, SignlessIntegerLikeOfAnyRank:$rhs); - let extraClassDeclaration = [{ - static arith::CmpIPredicate getPredicateByName(StringRef name); - }]; - let hasFolder = 1; let hasCanonicalizer = 1; } @@ -1546,10 +1542,6 @@ def Arith_CmpFOp : Arith_CompareOp<"cmpf", DefaultValuedAttr< Arith_FastMathAttr, "::mlir::arith::FastMathFlags::none">:$fastmath); - let extraClassDeclaration = [{ - static arith::CmpFPredicate getPredicateByName(StringRef name); - }]; - let hasFolder = 1; let hasCanonicalizer = 1; let assemblyFormat = [{ $predicate `,` $lhs `,` $rhs (`fastmath` `` $fastmath^)? From 9d3f9f47e6e630b8308562297757e0911be03a18 Mon Sep 17 00:00:00 2001 From: YunQiang Su Date: Fri, 27 Dec 2024 09:57:42 +0800 Subject: [PATCH 096/567] Revert "[llvm-lib] Handle MIPS architecture (#121007)" This reverts commit 5d529c32cc2d5342a0d183881b6c3023435ed5d3. --- llvm/lib/Object/WindowsMachineFlag.cpp | 2 -- llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp | 3 --- llvm/test/tools/llvm-lib/Inputs/mips.ll | 7 ------- llvm/test/tools/llvm-lib/infer-machine.test | 19 ------------------- llvm/test/tools/llvm-lib/machine-opt.test | 13 ------------- 5 files changed, 44 deletions(-) delete mode 100644 llvm/test/tools/llvm-lib/Inputs/mips.ll delete mode 100644 llvm/test/tools/llvm-lib/infer-machine.test delete mode 100644 llvm/test/tools/llvm-lib/machine-opt.test diff --git a/llvm/lib/Object/WindowsMachineFlag.cpp b/llvm/lib/Object/WindowsMachineFlag.cpp index caf357e8c136f..b9f818775768a 100644 --- a/llvm/lib/Object/WindowsMachineFlag.cpp +++ b/llvm/lib/Object/WindowsMachineFlag.cpp @@ -21,7 +21,6 @@ using namespace llvm; // Returns /machine's value. COFF::MachineTypes llvm::getMachineType(StringRef S) { - // Flags must be a superset of Microsoft lib.exe /machine flags. return StringSwitch(S.lower()) .Cases("x64", "amd64", COFF::IMAGE_FILE_MACHINE_AMD64) .Cases("x86", "i386", COFF::IMAGE_FILE_MACHINE_I386) @@ -29,7 +28,6 @@ COFF::MachineTypes llvm::getMachineType(StringRef S) { .Case("arm64", COFF::IMAGE_FILE_MACHINE_ARM64) .Case("arm64ec", COFF::IMAGE_FILE_MACHINE_ARM64EC) .Case("arm64x", COFF::IMAGE_FILE_MACHINE_ARM64X) - .Case("mips", COFF::IMAGE_FILE_MACHINE_R4000) .Default(COFF::IMAGE_FILE_MACHINE_UNKNOWN); } diff --git a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp index 6ce06b434b2c0..138d9fc7f1d7f 100644 --- a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp +++ b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp @@ -171,7 +171,6 @@ static Expected getCOFFFileMachine(MemoryBufferRef MB) { uint16_t Machine = (*Obj)->getMachine(); if (Machine != COFF::IMAGE_FILE_MACHINE_I386 && Machine != COFF::IMAGE_FILE_MACHINE_AMD64 && - Machine != COFF::IMAGE_FILE_MACHINE_R4000 && Machine != COFF::IMAGE_FILE_MACHINE_ARMNT && !COFF::isAnyArm64(Machine)) { return createStringError(inconvertibleErrorCode(), "unknown machine: " + std::to_string(Machine)); @@ -196,8 +195,6 @@ static Expected getBitcodeFileMachine(MemoryBufferRef MB) { case Triple::aarch64: return T.isWindowsArm64EC() ? COFF::IMAGE_FILE_MACHINE_ARM64EC : COFF::IMAGE_FILE_MACHINE_ARM64; - case Triple::mipsel: - return COFF::IMAGE_FILE_MACHINE_R4000; default: return createStringError(inconvertibleErrorCode(), "unknown arch in target triple: " + *TripleStr); diff --git a/llvm/test/tools/llvm-lib/Inputs/mips.ll b/llvm/test/tools/llvm-lib/Inputs/mips.ll deleted file mode 100644 index dd0f8338cfa97..0000000000000 --- a/llvm/test/tools/llvm-lib/Inputs/mips.ll +++ /dev/null @@ -1,7 +0,0 @@ -target triple = "mipsel-windows-coff" - -; Function Attrs: noinline nounwind optnone -define dso_local void @"?f@@YAXXZ"() #0 { -entry: - ret void -} diff --git a/llvm/test/tools/llvm-lib/infer-machine.test b/llvm/test/tools/llvm-lib/infer-machine.test deleted file mode 100644 index c1399c617af40..0000000000000 --- a/llvm/test/tools/llvm-lib/infer-machine.test +++ /dev/null @@ -1,19 +0,0 @@ -RUN: rm -rf %t && mkdir -p %t - -RUN: llc -mtriple=i386-windows-coff -filetype=obj -o %t/i386.obj %S/Inputs/i386.ll -RUN: llvm-as %S/Inputs/i386.ll -o %t/i386.bc -RUN: llvm-lib %t/i386.obj %t/i386.bc /out:%t/i386.lib -RUN: llvm-objdump -h %t/i386.lib | FileCheck %s --check-prefix=I386 -I386: file format coff-i386 - -RUN: llc -mtriple=x86_64-windows-coff -filetype=obj -o %t/x86_64.obj %S/Inputs/x86_64.ll -RUN: llvm-as %S/Inputs/x86_64.ll -o %t/x86_64.bc -RUN: llvm-lib %t/x86_64.obj %t/x86_64.bc /out:%t/x86_64.lib -RUN: llvm-objdump -h %t/x86_64.lib | FileCheck %s --check-prefix=X86_64 -X86_64: file format coff-x86-64 - -RUN: llc -mtriple=mipsel-windows-coff -filetype=obj -o %t/mips.obj %S/Inputs/mips.ll -RUN: llvm-as %S/Inputs/mips.ll -o %t/mips.bc -RUN: llvm-lib %t/mips.obj %t/mips.bc /out:%t/mips.lib -RUN: llvm-objdump -h %t/mips.lib | FileCheck %s --check-prefix=MIPS -MIPS: file format coff-mips diff --git a/llvm/test/tools/llvm-lib/machine-opt.test b/llvm/test/tools/llvm-lib/machine-opt.test deleted file mode 100644 index e5ade82c2f0a6..0000000000000 --- a/llvm/test/tools/llvm-lib/machine-opt.test +++ /dev/null @@ -1,13 +0,0 @@ -RUN: rm -f %t.lib - -RUN: llvm-lib /out:%t.lib /machine:i386 2>&1 | FileCheck --check-prefix=EMPTYWARN %s -RUN: llvm-lib /out:%t.lib /machine:amd64 2>&1 | FileCheck --check-prefix=EMPTYWARN %s - -RUN: llvm-lib /out:%t.lib /machine:mips 2>&1 | FileCheck --check-prefix=EMPTYWARN %s - -RUN: llvm-lib /out:%t.lib /machine:arm 2>&1 | FileCheck --check-prefix=EMPTYWARN %s -RUN: llvm-lib /out:%t.lib /machine:arm64 2>&1 | FileCheck --check-prefix=EMPTYWARN %s -RUN: llvm-lib /out:%t.lib /machine:arm64x 2>&1 | FileCheck --check-prefix=EMPTYWARN %s - -EMPTYWARN: warning: no input files, not writing output file - From 47e1c87a613d7453b6d5addc2e23e26bea10c0ce Mon Sep 17 00:00:00 2001 From: Elvis Wang Date: Fri, 27 Dec 2024 10:37:21 +0800 Subject: [PATCH 097/567] [VPlan] Set debug location for VPReduction/VPWidenIntrinsicRecipe. (#120054) This patch add missing debug location for VPReduction/VPWidenIntrinsicRecipe. --- .../Transforms/Vectorize/LoopVectorize.cpp | 6 +-- llvm/lib/Transforms/Vectorize/VPlan.h | 16 ++++---- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 1 + .../LoopVectorize/RISCV/preserve-dbg-loc.ll | 39 +++++++++++++++++++ .../preserve-dbg-loc-and-loop-metadata.ll | 26 +++++++++++++ .../preserve-dbg-loc-reduction-inloop.ll | 34 ++++++++++++++++ 6 files changed, 112 insertions(+), 10 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/preserve-dbg-loc.ll create mode 100644 llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-reduction-inloop.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index cb828b738d310..1b00e15ea28b7 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9739,9 +9739,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( if (CM.blockNeedsPredicationForAnyReason(BB)) CondOp = RecipeBuilder.getBlockInMask(BB); - VPReductionRecipe *RedRecipe = - new VPReductionRecipe(RdxDesc, CurrentLinkI, PreviousLink, VecOp, - CondOp, CM.useOrderedReductions(RdxDesc)); + auto *RedRecipe = new VPReductionRecipe( + RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp, + CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc()); // Append the recipe to the end of the VPBasicBlock because we need to // ensure that it comes after all of it's inputs, including CondOp. // Note that this transformation may leave over dead recipes (including diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 606780fa7dd5c..e2c0ff7954675 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1654,7 +1654,7 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags { VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID, ArrayRef CallArguments, Type *Ty, DebugLoc DL = {}) - : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments), + : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, DL), VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty) { LLVMContext &Ctx = Ty->getContext(); AttributeList Attrs = Intrinsic::getAttributes(Ctx, VectorIntrinsicID); @@ -2648,8 +2648,9 @@ class VPReductionRecipe : public VPSingleDefRecipe { protected: VPReductionRecipe(const unsigned char SC, const RecurrenceDescriptor &R, Instruction *I, ArrayRef Operands, - VPValue *CondOp, bool IsOrdered) - : VPSingleDefRecipe(SC, Operands, I), RdxDesc(R), IsOrdered(IsOrdered) { + VPValue *CondOp, bool IsOrdered, DebugLoc DL) + : VPSingleDefRecipe(SC, Operands, I, DL), RdxDesc(R), + IsOrdered(IsOrdered) { if (CondOp) { IsConditional = true; addOperand(CondOp); @@ -2659,16 +2660,17 @@ class VPReductionRecipe : public VPSingleDefRecipe { public: VPReductionRecipe(const RecurrenceDescriptor &R, Instruction *I, VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, - bool IsOrdered) + bool IsOrdered, DebugLoc DL = {}) : VPReductionRecipe(VPDef::VPReductionSC, R, I, ArrayRef({ChainOp, VecOp}), CondOp, - IsOrdered) {} + IsOrdered, DL) {} ~VPReductionRecipe() override = default; VPReductionRecipe *clone() override { return new VPReductionRecipe(RdxDesc, getUnderlyingInstr(), getChainOp(), - getVecOp(), getCondOp(), IsOrdered); + getVecOp(), getCondOp(), IsOrdered, + getDebugLoc()); } static inline bool classof(const VPRecipeBase *R) { @@ -2723,7 +2725,7 @@ class VPReductionEVLRecipe : public VPReductionRecipe { VPDef::VPReductionEVLSC, R.getRecurrenceDescriptor(), cast_or_null(R.getUnderlyingValue()), ArrayRef({R.getChainOp(), R.getVecOp(), &EVL}), CondOp, - R.isOrdered()) {} + R.isOrdered(), R.getDebugLoc()) {} ~VPReductionEVLRecipe() override = default; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 86262e6b9f94e..7fa5481fb3c95 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2175,6 +2175,7 @@ void VPReductionRecipe::execute(VPTransformState &State) { // Propagate the fast-math flags carried by the underlying instruction. IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); + State.setDebugLocFrom(getDebugLoc()); Value *NewVecOp = State.get(getVecOp()); if (VPValue *Cond = getCondOp()) { Value *NewCond = State.get(Cond, State.VF.isScalar()); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/preserve-dbg-loc.ll b/llvm/test/Transforms/LoopVectorize/RISCV/preserve-dbg-loc.ll new file mode 100644 index 0000000000000..93bd44f5c6220 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/preserve-dbg-loc.ll @@ -0,0 +1,39 @@ +; RUN: opt -passes=debugify,loop-vectorize \ +; RUN: -force-tail-folding-style=data-with-evl \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -S < %s 2>&1 | FileCheck --check-prefix=DEBUGLOC %s + +; Testing the debug locations of the generated vector intrinsic is same as +; its scalar counterpart. + +define void @vp_select(ptr %a, ptr %b, ptr %c, i64 %N) { +; DEBUGLOC-LABEL: define void @vp_select( +; DEBUGLOC: vector.body: +; DEBUGLOC: = call @llvm.vp.select.nxv4i32( %{{.+}}, %{{.+}}, %{{.+}}, i32 %{{.+}}), !dbg ![[SELLOC:[0-9]+]] +; DEBUGLOC: loop: +; DEBUGLOC: = select i1 %{{.+}}, i32 %{{.+}}, i32 %{{.+}}, !dbg ![[SELLOC]] +; + entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv + %load.b = load i32, ptr %gep.b, align 4 + %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv + %load.c = load i32, ptr %gep.c, align 4 + %cmp = icmp sgt i32 %load.b, %load.c + %neg.c = sub i32 0, %load.c + %sel = select i1 %cmp, i32 %load.c, i32 %neg.c + %add = add i32 %sel, %load.b + %gep.a = getelementptr inbounds i32, ptr %a, i64 %iv + store i32 %add, ptr %gep.a, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, %N + br i1 %exitcond, label %exit, label %loop + + exit: + ret void + } + + ; DEBUGLOC: [[SELLOC]] = !DILocation(line: 9 diff --git a/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll index 5052ba8117751..bb8e19e3175f1 100644 --- a/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll +++ b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll @@ -109,6 +109,31 @@ exit: ret void } +define void @widen_intrinsic_dbg(i64 %n, ptr %y, ptr %x) { +; DEBUGLOC-LABEL: define void @widen_intrinsic_dbg( +; DEBUGLOC: vector.body: +; DEBUGLOC: = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{.+}}), !dbg ![[INTRINSIC_LOC:[0-9]+]] +; DEBUGLOC: loop: +; DEBUGLOC: = call float @llvm.sqrt.f32(float %{{.+}}), !dbg ![[INTRINSIC_LOC]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.y = getelementptr inbounds float, ptr %y, i64 %iv + %load = load float, ptr %gep.y, align 4 + %call = call float @llvm.sqrt.f32(float %load) + %gep.x = getelementptr inbounds float, ptr %x, i64 %iv + store float %call, ptr %gep.x, align 4 + %iv.next = add i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, %n + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + !0 = !{!0, !1} !1 = !{!"llvm.loop.vectorize.width", i32 4} ; CHECK-NOT: !{metadata !"llvm.loop.vectorize.width", i32 4} @@ -116,3 +141,4 @@ exit: ; DEBUGLOC: ![[RESUMELOC]] = !DILocation(line: 2 ; DEBUGLOC: ![[PTRIVLOC]] = !DILocation(line: 12 +; DEBUGLOC: ![[INTRINSIC_LOC]] = !DILocation(line: 44 diff --git a/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-reduction-inloop.ll new file mode 100644 index 0000000000000..57f0dc205dba1 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-reduction-inloop.ll @@ -0,0 +1,34 @@ +; RUN: opt < %s -passes=debugify,loop-vectorize -force-vector-width=4 -prefer-inloop-reductions -S | FileCheck %s -check-prefix DEBUGLOC + +; Testing the debug locations of the generated vector intstructions are same as +; their scalar counterpart. + +define i32 @reduction_sum(ptr %A, ptr %B) { +; DEBUGLOC-LABEL: define i32 @reduction_sum( +; DEBUGLOC: vector.body: +; DEBUGLOC: = load <4 x i32>, ptr %{{.+}}, align 4, !dbg ![[LOADLOC:[0-9]+]] +; DEBUGLOC: = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %{{.+}}), !dbg ![[REDLOC:[0-9]+]] +; DEBUGLOC: loop: +; DEBUGLOC: %[[LOAD:.+]] = load i32, ptr %{{.+}}, align 4, !dbg ![[LOADLOC]] +; DEBUGLOC: = add i32 %{{.+}}, %[[LOAD]], !dbg ![[REDLOC]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] + %gep = getelementptr inbounds i32, ptr %A, i64 %iv + %load = load i32, ptr %gep, align 4 + %red.next = add i32 %red, %load + %iv.next = add i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 256 + br i1 %exitcond, label %exit, label %loop + +exit: + %red.lcssa = phi i32 [ %red.next, %loop ] + ret i32 %red.lcssa +} + +; DEBUGLOC: ![[LOADLOC]] = !DILocation(line: 5 +; DEBUGLOC: ![[REDLOC]] = !DILocation(line: 6 From 179344d9a85934ff83bed1f657c91d4c1ba12460 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 26 Dec 2024 18:53:42 -0800 Subject: [PATCH 098/567] [MC] Move AIX specific function to PPCAsmPrinter https://reviews.llvm.org/D95518 used switchSectionNoPrint, which seems buggy as .ll -> .s -> .o will be different from .ll -> .o, but this change intends to be a NFC. --- llvm/include/llvm/MC/MCStreamer.h | 3 --- llvm/lib/MC/MCAsmStreamer.cpp | 14 -------------- llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp | 12 +++++++++--- 3 files changed, 9 insertions(+), 20 deletions(-) diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h index 9115dcd2cb716..21da4dac4872b 100644 --- a/llvm/include/llvm/MC/MCStreamer.h +++ b/llvm/include/llvm/MC/MCStreamer.h @@ -1137,9 +1137,6 @@ class MCStreamer { const MCSymbol *LastLabel, const MCSymbol *Label, unsigned PointerSize) {} - - /// Do finalization for the streamer at the end of a section. - virtual void doFinalizationAtSectionEnd(MCSection *Section) {} }; /// Create a dummy machine code streamer, which does nothing. This is useful for diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp index 32f1d63218749..01fe11ed20501 100644 --- a/llvm/lib/MC/MCAsmStreamer.cpp +++ b/llvm/lib/MC/MCAsmStreamer.cpp @@ -442,8 +442,6 @@ class MCAsmStreamer final : public MCStreamer { void emitDwarfAdvanceLineAddr(int64_t LineDelta, const MCSymbol *LastLabel, const MCSymbol *Label, unsigned PointerSize) override; - - void doFinalizationAtSectionEnd(MCSection *Section) override; }; } // end anonymous namespace. @@ -2679,18 +2677,6 @@ void MCAsmStreamer::emitDwarfAdvanceLineAddr(int64_t LineDelta, emitIntValue(dwarf::DW_LNS_copy, 1); } -void MCAsmStreamer::doFinalizationAtSectionEnd(MCSection *Section) { - // Emit section end. This is used to tell the debug line section where the end - // is for a text section if we don't use .loc to represent the debug line. - assert(MAI->isAIX()); - switchSectionNoPrint(Section); - - MCSymbol *Sym = getCurrentSectionOnly()->getEndSymbol(getContext()); - - if (!Sym->isInSection()) - emitLabel(Sym); -} - MCStreamer *llvm::createAsmStreamer(MCContext &Context, std::unique_ptr OS, MCInstPrinter *IP, diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 33e07915e735b..162d11058266f 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -3247,9 +3247,15 @@ void PPCAIXAsmPrinter::emitInstruction(const MachineInstr *MI) { bool PPCAIXAsmPrinter::doFinalization(Module &M) { // Do streamer related finalization for DWARF. - if (hasDebugInfo()) - OutStreamer->doFinalizationAtSectionEnd( - OutStreamer->getContext().getObjectFileInfo()->getTextSection()); + if (hasDebugInfo()) { + // Emit section end. This is used to tell the debug line section where the + // end is for a text section if we don't use .loc to represent the debug + // line. + auto *Sec = OutContext.getObjectFileInfo()->getTextSection(); + OutStreamer->switchSectionNoPrint(Sec); + MCSymbol *Sym = Sec->getEndSymbol(OutContext); + OutStreamer->emitLabel(Sym); + } for (MCSymbol *Sym : ExtSymSDNodeSymbols) OutStreamer->emitSymbolAttribute(Sym, MCSA_Extern); From c6ea7fb2f85346eb786f2690355db830c455bfc0 Mon Sep 17 00:00:00 2001 From: Patryk Wychowaniec Date: Fri, 27 Dec 2024 04:44:55 +0100 Subject: [PATCH 099/567] [AVR] Wrap out-of-bounds relative jumps (#118015) This commit improves the relative jumps, so that we are able to emit `rjmp` that wraps around the memory boundary on devices with 8KB flash. --- llvm/lib/Target/AVR/AVRDevices.td | 50 +- .../Target/AVR/MCTargetDesc/AVRAsmBackend.cpp | 69 +- .../AVR/branch-relaxation-long-backward.ll | 2081 +++++++++ .../AVR/branch-relaxation-long-forward.ll | 2081 +++++++++ .../CodeGen/AVR/branch-relaxation-long.ll | 4162 ----------------- 5 files changed, 4228 insertions(+), 4215 deletions(-) create mode 100644 llvm/test/CodeGen/AVR/branch-relaxation-long-backward.ll create mode 100644 llvm/test/CodeGen/AVR/branch-relaxation-long-forward.ll delete mode 100644 llvm/test/CodeGen/AVR/branch-relaxation-long.ll diff --git a/llvm/lib/Target/AVR/AVRDevices.td b/llvm/lib/Target/AVR/AVRDevices.td index 5eca92ab4b6c5..56147bb473bc4 100644 --- a/llvm/lib/Target/AVR/AVRDevices.td +++ b/llvm/lib/Target/AVR/AVRDevices.td @@ -60,6 +60,18 @@ def FeatureSmallStack "The device has an 8-bit " "stack pointer">; +// The device potentially requires emitting rjmp that wraps across the flash +// boundary. +// +// We enable this for devices that have exactly 8 kB of flash memory and don't +// support the `jmp` instruction - with this feature enabled, we try to convert +// out-of-bounds relative jumps into in-bounds by wrapping the offset, e.g. +// `rjmp +5000` becomes `rjmp -3192`. +def FeatureWrappingRjmp + : SubtargetFeature<"wrappingrjmp", "HasWrappingRjmp", "true", + "The device potentially requires emitting rjmp that " + "wraps across the flash boundary">; + // The device supports the 16-bit GPR pair MOVW instruction. def FeatureMOVW : SubtargetFeature<"movw", "HasMOVW", "true", "The device supports the 16-bit MOVW " @@ -274,11 +286,11 @@ def : Device<"at86rf401", FamilyAVR2, ELFArchAVR25, [FeatureMOVW, FeatureLPMX]>; def : Device<"at90s4414", FamilyAVR2, ELFArchAVR2, [FeatureSmallStack]>; def : Device<"at90s4433", FamilyAVR2, ELFArchAVR2, [FeatureSmallStack]>; def : Device<"at90s4434", FamilyAVR2, ELFArchAVR2, [FeatureSmallStack]>; -def : Device<"at90s8515", FamilyAVR2, ELFArchAVR2>; -def : Device<"at90c8534", FamilyAVR2, ELFArchAVR2>; -def : Device<"at90s8535", FamilyAVR2, ELFArchAVR2>; -def : Device<"ata5272", FamilyAVR25, ELFArchAVR25>; -def : Device<"ata6616c", FamilyAVR25, ELFArchAVR25>; +def : Device<"at90s8515", FamilyAVR2, ELFArchAVR2, [FeatureWrappingRjmp]>; +def : Device<"at90c8534", FamilyAVR2, ELFArchAVR2, [FeatureWrappingRjmp]>; +def : Device<"at90s8535", FamilyAVR2, ELFArchAVR2, [FeatureWrappingRjmp]>; +def : Device<"ata5272", FamilyAVR25, ELFArchAVR25, [FeatureWrappingRjmp]>; +def : Device<"ata6616c", FamilyAVR25, ELFArchAVR25, [FeatureWrappingRjmp]>; def : Device<"attiny13", FamilyAVR25, ELFArchAVR25, [FeatureSmallStack]>; def : Device<"attiny13a", FamilyAVR25, ELFArchAVR25, [FeatureSmallStack]>; def : Device<"attiny2313", FamilyAVR25, ELFArchAVR25, [FeatureSmallStack]>; @@ -288,24 +300,24 @@ def : Device<"attiny24a", FamilyAVR25, ELFArchAVR25, [FeatureSmallStack]>; def : Device<"attiny4313", FamilyAVR25, ELFArchAVR25>; def : Device<"attiny44", FamilyAVR25, ELFArchAVR25>; def : Device<"attiny44a", FamilyAVR25, ELFArchAVR25>; -def : Device<"attiny84", FamilyAVR25, ELFArchAVR25>; -def : Device<"attiny84a", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny84", FamilyAVR25, ELFArchAVR25, [FeatureWrappingRjmp]>; +def : Device<"attiny84a", FamilyAVR25, ELFArchAVR25, [FeatureWrappingRjmp]>; def : Device<"attiny25", FamilyAVR25, ELFArchAVR25, [FeatureSmallStack]>; def : Device<"attiny45", FamilyAVR25, ELFArchAVR25>; -def : Device<"attiny85", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny85", FamilyAVR25, ELFArchAVR25, [FeatureWrappingRjmp]>; def : Device<"attiny261", FamilyAVR25, ELFArchAVR25, [FeatureSmallStack]>; def : Device<"attiny261a", FamilyAVR25, ELFArchAVR25, [FeatureSmallStack]>; def : Device<"attiny441", FamilyAVR25, ELFArchAVR25>; def : Device<"attiny461", FamilyAVR25, ELFArchAVR25>; def : Device<"attiny461a", FamilyAVR25, ELFArchAVR25>; -def : Device<"attiny841", FamilyAVR25, ELFArchAVR25>; -def : Device<"attiny861", FamilyAVR25, ELFArchAVR25>; -def : Device<"attiny861a", FamilyAVR25, ELFArchAVR25>; -def : Device<"attiny87", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny841", FamilyAVR25, ELFArchAVR25, [FeatureWrappingRjmp]>; +def : Device<"attiny861", FamilyAVR25, ELFArchAVR25, [FeatureWrappingRjmp]>; +def : Device<"attiny861a", FamilyAVR25, ELFArchAVR25, [FeatureWrappingRjmp]>; +def : Device<"attiny87", FamilyAVR25, ELFArchAVR25, [FeatureWrappingRjmp]>; def : Device<"attiny43u", FamilyAVR25, ELFArchAVR25>; def : Device<"attiny48", FamilyAVR25, ELFArchAVR25>; -def : Device<"attiny88", FamilyAVR25, ELFArchAVR25>; -def : Device<"attiny828", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny88", FamilyAVR25, ELFArchAVR25, [FeatureWrappingRjmp]>; +def : Device<"attiny828", FamilyAVR25, ELFArchAVR25, [FeatureWrappingRjmp]>; def : Device<"at43usb355", FamilyAVR3, ELFArchAVR3>; def : Device<"at76c711", FamilyAVR3, ELFArchAVR3>; def : Device<"atmega103", FamilyAVR31, ELFArchAVR31>; @@ -321,11 +333,11 @@ def : Device<"atmega16u2", FamilyAVR35, ELFArchAVR35>; def : Device<"atmega32u2", FamilyAVR35, ELFArchAVR35>; def : Device<"attiny1634", FamilyAVR35, ELFArchAVR35>; def : Device<"atmega8", FamilyAVR2, ELFArchAVR4, - [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>; + [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM, FeatureWrappingRjmp]>; def : Device<"ata6289", FamilyAVR4, ELFArchAVR4>; def : Device<"atmega8a", FamilyAVR2, ELFArchAVR4, - [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>; -def : Device<"ata6285", FamilyAVR4, ELFArchAVR4>; + [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM, FeatureWrappingRjmp]>; +def : Device<"ata6285", FamilyAVR4, ELFArchAVR4, [FeatureWrappingRjmp]>; def : Device<"ata6286", FamilyAVR4, ELFArchAVR4>; def : Device<"ata6612c", FamilyAVR4, ELFArchAVR4>; def : Device<"atmega48", FamilyAVR4, ELFArchAVR4>; @@ -339,9 +351,9 @@ def : Device<"atmega88p", FamilyAVR4, ELFArchAVR4>; def : Device<"atmega88pa", FamilyAVR4, ELFArchAVR4>; def : Device<"atmega88pb", FamilyAVR4, ELFArchAVR4>; def : Device<"atmega8515", FamilyAVR2, ELFArchAVR4, - [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>; + [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM, FeatureWrappingRjmp]>; def : Device<"atmega8535", FamilyAVR2, ELFArchAVR4, - [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>; + [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM, FeatureWrappingRjmp]>; def : Device<"atmega8hva", FamilyAVR4, ELFArchAVR4>; def : Device<"at90pwm1", FamilyAVR4, ELFArchAVR4>; def : Device<"at90pwm2", FamilyAVR4, ELFArchAVR4>; diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp index 17c48c2fc35ff..fd35f8fcb8e7b 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp @@ -27,16 +27,13 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -// FIXME: we should be doing checks to make sure asm operands -// are not out of bounds. - namespace adjust { using namespace llvm; static void signed_width(unsigned Width, uint64_t Value, std::string Description, const MCFixup &Fixup, - MCContext *Ctx = nullptr) { + MCContext *Ctx) { if (!isIntN(Width, Value)) { std::string Diagnostic = "out of range " + Description; @@ -46,17 +43,13 @@ static void signed_width(unsigned Width, uint64_t Value, Diagnostic += " (expected an integer in the range " + std::to_string(Min) + " to " + std::to_string(Max) + ")"; - if (Ctx) { - Ctx->reportError(Fixup.getLoc(), Diagnostic); - } else { - llvm_unreachable(Diagnostic.c_str()); - } + Ctx->reportError(Fixup.getLoc(), Diagnostic); } } static void unsigned_width(unsigned Width, uint64_t Value, std::string Description, const MCFixup &Fixup, - MCContext *Ctx = nullptr) { + MCContext *Ctx) { if (!isUIntN(Width, Value)) { std::string Diagnostic = "out of range " + Description; @@ -65,17 +58,13 @@ static void unsigned_width(unsigned Width, uint64_t Value, Diagnostic += " (expected an integer in the range 0 to " + std::to_string(Max) + ")"; - if (Ctx) { - Ctx->reportError(Fixup.getLoc(), Diagnostic); - } else { - llvm_unreachable(Diagnostic.c_str()); - } + Ctx->reportError(Fixup.getLoc(), Diagnostic); } } /// Adjusts the value of a branch target before fixup application. static void adjustBranch(unsigned Size, const MCFixup &Fixup, uint64_t &Value, - MCContext *Ctx = nullptr) { + MCContext *Ctx) { // We have one extra bit of precision because the value is rightshifted by // one. unsigned_width(Size + 1, Value, std::string("branch target"), Fixup, Ctx); @@ -86,13 +75,28 @@ static void adjustBranch(unsigned Size, const MCFixup &Fixup, uint64_t &Value, /// Adjusts the value of a relative branch target before fixup application. static void adjustRelativeBranch(unsigned Size, const MCFixup &Fixup, - uint64_t &Value, MCContext *Ctx = nullptr) { + uint64_t &Value, MCContext *Ctx) { // Jumps are relative to the current instruction. Value -= 2; // We have one extra bit of precision because the value is rightshifted by // one. - signed_width(Size + 1, Value, std::string("branch target"), Fixup, Ctx); + Size += 1; + + if (!isIntN(Size, Value) && + Ctx->getSubtargetInfo()->hasFeature(AVR::FeatureWrappingRjmp)) { + const int32_t FlashSize = 0x2000; + int32_t SignedValue = Value; + + uint64_t WrappedValue = SignedValue > 0 ? (uint64_t)(Value - FlashSize) + : (uint64_t)(FlashSize + Value); + + if (isIntN(Size, WrappedValue)) { + Value = WrappedValue; + } + } + + signed_width(Size, Value, std::string("branch target"), Fixup, Ctx); // Rightshifts the value by one. AVR::fixups::adjustBranchTarget(Value); @@ -105,7 +109,7 @@ static void adjustRelativeBranch(unsigned Size, const MCFixup &Fixup, /// /// Offset of 0 (so the result is left shifted by 3 bits before application). static void fixup_call(unsigned Size, const MCFixup &Fixup, uint64_t &Value, - MCContext *Ctx = nullptr) { + MCContext *Ctx) { adjustBranch(Size, Fixup, Value, Ctx); auto top = Value & (0xf00000 << 6); // the top four bits @@ -121,7 +125,7 @@ static void fixup_call(unsigned Size, const MCFixup &Fixup, uint64_t &Value, /// 0000 00kk kkkk k000 /// Offset of 0 (so the result is left shifted by 3 bits before application). static void fixup_7_pcrel(unsigned Size, const MCFixup &Fixup, uint64_t &Value, - MCContext *Ctx = nullptr) { + MCContext *Ctx) { adjustRelativeBranch(Size, Fixup, Value, Ctx); // Because the value may be negative, we must mask out the sign bits @@ -135,7 +139,7 @@ static void fixup_7_pcrel(unsigned Size, const MCFixup &Fixup, uint64_t &Value, /// 0000 kkkk kkkk kkkk /// Offset of 0 (so the result isn't left-shifted before application). static void fixup_13_pcrel(unsigned Size, const MCFixup &Fixup, uint64_t &Value, - MCContext *Ctx = nullptr) { + MCContext *Ctx) { adjustRelativeBranch(Size, Fixup, Value, Ctx); // Because the value may be negative, we must mask out the sign bits @@ -147,8 +151,7 @@ static void fixup_13_pcrel(unsigned Size, const MCFixup &Fixup, uint64_t &Value, /// /// Resolves to: /// 10q0 qq10 0000 1qqq -static void fixup_6(const MCFixup &Fixup, uint64_t &Value, - MCContext *Ctx = nullptr) { +static void fixup_6(const MCFixup &Fixup, uint64_t &Value, MCContext *Ctx) { unsigned_width(6, Value, std::string("immediate"), Fixup, Ctx); Value = ((Value & 0x20) << 8) | ((Value & 0x18) << 7) | (Value & 0x07); @@ -160,7 +163,7 @@ static void fixup_6(const MCFixup &Fixup, uint64_t &Value, /// Resolves to: /// 0000 0000 kk00 kkkk static void fixup_6_adiw(const MCFixup &Fixup, uint64_t &Value, - MCContext *Ctx = nullptr) { + MCContext *Ctx) { unsigned_width(6, Value, std::string("immediate"), Fixup, Ctx); Value = ((Value & 0x30) << 2) | (Value & 0x0f); @@ -170,8 +173,7 @@ static void fixup_6_adiw(const MCFixup &Fixup, uint64_t &Value, /// /// Resolves to: /// 0000 0000 AAAA A000 -static void fixup_port5(const MCFixup &Fixup, uint64_t &Value, - MCContext *Ctx = nullptr) { +static void fixup_port5(const MCFixup &Fixup, uint64_t &Value, MCContext *Ctx) { unsigned_width(5, Value, std::string("port number"), Fixup, Ctx); Value &= 0x1f; @@ -183,8 +185,7 @@ static void fixup_port5(const MCFixup &Fixup, uint64_t &Value, /// /// Resolves to: /// 1011 0AAd dddd AAAA -static void fixup_port6(const MCFixup &Fixup, uint64_t &Value, - MCContext *Ctx = nullptr) { +static void fixup_port6(const MCFixup &Fixup, uint64_t &Value, MCContext *Ctx) { unsigned_width(6, Value, std::string("port number"), Fixup, Ctx); Value = ((Value & 0x30) << 5) | (Value & 0x0f); @@ -195,7 +196,7 @@ static void fixup_port6(const MCFixup &Fixup, uint64_t &Value, /// Resolves to: /// 1010 ikkk dddd kkkk static void fixup_lds_sts_16(const MCFixup &Fixup, uint64_t &Value, - MCContext *Ctx = nullptr) { + MCContext *Ctx) { unsigned_width(7, Value, std::string("immediate"), Fixup, Ctx); Value = ((Value & 0x70) << 8) | (Value & 0x0f); } @@ -213,7 +214,7 @@ namespace ldi { /// 0000 KKKK 0000 KKKK /// Offset of 0 (so the result isn't left-shifted before application). static void fixup(unsigned Size, const MCFixup &Fixup, uint64_t &Value, - MCContext *Ctx = nullptr) { + MCContext *Ctx) { uint64_t upper = Value & 0xf0; uint64_t lower = Value & 0x0f; @@ -223,25 +224,25 @@ static void fixup(unsigned Size, const MCFixup &Fixup, uint64_t &Value, static void neg(uint64_t &Value) { Value *= -1; } static void lo8(unsigned Size, const MCFixup &Fixup, uint64_t &Value, - MCContext *Ctx = nullptr) { + MCContext *Ctx) { Value &= 0xff; ldi::fixup(Size, Fixup, Value, Ctx); } static void hi8(unsigned Size, const MCFixup &Fixup, uint64_t &Value, - MCContext *Ctx = nullptr) { + MCContext *Ctx) { Value = (Value & 0xff00) >> 8; ldi::fixup(Size, Fixup, Value, Ctx); } static void hh8(unsigned Size, const MCFixup &Fixup, uint64_t &Value, - MCContext *Ctx = nullptr) { + MCContext *Ctx) { Value = (Value & 0xff0000) >> 16; ldi::fixup(Size, Fixup, Value, Ctx); } static void ms8(unsigned Size, const MCFixup &Fixup, uint64_t &Value, - MCContext *Ctx = nullptr) { + MCContext *Ctx) { Value = (Value & 0xff000000) >> 24; ldi::fixup(Size, Fixup, Value, Ctx); } diff --git a/llvm/test/CodeGen/AVR/branch-relaxation-long-backward.ll b/llvm/test/CodeGen/AVR/branch-relaxation-long-backward.ll new file mode 100644 index 0000000000000..7c915e1dc3ef6 --- /dev/null +++ b/llvm/test/CodeGen/AVR/branch-relaxation-long-backward.ll @@ -0,0 +1,2081 @@ +; RUN: llc < %s -mtriple=avr -mcpu=attiny85 -filetype=obj -o - | llvm-objdump --mcpu=attiny85 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=ATTINY85 %s +; RUN: not llc < %s -mtriple=avr -mcpu=avr25 -filetype=obj -o - 2>&1 | FileCheck --check-prefix=AVR25 %s +; RUN: llc < %s -mtriple=avr -mcpu=avr3 -filetype=obj -o - | llvm-objdump --mcpu=avr3 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=AVR3 %s + +; ATTINY85:
: +; ATTINY85-NEXT: andi r24, 0x1 +; ATTINY85: cpi r24, 0x0 +; ATTINY85-NEXT: breq .+2 +; ATTINY85-NEXT: rjmp .+4086 +; ATTINY85: ldi r24, 0x3 +; ATTINY85-NEXT: ret + +; AVR25: error: out of range branch target (expected an integer in the range -4096 to 4095) + +; AVR3:
: +; AVR3-NEXT: andi r24, 0x1 +; AVR3: cpi r24, 0x0 +; AVR3-NEXT: breq .+4 +; AVR3-NEXT: jmp 0x0 +; AVR3-NEXT: R_AVR_CALL .text+0x2 +; AVR3: ldi r24, 0x3 +; AVR3-NEXT: ret + +define i8 @main(i1 %a) { +entry-block: + br label %hello +hello: + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + br i1 %a, label %hello, label %finished +finished: + ret i8 3 +} diff --git a/llvm/test/CodeGen/AVR/branch-relaxation-long-forward.ll b/llvm/test/CodeGen/AVR/branch-relaxation-long-forward.ll new file mode 100644 index 0000000000000..24ddb36c68839 --- /dev/null +++ b/llvm/test/CodeGen/AVR/branch-relaxation-long-forward.ll @@ -0,0 +1,2081 @@ +; RUN: llc < %s -mtriple=avr -mcpu=attiny85 -filetype=obj -o - | llvm-objdump --mcpu=attiny85 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=ATTINY85 %s +; RUN: not llc < %s -mtriple=avr -mcpu=avr25 -filetype=obj -o - 2>&1 | FileCheck --check-prefix=AVR25 %s +; RUN: llc < %s -mtriple=avr -mcpu=avr3 -filetype=obj -o - | llvm-objdump --mcpu=avr3 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=AVR3 %s + +; ATTINY85:
: +; ATTINY85-NEXT: andi r24, 0x1 +; ATTINY85-NEXT: cpi r24, 0x0 +; ATTINY85-NEXT: brne .+2 +; ATTINY85-NEXT: rjmp .-4092 +; ATTINY85: ldi r24, 0x3 +; ATTINY85-NEXT: ret + +; AVR25: error: out of range branch target (expected an integer in the range -4096 to 4095) + +; AVR3:
: +; AVR3-NEXT: andi r24, 0x1 +; AVR3-NEXT: cpi r24, 0x0 +; AVR3-NEXT: brne .+4 +; AVR3-NEXT: jmp 0x0 +; AVR3-NEXT: R_AVR_CALL .text+0x100e +; AVR3: ldi r24, 0x3 +; AVR3-NEXT: ret + +define i8 @main(i1 %a) { +entry-block: + br i1 %a, label %hello, label %finished +hello: + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + br label %finished +finished: + ret i8 3 +} diff --git a/llvm/test/CodeGen/AVR/branch-relaxation-long.ll b/llvm/test/CodeGen/AVR/branch-relaxation-long.ll deleted file mode 100644 index cd7a8046152e9..0000000000000 --- a/llvm/test/CodeGen/AVR/branch-relaxation-long.ll +++ /dev/null @@ -1,4162 +0,0 @@ -; RUN: llc < %s -mtriple=avr -mattr=avr3 | FileCheck %s -; RUN: llc < %s -mtriple=avr -mattr=avr2 | FileCheck --check-prefix=AVR2 %s - -; CHECK-LABEL: relax_to_jmp: -; CHECK: cpi r{{[0-9]+}}, 0 -; CHECK: brne [[BB1:.LBB[0-9]+_[0-9]+]] -; CHECK: jmp [[BB2:.LBB[0-9]+_[0-9]+]] -; CHECK: [[BB1]]: -; CHECK: nop -; CHECK: [[BB2]]: - -;; A `RJMP` is generated instead of expected `JMP` for AVR2, -;; and it is up to the linker to report 'out of range' or -;; 'exceed flash maximum size'. -; AVR2-LABEL: relax_to_jmp: -; AVR2: cpi r{{[0-9]+}}, 0 -; AVR2: brne [[BB1:.LBB[0-9]+_[0-9]+]] -; AVR2: rjmp [[BB2:.LBB[0-9]+_[0-9]+]] -; AVR2: [[BB1]]: -; AVR2: nop -; AVR2: [[BB2]]: - -define i8 @relax_to_jmp(i1 %a) { -entry-block: - br i1 %a, label %hello, label %finished -hello: - ; with >4 kB of instructions (2050 NOPs), this requires a long jump (jmp), - ; versus a relative one (rjmp). - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - br label %finished -finished: - ret i8 3 -} - -; CHECK-LABEL: relax_to_jmp_backwards: -; CHECK: [[BB1:.LBB[0-9]+_[0-9]+]] -; CHECK: nop -; CHECK: cpi r{{[0-9]+}}, 0 -; CHECK: breq [[BB2:.LBB[0-9]+_[0-9]+]] -; CHECK: jmp [[BB1]] -; CHECK: [[BB2]]: - -;; A `RJMP` is generated instead of expected `JMP` for AVR2, -;; and it is up to the linker to report 'out of range' or -;; 'exceed flash maximum size'. -; AVR2-LABEL: relax_to_jmp_backwards: -; AVR2: [[BB1:.LBB[0-9]+_[0-9]+]] -; AVR2: nop -; AVR2: cpi r{{[0-9]+}}, 0 -; AVR2: breq [[BB2:.LBB[0-9]+_[0-9]+]] -; AVR2: rjmp [[BB1]] -; AVR2: [[BB2]]: - -define i8 @relax_to_jmp_backwards(i1 %a) { -entry-block: - br label %hello -hello: - ; with >4 kB of instructions (2050 NOPs), this requires a long jump (jmp), - ; versus a relative one (rjmp). - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - call void asm sideeffect "nop", ""() - br i1 %a, label %hello, label %finished -finished: - ret i8 3 -} From f51db95e064c97860910d1ca17a8c29eb23d8623 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 26 Dec 2024 19:49:45 -0800 Subject: [PATCH 100/567] [NFC][Driver] Use global --implicit-check-not=libclang_rt (#121081) To simplify and improve precision of the test. --- clang/test/Driver/sanitizer-ld.c | 169 +++++++++++++++++++------------ 1 file changed, 106 insertions(+), 63 deletions(-) diff --git a/clang/test/Driver/sanitizer-ld.c b/clang/test/Driver/sanitizer-ld.c index 9ae1a46de3f89..6b57fb144f421 100644 --- a/clang/test/Driver/sanitizer-ld.c +++ b/clang/test/Driver/sanitizer-ld.c @@ -1,6 +1,6 @@ // Test sanitizers ld flags. -// DEFINE: %{filecheck} = FileCheck %s +// DEFINE: %{filecheck} = FileCheck %s --implicit-check-not="libclang_rt" // RUN: %clang -### %s 2>&1 \ // RUN: --target=i386-unknown-linux -fuse-ld=ld -fsanitize=address \ @@ -10,7 +10,8 @@ // // CHECK-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-ASAN-LINUX-NOT: "-lc" -// CHECK-ASAN-LINUX: libclang_rt.asan.a" +// CHECK-ASAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan_static.a" "--no-whole-archive" +// CHECK-ASAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // CHECK-ASAN-LINUX-NOT: "--export-dynamic" // CHECK-ASAN-LINUX: "--dynamic-list={{.*}}libclang_rt.asan.a.syms" // CHECK-ASAN-LINUX-NOT: "--export-dynamic" @@ -25,8 +26,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-NO-LINK-RUNTIME-LINUX // -// CHECK-ASAN-NO-LINK-RUNTIME-LINUX-NOT: libclang_rt.asan_static-x86_64 -// CHECK-ASAN-NO-LINK-RUNTIME-LINUX-NOT: libclang_rt.asan-x86_64 +// CHECK-ASAN-NO-LINK-RUNTIME-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // RUN: %clang -fsanitize=address -fno-sanitize-link-runtime -### %s 2>&1 \ // RUN: --target=arm64e-apple-macosx -fuse-ld=ld \ @@ -34,8 +34,8 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-NO-LINK-RUNTIME-DARWIN // -// CHECK-ASAN-NO-LINK-RUNTIME-DARWIN-NOT: libclang_rt.asan_static -// CHECK-ASAN-NO-LINK-RUNTIME-DARWIN-NOT: libclang_rt.asan +// CHECK-ASAN-NO-LINK-RUNTIME-DARWIN: "{{.*}}ld" +// CHECK-ASAN-NO-LINK-RUNTIME-DARWIN: libclang_rt.osx.a" // RUN: %clang -fsanitize=address -### %s 2>&1 \ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ @@ -43,8 +43,9 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-EXECUTABLE-LINUX // -// CHECK-ASAN-EXECUTABLE-LINUX: libclang_rt.asan_static -// CHECK-ASAN-EXECUTABLE-LINUX: libclang_rt.asan +// CHECK-ASAN-EXECUTABLE-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan_static.a" "--no-whole-archive" +// CHECK-ASAN-EXECUTABLE-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" +// CHECK-ASAN-EXECUTABLE-LINUX: "--dynamic-list={{.*}}libclang_rt.asan.a.syms" // RUN: %clang -fsanitize=address -shared -### %s 2>&1 \ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ @@ -53,7 +54,6 @@ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-SHARED-LINUX // // CHECK-ASAN-SHARED-LINUX: libclang_rt.asan_static -// CHECK-ASAN-SHARED-LINUX-NOT: libclang_rt.asan // RUN: %clang -### %s 2>&1 \ // RUN: --target=i386-unknown-linux -fuse-ld=ld -fsanitize=address -shared-libsan \ @@ -76,9 +76,9 @@ // // CHECK-SHARED-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-SHARED-ASAN-LINUX-NOT: "-lc" -// CHECK-SHARED-ASAN-LINUX-NOT: libclang_rt.asan.a" // CHECK-SHARED-ASAN-LINUX: libclang_rt.asan.so" // CHECK-SHARED-ASAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan-preinit.a" "--no-whole-archive" +// CHECK-SHARED-ASAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan_static.a" "--no-whole-archive" // CHECK-SHARED-ASAN-LINUX-NOT: "-lpthread" // CHECK-SHARED-ASAN-LINUX-NOT: "-lrt" // CHECK-SHARED-ASAN-LINUX-NOT: "-ldl" @@ -94,9 +94,8 @@ // // CHECK-DSO-SHARED-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-DSO-SHARED-ASAN-LINUX-NOT: "-lc" -// CHECK-DSO-SHARED-ASAN-LINUX-NOT: libclang_rt.asan.a" -// CHECK-DSO-SHARED-ASAN-LINUX-NOT: "libclang_rt.asan-preinit.a" // CHECK-DSO-SHARED-ASAN-LINUX: libclang_rt.asan.so" +// CHECK-DSO-SHARED-ASAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan_static.a" "--no-whole-archive" // CHECK-DSO-SHARED-ASAN-LINUX-NOT: "-lpthread" // CHECK-DSO-SHARED-ASAN-LINUX-NOT: "-lrt" // CHECK-DSO-SHARED-ASAN-LINUX-NOT: "-ldl" @@ -112,9 +111,8 @@ // // CHECK-ASAN-FREEBSD: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-ASAN-FREEBSD-NOT: "-lc" -// CHECK-ASAN-FREEBSD-NOT: libclang_rt.asan_cxx +// CHECK-ASAN-FREEBSD: freebsd{{/|\\+}}libclang_rt.asan_static.a" // CHECK-ASAN-FREEBSD: freebsd{{/|\\+}}libclang_rt.asan.a" -// CHECK-ASAN-FREEBSD-NOT: libclang_rt.asan_cxx // CHECK-ASAN-FREEBSD-NOT: "--dynamic-list" // CHECK-ASAN-FREEBSD: "--export-dynamic" // CHECK-ASAN-FREEBSD: "-lpthread" @@ -128,6 +126,8 @@ // // CHECK-ASAN-FREEBSD-LDL: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-ASAN-FREEBSD-LDL-NOT: "-ldl" +// CHECK-ASAN-FREEBSD-LDL: "--whole-archive" "{{.*}}libclang_rt.asan_static.a" "--no-whole-archive" +// CHECK-ASAN-FREEBSD-LDL: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" // RUN: %clangxx -### %s 2>&1 \ // RUN: --target=i386-unknown-linux -fuse-ld=ld -stdlib=platform -fsanitize=address \ @@ -163,7 +163,6 @@ // CHECK-ASAN-LINUX-CNOCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-ASAN-LINUX-CNOCXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" -// CHECK-ASAN-LINUX-CNOCXX-NOT: libclang_rt.asan_cxx // CHECK-ASAN-LINUX-CNOCXX-SAME: "--export-dynamic" // CHECK-ASAN-LINUX-CNOCXX-NOT: stdc++ // CHECK-ASAN-LINUX-CNOCXX-SAME: "-lpthread" @@ -181,7 +180,6 @@ // CHECK-ASAN-LINUX-NOCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-ASAN-LINUX-NOCXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" -// CHECK-ASAN-LINUX-NOCXX-NOT: libclang_rt.asan_cxx // CHECK-ASAN-LINUX-NOCXX-SAME: "--export-dynamic" // CHECK-ASAN-LINUX-NOCXX-SAME: "-lstdc++" // CHECK-ASAN-LINUX-NOCXX-SAME: "-lpthread" @@ -199,7 +197,7 @@ // CHECK-ASAN-LINUX-NOSTDCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-ASAN-LINUX-NOSTDCXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" -// CHECK-ASAN-LINUX-NOSTDCXX-SAME: libclang_rt.asan_cxx +// CHECK-ASAN-LINUX-NOSTDCXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan_cxx.a" "--no-whole-archive" // CHECK-ASAN-LINUX-NOSTDCXX-SAME: "--export-dynamic" // CHECK-ASAN-LINUX-NOSTDCXX-SAME: "-lpthread" // CHECK-ASAN-LINUX-NOSTDCXX-SAME: "-lrt" @@ -216,6 +214,7 @@ // CHECK-ASAN-LINUX-CXX-STATIC: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-ASAN-LINUX-CXX-STATIC-NOT: stdc++ // CHECK-ASAN-LINUX-CXX-STATIC: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" +// CHECK-ASAN-LINUX-CXX-STATIC: "--dynamic-list={{.*}}libclang_rt.asan.a.syms" // CHECK-ASAN-LINUX-CXX-STATIC: stdc++ // RUN: %clang -### %s 2>&1 \ @@ -225,6 +224,7 @@ // // CHECK-ASAN-ARM: "{{(.*[^.0-9A-Z_a-z])?}}ld" // CHECK-ASAN-ARM-NOT: "-lc" +// CHECK-ASAN-ARM: libclang_rt.asan_static.a" // CHECK-ASAN-ARM: libclang_rt.asan.a" // // RUN: %clang -### %s 2>&1 \ @@ -234,6 +234,7 @@ // // CHECK-ASAN-ARMv7: "{{(.*[^.0-9A-Z_a-z])?}}ld" // CHECK-ASAN-ARMv7-NOT: "-lc" +// CHECK-ASAN-ARMv7: libclang_rt.asan_static.a" // CHECK-ASAN-ARMv7: libclang_rt.asan.a" // RUN: %clang -### %s 2>&1 \ @@ -248,6 +249,9 @@ // CHECK-ASAN-ANDROID-NOT: "-lpthread" // CHECK-ASAN-ANDROID-NOT: "-lresolv" // CHECK-ASAN-ANDROID: libclang_rt.asan.so" +// CHECK-ASAN-ANDROID: libclang_rt.asan_static.a" +// CHECK-ASAN-ANDROID: libclang_rt.builtins.a +// CHECK-ASAN-ANDROID: libclang_rt.builtins.a // CHECK-ASAN-ANDROID-NOT: "-lpthread" // CHECK-ASAN-ANDROID-NOT: "-lresolv" @@ -266,7 +270,10 @@ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-ANDROID-STATICLIBASAN // // CHECK-ASAN-ANDROID-STATICLIBASAN: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" +// CHECK-ASAN-ANDROID-STATICLIBASAN: libclang_rt.asan_static.a" // CHECK-ASAN-ANDROID-STATICLIBASAN: libclang_rt.asan.a" +// CHECK-ASAN-ANDROID-STATICLIBASAN: libclang_rt.builtins.a" +// CHECK-ASAN-ANDROID-STATICLIBASAN: libclang_rt.builtins.a" // CHECK-ASAN-ANDROID-STATICLIBASAN-NOT: "-lpthread" // CHECK-ASAN-ANDROID-STATICLIBASAN-NOT: "-lrt" // CHECK-ASAN-ANDROID-STATICLIBASAN-NOT: "-lresolv" @@ -283,6 +290,8 @@ // CHECK-UBSAN-ANDROID-NOT: "-lpthread" // CHECK-UBSAN-ANDROID-NOT: "-lresolv" // CHECK-UBSAN-ANDROID: libclang_rt.ubsan_standalone.so" +// CHECK-UBSAN-ANDROID: libclang_rt.builtins.a" +// CHECK-UBSAN-ANDROID: libclang_rt.builtins.a" // CHECK-UBSAN-ANDROID-NOT: "-lpthread" // CHECK-UBSAN-ANDROID-NOT: "-lresolv" @@ -295,6 +304,8 @@ // // CHECK-UBSAN-ANDROID-STATICLIBASAN: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" // CHECK-UBSAN-ANDROID-STATICLIBASAN: libclang_rt.ubsan_standalone.a" +// CHECK-UBSAN-ANDROID-STATICLIBASAN: libclang_rt.builtins.a" +// CHECK-UBSAN-ANDROID-STATICLIBASAN: libclang_rt.builtins.a" // CHECK-UBSAN-ANDROID-STATICLIBASAN-NOT: "-lpthread" // CHECK-UBSAN-ANDROID-STATICLIBASAN-NOT: "-lrt" // CHECK-UBSAN-ANDROID-STATICLIBASAN-NOT: "-lresolv" @@ -312,6 +323,9 @@ // CHECK-ASAN-ANDROID-X86-NOT: "-lpthread" // CHECK-ASAN-ANDROID-X86-NOT: "-lresolv" // CHECK-ASAN-ANDROID-X86: libclang_rt.asan.so" +// CHECK-ASAN-ANDROID-X86: libclang_rt.asan_static.a" +// CHECK-ASAN-ANDROID-X86: libclang_rt.builtins.a" +// CHECK-ASAN-ANDROID-X86: libclang_rt.builtins.a" // CHECK-ASAN-ANDROID-X86-NOT: "-lpthread" // CHECK-ASAN-ANDROID-X86-NOT: "-lresolv" // @@ -322,6 +336,10 @@ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-ANDROID-SHARED-LIBASAN // // CHECK-ASAN-ANDROID-SHARED-LIBASAN-NOT: argument unused during compilation: '-shared-libsan' +// CHECK-ASAN-ANDROID-SHARED-LIBASAN: libclang_rt.asan.so" +// CHECK-ASAN-ANDROID-SHARED-LIBASAN: libclang_rt.asan_static.a" +// CHECK-ASAN-ANDROID-SHARED-LIBASAN: libclang_rt.builtins.a" +// CHECK-ASAN-ANDROID-SHARED-LIBASAN: libclang_rt.builtins.a" // // RUN: %clang -### %s 2>&1 \ // RUN: --target=arm-linux-androideabi -fuse-ld=ld -fsanitize=address \ @@ -333,6 +351,9 @@ // CHECK-ASAN-ANDROID-SHARED: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" // CHECK-ASAN-ANDROID-SHARED-NOT: "-lc" // CHECK-ASAN-ANDROID-SHARED: libclang_rt.asan.so" +// CHECK-ASAN-ANDROID-SHARED: libclang_rt.asan_static.a" +// CHECK-ASAN-ANDROID-SHARED: libclang_rt.builtins.a" +// CHECK-ASAN-ANDROID-SHARED: libclang_rt.builtins.a" // CHECK-ASAN-ANDROID-SHARED-NOT: "-lpthread" // CHECK-ASAN-ANDROID-SHARED-NOT: "-lresolv" @@ -357,6 +378,7 @@ // RUN: | %{filecheck} --check-prefix=CHECK-TYSAN-DARWIN-CXX // CHECK-TYSAN-DARWIN-CXX: "{{.*}}ld" // CHECK-TYSAN-DARWIN-CXX: libclang_rt.tysan_osx_dynamic.dylib +// CHECK-TYSAN-DARWIN-CXX: libclang_rt.osx.a // CHECK-TYSAN-DARWIN-CXX-NOT: -lc++abi // RUN: %clangxx -### %s 2>&1 \ @@ -385,7 +407,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-TSAN-NO-LINK-RUNTIME-LINUX // -// CHECK-TSAN-NO-LINK-RUNTIME-LINUX-NOT: libclang_rt.tsan +// CHECK-TSAN-NO-LINK-RUNTIME-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // RUN: not %clang -fsanitize=thread -fno-sanitize-link-runtime -### %s 2>&1 \ // RUN: --target=arm64e-apple-ios -fuse-ld=ld \ @@ -393,7 +415,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-TSAN-NO-LINK-RUNTIME-DARWIN // -// CHECK-TSAN-NO-LINK-RUNTIME-DARWIN-NOT: libclang_rt.tsan +// CHECK-TSAN-NO-LINK-RUNTIME-DARWIN: libclang_rt.ios.a // RUN: %clangxx -### %s 2>&1 \ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld -stdlib=platform -lstdc++ \ @@ -421,7 +443,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-MSAN-NO-LINK-RUNTIME-LINUX // -// CHECK-MSAN-NO-LINK-RUNTIME-LINUX-NOT: libclang_rt.msan +// CHECK-MSAN-NO-LINK-RUNTIME-LINUX: "{{.*}}ld" // RUN: %clang -fsanitize=undefined -### %s 2>&1 \ // RUN: --target=x86_64-unknown-linux-gnux32 -fuse-ld=ld \ @@ -443,11 +465,7 @@ // RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX // CHECK-UBSAN-LINUX: "{{.*}}ld" -// CHECK-UBSAN-LINUX-NOT: libclang_rt.asan -// CHECK-UBSAN-LINUX-NOT: libclang_rt.ubsan_standalone_cxx // CHECK-UBSAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive" -// CHECK-UBSAN-LINUX-NOT: libclang_rt.asan -// CHECK-UBSAN-LINUX-NOT: libclang_rt.ubsan_standalone_cxx // CHECK-UBSAN-LINUX-NOT: "-lstdc++" // CHECK-UBSAN-LINUX: "-lpthread" // CHECK-UBSAN-LINUX: "-lresolv" @@ -458,7 +476,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-NO-LINK-RUNTIME-LINUX // -// CHECK-UBSAN-NO-LINK-RUNTIME-LINUX-NOT: libclang_rt.undefined +// CHECK-UBSAN-NO-LINK-RUNTIME-LINUX: "{{.*}}ld" // RUN: %clang -fsanitize=undefined -fno-sanitize-link-runtime -### %s 2>&1 \ // RUN: --target=x86_64-apple-darwin -fuse-ld=ld \ @@ -466,7 +484,8 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-NO-LINK-RUNTIME-DARWIN // -// CHECK-UBSAN-NO-LINK-RUNTIME-DARWIN-NOT: libclang_rt.ubsan +// CHECK-UBSAN-NO-LINK-RUNTIME-DARWIN: "{{.*}}ld" +// CHECK-UBSAN-NO-LINK-RUNTIME-DARWIN: libclang_rt.osx.a // RUN: %clang -fsanitize=fuzzer -fno-sanitize-link-runtime -### %s 2>&1 \ // RUN: --target=arm64e-apple-watchos -fuse-ld=ld \ @@ -474,7 +493,8 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-FUZZER-NO-LINK-RUNTIME-DARWIN // -// CHECK-FUZZER-NO-LINK-RUNTIME-DARWIN-NOT: libclang_rt.fuzzer +// CHECK-FUZZER-NO-LINK-RUNTIME-DARWIN: "{{.*}}ld" +// CHECK-FUZZER-NO-LINK-RUNTIME-DARWIN: libclang_rt.watchos.a // RUN: %clang -fsanitize=undefined -### %s 2>&1 \ // RUN: --target=i386-unknown-linux -fuse-ld=ld \ @@ -515,13 +535,9 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX-CXX // CHECK-UBSAN-LINUX-CXX: "{{.*}}ld" -// CHECK-UBSAN-LINUX-CXX-NOT: libclang_rt.asan // CHECK-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive" -// CHECK-UBSAN-LINUX-CXX-NOT: libclang_rt.asan // CHECK-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone_cxx.a" "--no-whole-archive" -// CHECK-UBSAN-LINUX-CXX-NOT: libclang_rt.asan // CHECK-UBSAN-LINUX-CXX: "-lstdc++" -// CHECK-UBSAN-LINUX-CXX-NOT: libclang_rt.asan // CHECK-UBSAN-LINUX-CXX: "-lpthread" // CHECK-UBSAN-LINUX-CXX: "-lresolv" @@ -566,8 +582,9 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-UBSAN-LINUX // CHECK-ASAN-UBSAN-LINUX: "{{.*}}ld" +// CHECK-ASAN-UBSAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan_static.a" "--no-whole-archive" // CHECK-ASAN-UBSAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" -// CHECK-ASAN-UBSAN-LINUX-NOT: libclang_rt.ubsan +// CHECK-ASAN-UBSAN-LINUX: "--dynamic-list={{.*}}libclang_rt.asan.a.syms" // CHECK-ASAN-UBSAN-LINUX-NOT: "-lstdc++" // CHECK-ASAN-UBSAN-LINUX: "-lpthread" // CHECK-ASAN-UBSAN-LINUX: "-lresolv" @@ -578,11 +595,11 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-UBSAN-LINUX-CXX // CHECK-ASAN-UBSAN-LINUX-CXX: "{{.*}}ld" +// CHECK-ASAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.asan_static.a" "--no-whole-archive" // CHECK-ASAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" +// CHECK-ASAN-UBSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.asan.a.syms" // CHECK-ASAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.asan_cxx.a" "--no-whole-archive" -// CHECK-ASAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan -// CHECK-ASAN-UBSAN-LINUX-CXX: libclang_rt.ubsan_standalone_cxx -// CHECK-ASAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan +// CHECK-ASAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone_cxx.a" "--no-whole-archive" // CHECK-ASAN-UBSAN-LINUX-CXX: "-lstdc++" // CHECK-ASAN-UBSAN-LINUX-CXX: "-lpthread" // CHECK-ASAN-UBSAN-LINUX-CXX: "-lresolv" @@ -593,9 +610,10 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX // CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX: "{{.*}}ld" +// CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan_static.a" "--no-whole-archive" // CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" +// CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-SAME: "--dynamic-list={{.*}}libclang_rt.asan.a.syms" // CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan_cxx.a" "--no-whole-archive" -// CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-NOT: libclang_rt.ubsan // CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-SAME: "-lstdc++" // CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-SAME: "-lpthread" // CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-SAME: "-lresolv" @@ -607,9 +625,10 @@ // RUN: | %{filecheck} --check-prefix=CHECK-MSAN-UBSAN-LINUX-CXX // CHECK-MSAN-UBSAN-LINUX-CXX: "{{.*}}ld" // CHECK-MSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.msan.a" "--no-whole-archive" -// CHECK-MSAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan -// CHECK-MSAN-UBSAN-LINUX-CXX: libclang_rt.ubsan_standalone_cxx -// CHECK-MSAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan +// CHECK-MSAN-UBSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.msan.a.syms" +// CHECK-MSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.msan_cxx.a" "--no-whole-archive" +// CHECK-MSAN-UBSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.msan_cxx.a.syms" +// CHECK-MSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone_cxx.a" "--no-whole-archive" // RUN: %clangxx -fsanitize=thread,undefined -### %s 2>&1 \ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ @@ -618,9 +637,10 @@ // RUN: | %{filecheck} --check-prefix=CHECK-TSAN-UBSAN-LINUX-CXX // CHECK-TSAN-UBSAN-LINUX-CXX: "{{.*}}ld" // CHECK-TSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tsan.a" "--no-whole-archive" -// CHECK-TSAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan -// CHECK-TSAN-UBSAN-LINUX-CXX: libclang_rt.ubsan_standalone_cxx -// CHECK-TSAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan +// CHECK-TSAN-UBSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.tsan.a.syms" +// CHECK-TSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tsan_cxx.a" "--no-whole-archive" +// CHECK-TSAN-UBSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.tsan_cxx.a.syms" +// CHECK-TSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone_cxx.a" "--no-whole-archive" // RUN: %clang -fsanitize=undefined -### %s 2>&1 \ // RUN: --target=i386-unknown-linux -fuse-ld=ld \ @@ -631,7 +651,6 @@ // CHECK-UBSAN-LINUX-SHARED: "{{.*}}ld" // CHECK-UBSAN-LINUX-SHARED-NOT: --export-dynamic // CHECK-UBSAN-LINUX-SHARED-NOT: --dynamic-list -// CHECK-UBSAN-LINUX-SHARED-NOT: libclang_rt.ubsan // RUN: %clang -### %s 2>&1 \ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld -fsanitize=leak \ @@ -641,7 +660,6 @@ // // CHECK-LSAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-LSAN-LINUX-NOT: "-lc" -// CHECK-LSAN-LINUX-NOT: libclang_rt.ubsan // CHECK-LSAN-LINUX: libclang_rt.lsan.a" // CHECK-LSAN-LINUX: "-lpthread" // CHECK-LSAN-LINUX: "-ldl" @@ -653,7 +671,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-LSAN-NO-LINK-RUNTIME-LINUX // -// CHECK-LSAN-NO-LINK-RUNTIME-LINUX-NOT: libclang_rt.lsan +// CHECK-LSAN-NO-LINK-RUNTIME-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // RUN: %clang -### %s 2>&1 \ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld -fsanitize=leak -fsanitize-coverage=func \ @@ -663,9 +681,8 @@ // // CHECK-LSAN-COV-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-LSAN-COV-LINUX-NOT: "-lc" -// CHECK-LSAN-COV-LINUX-NOT: libclang_rt.ubsan +// CHECK-LSAN-COV-LINUX: libclang_rt.lsan.a // CHECK-LSAV-COV-LINUX: libclang_rt.lsan-x86_64.a" -// CHECK-LSAN-COV-LINUX-NOT: libclang_rt.ubsan // CHECK-LSAN-COV-LINUX: "-lpthread" // CHECK-LSAN-COV-LINUX: "-ldl" // CHECK-LSAN-COV-LINUX: "-lresolv" @@ -676,9 +693,9 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-LSAN-ASAN-LINUX // CHECK-LSAN-ASAN-LINUX: "{{.*}}ld" -// CHECK-LSAN-ASAN-LINUX-NOT: libclang_rt.lsan +// CHECK-LSAN-ASAN-LINUX: libclang_rt.asan_static // CHECK-LSAN-ASAN-LINUX: libclang_rt.asan -// CHECK-LSAN-ASAN-LINUX-NOT: libclang_rt.lsan +// CHECK-LSAN-ASAN-LINUX: "--dynamic-list={{.*}}libclang_rt.asan.a.syms" // RUN: %clang -fsanitize=address -fsanitize-coverage=func -### %s 2>&1 \ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ @@ -686,8 +703,9 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-COV-LINUX // CHECK-ASAN-COV-LINUX: "{{.*}}ld" -// CHECK-ASAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive" -// CHECK-ASAN-COV-LINUX-NOT: libclang_rt.ubsan +// CHECK-ASAN-COV-LINUX: libclang_rt.asan_static +// CHECK-ASAN-COV-LINUX: libclang_rt.asan +// CHECK-ASAN-COV-LINUX: "--dynamic-list={{.*}}libclang_rt.asan.a.syms" // CHECK-ASAN-COV-LINUX-NOT: "-lstdc++" // CHECK-ASAN-COV-LINUX: "-lpthread" // CHECK-ASAN-COV-LINUX: "-lresolv" @@ -699,7 +717,7 @@ // RUN: | %{filecheck} --check-prefix=CHECK-MSAN-COV-LINUX // CHECK-MSAN-COV-LINUX: "{{.*}}ld" // CHECK-MSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.msan.a" "--no-whole-archive" -// CHECK-MSAN-COV-LINUX-NOT: libclang_rt.ubsan +// CHECK-MSAN-COV-LINUX: "--dynamic-list={{.*}}libclang_rt.msan.a.syms" // CHECK-MSAN-COV-LINUX-NOT: "-lstdc++" // CHECK-MSAN-COV-LINUX: "-lpthread" // CHECK-MSAN-COV-LINUX: "-lresolv" @@ -711,7 +729,6 @@ // RUN: | %{filecheck} --check-prefix=CHECK-DFSAN-COV-LINUX // CHECK-DFSAN-COV-LINUX: "{{.*}}ld" // CHECK-DFSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.dfsan.a" "--no-whole-archive" -// CHECK-DFSAN-COV-LINUX-NOT: libclang_rt.ubsan // CHECK-DFSAN-COV-LINUX-NOT: "-lstdc++" // CHECK-DFSAN-COV-LINUX: "-lpthread" // CHECK-DFSAN-COV-LINUX: "-lresolv" @@ -746,7 +763,6 @@ // // CHECK-NSAN-LINUX: "{{.*}}ld" // CHECK-NSAN-LINUX-NOT: "-lc" -// CHECK-NSAN-LINUX-NOT: libclang_rt.ubsan // CHECK-NSAN-LINUX: libclang_rt.nsan.a" // CHECK-NSAN-LINUX: "-lpthread" "-lrt" "-lm" "-ldl" "-lresolv" @@ -766,7 +782,6 @@ // RUN: | %{filecheck} --check-prefix=CHECK-NSAN-UBSAN // CHECK-NSAN-UBSAN: "--whole-archive" "{{[^"]*}}libclang_rt.nsan.a" "--no-whole-archive" -// CHECK-NSAN-UBSAN-NOT: libclang_rt.ubsan // CFI by itself does not link runtime libraries. // RUN: not %clang -fsanitize=cfi -### %s 2>&1 \ @@ -775,7 +790,6 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-CFI-LINUX // CHECK-CFI-LINUX: "{{.*}}ld" -// CHECK-CFI-LINUX-NOT: libclang_rt. // CFI with diagnostics links the UBSan runtime. // RUN: not %clang -fsanitize=cfi -fno-sanitize-trap=cfi -fsanitize-recover=cfi \ @@ -815,7 +829,8 @@ // RUN: --sysroot=%S/Inputs/basic_android_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-CFI-CROSS-DSO-ANDROID // CHECK-CFI-CROSS-DSO-ANDROID: "{{.*}}ld{{(.exe)?}}" -// CHECK-CFI-CROSS-DSO-ANDROID-NOT: libclang_rt.cfi +// CHECK-CFI-CROSS-DSO-ANDROID: libclang_rt.builtins.a +// CHECK-CFI-CROSS-DSO-ANDROID: libclang_rt.builtins.a // Cross-DSO CFI with diagnostics on Android links just the UBSAN runtime. // RUN: not %clang -fsanitize=cfi -fsanitize-cfi-cross-dso -### %s 2>&1 \ @@ -827,6 +842,8 @@ // CHECK-CFI-CROSS-DSO-DIAG-ANDROID: "{{.*}}ld{{(.exe)?}}" // CHECK-CFI-CROSS-DSO-DIAG-ANDROID: "{{[^"]*}}libclang_rt.ubsan_standalone.so" // CHECK-CFI-CROSS-DSO-DIAG-ANDROID: "--export-dynamic-symbol=__cfi_check" +// CHECK-CFI-CROSS-DSO-DIAG-ANDROID: libclang_rt.builtins.a +// CHECK-CFI-CROSS-DSO-DIAG-ANDROID: libclang_rt.builtins.a // RUN: %clangxx -fsanitize=address -### %s 2>&1 \ // RUN: -mmacos-version-min=10.6 \ @@ -837,6 +854,7 @@ // CHECK-ASAN-DARWIN106-CXX: "{{.*}}ld" // CHECK-ASAN-DARWIN106-CXX: libclang_rt.asan_osx_dynamic.dylib // CHECK-ASAN-DARWIN106-CXX-NOT: -lc++abi +// CHECK-ASAN-DARWIN106-CXX: libclang_rt.osx.a // RUN: %clangxx -fsanitize=leak -### %s 2>&1 \ // RUN: -mmacos-version-min=10.6 \ @@ -847,6 +865,7 @@ // CHECK-LSAN-DARWIN106-CXX: "{{.*}}ld" // CHECK-LSAN-DARWIN106-CXX: libclang_rt.lsan_osx_dynamic.dylib // CHECK-LSAN-DARWIN106-CXX-NOT: -lc++abi +// CHECK-LSAN-DARWIN106-CXX: libclang_rt.osx.a // RUN: %clang -### %s 2>&1 \ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld -fsanitize=safe-stack \ @@ -867,6 +886,7 @@ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-X86-64 // CHECK-SHADOWCALLSTACK-LINUX-X86-64-NOT: error: +// CHECK-SHADOWCALLSTACK-LINUX-X86-64: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // RUN: not %clang -fsanitize=shadow-call-stack -### %s 2>&1 \ // RUN: --target=aarch64-unknown-linux -fuse-ld=ld \ @@ -877,20 +897,28 @@ // RUN: --target=riscv32-unknown-elf -fuse-ld=ld \ // RUN: | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-ELF-RISCV32 // CHECK-SHADOWCALLSTACK-ELF-RISCV32-NOT: error: +// CHECK-SHADOWCALLSTACK-ELF-RISCV32: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" +// CHECK-SHADOWCALLSTACK-ELF-RISCV32: libclang_rt.builtins.a // RUN: %clang -fsanitize=shadow-call-stack -### %s 2>&1 \ // RUN: --target=riscv64-unknown-linux -fuse-ld=ld \ // RUN: | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-RISCV64 // CHECK-SHADOWCALLSTACK-LINUX-RISCV64-NOT: error: +// CHECK-SHADOWCALLSTACK-LINUX-RISCV64: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // RUN: %clang -target riscv64-linux-android -fsanitize=shadow-call-stack %s -### 2>&1 \ // RUN: | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-ANDROID-RISCV64 // CHECK-SHADOWCALLSTACK-ANDROID-RISCV64-NOT: error: +// CHECK-SHADOWCALLSTACK-ANDROID-RISCV64: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" +// CHECK-SHADOWCALLSTACK-ANDROID-RISCV64: libclang_rt.builtins.a +// CHECK-SHADOWCALLSTACK-ANDROID-RISCV64: libclang_rt.builtins.a // RUN: %clang -fsanitize=shadow-call-stack -### %s 2>&1 \ // RUN: --target=riscv64-unknown-fuchsia -fuse-ld=ld \ // RUN: | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-FUCHSIA-RISCV64 // CHECK-SHADOWCALLSTACK-FUCHSIA-RISCV64-NOT: error: +// CHECK-SHADOWCALLSTACK-FUCHSIA-RISCV64: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" +// CHECK-SHADOWCALLSTACK-FUCHSIA-RISCV64: libclang_rt.builtins.a // RUN: %clang -fsanitize=shadow-call-stack -### %s 2>&1 \ // RUN: --target=aarch64-unknown-linux -fuse-ld=ld -ffixed-x18 \ @@ -898,10 +926,16 @@ // RUN: %clang -fsanitize=shadow-call-stack -### %s 2>&1 \ // RUN: --target=arm64-unknown-ios -fuse-ld=ld \ // RUN: | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18 +// CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18-NOT: error: +// CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18: "{{(.*[^-.0-9A-Z_a-z])?}}ld" + // RUN: %clang -fsanitize=shadow-call-stack -### %s 2>&1 \ // RUN: --target=aarch64-unknown-linux-android -fuse-ld=ld \ -// RUN: | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18 -// CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18-NOT: error: +// RUN: | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18-ANDROID +// CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18-ANDROID-NOT: error: +// CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18-ANDROID: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" +// CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18-ANDROID: libclang_rt.builtins.a +// CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18-ANDROID: libclang_rt.builtins.a // RUN: not %clang -fsanitize=shadow-call-stack -### %s 2>&1 \ // RUN: --target=x86-unknown-linux -fuse-ld=ld \ @@ -912,6 +946,8 @@ // RUN: -fsanitize=safe-stack --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-SAFESTACK // CHECK-SHADOWCALLSTACK-SAFESTACK-NOT: error: +// CHECK-SHADOWCALLSTACK-SAFESTACK: "{{(.*[^-.0-9A-Z_a-z])?}}ld" +// CHECK-SHADOWCALLSTACK-SAFESTACK: libclang_rt.safestack.a // RUN: not %clang -fsanitize=cfi -fsanitize-stats -### %s 2>&1 \ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ @@ -967,7 +1003,8 @@ // RUN: | %{filecheck} --check-prefix=CHECK-SAFESTACK-ANDROID-ARM // // CHECK-SAFESTACK-ANDROID-ARM: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" -// CHECK-SAFESTACK-ANDROID-ARM-NOT: libclang_rt.safestack +// CHECK-SAFESTACK-ANDROID-ARM: libclang_rt.builtins.a +// CHECK-SAFESTACK-ANDROID-ARM: libclang_rt.builtins.a // RUN: %clang -### %s -shared 2>&1 \ // RUN: --target=arm-linux-androideabi -fuse-ld=ld -fsanitize=safe-stack \ @@ -975,7 +1012,8 @@ // RUN: | %{filecheck} --check-prefix=CHECK-SAFESTACK-SHARED-ANDROID-ARM // // CHECK-SAFESTACK-SHARED-ANDROID-ARM: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" -// CHECK-SAFESTACK-SHARED-ANDROID-ARM-NOT: libclang_rt.safestack +// CHECK-SAFESTACK-SHARED-ANDROID-ARM: libclang_rt.builtins.a +// CHECK-SAFESTACK-SHARED-ANDROID-ARM: libclang_rt.builtins.a // RUN: %clang -### %s 2>&1 \ // RUN: --target=aarch64-linux-android -fuse-ld=ld -fsanitize=safe-stack \ @@ -983,7 +1021,8 @@ // RUN: | %{filecheck} --check-prefix=CHECK-SAFESTACK-ANDROID-AARCH64 // // CHECK-SAFESTACK-ANDROID-AARCH64: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" -// CHECK-SAFESTACK-ANDROID-AARCH64-NOT: libclang_rt.safestack +// CHECK-SAFESTACK-ANDROID-AARCH64: libclang_rt.builtins.a +// CHECK-SAFESTACK-ANDROID-AARCH64: libclang_rt.builtins.a // RUN: not %clang -fsanitize=undefined -### %s 2>&1 \ // RUN: --target=x86_64-scei-ps4 -fuse-ld=ld \ @@ -1071,7 +1110,6 @@ // // CHECK-SCUDO-SHARED-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-SCUDO-SHARED-LINUX-NOT: "-lc" -// CHECK-SCUDO-SHARED-LINUX-NOT: libclang_rt.scudo_standalone.a" // CHECK-SCUDO-SHARED-LINUX: libclang_rt.scudo_standalone.so" // CHECK-SCUDO-SHARED-LINUX-NOT: "-lpthread" // CHECK-SCUDO-SHARED-LINUX-NOT: "-lrt" @@ -1093,6 +1131,8 @@ // CHECK-SCUDO-ANDROID: libclang_rt.scudo_standalone.so" // CHECK-SCUDO-ANDROID-NOT: "-lpthread" // CHECK-SCUDO-ANDROID-NOT: "-lresolv" +// CHECK-SCUDO-ANDROID: libclang_rt.builtins.a" +// CHECK-SCUDO-ANDROID: libclang_rt.builtins.a" // RUN: %clang -### %s 2>&1 \ // RUN: --target=arm-linux-androideabi -fuse-ld=ld -fsanitize=scudo \ @@ -1106,6 +1146,7 @@ // CHECK-SCUDO-ANDROID-STATIC-NOT: "-lpthread" // CHECK-SCUDO-ANDROID-STATIC-NOT: "-lrt" // CHECK-SCUDO-ANDROID-STATIC-NOT: "-lresolv" +// CHECK-SCUDO-ANDROID-STATIC: "{{.*}}libclang_rt.builtins.a" // RUN: %clang -### %s 2>&1 \ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld -fsanitize=hwaddress \ @@ -1133,6 +1174,7 @@ // CHECK-SHARED-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-SHARED-HWASAN-X86-64-LINUX-NOT: "-lc" // CHECK-SHARED-HWASAN-X86-64-LINUX: libclang_rt.hwasan.so" +// CHECK-SHARED-HWASAN-X86-64-LINUX: libclang_rt.hwasan-preinit.a" // CHECK-SHARED-HWASAN-X86-64-LINUX-NOT: "-lpthread" // CHECK-SHARED-HWASAN-X86-64-LINUX-NOT: "-lrt" // CHECK-SHARED-HWASAN-X86-64-LINUX-NOT: "-ldl" @@ -1183,6 +1225,7 @@ // CHECK-SHARED-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // CHECK-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lc" // CHECK-SHARED-HWASAN-AARCH64-LINUX: libclang_rt.hwasan.so" +// CHECK-SHARED-HWASAN-AARCH64-LINUX: libclang_rt.hwasan-preinit.a" // CHECK-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lpthread" // CHECK-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lrt" // CHECK-SHARED-HWASAN-AARCH64-LINUX-NOT: "-ldl" From 8230b8a60e8763a90b85a83cfd4ceeac1174ac84 Mon Sep 17 00:00:00 2001 From: NAKAMURA Takumi Date: Fri, 27 Dec 2024 14:01:03 +0900 Subject: [PATCH 101/567] test/llvm-cov/branch-noShowBranch.test: Align `CHECK-NOT`s to branch-c-general.test's in #113114 --- llvm/test/tools/llvm-cov/branch-noShowBranch.test | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/llvm/test/tools/llvm-cov/branch-noShowBranch.test b/llvm/test/tools/llvm-cov/branch-noShowBranch.test index cabeeb01bfe3e..9f3cfd55f029b 100644 --- a/llvm/test/tools/llvm-cov/branch-noShowBranch.test +++ b/llvm/test/tools/llvm-cov/branch-noShowBranch.test @@ -12,7 +12,7 @@ // REPORT-NOT: conditionals 24 0 100.00% 15 0 100.00% 16 2 87.50% // REPORT-NOT: early_exits 20 4 80.00% 25 2 92.00% 16 6 62.50% // REPORT-NOT: jumps 39 12 69.23% 48 2 95.83% 26 9 65.38% -// REPORT-NOT: switches 28 5 82.14% 38 4 89.47% 30 9 70.00% +// REPORT-NOT: switches 28 5 82.14% 38 4 89.47% 28 7 75.00% // REPORT-NOT: big_switch 25 1 96.00% 32 0 100.00% 30 6 80.00% // REPORT-NOT: boolean_operators 16 0 100.00% 13 0 100.00% 22 2 90.91% // REPORT-NOT: boolop_loops 19 0 100.00% 14 0 100.00% 16 2 87.50% @@ -21,5 +21,4 @@ // REPORT-NOT: main 1 0 100.00% 16 0 100.00% 0 0 0.00% // REPORT-NOT: c-general.c:static_func 4 0 100.00% 4 0 100.00% 2 0 100.00% // REPORT: TOTAL 197 24 87.82% 234 8 96.58% -// REPORT-NOT: TOTAL 197 24 87.82% 234 13 94.44% 174 38 78.16% - +// REPORT-NOT: TOTAL 197 24 87.82% 234 8 96.58% 172 36 79.07% From 3d9f9684a56ff049b5d5454bdb73f406c5af3959 Mon Sep 17 00:00:00 2001 From: NAKAMURA Takumi Date: Fri, 27 Dec 2024 14:08:25 +0900 Subject: [PATCH 102/567] llvm-cov: Split out `sumRegions()` from `FunctionCoverageSummary::get()`. NFC. --- llvm/tools/llvm-cov/CoverageSummaryInfo.cpp | 22 +++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp b/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp index 58e7918d39270..ad7561d3dc62c 100644 --- a/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp +++ b/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp @@ -59,12 +59,11 @@ sumMCDCPairs(const ArrayRef &Records) { return {NumPairs, CoveredPairs}; } -FunctionCoverageSummary -FunctionCoverageSummary::get(const CoverageMapping &CM, - const coverage::FunctionRecord &Function) { +static std::pair +sumRegions(ArrayRef CodeRegions, const CoverageData &CD) { // Compute the region coverage. size_t NumCodeRegions = 0, CoveredRegions = 0; - for (auto &CR : Function.CountedRegions) { + for (auto &CR : CodeRegions) { if (CR.Kind != CounterMappingRegion::CodeRegion) continue; ++NumCodeRegions; @@ -74,7 +73,6 @@ FunctionCoverageSummary::get(const CoverageMapping &CM, // Compute the line coverage size_t NumLines = 0, CoveredLines = 0; - CoverageData CD = CM.getCoverageForFunction(Function); for (const auto &LCS : getLineCoverageStats(CD)) { if (!LCS.isMapped()) continue; @@ -83,6 +81,16 @@ FunctionCoverageSummary::get(const CoverageMapping &CM, ++CoveredLines; } + return {RegionCoverageInfo(CoveredRegions, NumCodeRegions), + LineCoverageInfo(CoveredLines, NumLines)}; +} + +FunctionCoverageSummary +FunctionCoverageSummary::get(const CoverageMapping &CM, + const coverage::FunctionRecord &Function) { + CoverageData CD = CM.getCoverageForFunction(Function); + auto [RegionCoverage, LineCoverage] = sumRegions(Function.CountedRegions, CD); + // Compute the branch coverage, including branches from expansions. size_t NumBranches = 0, CoveredBranches = 0; sumBranches(NumBranches, CoveredBranches, CD.getBranches()); @@ -92,9 +100,7 @@ FunctionCoverageSummary::get(const CoverageMapping &CM, std::tie(NumPairs, CoveredPairs) = sumMCDCPairs(CD.getMCDCRecords()); return FunctionCoverageSummary( - Function.Name, Function.ExecutionCount, - RegionCoverageInfo(CoveredRegions, NumCodeRegions), - LineCoverageInfo(CoveredLines, NumLines), + Function.Name, Function.ExecutionCount, RegionCoverage, LineCoverage, BranchCoverageInfo(CoveredBranches, NumBranches), MCDCCoverageInfo(CoveredPairs, NumPairs)); } From cd3c1658ee3ff882ff9c51488662a4c3f21e6d9c Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 26 Dec 2024 20:45:29 -0800 Subject: [PATCH 103/567] [RISCV] Add more tests to rv*xtheadba.ll. NFC XTheadba has similarities with Zba and shares some of the same codegen code and has similar isel patterns. This patch makes the testing more similar. --- llvm/test/CodeGen/RISCV/rv32xtheadba.ll | 552 +++++++- llvm/test/CodeGen/RISCV/rv64xtheadba.ll | 1684 +++++++++++++++++++++-- 2 files changed, 2131 insertions(+), 105 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/rv32xtheadba.ll b/llvm/test/CodeGen/RISCV/rv32xtheadba.ll index 332e49771bedf..effbcc0e08f3e 100644 --- a/llvm/test/CodeGen/RISCV/rv32xtheadba.ll +++ b/llvm/test/CodeGen/RISCV/rv32xtheadba.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub ; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs < %s \ -; RUN: | FileCheck %s -check-prefixes=RV32I +; RUN: | FileCheck %s -check-prefixes=CHECK,RV32I ; RUN: llc -mtriple=riscv32 -mattr=+m,+xtheadba -verify-machineinstrs < %s \ -; RUN: | FileCheck %s -check-prefixes=RV32XTHEADBA +; RUN: | FileCheck %s -check-prefixes=CHECK,RV32XTHEADBA define signext i16 @th_addsl_1(i64 %0, ptr %1) { ; RV32I-LABEL: th_addsl_1: @@ -324,3 +324,551 @@ define i32 @mul288(i32 %a) { %c = mul i32 %a, 288 ret i32 %c } + +define i32 @mul258(i32 %a) { +; RV32I-LABEL: mul258: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, 258 +; RV32I-NEXT: mul a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32XTHEADBA-LABEL: mul258: +; RV32XTHEADBA: # %bb.0: +; RV32XTHEADBA-NEXT: slli a1, a0, 8 +; RV32XTHEADBA-NEXT: th.addsl a0, a1, a0, 1 +; RV32XTHEADBA-NEXT: ret + %c = mul i32 %a, 258 + ret i32 %c +} + +define i32 @mul260(i32 %a) { +; RV32I-LABEL: mul260: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, 260 +; RV32I-NEXT: mul a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32XTHEADBA-LABEL: mul260: +; RV32XTHEADBA: # %bb.0: +; RV32XTHEADBA-NEXT: slli a1, a0, 8 +; RV32XTHEADBA-NEXT: th.addsl a0, a1, a0, 2 +; RV32XTHEADBA-NEXT: ret + %c = mul i32 %a, 260 + ret i32 %c +} + +define i32 @mul264(i32 %a) { +; RV32I-LABEL: mul264: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, 264 +; RV32I-NEXT: mul a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32XTHEADBA-LABEL: mul264: +; RV32XTHEADBA: # %bb.0: +; RV32XTHEADBA-NEXT: slli a1, a0, 8 +; RV32XTHEADBA-NEXT: th.addsl a0, a1, a0, 3 +; RV32XTHEADBA-NEXT: ret + %c = mul i32 %a, 264 + ret i32 %c +} + +define i32 @mul11(i32 %a) { +; RV32I-LABEL: mul11: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, 11 +; RV32I-NEXT: mul a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32XTHEADBA-LABEL: mul11: +; RV32XTHEADBA: # %bb.0: +; RV32XTHEADBA-NEXT: th.addsl a1, a0, a0, 2 +; RV32XTHEADBA-NEXT: th.addsl a0, a0, a1, 1 +; RV32XTHEADBA-NEXT: ret + %c = mul i32 %a, 11 + ret i32 %c +} + +define i32 @mul19(i32 %a) { +; RV32I-LABEL: mul19: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, 19 +; RV32I-NEXT: mul a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32XTHEADBA-LABEL: mul19: +; RV32XTHEADBA: # %bb.0: +; RV32XTHEADBA-NEXT: th.addsl a1, a0, a0, 3 +; RV32XTHEADBA-NEXT: th.addsl a0, a0, a1, 1 +; RV32XTHEADBA-NEXT: ret + %c = mul i32 %a, 19 + ret i32 %c +} + +define i32 @mul13(i32 %a) { +; RV32I-LABEL: mul13: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, 13 +; RV32I-NEXT: mul a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32XTHEADBA-LABEL: mul13: +; RV32XTHEADBA: # %bb.0: +; RV32XTHEADBA-NEXT: th.addsl a1, a0, a0, 1 +; RV32XTHEADBA-NEXT: th.addsl a0, a0, a1, 2 +; RV32XTHEADBA-NEXT: ret + %c = mul i32 %a, 13 + ret i32 %c +} + +define i32 @mul21(i32 %a) { +; RV32I-LABEL: mul21: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, 21 +; RV32I-NEXT: mul a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32XTHEADBA-LABEL: mul21: +; RV32XTHEADBA: # %bb.0: +; RV32XTHEADBA-NEXT: th.addsl a1, a0, a0, 2 +; RV32XTHEADBA-NEXT: th.addsl a0, a0, a1, 2 +; RV32XTHEADBA-NEXT: ret + %c = mul i32 %a, 21 + ret i32 %c +} + +define i32 @mul37(i32 %a) { +; RV32I-LABEL: mul37: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, 37 +; RV32I-NEXT: mul a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32XTHEADBA-LABEL: mul37: +; RV32XTHEADBA: # %bb.0: +; RV32XTHEADBA-NEXT: th.addsl a1, a0, a0, 3 +; RV32XTHEADBA-NEXT: th.addsl a0, a0, a1, 2 +; RV32XTHEADBA-NEXT: ret + %c = mul i32 %a, 37 + ret i32 %c +} + +define i32 @mul25(i32 %a) { +; RV32I-LABEL: mul25: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, 25 +; RV32I-NEXT: mul a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32XTHEADBA-LABEL: mul25: +; RV32XTHEADBA: # %bb.0: +; RV32XTHEADBA-NEXT: th.addsl a0, a0, a0, 2 +; RV32XTHEADBA-NEXT: th.addsl a0, a0, a0, 2 +; RV32XTHEADBA-NEXT: ret + %c = mul i32 %a, 25 + ret i32 %c +} + +define i32 @mul41(i32 %a) { +; RV32I-LABEL: mul41: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, 41 +; RV32I-NEXT: mul a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32XTHEADBA-LABEL: mul41: +; RV32XTHEADBA: # %bb.0: +; RV32XTHEADBA-NEXT: th.addsl a1, a0, a0, 2 +; RV32XTHEADBA-NEXT: th.addsl a0, a0, a1, 3 +; RV32XTHEADBA-NEXT: ret + %c = mul i32 %a, 41 + ret i32 %c +} + +define i32 @mul73(i32 %a) { +; RV32I-LABEL: mul73: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, 73 +; RV32I-NEXT: mul a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32XTHEADBA-LABEL: mul73: +; RV32XTHEADBA: # %bb.0: +; RV32XTHEADBA-NEXT: th.addsl a1, a0, a0, 3 +; RV32XTHEADBA-NEXT: th.addsl a0, a0, a1, 3 +; RV32XTHEADBA-NEXT: ret + %c = mul i32 %a, 73 + ret i32 %c +} + +define i32 @mul27(i32 %a) { +; RV32I-LABEL: mul27: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, 27 +; RV32I-NEXT: mul a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32XTHEADBA-LABEL: mul27: +; RV32XTHEADBA: # %bb.0: +; RV32XTHEADBA-NEXT: th.addsl a0, a0, a0, 1 +; RV32XTHEADBA-NEXT: th.addsl a0, a0, a0, 3 +; RV32XTHEADBA-NEXT: ret + %c = mul i32 %a, 27 + ret i32 %c +} + +define i32 @mul45(i32 %a) { +; RV32I-LABEL: mul45: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, 45 +; RV32I-NEXT: mul a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32XTHEADBA-LABEL: mul45: +; RV32XTHEADBA: # %bb.0: +; RV32XTHEADBA-NEXT: th.addsl a0, a0, a0, 2 +; RV32XTHEADBA-NEXT: th.addsl a0, a0, a0, 3 +; RV32XTHEADBA-NEXT: ret + %c = mul i32 %a, 45 + ret i32 %c +} + +define i32 @mul81(i32 %a) { +; RV32I-LABEL: mul81: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, 81 +; RV32I-NEXT: mul a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32XTHEADBA-LABEL: mul81: +; RV32XTHEADBA: # %bb.0: +; RV32XTHEADBA-NEXT: th.addsl a0, a0, a0, 3 +; RV32XTHEADBA-NEXT: th.addsl a0, a0, a0, 3 +; RV32XTHEADBA-NEXT: ret + %c = mul i32 %a, 81 + ret i32 %c +} + +define i32 @mul4098(i32 %a) { +; RV32I-LABEL: mul4098: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a1, a0, 1 +; RV32I-NEXT: slli a0, a0, 12 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32XTHEADBA-LABEL: mul4098: +; RV32XTHEADBA: # %bb.0: +; RV32XTHEADBA-NEXT: slli a1, a0, 12 +; RV32XTHEADBA-NEXT: th.addsl a0, a1, a0, 1 +; RV32XTHEADBA-NEXT: ret + %c = mul i32 %a, 4098 + ret i32 %c +} + +define i32 @mul4100(i32 %a) { +; RV32I-LABEL: mul4100: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a1, a0, 2 +; RV32I-NEXT: slli a0, a0, 12 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32XTHEADBA-LABEL: mul4100: +; RV32XTHEADBA: # %bb.0: +; RV32XTHEADBA-NEXT: slli a1, a0, 12 +; RV32XTHEADBA-NEXT: th.addsl a0, a1, a0, 2 +; RV32XTHEADBA-NEXT: ret + %c = mul i32 %a, 4100 + ret i32 %c +} + +define i32 @mul4104(i32 %a) { +; RV32I-LABEL: mul4104: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a1, a0, 3 +; RV32I-NEXT: slli a0, a0, 12 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32XTHEADBA-LABEL: mul4104: +; RV32XTHEADBA: # %bb.0: +; RV32XTHEADBA-NEXT: slli a1, a0, 12 +; RV32XTHEADBA-NEXT: th.addsl a0, a1, a0, 3 +; RV32XTHEADBA-NEXT: ret + %c = mul i32 %a, 4104 + ret i32 %c +} + +define i32 @add4104(i32 %a) { +; CHECK-LABEL: add4104: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, 1 +; CHECK-NEXT: addi a1, a1, 8 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: ret + %c = add i32 %a, 4104 + ret i32 %c +} + +define i32 @add8208(i32 %a) { +; CHECK-LABEL: add8208: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, 2 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: ret + %c = add i32 %a, 8208 + ret i32 %c +} + +define i32 @add8192(i32 %a) { +; CHECK-LABEL: add8192: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, 2 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: ret + %c = add i32 %a, 8192 + ret i32 %c +} + +define i32 @addshl_5_6(i32 %a, i32 %b) { +; CHECK-LABEL: addshl_5_6: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: slli a1, a1, 6 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: ret + %c = shl i32 %a, 5 + %d = shl i32 %b, 6 + %e = add i32 %c, %d + ret i32 %e +} + +define i32 @addshl_5_7(i32 %a, i32 %b) { +; CHECK-LABEL: addshl_5_7: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: slli a1, a1, 7 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: ret + %c = shl i32 %a, 5 + %d = shl i32 %b, 7 + %e = add i32 %c, %d + ret i32 %e +} + +define i32 @addshl_5_8(i32 %a, i32 %b) { +; CHECK-LABEL: addshl_5_8: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: slli a1, a1, 8 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: ret + %c = shl i32 %a, 5 + %d = shl i32 %b, 8 + %e = add i32 %c, %d + ret i32 %e +} + +define i32 @srli_1_sh2add(ptr %0, i32 %1) { +; RV32I-LABEL: srli_1_sh2add: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a1, a1, 1 +; RV32I-NEXT: andi a1, a1, -4 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: ret +; +; RV32XTHEADBA-LABEL: srli_1_sh2add: +; RV32XTHEADBA: # %bb.0: +; RV32XTHEADBA-NEXT: srli a1, a1, 1 +; RV32XTHEADBA-NEXT: th.addsl a0, a0, a1, 2 +; RV32XTHEADBA-NEXT: lw a0, 0(a0) +; RV32XTHEADBA-NEXT: ret + %3 = lshr i32 %1, 1 + %4 = getelementptr inbounds i32, ptr %0, i32 %3 + %5 = load i32, ptr %4, align 4 + ret i32 %5 +} + +define i64 @srli_2_sh3add(ptr %0, i32 %1) { +; RV32I-LABEL: srli_2_sh3add: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a1, a1, 1 +; RV32I-NEXT: andi a1, a1, -8 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: lw a0, 0(a1) +; RV32I-NEXT: lw a1, 4(a1) +; RV32I-NEXT: ret +; +; RV32XTHEADBA-LABEL: srli_2_sh3add: +; RV32XTHEADBA: # %bb.0: +; RV32XTHEADBA-NEXT: srli a1, a1, 2 +; RV32XTHEADBA-NEXT: th.addsl a1, a0, a1, 3 +; RV32XTHEADBA-NEXT: lw a0, 0(a1) +; RV32XTHEADBA-NEXT: lw a1, 4(a1) +; RV32XTHEADBA-NEXT: ret + %3 = lshr i32 %1, 2 + %4 = getelementptr inbounds i64, ptr %0, i32 %3 + %5 = load i64, ptr %4, align 8 + ret i64 %5 +} + +define signext i16 @srli_2_sh1add(ptr %0, i32 %1) { +; RV32I-LABEL: srli_2_sh1add: +; RV32I: # %bb.0: +; RV32I-NEXT: srli a1, a1, 1 +; RV32I-NEXT: andi a1, a1, -2 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: lh a0, 0(a0) +; RV32I-NEXT: ret +; +; RV32XTHEADBA-LABEL: srli_2_sh1add: +; RV32XTHEADBA: # %bb.0: +; RV32XTHEADBA-NEXT: srli a1, a1, 2 +; RV32XTHEADBA-NEXT: th.addsl a0, a0, a1, 1 +; RV32XTHEADBA-NEXT: lh a0, 0(a0) +; RV32XTHEADBA-NEXT: ret + %3 = lshr i32 %1, 2 + %4 = getelementptr inbounds i16, ptr %0, i32 %3 + %5 = load i16, ptr %4, align 2 + ret i16 %5 +} + +define i32 @srli_3_sh2add(ptr %0, i32 %1) { +; RV32I-LABEL: srli_3_sh2add: +; RV32I: # %bb.0: +; RV32I-NEXT: srli a1, a1, 1 +; RV32I-NEXT: andi a1, a1, -4 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: ret +; +; RV32XTHEADBA-LABEL: srli_3_sh2add: +; RV32XTHEADBA: # %bb.0: +; RV32XTHEADBA-NEXT: srli a1, a1, 3 +; RV32XTHEADBA-NEXT: th.addsl a0, a0, a1, 2 +; RV32XTHEADBA-NEXT: lw a0, 0(a0) +; RV32XTHEADBA-NEXT: ret + %3 = lshr i32 %1, 3 + %4 = getelementptr inbounds i32, ptr %0, i32 %3 + %5 = load i32, ptr %4, align 4 + ret i32 %5 +} + +define i64 @srli_4_sh3add(ptr %0, i32 %1) { +; RV32I-LABEL: srli_4_sh3add: +; RV32I: # %bb.0: +; RV32I-NEXT: srli a1, a1, 1 +; RV32I-NEXT: andi a1, a1, -8 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: lw a0, 0(a1) +; RV32I-NEXT: lw a1, 4(a1) +; RV32I-NEXT: ret +; +; RV32XTHEADBA-LABEL: srli_4_sh3add: +; RV32XTHEADBA: # %bb.0: +; RV32XTHEADBA-NEXT: srli a1, a1, 4 +; RV32XTHEADBA-NEXT: th.addsl a1, a0, a1, 3 +; RV32XTHEADBA-NEXT: lw a0, 0(a1) +; RV32XTHEADBA-NEXT: lw a1, 4(a1) +; RV32XTHEADBA-NEXT: ret + %3 = lshr i32 %1, 4 + %4 = getelementptr inbounds i64, ptr %0, i32 %3 + %5 = load i64, ptr %4, align 8 + ret i64 %5 +} + +define i32 @mul_neg1(i32 %a) { +; CHECK-LABEL: mul_neg1: +; CHECK: # %bb.0: +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %c = mul i32 %a, -1 + ret i32 %c +} + +define i32 @mul_neg2(i32 %a) { +; CHECK-LABEL: mul_neg2: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %c = mul i32 %a, -2 + ret i32 %c +} + +define i32 @mul_neg3(i32 %a) { +; RV32I-LABEL: mul_neg3: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a1, a0, 1 +; RV32I-NEXT: neg a0, a0 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32XTHEADBA-LABEL: mul_neg3: +; RV32XTHEADBA: # %bb.0: +; RV32XTHEADBA-NEXT: th.addsl a0, a0, a0, 1 +; RV32XTHEADBA-NEXT: neg a0, a0 +; RV32XTHEADBA-NEXT: ret + %c = mul i32 %a, -3 + ret i32 %c +} + +define i32 @mul_neg4(i32 %a) { +; CHECK-LABEL: mul_neg4: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %c = mul i32 %a, -4 + ret i32 %c +} + +define i32 @mul_neg5(i32 %a) { +; RV32I-LABEL: mul_neg5: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a1, a0, 2 +; RV32I-NEXT: neg a0, a0 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32XTHEADBA-LABEL: mul_neg5: +; RV32XTHEADBA: # %bb.0: +; RV32XTHEADBA-NEXT: th.addsl a0, a0, a0, 2 +; RV32XTHEADBA-NEXT: neg a0, a0 +; RV32XTHEADBA-NEXT: ret + %c = mul i32 %a, -5 + ret i32 %c +} + +define i32 @mul_neg6(i32 %a) { +; CHECK-LABEL: mul_neg6: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -6 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: ret + %c = mul i32 %a, -6 + ret i32 %c +} + +define i32 @mul_neg7(i32 %a) { +; CHECK-LABEL: mul_neg7: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a1, a0, 3 +; CHECK-NEXT: sub a0, a0, a1 +; CHECK-NEXT: ret + %c = mul i32 %a, -7 + ret i32 %c +} + +define i32 @mul_neg8(i32 %a) { +; CHECK-LABEL: mul_neg8: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %c = mul i32 %a, -8 + ret i32 %c +} diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll index 2d44ffbf63749..08449de913b98 100644 --- a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll +++ b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub ; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \ -; RUN: | FileCheck %s -check-prefixes=RV64I +; RUN: | FileCheck %s -check-prefixes=CHECK,RV64I ; RUN: llc -mtriple=riscv64 -mattr=+m,+xtheadba -verify-machineinstrs < %s \ -; RUN: | FileCheck %s -check-prefixes=RV64XTHEADBA +; RUN: | FileCheck %s -check-prefixes=CHECK,RV64XTHEADBA define signext i16 @th_addsl_1(i64 %0, ptr %1) { ; RV64I-LABEL: th_addsl_1: @@ -182,6 +182,18 @@ define i64 @addmul20(i64 %a, i64 %b) { ret i64 %d } +define i64 @addmul22(i64 %a, i64 %b) { +; CHECK-LABEL: addmul22: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 22 +; CHECK-NEXT: mul a0, a0, a2 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: ret + %c = mul i64 %a, 22 + %d = add i64 %c, %b + ret i64 %d +} + define i64 @addmul24(i64 %a, i64 %b) { ; RV64I-LABEL: addmul24: ; RV64I: # %bb.0: @@ -255,182 +267,229 @@ define i64 @addmul72(i64 %a, i64 %b) { ret i64 %d } -define i64 @mul11(i64 %a) { -; RV64I-LABEL: mul11: +define i64 @mul50(i64 %a) { +; RV64I-LABEL: mul50: ; RV64I: # %bb.0: -; RV64I-NEXT: li a1, 11 +; RV64I-NEXT: li a1, 50 ; RV64I-NEXT: mul a0, a0, a1 ; RV64I-NEXT: ret ; -; RV64XTHEADBA-LABEL: mul11: +; RV64XTHEADBA-LABEL: mul50: ; RV64XTHEADBA: # %bb.0: -; RV64XTHEADBA-NEXT: th.addsl a1, a0, a0, 2 -; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 1 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 2 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 2 +; RV64XTHEADBA-NEXT: slli a0, a0, 1 ; RV64XTHEADBA-NEXT: ret - %c = mul i64 %a, 11 + %c = mul i64 %a, 50 ret i64 %c } -define i64 @mul19(i64 %a) { -; RV64I-LABEL: mul19: +define i64 @addmul50(i64 %a, i64 %b) { +; RV64I-LABEL: addmul50: ; RV64I: # %bb.0: -; RV64I-NEXT: li a1, 19 -; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: li a2, 50 +; RV64I-NEXT: mul a0, a0, a2 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: ret ; -; RV64XTHEADBA-LABEL: mul19: +; RV64XTHEADBA-LABEL: addmul50: ; RV64XTHEADBA: # %bb.0: -; RV64XTHEADBA-NEXT: th.addsl a1, a0, a0, 3 -; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 1 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 2 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 2 +; RV64XTHEADBA-NEXT: th.addsl a0, a1, a0, 1 ; RV64XTHEADBA-NEXT: ret - %c = mul i64 %a, 19 - ret i64 %c + %c = mul i64 %a, 50 + %d = add i64 %c, %b + ret i64 %d } -define i64 @mul13(i64 %a) { -; RV64I-LABEL: mul13: +define i64 @mul100(i64 %a) { +; RV64I-LABEL: mul100: ; RV64I: # %bb.0: -; RV64I-NEXT: li a1, 13 +; RV64I-NEXT: li a1, 100 ; RV64I-NEXT: mul a0, a0, a1 ; RV64I-NEXT: ret ; -; RV64XTHEADBA-LABEL: mul13: +; RV64XTHEADBA-LABEL: mul100: ; RV64XTHEADBA: # %bb.0: -; RV64XTHEADBA-NEXT: th.addsl a1, a0, a0, 1 -; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 2 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 2 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 2 +; RV64XTHEADBA-NEXT: slli a0, a0, 2 ; RV64XTHEADBA-NEXT: ret - %c = mul i64 %a, 13 + %c = mul i64 %a, 100 ret i64 %c } -define i64 @mul21(i64 %a) { -; RV64I-LABEL: mul21: +define i64 @addmul100(i64 %a, i64 %b) { +; RV64I-LABEL: addmul100: ; RV64I: # %bb.0: -; RV64I-NEXT: li a1, 21 -; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: li a2, 100 +; RV64I-NEXT: mul a0, a0, a2 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: ret ; -; RV64XTHEADBA-LABEL: mul21: +; RV64XTHEADBA-LABEL: addmul100: ; RV64XTHEADBA: # %bb.0: -; RV64XTHEADBA-NEXT: th.addsl a1, a0, a0, 2 -; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 2 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 2 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 2 +; RV64XTHEADBA-NEXT: th.addsl a0, a1, a0, 2 ; RV64XTHEADBA-NEXT: ret - %c = mul i64 %a, 21 - ret i64 %c + %c = mul i64 %a, 100 + %d = add i64 %c, %b + ret i64 %d } -define i64 @mul37(i64 %a) { -; RV64I-LABEL: mul37: +define i64 @mul162(i64 %a) { +; RV64I-LABEL: mul162: ; RV64I: # %bb.0: -; RV64I-NEXT: li a1, 37 +; RV64I-NEXT: li a1, 162 ; RV64I-NEXT: mul a0, a0, a1 ; RV64I-NEXT: ret ; -; RV64XTHEADBA-LABEL: mul37: +; RV64XTHEADBA-LABEL: mul162: ; RV64XTHEADBA: # %bb.0: -; RV64XTHEADBA-NEXT: th.addsl a1, a0, a0, 3 -; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 2 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 3 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 3 +; RV64XTHEADBA-NEXT: slli a0, a0, 1 ; RV64XTHEADBA-NEXT: ret - %c = mul i64 %a, 37 + %c = mul i64 %a, 162 ret i64 %c } -define i64 @mul25(i64 %a) { -; RV64I-LABEL: mul25: +define i64 @addmul162(i64 %a, i64 %b) { +; RV64I-LABEL: addmul162: ; RV64I: # %bb.0: -; RV64I-NEXT: li a1, 25 -; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: li a2, 162 +; RV64I-NEXT: mul a0, a0, a2 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: ret ; -; RV64XTHEADBA-LABEL: mul25: +; RV64XTHEADBA-LABEL: addmul162: ; RV64XTHEADBA: # %bb.0: -; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 2 -; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 2 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 3 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 3 +; RV64XTHEADBA-NEXT: th.addsl a0, a1, a0, 1 ; RV64XTHEADBA-NEXT: ret - %c = mul i64 %a, 25 - ret i64 %c + %c = mul i64 %a, 162 + %d = add i64 %c, %b + ret i64 %d } -define i64 @mul41(i64 %a) { -; RV64I-LABEL: mul41: +define i64 @mul180(i64 %a) { +; RV64I-LABEL: mul180: ; RV64I: # %bb.0: -; RV64I-NEXT: li a1, 41 +; RV64I-NEXT: li a1, 180 ; RV64I-NEXT: mul a0, a0, a1 ; RV64I-NEXT: ret ; -; RV64XTHEADBA-LABEL: mul41: +; RV64XTHEADBA-LABEL: mul180: ; RV64XTHEADBA: # %bb.0: -; RV64XTHEADBA-NEXT: th.addsl a1, a0, a0, 2 -; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 3 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 2 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 3 +; RV64XTHEADBA-NEXT: slli a0, a0, 2 ; RV64XTHEADBA-NEXT: ret - %c = mul i64 %a, 41 + %c = mul i64 %a, 180 ret i64 %c } -define i64 @mul73(i64 %a) { -; RV64I-LABEL: mul73: +define i64 @addmul180(i64 %a, i64 %b) { +; RV64I-LABEL: addmul180: ; RV64I: # %bb.0: -; RV64I-NEXT: li a1, 73 -; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: li a2, 180 +; RV64I-NEXT: mul a0, a0, a2 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: ret ; -; RV64XTHEADBA-LABEL: mul73: +; RV64XTHEADBA-LABEL: addmul180: ; RV64XTHEADBA: # %bb.0: -; RV64XTHEADBA-NEXT: th.addsl a1, a0, a0, 3 -; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 3 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 2 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 3 +; RV64XTHEADBA-NEXT: th.addsl a0, a1, a0, 2 ; RV64XTHEADBA-NEXT: ret - %c = mul i64 %a, 73 - ret i64 %c + %c = mul i64 %a, 180 + %d = add i64 %c, %b + ret i64 %d } -define i64 @mul27(i64 %a) { -; RV64I-LABEL: mul27: +define i64 @add255mul180(i64 %a) { +; RV64I-LABEL: add255mul180: ; RV64I: # %bb.0: -; RV64I-NEXT: li a1, 27 +; RV64I-NEXT: li a1, 180 ; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: addi a0, a0, 255 ; RV64I-NEXT: ret ; -; RV64XTHEADBA-LABEL: mul27: +; RV64XTHEADBA-LABEL: add255mul180: ; RV64XTHEADBA: # %bb.0: -; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 1 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 2 ; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 3 +; RV64XTHEADBA-NEXT: li a1, 255 +; RV64XTHEADBA-NEXT: th.addsl a0, a1, a0, 2 ; RV64XTHEADBA-NEXT: ret - %c = mul i64 %a, 27 - ret i64 %c + %c = mul i64 %a, 180 + %d = add i64 %c, 255 + ret i64 %d } -define i64 @mul45(i64 %a) { -; RV64I-LABEL: mul45: +define i64 @mul200(i64 %a) { +; RV64I-LABEL: mul200: ; RV64I: # %bb.0: -; RV64I-NEXT: li a1, 45 +; RV64I-NEXT: li a1, 200 ; RV64I-NEXT: mul a0, a0, a1 ; RV64I-NEXT: ret ; -; RV64XTHEADBA-LABEL: mul45: +; RV64XTHEADBA-LABEL: mul200: ; RV64XTHEADBA: # %bb.0: ; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 2 -; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 3 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 2 +; RV64XTHEADBA-NEXT: slli a0, a0, 3 ; RV64XTHEADBA-NEXT: ret - %c = mul i64 %a, 45 + %c = mul i64 %a, 200 ret i64 %c } -define i64 @mul81(i64 %a) { -; RV64I-LABEL: mul81: +define i64 @addmul200(i64 %a, i64 %b) { +; RV64I-LABEL: addmul200: ; RV64I: # %bb.0: -; RV64I-NEXT: li a1, 81 -; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: li a2, 200 +; RV64I-NEXT: mul a0, a0, a2 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: ret ; -; RV64XTHEADBA-LABEL: mul81: +; RV64XTHEADBA-LABEL: addmul200: ; RV64XTHEADBA: # %bb.0: -; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 3 -; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 3 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 2 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 2 +; RV64XTHEADBA-NEXT: th.addsl a0, a1, a0, 3 ; RV64XTHEADBA-NEXT: ret - %c = mul i64 %a, 81 - ret i64 %c + %c = mul i64 %a, 200 + %d = add i64 %c, %b + ret i64 %d } +define i64 @addmul4096(i64 %a, i64 %b) { +; CHECK-LABEL: addmul4096: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a0, a0, 12 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: ret + %c = mul i64 %a, 4096 + %d = add i64 %c, %b + ret i64 %d +} + +define i64 @addmul4230(i64 %a, i64 %b) { +; CHECK-LABEL: addmul4230: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a2, 1 +; CHECK-NEXT: addiw a2, a2, 134 +; CHECK-NEXT: mul a0, a0, a2 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: ret + %c = mul i64 %a, 4230 + %d = add i64 %c, %b + ret i64 %d +} define i64 @mul96(i64 %a) { ; RV64I-LABEL: mul96: @@ -449,6 +508,91 @@ define i64 @mul96(i64 %a) { ret i64 %c } +define i64 @mul119(i64 %a) { +; RV64I-LABEL: mul119: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 119 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: mul119: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: th.addsl a1, a0, a0, 3 +; RV64XTHEADBA-NEXT: slli a0, a0, 7 +; RV64XTHEADBA-NEXT: sub a0, a0, a1 +; RV64XTHEADBA-NEXT: ret + %c = mul i64 %a, 119 + ret i64 %c +} + +define i64 @mul123(i64 %a) { +; RV64I-LABEL: mul123: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 123 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: mul123: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: th.addsl a1, a0, a0, 2 +; RV64XTHEADBA-NEXT: slli a0, a0, 7 +; RV64XTHEADBA-NEXT: sub a0, a0, a1 +; RV64XTHEADBA-NEXT: ret + %c = mul i64 %a, 123 + ret i64 %c +} + +define i64 @mul125(i64 %a) { +; RV64I-LABEL: mul125: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 125 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: mul125: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: th.addsl a1, a0, a0, 1 +; RV64XTHEADBA-NEXT: slli a0, a0, 7 +; RV64XTHEADBA-NEXT: sub a0, a0, a1 +; RV64XTHEADBA-NEXT: ret + %c = mul i64 %a, 125 + ret i64 %c +} + +define i64 @mul131(i64 %a) { +; RV64I-LABEL: mul131: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 131 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: mul131: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: th.addsl a1, a0, a0, 1 +; RV64XTHEADBA-NEXT: slli a0, a0, 7 +; RV64XTHEADBA-NEXT: add a0, a0, a1 +; RV64XTHEADBA-NEXT: ret + %c = mul i64 %a, 131 + ret i64 %c +} + +define i64 @mul133(i64 %a) { +; RV64I-LABEL: mul133: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 133 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: mul133: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: th.addsl a1, a0, a0, 2 +; RV64XTHEADBA-NEXT: slli a0, a0, 7 +; RV64XTHEADBA-NEXT: add a0, a0, a1 +; RV64XTHEADBA-NEXT: ret + %c = mul i64 %a, 133 + ret i64 %c +} + define i64 @mul137(i64 %a) { ; RV64I-LABEL: mul137: ; RV64I: # %bb.0: @@ -482,35 +626,1369 @@ define i64 @mul160(i64 %a) { ret i64 %c } -define i64 @mul200(i64 %a) { -; RV64I-LABEL: mul200: +define i64 @mul288(i64 %a) { +; RV64I-LABEL: mul288: ; RV64I: # %bb.0: -; RV64I-NEXT: li a1, 200 +; RV64I-NEXT: li a1, 288 ; RV64I-NEXT: mul a0, a0, a1 ; RV64I-NEXT: ret ; -; RV64XTHEADBA-LABEL: mul200: +; RV64XTHEADBA-LABEL: mul288: ; RV64XTHEADBA: # %bb.0: -; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 2 -; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 2 -; RV64XTHEADBA-NEXT: slli a0, a0, 3 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 3 +; RV64XTHEADBA-NEXT: slli a0, a0, 5 ; RV64XTHEADBA-NEXT: ret - %c = mul i64 %a, 200 + %c = mul i64 %a, 288 ret i64 %c } -define i64 @mul288(i64 %a) { -; RV64I-LABEL: mul288: +define i64 @mul258(i64 %a) { +; RV64I-LABEL: mul258: ; RV64I: # %bb.0: -; RV64I-NEXT: li a1, 288 +; RV64I-NEXT: li a1, 258 ; RV64I-NEXT: mul a0, a0, a1 ; RV64I-NEXT: ret ; -; RV64XTHEADBA-LABEL: mul288: +; RV64XTHEADBA-LABEL: mul258: ; RV64XTHEADBA: # %bb.0: -; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 3 -; RV64XTHEADBA-NEXT: slli a0, a0, 5 +; RV64XTHEADBA-NEXT: slli a1, a0, 8 +; RV64XTHEADBA-NEXT: th.addsl a0, a1, a0, 1 ; RV64XTHEADBA-NEXT: ret - %c = mul i64 %a, 288 + %c = mul i64 %a, 258 + ret i64 %c +} + +define i64 @mul260(i64 %a) { +; RV64I-LABEL: mul260: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 260 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: mul260: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: slli a1, a0, 8 +; RV64XTHEADBA-NEXT: th.addsl a0, a1, a0, 2 +; RV64XTHEADBA-NEXT: ret + %c = mul i64 %a, 260 + ret i64 %c +} + +define i64 @mul264(i64 %a) { +; RV64I-LABEL: mul264: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 264 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: mul264: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: slli a1, a0, 8 +; RV64XTHEADBA-NEXT: th.addsl a0, a1, a0, 3 +; RV64XTHEADBA-NEXT: ret + %c = mul i64 %a, 264 ret i64 %c } +define i64 @mul11(i64 %a) { +; RV64I-LABEL: mul11: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 11 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: mul11: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: th.addsl a1, a0, a0, 2 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 1 +; RV64XTHEADBA-NEXT: ret + %c = mul i64 %a, 11 + ret i64 %c +} + +define i64 @mul19(i64 %a) { +; RV64I-LABEL: mul19: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 19 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: mul19: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: th.addsl a1, a0, a0, 3 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 1 +; RV64XTHEADBA-NEXT: ret + %c = mul i64 %a, 19 + ret i64 %c +} + +define i64 @mul13(i64 %a) { +; RV64I-LABEL: mul13: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 13 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: mul13: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: th.addsl a1, a0, a0, 1 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 2 +; RV64XTHEADBA-NEXT: ret + %c = mul i64 %a, 13 + ret i64 %c +} + +define i64 @mul21(i64 %a) { +; RV64I-LABEL: mul21: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 21 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: mul21: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: th.addsl a1, a0, a0, 2 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 2 +; RV64XTHEADBA-NEXT: ret + %c = mul i64 %a, 21 + ret i64 %c +} + +define i64 @mul37(i64 %a) { +; RV64I-LABEL: mul37: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 37 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: mul37: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: th.addsl a1, a0, a0, 3 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 2 +; RV64XTHEADBA-NEXT: ret + %c = mul i64 %a, 37 + ret i64 %c +} + +define i64 @mul25(i64 %a) { +; RV64I-LABEL: mul25: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 25 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: mul25: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 2 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 2 +; RV64XTHEADBA-NEXT: ret + %c = mul i64 %a, 25 + ret i64 %c +} + +define i64 @mul41(i64 %a) { +; RV64I-LABEL: mul41: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 41 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: mul41: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: th.addsl a1, a0, a0, 2 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 3 +; RV64XTHEADBA-NEXT: ret + %c = mul i64 %a, 41 + ret i64 %c +} + +define i64 @mul73(i64 %a) { +; RV64I-LABEL: mul73: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 73 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: mul73: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: th.addsl a1, a0, a0, 3 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 3 +; RV64XTHEADBA-NEXT: ret + %c = mul i64 %a, 73 + ret i64 %c +} + +define i64 @mul27(i64 %a) { +; RV64I-LABEL: mul27: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 27 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: mul27: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 1 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 3 +; RV64XTHEADBA-NEXT: ret + %c = mul i64 %a, 27 + ret i64 %c +} + +define i64 @mul45(i64 %a) { +; RV64I-LABEL: mul45: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 45 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: mul45: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 2 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 3 +; RV64XTHEADBA-NEXT: ret + %c = mul i64 %a, 45 + ret i64 %c +} + +define i64 @mul81(i64 %a) { +; RV64I-LABEL: mul81: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 81 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: mul81: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 3 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 3 +; RV64XTHEADBA-NEXT: ret + %c = mul i64 %a, 81 + ret i64 %c +} + +define i64 @mul4098(i64 %a) { +; RV64I-LABEL: mul4098: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a0, 1 +; RV64I-NEXT: slli a0, a0, 12 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: mul4098: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: slli a1, a0, 12 +; RV64XTHEADBA-NEXT: th.addsl a0, a1, a0, 1 +; RV64XTHEADBA-NEXT: ret + %c = mul i64 %a, 4098 + ret i64 %c +} + +define i64 @mul4100(i64 %a) { +; RV64I-LABEL: mul4100: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a0, 2 +; RV64I-NEXT: slli a0, a0, 12 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: mul4100: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: slli a1, a0, 12 +; RV64XTHEADBA-NEXT: th.addsl a0, a1, a0, 2 +; RV64XTHEADBA-NEXT: ret + %c = mul i64 %a, 4100 + ret i64 %c +} + +define i64 @mul4104(i64 %a) { +; RV64I-LABEL: mul4104: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a0, 3 +; RV64I-NEXT: slli a0, a0, 12 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: mul4104: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: slli a1, a0, 12 +; RV64XTHEADBA-NEXT: th.addsl a0, a1, a0, 3 +; RV64XTHEADBA-NEXT: ret + %c = mul i64 %a, 4104 + ret i64 %c +} + +define signext i32 @mulw192(i32 signext %a) { +; RV64I-LABEL: mulw192: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a0, 6 +; RV64I-NEXT: slli a0, a0, 8 +; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: mulw192: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 1 +; RV64XTHEADBA-NEXT: slliw a0, a0, 6 +; RV64XTHEADBA-NEXT: ret + %c = mul i32 %a, 192 + ret i32 %c +} + +define signext i32 @mulw320(i32 signext %a) { +; RV64I-LABEL: mulw320: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 320 +; RV64I-NEXT: mulw a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: mulw320: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 2 +; RV64XTHEADBA-NEXT: slliw a0, a0, 6 +; RV64XTHEADBA-NEXT: ret + %c = mul i32 %a, 320 + ret i32 %c +} + +define signext i32 @mulw576(i32 signext %a) { +; RV64I-LABEL: mulw576: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 576 +; RV64I-NEXT: mulw a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: mulw576: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 3 +; RV64XTHEADBA-NEXT: slliw a0, a0, 6 +; RV64XTHEADBA-NEXT: ret + %c = mul i32 %a, 576 + ret i32 %c +} + +define i64 @add4104(i64 %a) { +; CHECK-LABEL: add4104: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, 1 +; CHECK-NEXT: addiw a1, a1, 8 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: ret + %c = add i64 %a, 4104 + ret i64 %c +} + +define i64 @add4104_2(i64 %a) { +; CHECK-LABEL: add4104_2: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, 1 +; CHECK-NEXT: addiw a1, a1, 8 +; CHECK-NEXT: or a0, a0, a1 +; CHECK-NEXT: ret + %c = or disjoint i64 %a, 4104 + ret i64 %c +} + +define i64 @add8208(i64 %a) { +; CHECK-LABEL: add8208: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, 2 +; CHECK-NEXT: addiw a1, a1, 16 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: ret + %c = add i64 %a, 8208 + ret i64 %c +} + +; Make sure we prefer LUI for the 8192 instead of using sh3add. +define signext i32 @add8192_i32(i32 signext %a) { +; CHECK-LABEL: add8192_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, 2 +; CHECK-NEXT: addw a0, a0, a1 +; CHECK-NEXT: ret + %c = add i32 %a, 8192 + ret i32 %c +} + +; Make sure we prefer LUI for the 8192 instead of using sh3add. +define i64 @add8192(i64 %a) { +; CHECK-LABEL: add8192: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, 2 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: ret + %c = add i64 %a, 8192 + ret i64 %c +} + +define signext i32 @addshl32_5_6(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: addshl32_5_6: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: slli a1, a1, 6 +; CHECK-NEXT: addw a0, a0, a1 +; CHECK-NEXT: ret + %c = shl i32 %a, 5 + %d = shl i32 %b, 6 + %e = add i32 %c, %d + ret i32 %e +} + +define i64 @addshl64_5_6(i64 %a, i64 %b) { +; CHECK-LABEL: addshl64_5_6: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: slli a1, a1, 6 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: ret + %c = shl i64 %a, 5 + %d = shl i64 %b, 6 + %e = add i64 %c, %d + ret i64 %e +} + +define signext i32 @addshl32_5_7(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: addshl32_5_7: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: slli a1, a1, 7 +; CHECK-NEXT: addw a0, a0, a1 +; CHECK-NEXT: ret + %c = shl i32 %a, 5 + %d = shl i32 %b, 7 + %e = add i32 %c, %d + ret i32 %e +} + +define i64 @addshl64_5_7(i64 %a, i64 %b) { +; CHECK-LABEL: addshl64_5_7: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: slli a1, a1, 7 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: ret + %c = shl i64 %a, 5 + %d = shl i64 %b, 7 + %e = add i64 %c, %d + ret i64 %e +} + +define signext i32 @addshl32_5_8(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: addshl32_5_8: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: slli a1, a1, 8 +; CHECK-NEXT: addw a0, a0, a1 +; CHECK-NEXT: ret + %c = shl i32 %a, 5 + %d = shl i32 %b, 8 + %e = add i32 %c, %d + ret i32 %e +} + +define i64 @addshl64_5_8(i64 %a, i64 %b) { +; CHECK-LABEL: addshl64_5_8: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: slli a1, a1, 8 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: ret + %c = shl i64 %a, 5 + %d = shl i64 %b, 8 + %e = add i64 %c, %d + ret i64 %e +} + +define i64 @sh6_sh3_add1(i64 noundef %x, i64 noundef %y, i64 noundef %z) { +; RV64I-LABEL: sh6_sh3_add1: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a2, a2, 3 +; RV64I-NEXT: slli a1, a1, 6 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: sh6_sh3_add1: +; RV64XTHEADBA: # %bb.0: # %entry +; RV64XTHEADBA-NEXT: slli a1, a1, 6 +; RV64XTHEADBA-NEXT: th.addsl a1, a1, a2, 3 +; RV64XTHEADBA-NEXT: add a0, a1, a0 +; RV64XTHEADBA-NEXT: ret +entry: + %shl = shl i64 %z, 3 + %shl1 = shl i64 %y, 6 + %add = add nsw i64 %shl1, %shl + %add2 = add nsw i64 %add, %x + ret i64 %add2 +} + +define i64 @sh6_sh3_add2(i64 noundef %x, i64 noundef %y, i64 noundef %z) { +; RV64I-LABEL: sh6_sh3_add2: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a2, a2, 3 +; RV64I-NEXT: slli a1, a1, 6 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: sh6_sh3_add2: +; RV64XTHEADBA: # %bb.0: # %entry +; RV64XTHEADBA-NEXT: slli a1, a1, 6 +; RV64XTHEADBA-NEXT: add a0, a1, a0 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a2, 3 +; RV64XTHEADBA-NEXT: ret +entry: + %shl = shl i64 %z, 3 + %shl1 = shl i64 %y, 6 + %add = add nsw i64 %shl1, %x + %add2 = add nsw i64 %add, %shl + ret i64 %add2 +} + +define i64 @sh6_sh3_add3(i64 noundef %x, i64 noundef %y, i64 noundef %z) { +; RV64I-LABEL: sh6_sh3_add3: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a2, a2, 3 +; RV64I-NEXT: slli a1, a1, 6 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: sh6_sh3_add3: +; RV64XTHEADBA: # %bb.0: # %entry +; RV64XTHEADBA-NEXT: slli a1, a1, 6 +; RV64XTHEADBA-NEXT: th.addsl a1, a1, a2, 3 +; RV64XTHEADBA-NEXT: add a0, a0, a1 +; RV64XTHEADBA-NEXT: ret +entry: + %shl = shl i64 %z, 3 + %shl1 = shl i64 %y, 6 + %add = add nsw i64 %shl1, %shl + %add2 = add nsw i64 %x, %add + ret i64 %add2 +} + +define i64 @sh6_sh3_add4(i64 noundef %x, i64 noundef %y, i64 noundef %z) { +; RV64I-LABEL: sh6_sh3_add4: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a2, a2, 3 +; RV64I-NEXT: slli a1, a1, 6 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: sh6_sh3_add4: +; RV64XTHEADBA: # %bb.0: # %entry +; RV64XTHEADBA-NEXT: slli a1, a1, 6 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a2, 3 +; RV64XTHEADBA-NEXT: add a0, a0, a1 +; RV64XTHEADBA-NEXT: ret +entry: + %shl = shl i64 %z, 3 + %shl1 = shl i64 %y, 6 + %add = add nsw i64 %x, %shl + %add2 = add nsw i64 %add, %shl1 + ret i64 %add2 +} + +define signext i16 @srliw_1_sh1add(ptr %0, i32 signext %1) { +; CHECK-LABEL: srliw_1_sh1add: +; CHECK: # %bb.0: +; CHECK-NEXT: srliw a1, a1, 1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: lh a0, 0(a0) +; CHECK-NEXT: ret + %3 = lshr i32 %1, 1 + %4 = zext i32 %3 to i64 + %5 = getelementptr inbounds i16, ptr %0, i64 %4 + %6 = load i16, ptr %5, align 2 + ret i16 %6 +} + +define signext i32 @srliw_2_sh2add(ptr %0, i32 signext %1) { +; CHECK-LABEL: srliw_2_sh2add: +; CHECK: # %bb.0: +; CHECK-NEXT: srliw a1, a1, 2 +; CHECK-NEXT: slli a1, a1, 2 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: lw a0, 0(a0) +; CHECK-NEXT: ret + %3 = lshr i32 %1, 2 + %4 = zext i32 %3 to i64 + %5 = getelementptr inbounds i32, ptr %0, i64 %4 + %6 = load i32, ptr %5, align 4 + ret i32 %6 +} + +define i64 @srliw_3_sh3add(ptr %0, i32 signext %1) { +; CHECK-LABEL: srliw_3_sh3add: +; CHECK: # %bb.0: +; CHECK-NEXT: srliw a1, a1, 3 +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: ld a0, 0(a0) +; CHECK-NEXT: ret + %3 = lshr i32 %1, 3 + %4 = zext i32 %3 to i64 + %5 = getelementptr inbounds i64, ptr %0, i64 %4 + %6 = load i64, ptr %5, align 8 + ret i64 %6 +} + +define signext i32 @srliw_1_sh2add(ptr %0, i32 signext %1) { +; RV64I-LABEL: srliw_1_sh2add: +; RV64I: # %bb.0: +; RV64I-NEXT: srliw a1, a1, 1 +; RV64I-NEXT: slli a1, a1, 2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: lw a0, 0(a0) +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: srliw_1_sh2add: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: srliw a1, a1, 1 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 2 +; RV64XTHEADBA-NEXT: lw a0, 0(a0) +; RV64XTHEADBA-NEXT: ret + %3 = lshr i32 %1, 1 + %4 = zext i32 %3 to i64 + %5 = getelementptr inbounds i32, ptr %0, i64 %4 + %6 = load i32, ptr %5, align 4 + ret i32 %6 +} + +define i64 @srliw_1_sh3add(ptr %0, i32 signext %1) { +; RV64I-LABEL: srliw_1_sh3add: +; RV64I: # %bb.0: +; RV64I-NEXT: srliw a1, a1, 1 +; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ld a0, 0(a0) +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: srliw_1_sh3add: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: srliw a1, a1, 1 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 3 +; RV64XTHEADBA-NEXT: ld a0, 0(a0) +; RV64XTHEADBA-NEXT: ret + %3 = lshr i32 %1, 1 + %4 = zext i32 %3 to i64 + %5 = getelementptr inbounds i64, ptr %0, i64 %4 + %6 = load i64, ptr %5, align 8 + ret i64 %6 +} + +define i64 @srliw_2_sh3add(ptr %0, i32 signext %1) { +; RV64I-LABEL: srliw_2_sh3add: +; RV64I: # %bb.0: +; RV64I-NEXT: srliw a1, a1, 2 +; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ld a0, 0(a0) +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: srliw_2_sh3add: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: srliw a1, a1, 2 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 3 +; RV64XTHEADBA-NEXT: ld a0, 0(a0) +; RV64XTHEADBA-NEXT: ret + %3 = lshr i32 %1, 2 + %4 = zext i32 %3 to i64 + %5 = getelementptr inbounds i64, ptr %0, i64 %4 + %6 = load i64, ptr %5, align 8 + ret i64 %6 +} + +define signext i16 @srliw_2_sh1add(ptr %0, i32 signext %1) { +; RV64I-LABEL: srliw_2_sh1add: +; RV64I: # %bb.0: +; RV64I-NEXT: srliw a1, a1, 2 +; RV64I-NEXT: slli a1, a1, 1 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: lh a0, 0(a0) +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: srliw_2_sh1add: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: srliw a1, a1, 2 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 1 +; RV64XTHEADBA-NEXT: lh a0, 0(a0) +; RV64XTHEADBA-NEXT: ret + %3 = lshr i32 %1, 2 + %4 = zext i32 %3 to i64 + %5 = getelementptr inbounds i16, ptr %0, i64 %4 + %6 = load i16, ptr %5, align 2 + ret i16 %6 +} + + +define signext i32 @srliw_3_sh2add(ptr %0, i32 signext %1) { +; RV64I-LABEL: srliw_3_sh2add: +; RV64I: # %bb.0: +; RV64I-NEXT: srliw a1, a1, 3 +; RV64I-NEXT: slli a1, a1, 2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: lw a0, 0(a0) +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: srliw_3_sh2add: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: srliw a1, a1, 3 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 2 +; RV64XTHEADBA-NEXT: lw a0, 0(a0) +; RV64XTHEADBA-NEXT: ret + %3 = lshr i32 %1, 3 + %4 = zext i32 %3 to i64 + %5 = getelementptr inbounds i32, ptr %0, i64 %4 + %6 = load i32, ptr %5, align 4 + ret i32 %6 +} + +define i64 @srliw_4_sh3add(ptr %0, i32 signext %1) { +; RV64I-LABEL: srliw_4_sh3add: +; RV64I: # %bb.0: +; RV64I-NEXT: srliw a1, a1, 4 +; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ld a0, 0(a0) +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: srliw_4_sh3add: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: srliw a1, a1, 4 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 3 +; RV64XTHEADBA-NEXT: ld a0, 0(a0) +; RV64XTHEADBA-NEXT: ret + %3 = lshr i32 %1, 4 + %4 = zext i32 %3 to i64 + %5 = getelementptr inbounds i64, ptr %0, i64 %4 + %6 = load i64, ptr %5, align 8 + ret i64 %6 +} + +define signext i32 @srli_1_sh2add(ptr %0, i64 %1) { +; RV64I-LABEL: srli_1_sh2add: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a1, 1 +; RV64I-NEXT: andi a1, a1, -4 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: lw a0, 0(a0) +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: srli_1_sh2add: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: srli a1, a1, 1 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 2 +; RV64XTHEADBA-NEXT: lw a0, 0(a0) +; RV64XTHEADBA-NEXT: ret + %3 = lshr i64 %1, 1 + %4 = getelementptr inbounds i32, ptr %0, i64 %3 + %5 = load i32, ptr %4, align 4 + ret i32 %5 +} + +define i64 @srli_2_sh3add(ptr %0, i64 %1) { +; RV64I-LABEL: srli_2_sh3add: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a1, 1 +; RV64I-NEXT: andi a1, a1, -8 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ld a0, 0(a0) +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: srli_2_sh3add: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: srli a1, a1, 2 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 3 +; RV64XTHEADBA-NEXT: ld a0, 0(a0) +; RV64XTHEADBA-NEXT: ret + %3 = lshr i64 %1, 2 + %4 = getelementptr inbounds i64, ptr %0, i64 %3 + %5 = load i64, ptr %4, align 8 + ret i64 %5 +} + +define signext i16 @srli_2_sh1add(ptr %0, i64 %1) { +; RV64I-LABEL: srli_2_sh1add: +; RV64I: # %bb.0: +; RV64I-NEXT: srli a1, a1, 1 +; RV64I-NEXT: andi a1, a1, -2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: lh a0, 0(a0) +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: srli_2_sh1add: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: srli a1, a1, 2 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 1 +; RV64XTHEADBA-NEXT: lh a0, 0(a0) +; RV64XTHEADBA-NEXT: ret + %3 = lshr i64 %1, 2 + %4 = getelementptr inbounds i16, ptr %0, i64 %3 + %5 = load i16, ptr %4, align 2 + ret i16 %5 +} + +define signext i32 @srli_3_sh2add(ptr %0, i64 %1) { +; RV64I-LABEL: srli_3_sh2add: +; RV64I: # %bb.0: +; RV64I-NEXT: srli a1, a1, 1 +; RV64I-NEXT: andi a1, a1, -4 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: lw a0, 0(a0) +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: srli_3_sh2add: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: srli a1, a1, 3 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 2 +; RV64XTHEADBA-NEXT: lw a0, 0(a0) +; RV64XTHEADBA-NEXT: ret + %3 = lshr i64 %1, 3 + %4 = getelementptr inbounds i32, ptr %0, i64 %3 + %5 = load i32, ptr %4, align 4 + ret i32 %5 +} + +define i64 @srli_4_sh3add(ptr %0, i64 %1) { +; RV64I-LABEL: srli_4_sh3add: +; RV64I: # %bb.0: +; RV64I-NEXT: srli a1, a1, 1 +; RV64I-NEXT: andi a1, a1, -8 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ld a0, 0(a0) +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: srli_4_sh3add: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: srli a1, a1, 4 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 3 +; RV64XTHEADBA-NEXT: ld a0, 0(a0) +; RV64XTHEADBA-NEXT: ret + %3 = lshr i64 %1, 4 + %4 = getelementptr inbounds i64, ptr %0, i64 %3 + %5 = load i64, ptr %4, align 8 + ret i64 %5 +} + +define i8 @array_index_sh1_sh0(ptr %p, i64 %idx1, i64 %idx2) { +; RV64I-LABEL: array_index_sh1_sh0: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a1, 1 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: lbu a0, 0(a0) +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: array_index_sh1_sh0: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 1 +; RV64XTHEADBA-NEXT: add a0, a0, a2 +; RV64XTHEADBA-NEXT: lbu a0, 0(a0) +; RV64XTHEADBA-NEXT: ret + %a = getelementptr inbounds [2 x i8], ptr %p, i64 %idx1, i64 %idx2 + %b = load i8, ptr %a, align 1 + ret i8 %b +} + +define i16 @array_index_sh1_sh1(ptr %p, i64 %idx1, i64 %idx2) { +; RV64I-LABEL: array_index_sh1_sh1: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a1, 2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a2, a2, 1 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: lh a0, 0(a0) +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: array_index_sh1_sh1: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 2 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a2, 1 +; RV64XTHEADBA-NEXT: lh a0, 0(a0) +; RV64XTHEADBA-NEXT: ret + %a = getelementptr inbounds [2 x i16], ptr %p, i64 %idx1, i64 %idx2 + %b = load i16, ptr %a, align 2 + ret i16 %b +} + +define i32 @array_index_sh1_sh2(ptr %p, i64 %idx1, i64 %idx2) { +; RV64I-LABEL: array_index_sh1_sh2: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a2, a2, 2 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: lw a0, 0(a0) +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: array_index_sh1_sh2: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 3 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a2, 2 +; RV64XTHEADBA-NEXT: lw a0, 0(a0) +; RV64XTHEADBA-NEXT: ret + %a = getelementptr inbounds [2 x i32], ptr %p, i64 %idx1, i64 %idx2 + %b = load i32, ptr %a, align 4 + ret i32 %b +} + +define i64 @array_index_sh1_sh3(ptr %p, i64 %idx1, i64 %idx2) { +; RV64I-LABEL: array_index_sh1_sh3: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a1, 4 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a2, a2, 3 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: ld a0, 0(a0) +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: array_index_sh1_sh3: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: slli a1, a1, 4 +; RV64XTHEADBA-NEXT: add a0, a0, a1 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a2, 3 +; RV64XTHEADBA-NEXT: ld a0, 0(a0) +; RV64XTHEADBA-NEXT: ret + %a = getelementptr inbounds [2 x i64], ptr %p, i64 %idx1, i64 %idx2 + %b = load i64, ptr %a, align 8 + ret i64 %b +} + +define i8 @array_index_sh2_sh0(ptr %p, i64 %idx1, i64 %idx2) { +; RV64I-LABEL: array_index_sh2_sh0: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a1, 2 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: lbu a0, 0(a0) +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: array_index_sh2_sh0: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 2 +; RV64XTHEADBA-NEXT: add a0, a0, a2 +; RV64XTHEADBA-NEXT: lbu a0, 0(a0) +; RV64XTHEADBA-NEXT: ret + %a = getelementptr inbounds [4 x i8], ptr %p, i64 %idx1, i64 %idx2 + %b = load i8, ptr %a, align 1 + ret i8 %b +} + +define i16 @array_index_sh2_sh1(ptr %p, i64 %idx1, i64 %idx2) { +; RV64I-LABEL: array_index_sh2_sh1: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a2, a2, 1 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: lh a0, 0(a0) +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: array_index_sh2_sh1: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 3 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a2, 1 +; RV64XTHEADBA-NEXT: lh a0, 0(a0) +; RV64XTHEADBA-NEXT: ret + %a = getelementptr inbounds [4 x i16], ptr %p, i64 %idx1, i64 %idx2 + %b = load i16, ptr %a, align 2 + ret i16 %b +} + +define i32 @array_index_sh2_sh2(ptr %p, i64 %idx1, i64 %idx2) { +; RV64I-LABEL: array_index_sh2_sh2: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a1, 4 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a2, a2, 2 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: lw a0, 0(a0) +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: array_index_sh2_sh2: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: slli a1, a1, 4 +; RV64XTHEADBA-NEXT: add a0, a0, a1 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a2, 2 +; RV64XTHEADBA-NEXT: lw a0, 0(a0) +; RV64XTHEADBA-NEXT: ret + %a = getelementptr inbounds [4 x i32], ptr %p, i64 %idx1, i64 %idx2 + %b = load i32, ptr %a, align 4 + ret i32 %b +} + +define i64 @array_index_sh2_sh3(ptr %p, i64 %idx1, i64 %idx2) { +; RV64I-LABEL: array_index_sh2_sh3: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a1, 5 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a2, a2, 3 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: ld a0, 0(a0) +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: array_index_sh2_sh3: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: slli a1, a1, 5 +; RV64XTHEADBA-NEXT: add a0, a0, a1 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a2, 3 +; RV64XTHEADBA-NEXT: ld a0, 0(a0) +; RV64XTHEADBA-NEXT: ret + %a = getelementptr inbounds [4 x i64], ptr %p, i64 %idx1, i64 %idx2 + %b = load i64, ptr %a, align 8 + ret i64 %b +} + +define i8 @array_index_sh3_sh0(ptr %p, i64 %idx1, i64 %idx2) { +; RV64I-LABEL: array_index_sh3_sh0: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: lbu a0, 0(a0) +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: array_index_sh3_sh0: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 3 +; RV64XTHEADBA-NEXT: add a0, a0, a2 +; RV64XTHEADBA-NEXT: lbu a0, 0(a0) +; RV64XTHEADBA-NEXT: ret + %a = getelementptr inbounds [8 x i8], ptr %p, i64 %idx1, i64 %idx2 + %b = load i8, ptr %a, align 1 + ret i8 %b +} + +define i16 @array_index_sh3_sh1(ptr %p, i64 %idx1, i64 %idx2) { +; RV64I-LABEL: array_index_sh3_sh1: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a1, 4 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a2, a2, 1 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: lh a0, 0(a0) +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: array_index_sh3_sh1: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: slli a1, a1, 4 +; RV64XTHEADBA-NEXT: add a0, a0, a1 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a2, 1 +; RV64XTHEADBA-NEXT: lh a0, 0(a0) +; RV64XTHEADBA-NEXT: ret + %a = getelementptr inbounds [8 x i16], ptr %p, i64 %idx1, i64 %idx2 + %b = load i16, ptr %a, align 2 + ret i16 %b +} + +define i32 @array_index_sh3_sh2(ptr %p, i64 %idx1, i64 %idx2) { +; RV64I-LABEL: array_index_sh3_sh2: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a1, 5 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a2, a2, 2 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: lw a0, 0(a0) +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: array_index_sh3_sh2: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: slli a1, a1, 5 +; RV64XTHEADBA-NEXT: add a0, a0, a1 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a2, 2 +; RV64XTHEADBA-NEXT: lw a0, 0(a0) +; RV64XTHEADBA-NEXT: ret + %a = getelementptr inbounds [8 x i32], ptr %p, i64 %idx1, i64 %idx2 + %b = load i32, ptr %a, align 4 + ret i32 %b +} + +define i64 @array_index_sh3_sh3(ptr %p, i64 %idx1, i64 %idx2) { +; RV64I-LABEL: array_index_sh3_sh3: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a1, 6 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a2, a2, 3 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: ld a0, 0(a0) +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: array_index_sh3_sh3: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: slli a1, a1, 6 +; RV64XTHEADBA-NEXT: add a0, a0, a1 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a2, 3 +; RV64XTHEADBA-NEXT: ld a0, 0(a0) +; RV64XTHEADBA-NEXT: ret + %a = getelementptr inbounds [8 x i64], ptr %p, i64 %idx1, i64 %idx2 + %b = load i64, ptr %a, align 8 + ret i64 %b +} + +; Similar to above, but with a lshr on one of the indices. This requires +; special handling during isel to form a shift pair. +define i64 @array_index_lshr_sh3_sh3(ptr %p, i64 %idx1, i64 %idx2) { +; RV64I-LABEL: array_index_lshr_sh3_sh3: +; RV64I: # %bb.0: +; RV64I-NEXT: srli a1, a1, 58 +; RV64I-NEXT: slli a2, a2, 3 +; RV64I-NEXT: slli a1, a1, 6 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ld a0, 0(a0) +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: array_index_lshr_sh3_sh3: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: srli a1, a1, 58 +; RV64XTHEADBA-NEXT: slli a1, a1, 6 +; RV64XTHEADBA-NEXT: add a0, a0, a1 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a2, 3 +; RV64XTHEADBA-NEXT: ld a0, 0(a0) +; RV64XTHEADBA-NEXT: ret + %shr = lshr i64 %idx1, 58 + %a = getelementptr inbounds [8 x i64], ptr %p, i64 %shr, i64 %idx2 + %b = load i64, ptr %a, align 8 + ret i64 %b +} + +define i8 @array_index_sh4_sh0(ptr %p, i64 %idx1, i64 %idx2) { +; CHECK-LABEL: array_index_sh4_sh0: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: lbu a0, 0(a0) +; CHECK-NEXT: ret + %a = getelementptr inbounds [16 x i8], ptr %p, i64 %idx1, i64 %idx2 + %b = load i8, ptr %a, align 1 + ret i8 %b +} + +define i16 @array_index_sh4_sh1(ptr %p, i64 %idx1, i64 %idx2) { +; RV64I-LABEL: array_index_sh4_sh1: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a1, 5 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a2, a2, 1 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: lh a0, 0(a0) +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: array_index_sh4_sh1: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: slli a1, a1, 5 +; RV64XTHEADBA-NEXT: add a0, a0, a1 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a2, 1 +; RV64XTHEADBA-NEXT: lh a0, 0(a0) +; RV64XTHEADBA-NEXT: ret + %a = getelementptr inbounds [16 x i16], ptr %p, i64 %idx1, i64 %idx2 + %b = load i16, ptr %a, align 2 + ret i16 %b +} + +define i32 @array_index_sh4_sh2(ptr %p, i64 %idx1, i64 %idx2) { +; RV64I-LABEL: array_index_sh4_sh2: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a1, 6 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a2, a2, 2 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: lw a0, 0(a0) +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: array_index_sh4_sh2: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: slli a1, a1, 6 +; RV64XTHEADBA-NEXT: add a0, a0, a1 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a2, 2 +; RV64XTHEADBA-NEXT: lw a0, 0(a0) +; RV64XTHEADBA-NEXT: ret + %a = getelementptr inbounds [16 x i32], ptr %p, i64 %idx1, i64 %idx2 + %b = load i32, ptr %a, align 4 + ret i32 %b +} + +define i64 @array_index_sh4_sh3(ptr %p, i64 %idx1, i64 %idx2) { +; RV64I-LABEL: array_index_sh4_sh3: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a1, 7 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a2, a2, 3 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: ld a0, 0(a0) +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: array_index_sh4_sh3: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: slli a1, a1, 7 +; RV64XTHEADBA-NEXT: add a0, a0, a1 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a2, 3 +; RV64XTHEADBA-NEXT: ld a0, 0(a0) +; RV64XTHEADBA-NEXT: ret + %a = getelementptr inbounds [16 x i64], ptr %p, i64 %idx1, i64 %idx2 + %b = load i64, ptr %a, align 8 + ret i64 %b +} + +define i64 @mul_neg1(i64 %a) { +; CHECK-LABEL: mul_neg1: +; CHECK: # %bb.0: +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %c = mul i64 %a, -1 + ret i64 %c +} + +define i64 @mul_neg2(i64 %a) { +; CHECK-LABEL: mul_neg2: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %c = mul i64 %a, -2 + ret i64 %c +} + +define i64 @mul_neg3(i64 %a) { +; RV64I-LABEL: mul_neg3: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a0, 1 +; RV64I-NEXT: neg a0, a0 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: mul_neg3: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 1 +; RV64XTHEADBA-NEXT: neg a0, a0 +; RV64XTHEADBA-NEXT: ret + %c = mul i64 %a, -3 + ret i64 %c +} + +define i64 @mul_neg4(i64 %a) { +; CHECK-LABEL: mul_neg4: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %c = mul i64 %a, -4 + ret i64 %c +} + +define i64 @mul_neg5(i64 %a) { +; RV64I-LABEL: mul_neg5: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a0, 2 +; RV64I-NEXT: neg a0, a0 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: mul_neg5: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a0, 2 +; RV64XTHEADBA-NEXT: neg a0, a0 +; RV64XTHEADBA-NEXT: ret + %c = mul i64 %a, -5 + ret i64 %c +} + +define i64 @mul_neg6(i64 %a) { +; CHECK-LABEL: mul_neg6: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -6 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: ret + %c = mul i64 %a, -6 + ret i64 %c +} + +define i64 @mul_neg7(i64 %a) { +; CHECK-LABEL: mul_neg7: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a1, a0, 3 +; CHECK-NEXT: sub a0, a0, a1 +; CHECK-NEXT: ret + %c = mul i64 %a, -7 + ret i64 %c +} + +define i64 @mul_neg8(i64 %a) { +; CHECK-LABEL: mul_neg8: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %c = mul i64 %a, -8 + ret i64 %c +} + +define ptr @srai_srli_sh3add(ptr %0, i64 %1) nounwind { +; RV64I-LABEL: srai_srli_sh3add: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: srai a1, a1, 32 +; RV64I-NEXT: srli a1, a1, 6 +; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: srai_srli_sh3add: +; RV64XTHEADBA: # %bb.0: # %entry +; RV64XTHEADBA-NEXT: srai a1, a1, 32 +; RV64XTHEADBA-NEXT: srli a1, a1, 6 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 3 +; RV64XTHEADBA-NEXT: ret +entry: + %2 = ashr i64 %1, 32 + %3 = lshr i64 %2, 6 + %4 = getelementptr i64, ptr %0, i64 %3 + ret ptr %4 +} + +define ptr @srai_srli_slli(ptr %0, i64 %1) nounwind { +; CHECK-LABEL: srai_srli_slli: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: srai a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 6 +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: ret +entry: + %2 = ashr i64 %1, 32 + %3 = lshr i64 %2, 6 + %4 = getelementptr i128, ptr %0, i64 %3 + ret ptr %4 +} + +; Negative to make sure the peephole added for srai_srli_slli and +; srai_srli_sh3add doesn't break this. +define i64 @srai_andi(i64 %x) nounwind { +; CHECK-LABEL: srai_andi: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: srai a0, a0, 8 +; CHECK-NEXT: andi a0, a0, -8 +; CHECK-NEXT: ret +entry: + %y = ashr i64 %x, 8 + %z = and i64 %y, -8 + ret i64 %z +} + +; Negative to make sure the peephole added for srai_srli_slli and +; srai_srli_sh3add doesn't break this. +define i64 @srai_lui_and(i64 %x) nounwind { +; CHECK-LABEL: srai_lui_and: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: srai a0, a0, 8 +; CHECK-NEXT: lui a1, 1048574 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: ret +entry: + %y = ashr i64 %x, 8 + %z = and i64 %y, -8192 + ret i64 %z +} From 814902a03a2bb4114fd61c05e0e599fc98d61dbb Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 26 Dec 2024 21:44:47 -0800 Subject: [PATCH 104/567] [RISCV] Fix XTheadba patterns broken since cfc574a6cd13d2d0b77110b579c5cfcec744129f. Adding an OperandTransform to CSImm12MulBy4 and CSImm12MulBy8 for Zba broke these patterns. They should have been changed in the same, but we lacked sufficient testing. --- llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td | 4 +-- llvm/test/CodeGen/RISCV/rv32xtheadba.ll | 36 ++++++++++++------- llvm/test/CodeGen/RISCV/rv64xtheadba.ll | 36 ++++++++++++------- 3 files changed, 50 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td index 99186ec7360e7..37b29eda2dc10 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td @@ -550,9 +550,9 @@ def : Pat<(add_non_imm12 sh3add_op:$rs1, (XLenVT GPR:$rs2)), (TH_ADDSL GPR:$rs2, sh3add_op:$rs1, 3)>; def : Pat<(add (XLenVT GPR:$r), CSImm12MulBy4:$i), - (TH_ADDSL GPR:$r, (XLenVT (ADDI (XLenVT X0), (SimmShiftRightBy2XForm CSImm12MulBy4:$i))), 2)>; + (TH_ADDSL GPR:$r, (XLenVT (ADDI (XLenVT X0), CSImm12MulBy4:$i)), 2)>; def : Pat<(add (XLenVT GPR:$r), CSImm12MulBy8:$i), - (TH_ADDSL GPR:$r, (XLenVT (ADDI (XLenVT X0), (SimmShiftRightBy3XForm CSImm12MulBy8:$i))), 3)>; + (TH_ADDSL GPR:$r, (XLenVT (ADDI (XLenVT X0), CSImm12MulBy8:$i)), 3)>; def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 200)), (SLLI (XLenVT (TH_ADDSL (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)), diff --git a/llvm/test/CodeGen/RISCV/rv32xtheadba.ll b/llvm/test/CodeGen/RISCV/rv32xtheadba.ll index effbcc0e08f3e..44ab0e1fef6c1 100644 --- a/llvm/test/CodeGen/RISCV/rv32xtheadba.ll +++ b/llvm/test/CodeGen/RISCV/rv32xtheadba.ll @@ -601,23 +601,35 @@ define i32 @mul4104(i32 %a) { } define i32 @add4104(i32 %a) { -; CHECK-LABEL: add4104: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, 1 -; CHECK-NEXT: addi a1, a1, 8 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: ret +; RV32I-LABEL: add4104: +; RV32I: # %bb.0: +; RV32I-NEXT: lui a1, 1 +; RV32I-NEXT: addi a1, a1, 8 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32XTHEADBA-LABEL: add4104: +; RV32XTHEADBA: # %bb.0: +; RV32XTHEADBA-NEXT: li a1, 1026 +; RV32XTHEADBA-NEXT: th.addsl a0, a0, a1, 2 +; RV32XTHEADBA-NEXT: ret %c = add i32 %a, 4104 ret i32 %c } define i32 @add8208(i32 %a) { -; CHECK-LABEL: add8208: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, 2 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: ret +; RV32I-LABEL: add8208: +; RV32I: # %bb.0: +; RV32I-NEXT: lui a1, 2 +; RV32I-NEXT: addi a1, a1, 16 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32XTHEADBA-LABEL: add8208: +; RV32XTHEADBA: # %bb.0: +; RV32XTHEADBA-NEXT: li a1, 1026 +; RV32XTHEADBA-NEXT: th.addsl a0, a0, a1, 3 +; RV32XTHEADBA-NEXT: ret %c = add i32 %a, 8208 ret i32 %c } diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll index 08449de913b98..1da76c1673d6a 100644 --- a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll +++ b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll @@ -966,12 +966,18 @@ define signext i32 @mulw576(i32 signext %a) { } define i64 @add4104(i64 %a) { -; CHECK-LABEL: add4104: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, 1 -; CHECK-NEXT: addiw a1, a1, 8 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: ret +; RV64I-LABEL: add4104: +; RV64I: # %bb.0: +; RV64I-NEXT: lui a1, 1 +; RV64I-NEXT: addiw a1, a1, 8 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: add4104: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: li a1, 1026 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 2 +; RV64XTHEADBA-NEXT: ret %c = add i64 %a, 4104 ret i64 %c } @@ -988,12 +994,18 @@ define i64 @add4104_2(i64 %a) { } define i64 @add8208(i64 %a) { -; CHECK-LABEL: add8208: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, 2 -; CHECK-NEXT: addiw a1, a1, 16 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: ret +; RV64I-LABEL: add8208: +; RV64I: # %bb.0: +; RV64I-NEXT: lui a1, 2 +; RV64I-NEXT: addiw a1, a1, 16 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: add8208: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: li a1, 1026 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a1, 3 +; RV64XTHEADBA-NEXT: ret %c = add i64 %a, 8208 ret i64 %c } From 5807d0efb963ab591a1ae569b538724299d6acdc Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 26 Dec 2024 23:16:10 -0800 Subject: [PATCH 105/567] [Drive] Don't match libclang_rt.builtins Fixes fuchsia bots. --- clang/test/Driver/sanitizer-ld.c | 37 +++----------------------------- 1 file changed, 3 insertions(+), 34 deletions(-) diff --git a/clang/test/Driver/sanitizer-ld.c b/clang/test/Driver/sanitizer-ld.c index 6b57fb144f421..5befbb159183e 100644 --- a/clang/test/Driver/sanitizer-ld.c +++ b/clang/test/Driver/sanitizer-ld.c @@ -1,6 +1,8 @@ // Test sanitizers ld flags. -// DEFINE: %{filecheck} = FileCheck %s --implicit-check-not="libclang_rt" +// Match all libclang_rt, excluding platform-inconsistent builtins. + +// DEFINE: %{filecheck} = FileCheck %s --implicit-check-not="libclang_rt.{{([^b]..|.[^u].|..[^i]).*}}" // RUN: %clang -### %s 2>&1 \ // RUN: --target=i386-unknown-linux -fuse-ld=ld -fsanitize=address \ @@ -250,8 +252,6 @@ // CHECK-ASAN-ANDROID-NOT: "-lresolv" // CHECK-ASAN-ANDROID: libclang_rt.asan.so" // CHECK-ASAN-ANDROID: libclang_rt.asan_static.a" -// CHECK-ASAN-ANDROID: libclang_rt.builtins.a -// CHECK-ASAN-ANDROID: libclang_rt.builtins.a // CHECK-ASAN-ANDROID-NOT: "-lpthread" // CHECK-ASAN-ANDROID-NOT: "-lresolv" @@ -272,8 +272,6 @@ // CHECK-ASAN-ANDROID-STATICLIBASAN: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" // CHECK-ASAN-ANDROID-STATICLIBASAN: libclang_rt.asan_static.a" // CHECK-ASAN-ANDROID-STATICLIBASAN: libclang_rt.asan.a" -// CHECK-ASAN-ANDROID-STATICLIBASAN: libclang_rt.builtins.a" -// CHECK-ASAN-ANDROID-STATICLIBASAN: libclang_rt.builtins.a" // CHECK-ASAN-ANDROID-STATICLIBASAN-NOT: "-lpthread" // CHECK-ASAN-ANDROID-STATICLIBASAN-NOT: "-lrt" // CHECK-ASAN-ANDROID-STATICLIBASAN-NOT: "-lresolv" @@ -290,8 +288,6 @@ // CHECK-UBSAN-ANDROID-NOT: "-lpthread" // CHECK-UBSAN-ANDROID-NOT: "-lresolv" // CHECK-UBSAN-ANDROID: libclang_rt.ubsan_standalone.so" -// CHECK-UBSAN-ANDROID: libclang_rt.builtins.a" -// CHECK-UBSAN-ANDROID: libclang_rt.builtins.a" // CHECK-UBSAN-ANDROID-NOT: "-lpthread" // CHECK-UBSAN-ANDROID-NOT: "-lresolv" @@ -304,8 +300,6 @@ // // CHECK-UBSAN-ANDROID-STATICLIBASAN: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" // CHECK-UBSAN-ANDROID-STATICLIBASAN: libclang_rt.ubsan_standalone.a" -// CHECK-UBSAN-ANDROID-STATICLIBASAN: libclang_rt.builtins.a" -// CHECK-UBSAN-ANDROID-STATICLIBASAN: libclang_rt.builtins.a" // CHECK-UBSAN-ANDROID-STATICLIBASAN-NOT: "-lpthread" // CHECK-UBSAN-ANDROID-STATICLIBASAN-NOT: "-lrt" // CHECK-UBSAN-ANDROID-STATICLIBASAN-NOT: "-lresolv" @@ -324,8 +318,6 @@ // CHECK-ASAN-ANDROID-X86-NOT: "-lresolv" // CHECK-ASAN-ANDROID-X86: libclang_rt.asan.so" // CHECK-ASAN-ANDROID-X86: libclang_rt.asan_static.a" -// CHECK-ASAN-ANDROID-X86: libclang_rt.builtins.a" -// CHECK-ASAN-ANDROID-X86: libclang_rt.builtins.a" // CHECK-ASAN-ANDROID-X86-NOT: "-lpthread" // CHECK-ASAN-ANDROID-X86-NOT: "-lresolv" // @@ -338,8 +330,6 @@ // CHECK-ASAN-ANDROID-SHARED-LIBASAN-NOT: argument unused during compilation: '-shared-libsan' // CHECK-ASAN-ANDROID-SHARED-LIBASAN: libclang_rt.asan.so" // CHECK-ASAN-ANDROID-SHARED-LIBASAN: libclang_rt.asan_static.a" -// CHECK-ASAN-ANDROID-SHARED-LIBASAN: libclang_rt.builtins.a" -// CHECK-ASAN-ANDROID-SHARED-LIBASAN: libclang_rt.builtins.a" // // RUN: %clang -### %s 2>&1 \ // RUN: --target=arm-linux-androideabi -fuse-ld=ld -fsanitize=address \ @@ -352,8 +342,6 @@ // CHECK-ASAN-ANDROID-SHARED-NOT: "-lc" // CHECK-ASAN-ANDROID-SHARED: libclang_rt.asan.so" // CHECK-ASAN-ANDROID-SHARED: libclang_rt.asan_static.a" -// CHECK-ASAN-ANDROID-SHARED: libclang_rt.builtins.a" -// CHECK-ASAN-ANDROID-SHARED: libclang_rt.builtins.a" // CHECK-ASAN-ANDROID-SHARED-NOT: "-lpthread" // CHECK-ASAN-ANDROID-SHARED-NOT: "-lresolv" @@ -829,8 +817,6 @@ // RUN: --sysroot=%S/Inputs/basic_android_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-CFI-CROSS-DSO-ANDROID // CHECK-CFI-CROSS-DSO-ANDROID: "{{.*}}ld{{(.exe)?}}" -// CHECK-CFI-CROSS-DSO-ANDROID: libclang_rt.builtins.a -// CHECK-CFI-CROSS-DSO-ANDROID: libclang_rt.builtins.a // Cross-DSO CFI with diagnostics on Android links just the UBSAN runtime. // RUN: not %clang -fsanitize=cfi -fsanitize-cfi-cross-dso -### %s 2>&1 \ @@ -842,8 +828,6 @@ // CHECK-CFI-CROSS-DSO-DIAG-ANDROID: "{{.*}}ld{{(.exe)?}}" // CHECK-CFI-CROSS-DSO-DIAG-ANDROID: "{{[^"]*}}libclang_rt.ubsan_standalone.so" // CHECK-CFI-CROSS-DSO-DIAG-ANDROID: "--export-dynamic-symbol=__cfi_check" -// CHECK-CFI-CROSS-DSO-DIAG-ANDROID: libclang_rt.builtins.a -// CHECK-CFI-CROSS-DSO-DIAG-ANDROID: libclang_rt.builtins.a // RUN: %clangxx -fsanitize=address -### %s 2>&1 \ // RUN: -mmacos-version-min=10.6 \ @@ -898,7 +882,6 @@ // RUN: | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-ELF-RISCV32 // CHECK-SHADOWCALLSTACK-ELF-RISCV32-NOT: error: // CHECK-SHADOWCALLSTACK-ELF-RISCV32: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" -// CHECK-SHADOWCALLSTACK-ELF-RISCV32: libclang_rt.builtins.a // RUN: %clang -fsanitize=shadow-call-stack -### %s 2>&1 \ // RUN: --target=riscv64-unknown-linux -fuse-ld=ld \ @@ -910,15 +893,12 @@ // RUN: | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-ANDROID-RISCV64 // CHECK-SHADOWCALLSTACK-ANDROID-RISCV64-NOT: error: // CHECK-SHADOWCALLSTACK-ANDROID-RISCV64: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" -// CHECK-SHADOWCALLSTACK-ANDROID-RISCV64: libclang_rt.builtins.a -// CHECK-SHADOWCALLSTACK-ANDROID-RISCV64: libclang_rt.builtins.a // RUN: %clang -fsanitize=shadow-call-stack -### %s 2>&1 \ // RUN: --target=riscv64-unknown-fuchsia -fuse-ld=ld \ // RUN: | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-FUCHSIA-RISCV64 // CHECK-SHADOWCALLSTACK-FUCHSIA-RISCV64-NOT: error: // CHECK-SHADOWCALLSTACK-FUCHSIA-RISCV64: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" -// CHECK-SHADOWCALLSTACK-FUCHSIA-RISCV64: libclang_rt.builtins.a // RUN: %clang -fsanitize=shadow-call-stack -### %s 2>&1 \ // RUN: --target=aarch64-unknown-linux -fuse-ld=ld -ffixed-x18 \ @@ -934,8 +914,6 @@ // RUN: | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18-ANDROID // CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18-ANDROID-NOT: error: // CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18-ANDROID: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" -// CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18-ANDROID: libclang_rt.builtins.a -// CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18-ANDROID: libclang_rt.builtins.a // RUN: not %clang -fsanitize=shadow-call-stack -### %s 2>&1 \ // RUN: --target=x86-unknown-linux -fuse-ld=ld \ @@ -1003,8 +981,6 @@ // RUN: | %{filecheck} --check-prefix=CHECK-SAFESTACK-ANDROID-ARM // // CHECK-SAFESTACK-ANDROID-ARM: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" -// CHECK-SAFESTACK-ANDROID-ARM: libclang_rt.builtins.a -// CHECK-SAFESTACK-ANDROID-ARM: libclang_rt.builtins.a // RUN: %clang -### %s -shared 2>&1 \ // RUN: --target=arm-linux-androideabi -fuse-ld=ld -fsanitize=safe-stack \ @@ -1012,8 +988,6 @@ // RUN: | %{filecheck} --check-prefix=CHECK-SAFESTACK-SHARED-ANDROID-ARM // // CHECK-SAFESTACK-SHARED-ANDROID-ARM: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" -// CHECK-SAFESTACK-SHARED-ANDROID-ARM: libclang_rt.builtins.a -// CHECK-SAFESTACK-SHARED-ANDROID-ARM: libclang_rt.builtins.a // RUN: %clang -### %s 2>&1 \ // RUN: --target=aarch64-linux-android -fuse-ld=ld -fsanitize=safe-stack \ @@ -1021,8 +995,6 @@ // RUN: | %{filecheck} --check-prefix=CHECK-SAFESTACK-ANDROID-AARCH64 // // CHECK-SAFESTACK-ANDROID-AARCH64: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}" -// CHECK-SAFESTACK-ANDROID-AARCH64: libclang_rt.builtins.a -// CHECK-SAFESTACK-ANDROID-AARCH64: libclang_rt.builtins.a // RUN: not %clang -fsanitize=undefined -### %s 2>&1 \ // RUN: --target=x86_64-scei-ps4 -fuse-ld=ld \ @@ -1131,8 +1103,6 @@ // CHECK-SCUDO-ANDROID: libclang_rt.scudo_standalone.so" // CHECK-SCUDO-ANDROID-NOT: "-lpthread" // CHECK-SCUDO-ANDROID-NOT: "-lresolv" -// CHECK-SCUDO-ANDROID: libclang_rt.builtins.a" -// CHECK-SCUDO-ANDROID: libclang_rt.builtins.a" // RUN: %clang -### %s 2>&1 \ // RUN: --target=arm-linux-androideabi -fuse-ld=ld -fsanitize=scudo \ @@ -1146,7 +1116,6 @@ // CHECK-SCUDO-ANDROID-STATIC-NOT: "-lpthread" // CHECK-SCUDO-ANDROID-STATIC-NOT: "-lrt" // CHECK-SCUDO-ANDROID-STATIC-NOT: "-lresolv" -// CHECK-SCUDO-ANDROID-STATIC: "{{.*}}libclang_rt.builtins.a" // RUN: %clang -### %s 2>&1 \ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld -fsanitize=hwaddress \ From bca055f2ac075d43f6f316927947b2a493f93bdb Mon Sep 17 00:00:00 2001 From: Dhruv Srivastava Date: Fri, 27 Dec 2024 13:42:26 +0530 Subject: [PATCH 106/567] [lldb] AIX Changes for MainLoop polling (#120378) This PR is in reference to porting LLDB on AIX. Link to discussions on llvm discourse and github: 1. https://discourse.llvm.org/t/port-lldb-to-ibm-aix/80640 2. https://github.com/llvm/llvm-project/issues/101657 The complete changes for porting are present in this draft PR: https://github.com/llvm/llvm-project/pull/102601 Dropping changes for MainLoop polling in AIX, as `ppoll` is not supported in AIX currently. This change is part of the couple of minimal changes required to build a minimal `lldb` binary on AIX --- lldb/source/Host/posix/MainLoopPosix.cpp | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/lldb/source/Host/posix/MainLoopPosix.cpp b/lldb/source/Host/posix/MainLoopPosix.cpp index aecdeb9ba5d1c..ce7caa3041dd0 100644 --- a/lldb/source/Host/posix/MainLoopPosix.cpp +++ b/lldb/source/Host/posix/MainLoopPosix.cpp @@ -99,6 +99,7 @@ class MainLoopPosix::RunImpl { ~RunImpl() = default; Status Poll(); + void ProcessReadEvents(); private: @@ -159,6 +160,22 @@ MainLoopPosix::RunImpl::RunImpl(MainLoopPosix &loop) : loop(loop) { read_fds.reserve(loop.m_read_fds.size()); } +static int StartPoll(llvm::MutableArrayRef fds, + std::optional point) { +#if HAVE_PPOLL + return ppoll(fds.data(), fds.size(), ToTimeSpec(point), + /*sigmask=*/nullptr); +#else + using namespace std::chrono; + int timeout = -1; + if (point) { + nanoseconds dur = std::max(*point - steady_clock::now(), nanoseconds(0)); + timeout = ceil(dur).count(); + } + return poll(fds.data(), fds.size(), timeout); +#endif +} + Status MainLoopPosix::RunImpl::Poll() { read_fds.clear(); @@ -169,11 +186,9 @@ Status MainLoopPosix::RunImpl::Poll() { pfd.revents = 0; read_fds.push_back(pfd); } + int ready = StartPoll(read_fds, loop.GetNextWakeupTime()); - if (ppoll(read_fds.data(), read_fds.size(), - ToTimeSpec(loop.GetNextWakeupTime()), - /*sigmask=*/nullptr) == -1 && - errno != EINTR) + if (ready == -1 && errno != EINTR) return Status(errno, eErrorTypePOSIX); return Status(); From 2b5b3cf60d9e9e0c597bad1be1207b167ef15c9f Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Fri, 27 Dec 2024 09:13:15 +0100 Subject: [PATCH 107/567] [mlir][sparse_tensor] Migrate `SparseIterationToScf.cpp` to dialect conversion (#121054) Use the regular dialect conversion driver instead of the 1:N dialect conversion driver. The 1:N dialect conversion driver will be removed soon. --- .../Transforms/SparseIterationToScf.cpp | 123 +++++++++++------- .../Transforms/SparseTensorPasses.cpp | 11 +- 2 files changed, 81 insertions(+), 53 deletions(-) diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp index e8a40b1e033dd..9e9fea76416b9 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp @@ -7,11 +7,17 @@ #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h" #include "mlir/Dialect/SparseTensor/Transforms/Passes.h" -#include "mlir/Transforms/OneToNTypeConversion.h" +#include "mlir/Transforms/DialectConversion.h" using namespace mlir; using namespace mlir::sparse_tensor; +/// Assert that the given value range contains a single value and return it. +static Value getSingleValue(ValueRange values) { + assert(values.size() == 1 && "expected single value"); + return values.front(); +} + static void convertLevelType(SparseTensorEncodingAttr enc, Level lvl, SmallVectorImpl &fields) { // Position and coordinate buffer in the sparse structure. @@ -54,14 +60,17 @@ static ValueRange genCoIterateBranchNest(PatternRewriter &rewriter, Location loc, CoIterateOp op, Value loopCrd, ArrayRef> iters, - ArrayRef subCases, ArrayRef userReduc) { - if (subCases.empty()) + ArrayRef newBlocks, ArrayRef oldBlocks, + ArrayRef userReduc) { + if (newBlocks.empty()) return userReduc; // The current branch that we are handling. - Region *b = subCases.front(); + Block *newBlock = newBlocks.front(); + Block *oldBlock = oldBlocks.front(); Value casePred = constantI1(rewriter, loc, true); - I64BitSet caseBits = op.getRegionDefinedSpace(b->getRegionNumber()); + I64BitSet caseBits = + op.getRegionDefinedSpace(newBlock->getParent()->getRegionNumber()); for (unsigned i : caseBits.bits()) { SparseIterator *it = iters[i].get(); Value pred = rewriter.create(loc, arith::CmpIPredicate::eq, @@ -80,16 +89,20 @@ genCoIterateBranchNest(PatternRewriter &rewriter, Location loc, CoIterateOp op, for (unsigned idx : caseBits.bits()) llvm::append_range(blockArgs, iters[idx]->getCursor()); + // Map the old block arguments, because the dialect conversion driver does + // not immediately perform SSA value replacements. This function is still + // seeing the old uses. IRMapping mapping; - for (auto [from, to] : - llvm::zip_equal(b->front().getArguments(), blockArgs)) { + for (auto [from, to] : llvm::zip_equal(oldBlock->getArguments(), blockArgs)) { mapping.map(from, to); } // Clone the region, we can not erase the region now because the same region // might be a subcase for multiple lattice point. - rewriter.cloneRegionBefore(*b, ifOp.getThenRegion(), + rewriter.cloneRegionBefore(*newBlock->getParent(), ifOp.getThenRegion(), ifOp.getThenRegion().begin(), mapping); + // Remove the block arguments, they were already replaced via `mapping`. + ifOp.getThenRegion().front().eraseArguments(0, blockArgs.size()); // replace sparse_tensor::YieldOp -> scf::YieldOp auto spY = cast(&ifOp.getThenRegion().front().back()); @@ -101,7 +114,8 @@ genCoIterateBranchNest(PatternRewriter &rewriter, Location loc, CoIterateOp op, // Generates remaining case recursively. rewriter.setInsertionPointToStart(&ifOp.getElseRegion().front()); ValueRange res = genCoIterateBranchNest(rewriter, loc, op, loopCrd, iters, - subCases.drop_front(), userReduc); + newBlocks.drop_front(), + oldBlocks.drop_front(), userReduc); if (!res.empty()) rewriter.create(loc, res); @@ -119,15 +133,13 @@ static ValueRange genLoopWithIterator( if (it->iteratableByFor()) { auto [lo, hi] = it->genForCond(rewriter, loc); Value step = constantIndex(rewriter, loc, 1); - scf::ForOp forOp = rewriter.create(loc, lo, hi, step, reduc); + scf::ForOp forOp = rewriter.create( + loc, lo, hi, step, reduc, + [&](OpBuilder &b, Location loc, Value iv, ValueRange iterArgs) { + // Empty builder function to ensure that no terminator is created. + }); { OpBuilder::InsertionGuard guard(rewriter); - // Erase the implicit yield operation created by ForOp when there is no - // yielding values. - if (!forOp.getBody()->empty()) - rewriter.eraseOp(&forOp.getBody()->front()); - assert(forOp.getBody()->empty()); - it->linkNewScope(forOp.getInductionVar()); rewriter.setInsertionPointToStart(forOp.getBody()); SmallVector ret = bodyBuilder(rewriter, loc, forOp.getBodyRegion(), @@ -178,46 +190,47 @@ namespace { /// Sparse codegen rule for number of entries operator. class ExtractIterSpaceConverter - : public OneToNOpConversionPattern { + : public OpConversionPattern { public: - using OneToNOpConversionPattern::OneToNOpConversionPattern; + using OpConversionPattern::OpConversionPattern; LogicalResult - matchAndRewrite(ExtractIterSpaceOp op, OpAdaptor adaptor, - OneToNPatternRewriter &rewriter) const override { + matchAndRewrite(ExtractIterSpaceOp op, OneToNOpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { Location loc = op.getLoc(); - const OneToNTypeMapping &resultMapping = adaptor.getResultMapping(); // Construct the iteration space. - SparseIterationSpace space(loc, rewriter, op.getTensor(), 0, + SparseIterationSpace space(loc, rewriter, + getSingleValue(adaptor.getTensor()), 0, op.getLvlRange(), adaptor.getParentIter()); SmallVector result = space.toValues(); - rewriter.replaceOp(op, result, resultMapping); + rewriter.replaceOpWithMultiple(op, {result}); return success(); } }; /// Sparse codegen rule for number of entries operator. -class ExtractValOpConverter : public OneToNOpConversionPattern { +class ExtractValOpConverter : public OpConversionPattern { public: - using OneToNOpConversionPattern::OneToNOpConversionPattern; + using OpConversionPattern::OpConversionPattern; LogicalResult - matchAndRewrite(ExtractValOp op, OpAdaptor adaptor, - OneToNPatternRewriter &rewriter) const override { + matchAndRewrite(ExtractValOp op, OneToNOpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { Location loc = op.getLoc(); Value pos = adaptor.getIterator().back(); - Value valBuf = rewriter.create(loc, op.getTensor()); + Value valBuf = + rewriter.create(loc, getSingleValue(adaptor.getTensor())); rewriter.replaceOpWithNewOp(op, valBuf, pos); return success(); } }; -class SparseIterateOpConverter : public OneToNOpConversionPattern { +class SparseIterateOpConverter : public OpConversionPattern { public: - using OneToNOpConversionPattern::OneToNOpConversionPattern; + using OpConversionPattern::OpConversionPattern; LogicalResult - matchAndRewrite(IterateOp op, OpAdaptor adaptor, - OneToNPatternRewriter &rewriter) const override { + matchAndRewrite(IterateOp op, OneToNOpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { if (!op.getCrdUsedLvls().empty()) return rewriter.notifyMatchFailure( op, "non-empty coordinates list not implemented."); @@ -235,14 +248,15 @@ class SparseIterateOpConverter : public OneToNOpConversionPattern { llvm::append_range(ivs, inits); // Type conversion on iterate op block. - OneToNTypeMapping blockTypeMapping(op.getBody()->getArgumentTypes()); + unsigned numOrigArgs = op.getBody()->getArgumentTypes().size(); + TypeConverter::SignatureConversion signatureConversion(numOrigArgs); if (failed(typeConverter->convertSignatureArgs( - op.getBody()->getArgumentTypes(), blockTypeMapping))) + op.getBody()->getArgumentTypes(), signatureConversion))) return rewriter.notifyMatchFailure( op, "failed to convert iterate region argurment types"); - rewriter.applySignatureConversion(op.getBody(), blockTypeMapping); - Block *block = op.getBody(); + Block *block = rewriter.applySignatureConversion( + op.getBody(), signatureConversion, getTypeConverter()); ValueRange ret = genLoopWithIterator( rewriter, loc, it.get(), ivs, [block](PatternRewriter &rewriter, Location loc, Region &loopBody, @@ -263,19 +277,17 @@ class SparseIterateOpConverter : public OneToNOpConversionPattern { return result; }); - const OneToNTypeMapping &resultMapping = adaptor.getResultMapping(); - rewriter.replaceOp(op, ret, resultMapping); + rewriter.replaceOp(op, ret); return success(); } }; -class SparseCoIterateOpConverter - : public OneToNOpConversionPattern { - using OneToNOpConversionPattern::OneToNOpConversionPattern; +class SparseCoIterateOpConverter : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; LogicalResult - matchAndRewrite(CoIterateOp op, OpAdaptor adaptor, - OneToNPatternRewriter &rewriter) const override { + matchAndRewrite(CoIterateOp op, OneToNOpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { assert(op.getSpaceDim() == 1 && "Not implemented"); Location loc = op.getLoc(); @@ -299,18 +311,23 @@ class SparseCoIterateOpConverter assert(!needUniv && "Not implemented"); (void)needUniv; + SmallVector newBlocks; + DenseMap newToOldBlockMap; for (Region ®ion : op.getCaseRegions()) { // Do a one-shot type conversion on all region blocks, since the same // region might be used multiple time. Block *block = ®ion.getBlocks().front(); - OneToNTypeMapping blockTypeMapping(block->getArgumentTypes()); + TypeConverter::SignatureConversion blockTypeMapping( + block->getArgumentTypes().size()); if (failed(typeConverter->convertSignatureArgs(block->getArgumentTypes(), blockTypeMapping))) { return rewriter.notifyMatchFailure( op, "failed to convert coiterate region argurment types"); } - rewriter.applySignatureConversion(block, blockTypeMapping); + newBlocks.push_back(rewriter.applySignatureConversion( + block, blockTypeMapping, getTypeConverter())); + newToOldBlockMap[newBlocks.back()] = block; } SmallVector spaces; @@ -343,7 +360,7 @@ class SparseCoIterateOpConverter // Generates a loop sequence, one loop per case. for (auto [r, caseBits] : - llvm::zip_equal(op.getCaseRegions(), op.getRegionDefinedSpaces())) { + llvm::zip_equal(newBlocks, op.getRegionDefinedSpaces())) { assert(caseBits.count() > 0 && "Complement space not implemented"); // Retrives a vector of pointers to the iterators used in the case. @@ -359,11 +376,17 @@ class SparseCoIterateOpConverter // The subcases are never empty, it must contains at least the current // region itself. // TODO: these cases should be sorted. - SmallVector subCases = op.getSubCasesOf(r.getRegionNumber()); + SmallVector subCases = + op.getSubCasesOf(r->getParent()->getRegionNumber()); + SmallVector newBlocks, oldBlocks; + for (Region *r : subCases) { + newBlocks.push_back(&r->front()); + oldBlocks.push_back(newToOldBlockMap[newBlocks.back()]); + } assert(!subCases.empty()); - ValueRange res = genCoIterateBranchNest(rewriter, loc, op, loopCrd, - iters, subCases, userReduc); + ValueRange res = genCoIterateBranchNest( + rewriter, loc, op, loopCrd, iters, newBlocks, oldBlocks, userReduc); SmallVector nextIterYields(res); // 2nd. foward the loop. @@ -388,7 +411,7 @@ class SparseCoIterateOpConverter // This is a simple iteration loop. assert(caseBits.count() == 1); - Block *block = &r.getBlocks().front(); + Block *block = r; ValueRange curResult = genLoopWithIterator( rewriter, loc, validIters.front(), userReduc, /*bodyBuilder=*/ diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp index 1cac949b68c79..153b9b170e5d3 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp @@ -172,11 +172,16 @@ struct LowerSparseIterationToSCFPass ConversionTarget target(*ctx); // The actual conversion. - target.addIllegalOp(); + target.addLegalDialect(); + target.addIllegalOp(); + target.addLegalOp(); populateLowerSparseIterationToSCFPatterns(converter, patterns); - if (failed(applyPartialOneToNConversion(getOperation(), converter, - std::move(patterns)))) + if (failed(applyPartialConversion(getOperation(), target, + std::move(patterns)))) signalPassFailure(); } }; From 1ead15512872b1f9eec0b69a7d8283e752a747e6 Mon Sep 17 00:00:00 2001 From: Dhruv Srivastava Date: Fri, 27 Dec 2024 13:43:43 +0530 Subject: [PATCH 108/567] [lldb] clang-format changes for some basic #if _AIX changes (#120978) This PR is in reference to porting LLDB on AIX. Link to discussions on llvm discourse and github: 1. https://discourse.llvm.org/t/port-lldb-to-ibm-aix/80640 2. https://github.com/llvm/llvm-project/issues/101657 The complete changes for porting are present in this draft PR: https://github.com/llvm/llvm-project/pull/102601 Added clang-format changes for changes related to some base #if _AIX changes: - https://github.com/llvm/llvm-project/pull/120979 --- .../posix/ConnectionFileDescriptorPosix.cpp | 3 +- lldb/source/Host/posix/DomainSocket.cpp | 3 +- lldb/source/Plugins/Language/ObjC/Cocoa.cpp | 13 ++++----- .../BSD-Archive/ObjectContainerBSDArchive.cpp | 29 ++++++++++--------- 4 files changed, 25 insertions(+), 23 deletions(-) diff --git a/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp b/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp index 6bdc33f892328..ab4ddbfe1fb20 100644 --- a/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp +++ b/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp @@ -119,8 +119,7 @@ bool ConnectionFileDescriptor::IsConnected() const { ConnectionStatus ConnectionFileDescriptor::Connect(llvm::StringRef path, Status *error_ptr) { - return Connect( - path, [](llvm::StringRef) {}, error_ptr); + return Connect(path, [](llvm::StringRef) {}, error_ptr); } ConnectionStatus diff --git a/lldb/source/Host/posix/DomainSocket.cpp b/lldb/source/Host/posix/DomainSocket.cpp index 9a0b385d998bf..f85e1b9bbdc5c 100644 --- a/lldb/source/Host/posix/DomainSocket.cpp +++ b/lldb/source/Host/posix/DomainSocket.cpp @@ -86,7 +86,8 @@ Status DomainSocket::Connect(llvm::StringRef name) { if (error.Fail()) return error; if (llvm::sys::RetryAfterSignal(-1, ::connect, GetNativeSocket(), - (struct sockaddr *)&saddr_un, saddr_un_len) < 0) + (struct sockaddr *)&saddr_un, + saddr_un_len) < 0) SetLastError(error); return error; diff --git a/lldb/source/Plugins/Language/ObjC/Cocoa.cpp b/lldb/source/Plugins/Language/ObjC/Cocoa.cpp index bbe5d4c611f87..b35e27ad8123f 100644 --- a/lldb/source/Plugins/Language/ObjC/Cocoa.cpp +++ b/lldb/source/Plugins/Language/ObjC/Cocoa.cpp @@ -31,7 +31,6 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/bit.h" - using namespace lldb; using namespace lldb_private; using namespace lldb_private::formatters; @@ -267,21 +266,21 @@ bool lldb_private::formatters::NSIndexSetSummaryProvider( if (class_name == "NSIndexSet" || class_name == "NSMutableIndexSet") { // Foundation version 2000 added a bitmask if the index set fit in 64 bits // and a Tagged Pointer version if the bitmask is small enough to fit in - // the tagged pointer payload. + // the tagged pointer payload. // It also changed the layout (but not the size) of the set descriptor. // First check whether this is a tagged pointer. The bitmask will be in // the payload of the tagged pointer. uint64_t payload; - if (runtime->GetFoundationVersion() >= 2000 - && descriptor->GetTaggedPointerInfo(nullptr, nullptr, &payload)) { + if (runtime->GetFoundationVersion() >= 2000 && + descriptor->GetTaggedPointerInfo(nullptr, nullptr, &payload)) { count = llvm::popcount(payload); break; } // The first 32 bits describe the index set in all cases: Status error; uint32_t mode = process_sp->ReadUnsignedIntegerFromMemory( - valobj_addr + ptr_size, 4, 0, error); + valobj_addr + ptr_size, 4, 0, error); if (error.Fail()) return false; // Now check if the index is held in a bitmask in the object: @@ -292,7 +291,7 @@ bool lldb_private::formatters::NSIndexSetSummaryProvider( if ((mode & 2) == 2) { // The bitfield is a 64 bit uint at the beginning of the data var. uint64_t bitfield = process_sp->ReadUnsignedIntegerFromMemory( - valobj_addr + 2 * ptr_size, 8, 0, error); + valobj_addr + 2 * ptr_size, 8, 0, error); if (error.Fail()) return false; count = llvm::popcount(bitfield); @@ -309,7 +308,7 @@ bool lldb_private::formatters::NSIndexSetSummaryProvider( count = 0; break; } - + if ((mode & 2) == 2) mode = 1; // this means the set only has one range else diff --git a/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp b/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp index 7aa5b8d81890a..3835f2b08a05f 100644 --- a/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp +++ b/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp @@ -81,10 +81,10 @@ size_t ObjectContainerBSDArchive::Archive::ParseObjects() { std::unique_ptr mem_buffer = llvm::MemoryBuffer::getMemBuffer( - llvm::StringRef((const char *)data.GetDataStart(), - data.GetByteSize()), - llvm::StringRef(), - /*RequiresNullTerminator=*/false); + llvm::StringRef((const char *)data.GetDataStart(), + data.GetByteSize()), + llvm::StringRef(), + /*RequiresNullTerminator=*/false); auto exp_ar = llvm::object::Archive::create(mem_buffer->getMemBufferRef()); if (!exp_ar) { @@ -95,7 +95,7 @@ size_t ObjectContainerBSDArchive::Archive::ParseObjects() { llvm::Error iter_err = llvm::Error::success(); Object obj; - for (const auto &child: llvm_archive->children(iter_err)) { + for (const auto &child : llvm_archive->children(iter_err)) { obj.Clear(); auto exp_name = child.getName(); if (exp_name) { @@ -111,7 +111,9 @@ size_t ObjectContainerBSDArchive::Archive::ParseObjects() { obj.modification_time = std::chrono::duration_cast( std::chrono::time_point_cast( - exp_mtime.get()).time_since_epoch()).count(); + exp_mtime.get()) + .time_since_epoch()) + .count(); } else { LLDB_LOG_ERROR(l, exp_mtime.takeError(), "failed to get archive object time: {0}"); @@ -331,21 +333,21 @@ ObjectContainer *ObjectContainerBSDArchive::CreateInstance( ArchiveType ObjectContainerBSDArchive::MagicBytesMatch(const DataExtractor &data) { uint32_t offset = 0; - const char *armag = (const char *)data.PeekData(offset, - sizeof(ar_hdr) + SARMAG); + const char *armag = + (const char *)data.PeekData(offset, sizeof(ar_hdr) + SARMAG); if (armag == nullptr) return ArchiveType::Invalid; ArchiveType result = ArchiveType::Invalid; if (strncmp(armag, ArchiveMagic, SARMAG) == 0) - result = ArchiveType::Archive; + result = ArchiveType::Archive; else if (strncmp(armag, ThinArchiveMagic, SARMAG) == 0) - result = ArchiveType::ThinArchive; + result = ArchiveType::ThinArchive; else - return ArchiveType::Invalid; + return ArchiveType::Invalid; armag += offsetof(struct ar_hdr, ar_fmag) + SARMAG; if (strncmp(armag, ARFMAG, 2) == 0) - return result; + return result; return ArchiveType::Invalid; } @@ -443,7 +445,8 @@ size_t ObjectContainerBSDArchive::GetModuleSpecifications( return 0; const size_t initial_count = specs.GetSize(); - llvm::sys::TimePoint<> file_mod_time = FileSystem::Instance().GetModificationTime(file); + llvm::sys::TimePoint<> file_mod_time = + FileSystem::Instance().GetModificationTime(file); Archive::shared_ptr archive_sp( Archive::FindCachedArchive(file, ArchSpec(), file_mod_time, file_offset)); bool set_archive_arch = false; From ac8bb7353a7fe79cd99b3c041d5a153517c31abc Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 27 Dec 2024 15:57:20 +0700 Subject: [PATCH 109/567] Attributor: Do not treat pointer vectors as valid for unsupported attributes (#121149) The memory attributes, noalias, and dereferenceable do not support vectors of pointers according to the IR verifier, so don't report them as valid. --- llvm/include/llvm/Transforms/IPO/Attributor.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h index 8915969f75466..a8ee3cd531e49 100644 --- a/llvm/include/llvm/Transforms/IPO/Attributor.h +++ b/llvm/include/llvm/Transforms/IPO/Attributor.h @@ -3853,7 +3853,7 @@ struct AANoAlias /// See AbstractAttribute::isValidIRPositionForInit static bool isValidIRPositionForInit(Attributor &A, const IRPosition &IRP) { - if (!IRP.getAssociatedType()->isPtrOrPtrVectorTy()) + if (!IRP.getAssociatedType()->isPointerTy()) return false; return IRAttribute::isValidIRPositionForInit(A, IRP); } @@ -4220,7 +4220,7 @@ struct AADereferenceable /// See AbstractAttribute::isValidIRPositionForInit static bool isValidIRPositionForInit(Attributor &A, const IRPosition &IRP) { - if (!IRP.getAssociatedType()->isPtrOrPtrVectorTy()) + if (!IRP.getAssociatedType()->isPointerTy()) return false; return IRAttribute::isValidIRPositionForInit(A, IRP); } @@ -4364,7 +4364,7 @@ struct AANoCapture /// See AbstractAttribute::isValidIRPositionForInit static bool isValidIRPositionForInit(Attributor &A, const IRPosition &IRP) { - if (!IRP.getAssociatedType()->isPtrOrPtrVectorTy()) + if (!IRP.getAssociatedType()->isPointerTy()) return false; return IRAttribute::isValidIRPositionForInit(A, IRP); } @@ -4635,8 +4635,7 @@ struct AAMemoryBehavior /// See AbstractAttribute::isValidIRPositionForInit static bool isValidIRPositionForInit(Attributor &A, const IRPosition &IRP) { - if (!IRP.isFunctionScope() && - !IRP.getAssociatedType()->isPtrOrPtrVectorTy()) + if (!IRP.isFunctionScope() && !IRP.getAssociatedType()->isPointerTy()) return false; return IRAttribute::isValidIRPositionForInit(A, IRP); } From 223521b13e7465bc177f43e22de526b777d6ff74 Mon Sep 17 00:00:00 2001 From: NAKAMURA Takumi Date: Fri, 27 Dec 2024 19:48:30 +0900 Subject: [PATCH 110/567] llvm-cov: Introduce `--binary-counters` (#120841) In `llvm-cov show`, this option rounds counters (line, branch) to `[1,0]` at rendering. This will be useful when the number of counts doesn't interest but **Covered/uncoverd** does. --- llvm/test/tools/llvm-cov/branch-macros.test | 1 + .../tools/llvm-cov/showLineExecutionCounts.test | 3 +++ llvm/tools/llvm-cov/CodeCoverage.cpp | 7 +++++++ llvm/tools/llvm-cov/CoverageViewOptions.h | 1 + llvm/tools/llvm-cov/SourceCoverageView.h | 13 ++++++++++++- llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp | 15 +++++++++------ llvm/tools/llvm-cov/SourceCoverageViewText.cpp | 6 +++--- 7 files changed, 36 insertions(+), 10 deletions(-) diff --git a/llvm/test/tools/llvm-cov/branch-macros.test b/llvm/test/tools/llvm-cov/branch-macros.test index e4bd14ec14f16..b16ef9d4846d8 100644 --- a/llvm/test/tools/llvm-cov/branch-macros.test +++ b/llvm/test/tools/llvm-cov/branch-macros.test @@ -1,5 +1,6 @@ // RUN: llvm-profdata merge %S/Inputs/branch-macros.proftext -o %t.profdata // RUN: llvm-cov show --show-expansions --show-branches=count %S/Inputs/branch-macros.o32l -instr-profile %t.profdata -path-equivalence=/tmp,%S/Inputs | FileCheck %S/Inputs/branch-macros.cpp -check-prefixes=CHECK,BRCOV -D#C=999 +// RUN: llvm-cov show --binary-counters --show-expansions --show-branches=count %S/Inputs/branch-macros.o32l -instr-profile %t.profdata -path-equivalence=/tmp,%S/Inputs | FileCheck %S/Inputs/branch-macros.cpp -check-prefixes=CHECK,BRCOV -D#C=1 // RUN: llvm-cov report --show-branch-summary %S/Inputs/branch-macros.o32l -instr-profile %t.profdata -show-functions -path-equivalence=/tmp,%S/Inputs %S/Inputs/branch-macros.cpp | FileCheck %s -check-prefix=REPORT // RUN: yaml2obj %S/Inputs/branch-macros-single.yaml -o %t.o diff --git a/llvm/test/tools/llvm-cov/showLineExecutionCounts.test b/llvm/test/tools/llvm-cov/showLineExecutionCounts.test index 4f505f9648eb8..a165d8d670e55 100644 --- a/llvm/test/tools/llvm-cov/showLineExecutionCounts.test +++ b/llvm/test/tools/llvm-cov/showLineExecutionCounts.test @@ -3,6 +3,7 @@ // RUN: llvm-profdata merge %S/Inputs/lineExecutionCounts.proftext -o %t.profdata // RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -instr-profile %t.profdata -path-equivalence=/tmp,%S/Inputs | FileCheck -check-prefixes=TEXT,WHOLE-FILE -D#C=999 -DC16K2=16.2k -DC16K1=16.1k %S/Inputs/showLineExecutionCounts.cpp +// RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -binary-counters -instr-profile %t.profdata -path-equivalence=/tmp,%S/Inputs | FileCheck -check-prefixes=TEXT,WHOLE-FILE -D#C=1 -DC16K2=1 -DC16K1=1 %S/Inputs/showLineExecutionCounts.cpp // RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -instr-profile %t.profdata -path-equivalence=/tmp,%S/Inputs -name=main | FileCheck -check-prefixes=TEXT,FILTER -D#C=999 -DC16K2=16.2k -DC16K1=16.1k %S/Inputs/showLineExecutionCounts.cpp // Test -output-dir. @@ -16,8 +17,10 @@ // // Test html output. // RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -format html -o %t.dir/html -instr-profile %t.profdata -path-equivalence=/tmp,%S/Inputs +// RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -format html -o %t.dir/html.binary -binary-counters -instr-profile %t.profdata -path-equivalence=/tmp,%S/Inputs // RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -format html -o %t.dir/html.filtered -instr-profile %t.profdata -path-equivalence=/tmp,%S/Inputs -name=main // RUN: FileCheck -check-prefixes=HTML,HTML-WHOLE-FILE -input-file %t.dir/html/coverage/tmp/showLineExecutionCounts.cpp.html %S/Inputs/showLineExecutionCounts.cpp +// RUN: FileCheck -check-prefixes=HTML-BINARY,HTML-WHOLE-FILE -input-file %t.dir/html.binary/coverage/tmp/showLineExecutionCounts.cpp.html %S/Inputs/showLineExecutionCounts.cpp // RUN: FileCheck -check-prefixes=HTML,HTML-FILTER -input-file %t.dir/html.filtered/coverage/tmp/showLineExecutionCounts.cpp.html %S/Inputs/showLineExecutionCounts.cpp // // Test index creation. diff --git a/llvm/tools/llvm-cov/CodeCoverage.cpp b/llvm/tools/llvm-cov/CodeCoverage.cpp index 5db5c2e023541..921f283deedc7 100644 --- a/llvm/tools/llvm-cov/CodeCoverage.cpp +++ b/llvm/tools/llvm-cov/CodeCoverage.cpp @@ -1023,6 +1023,12 @@ int CodeCoverageTool::doShow(int argc, const char **argv, cl::alias ShowOutputDirectoryA("o", cl::desc("Alias for --output-dir"), cl::aliasopt(ShowOutputDirectory)); + cl::opt BinaryCounters( + "binary-counters", cl::Optional, + cl::desc("Show binary counters (1/0) in lines and branches instead of " + "integer execution counts"), + cl::cat(ViewCategory)); + cl::opt TabSize( "tab-size", cl::init(2), cl::desc( @@ -1100,6 +1106,7 @@ int CodeCoverageTool::doShow(int argc, const char **argv, ViewOpts.ShowFunctionInstantiations = ShowInstantiations; ViewOpts.ShowDirectoryCoverage = ShowDirectoryCoverage; ViewOpts.ShowOutputDirectory = ShowOutputDirectory; + ViewOpts.BinaryCounters = BinaryCounters; ViewOpts.TabSize = TabSize; ViewOpts.ProjectTitle = ProjectTitle; diff --git a/llvm/tools/llvm-cov/CoverageViewOptions.h b/llvm/tools/llvm-cov/CoverageViewOptions.h index 015c92a1656be..81e69c3814e30 100644 --- a/llvm/tools/llvm-cov/CoverageViewOptions.h +++ b/llvm/tools/llvm-cov/CoverageViewOptions.h @@ -45,6 +45,7 @@ struct CoverageViewOptions { bool SkipExpansions; bool SkipFunctions; bool SkipBranches; + bool BinaryCounters; OutputFormat Format; BranchOutputType ShowBranches; std::string ShowOutputDirectory; diff --git a/llvm/tools/llvm-cov/SourceCoverageView.h b/llvm/tools/llvm-cov/SourceCoverageView.h index 2b1570d399dd0..0b4e3978a4ba9 100644 --- a/llvm/tools/llvm-cov/SourceCoverageView.h +++ b/llvm/tools/llvm-cov/SourceCoverageView.h @@ -180,6 +180,8 @@ class SourceCoverageView { /// on display. std::vector InstantiationSubViews; + bool BinaryCounters; + /// Get the first uncovered line number for the source file. unsigned getFirstUncoveredLineNo(); @@ -266,6 +268,14 @@ class SourceCoverageView { /// digits. static std::string formatCount(uint64_t N); + uint64_t BinaryCount(uint64_t N) const { + return (N && BinaryCounters ? 1 : N); + } + + std::string formatBinaryCount(uint64_t N) const { + return formatCount(BinaryCount(N)); + } + /// Check if region marker output is expected for a line. bool shouldRenderRegionMarkers(const LineCoverageStats &LCS) const; @@ -276,7 +286,8 @@ class SourceCoverageView { const CoverageViewOptions &Options, CoverageData &&CoverageInfo) : SourceName(SourceName), File(File), Options(Options), - CoverageInfo(std::move(CoverageInfo)) {} + CoverageInfo(std::move(CoverageInfo)), + BinaryCounters(Options.BinaryCounters) {} public: static std::unique_ptr diff --git a/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp b/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp index e2be576b93cda..c94d3853fc014 100644 --- a/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp +++ b/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp @@ -1019,19 +1019,22 @@ void SourceCoverageViewHTML::renderLine(raw_ostream &OS, LineRef L, // Just consider the segments which start *and* end on this line. for (unsigned I = 0, E = Segments.size() - 1; I < E; ++I) { const auto *CurSeg = Segments[I]; + auto CurSegCount = BinaryCount(CurSeg->Count); + auto LCSCount = BinaryCount(LCS.getExecutionCount()); if (!CurSeg->IsRegionEntry) continue; - if (CurSeg->Count == LCS.getExecutionCount()) + if (CurSegCount == LCSCount) continue; Snippets[I + 1] = - tag("div", Snippets[I + 1] + tag("span", formatCount(CurSeg->Count), - "tooltip-content"), + tag("div", + Snippets[I + 1] + + tag("span", formatCount(CurSegCount), "tooltip-content"), "tooltip"); if (getOptions().Debug) errs() << "Marker at " << CurSeg->Line << ":" << CurSeg->Col << " = " - << formatCount(CurSeg->Count) << "\n"; + << formatCount(CurSegCount) << "\n"; } } @@ -1051,7 +1054,7 @@ void SourceCoverageViewHTML::renderLineCoverageColumn( raw_ostream &OS, const LineCoverageStats &Line) { std::string Count; if (Line.isMapped()) - Count = tag("pre", formatCount(Line.getExecutionCount())); + Count = tag("pre", formatBinaryCount(Line.getExecutionCount())); std::string CoverageClass = (Line.getExecutionCount() > 0) ? "covered-line" @@ -1106,7 +1109,7 @@ void SourceCoverageViewHTML::renderBranchView(raw_ostream &OS, BranchView &BRV, OS << tag("span", Label, (Count ? "None" : "red branch")) << ": "; if (getOptions().ShowBranchCounts) - OS << tag("span", formatCount(Count), + OS << tag("span", formatBinaryCount(Count), (Count ? "covered-line" : "uncovered-line")); else OS << format("%0.2f", (Total != 0 ? 100.0 * Count / Total : 0.0)) << "%"; diff --git a/llvm/tools/llvm-cov/SourceCoverageViewText.cpp b/llvm/tools/llvm-cov/SourceCoverageViewText.cpp index 63f8248e3387b..765f8bbbd8d1b 100644 --- a/llvm/tools/llvm-cov/SourceCoverageViewText.cpp +++ b/llvm/tools/llvm-cov/SourceCoverageViewText.cpp @@ -216,7 +216,7 @@ void SourceCoverageViewText::renderLineCoverageColumn( OS.indent(LineCoverageColumnWidth) << '|'; return; } - std::string C = formatCount(Line.getExecutionCount()); + std::string C = formatBinaryCount(Line.getExecutionCount()); OS.indent(LineCoverageColumnWidth - C.size()); colored_ostream(OS, raw_ostream::MAGENTA, Line.hasMultipleRegions() && getOptions().Colors) @@ -263,7 +263,7 @@ void SourceCoverageViewText::renderRegionMarkers(raw_ostream &OS, if (getOptions().Debug) errs() << "Marker at " << S->Line << ":" << S->Col << " = " - << formatCount(S->Count) << "\n"; + << formatBinaryCount(S->Count) << "\n"; } OS << '\n'; } @@ -307,7 +307,7 @@ void SourceCoverageViewText::renderBranchView(raw_ostream &OS, BranchView &BRV, << Label; if (getOptions().ShowBranchCounts) - OS << ": " << formatCount(Count); + OS << ": " << formatBinaryCount(Count); else OS << ": " << format("%0.2f", (Total != 0 ? 100.0 * Count / Total : 0.0)) << "%"; From aa2fdc69d35ff1c4a6de8a8d8edcc4c15236bb15 Mon Sep 17 00:00:00 2001 From: NAKAMURA Takumi Date: Fri, 27 Dec 2024 20:42:26 +0900 Subject: [PATCH 111/567] [Coverage] Move SingleByteCoverage out of CountedRegion (#110966) `SingleByteCoverage` is not per-region attribute at least. Move it into `CoverageData` since it comes from `profdata`. Depends on: #120841 --- .../ProfileData/Coverage/CoverageMapping.h | 28 +++++++++--------- .../ProfileData/Coverage/CoverageMapping.cpp | 29 +++++++++---------- llvm/tools/llvm-cov/SourceCoverageView.h | 3 +- 3 files changed, 30 insertions(+), 30 deletions(-) diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h index 42da188fef34e..0ad6f07bde989 100644 --- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h +++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h @@ -364,19 +364,16 @@ struct CountedRegion : public CounterMappingRegion { uint64_t FalseExecutionCount; bool TrueFolded; bool FalseFolded; - bool HasSingleByteCoverage; - CountedRegion(const CounterMappingRegion &R, uint64_t ExecutionCount, - bool HasSingleByteCoverage) + CountedRegion(const CounterMappingRegion &R, uint64_t ExecutionCount) : CounterMappingRegion(R), ExecutionCount(ExecutionCount), - FalseExecutionCount(0), TrueFolded(false), FalseFolded(true), - HasSingleByteCoverage(HasSingleByteCoverage) {} + FalseExecutionCount(0), TrueFolded(false), FalseFolded(true) {} CountedRegion(const CounterMappingRegion &R, uint64_t ExecutionCount, - uint64_t FalseExecutionCount, bool HasSingleByteCoverage) + uint64_t FalseExecutionCount) : CounterMappingRegion(R), ExecutionCount(ExecutionCount), FalseExecutionCount(FalseExecutionCount), TrueFolded(false), - FalseFolded(false), HasSingleByteCoverage(HasSingleByteCoverage) {} + FalseFolded(false) {} }; /// MCDC Record grouping all information together. @@ -719,10 +716,9 @@ struct FunctionRecord { } void pushRegion(CounterMappingRegion Region, uint64_t Count, - uint64_t FalseCount, bool HasSingleByteCoverage) { + uint64_t FalseCount) { if (Region.isBranch()) { - CountedBranchRegions.emplace_back(Region, Count, FalseCount, - HasSingleByteCoverage); + CountedBranchRegions.emplace_back(Region, Count, FalseCount); // If either counter is hard-coded to zero, then this region represents a // constant-folded branch. CountedBranchRegions.back().TrueFolded = Region.Count.isZero(); @@ -731,8 +727,7 @@ struct FunctionRecord { } if (CountedRegions.empty()) ExecutionCount = Count; - CountedRegions.emplace_back(Region, Count, FalseCount, - HasSingleByteCoverage); + CountedRegions.emplace_back(Region, Count, FalseCount); } }; @@ -895,14 +890,19 @@ class CoverageData { std::vector BranchRegions; std::vector MCDCRecords; + bool SingleByteCoverage = false; + public: CoverageData() = default; - CoverageData(StringRef Filename) : Filename(Filename) {} + CoverageData(bool Single, StringRef Filename) + : Filename(Filename), SingleByteCoverage(Single) {} /// Get the name of the file this data covers. StringRef getFilename() const { return Filename; } + bool getSingleByteCoverage() const { return SingleByteCoverage; } + /// Get an iterator over the coverage segments for this object. The segments /// are guaranteed to be uniqued and sorted by location. std::vector::const_iterator begin() const { @@ -935,6 +935,8 @@ class CoverageMapping { DenseMap> FilenameHash2RecordIndices; std::vector> FuncHashMismatches; + std::optional SingleByteCoverage; + CoverageMapping() = default; // Load coverage records from readers. diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp index 87d8bb1bbb79c..1bf2e8d627bc4 100644 --- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp +++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp @@ -805,7 +805,6 @@ Error CoverageMapping::loadFunctionRecord( else OrigFuncName = getFuncNameWithoutPrefix(OrigFuncName, Record.Filenames[0]); - bool SingleByteCoverage = ProfileReader.hasSingleByteCoverage(); CounterMappingContext Ctx(Record.Expressions); std::vector Counts; @@ -871,10 +870,7 @@ Error CoverageMapping::loadFunctionRecord( consumeError(std::move(E)); return Error::success(); } - Function.pushRegion( - Region, (SingleByteCoverage && *ExecutionCount ? 1 : *ExecutionCount), - (SingleByteCoverage && *AltExecutionCount ? 1 : *AltExecutionCount), - SingleByteCoverage); + Function.pushRegion(Region, *ExecutionCount, *AltExecutionCount); // Record ExpansionRegion. if (Region.Kind == CounterMappingRegion::ExpansionRegion) { @@ -936,6 +932,9 @@ Error CoverageMapping::loadFunctionRecord( Error CoverageMapping::loadFromReaders( ArrayRef> CoverageReaders, IndexedInstrProfReader &ProfileReader, CoverageMapping &Coverage) { + assert(!Coverage.SingleByteCoverage || + *Coverage.SingleByteCoverage == ProfileReader.hasSingleByteCoverage()); + Coverage.SingleByteCoverage = ProfileReader.hasSingleByteCoverage(); for (const auto &CoverageReader : CoverageReaders) { for (auto RecordOrErr : *CoverageReader) { if (Error E = RecordOrErr.takeError()) @@ -1296,14 +1295,8 @@ class SegmentBuilder { // value for that area. // We add counts of the regions of the same kind as the active region // to handle the both situations. - if (I->Kind == Active->Kind) { - assert(I->HasSingleByteCoverage == Active->HasSingleByteCoverage && - "Regions are generated in different coverage modes"); - if (I->HasSingleByteCoverage) - Active->ExecutionCount = Active->ExecutionCount || I->ExecutionCount; - else - Active->ExecutionCount += I->ExecutionCount; - } + if (I->Kind == Active->Kind) + Active->ExecutionCount += I->ExecutionCount; } return Regions.drop_back(std::distance(++Active, End)); } @@ -1396,7 +1389,8 @@ static bool isExpansion(const CountedRegion &R, unsigned FileID) { } CoverageData CoverageMapping::getCoverageForFile(StringRef Filename) const { - CoverageData FileCoverage(Filename); + assert(SingleByteCoverage); + CoverageData FileCoverage(*SingleByteCoverage, Filename); std::vector Regions; // Look up the function records in the given file. Due to hash collisions on @@ -1460,7 +1454,9 @@ CoverageMapping::getCoverageForFunction(const FunctionRecord &Function) const { if (!MainFileID) return CoverageData(); - CoverageData FunctionCoverage(Function.Filenames[*MainFileID]); + assert(SingleByteCoverage); + CoverageData FunctionCoverage(*SingleByteCoverage, + Function.Filenames[*MainFileID]); std::vector Regions; for (const auto &CR : Function.CountedRegions) if (CR.FileID == *MainFileID) { @@ -1487,8 +1483,9 @@ CoverageMapping::getCoverageForFunction(const FunctionRecord &Function) const { CoverageData CoverageMapping::getCoverageForExpansion( const ExpansionRecord &Expansion) const { + assert(SingleByteCoverage); CoverageData ExpansionCoverage( - Expansion.Function.Filenames[Expansion.FileID]); + *SingleByteCoverage, Expansion.Function.Filenames[Expansion.FileID]); std::vector Regions; for (const auto &CR : Expansion.Function.CountedRegions) if (CR.FileID == Expansion.FileID) { diff --git a/llvm/tools/llvm-cov/SourceCoverageView.h b/llvm/tools/llvm-cov/SourceCoverageView.h index 0b4e3978a4ba9..cff32b756ee32 100644 --- a/llvm/tools/llvm-cov/SourceCoverageView.h +++ b/llvm/tools/llvm-cov/SourceCoverageView.h @@ -287,7 +287,8 @@ class SourceCoverageView { CoverageData &&CoverageInfo) : SourceName(SourceName), File(File), Options(Options), CoverageInfo(std::move(CoverageInfo)), - BinaryCounters(Options.BinaryCounters) {} + BinaryCounters(Options.BinaryCounters || + CoverageInfo.getSingleByteCoverage()) {} public: static std::unique_ptr From ccfe0de0e1e37ed369c9bf89dd0188ba0afb2e9a Mon Sep 17 00:00:00 2001 From: Hassnaa Hamdi Date: Fri, 27 Dec 2024 12:42:07 +0000 Subject: [PATCH 112/567] [LV]: Teach LV to recursively (de)interleave. (#89018) Currently available intrinsics are only ld2/st2, which don't support interleaving factor > 2. This patch teaches the LV to use ld2/st2 recursively to support high interleaving factors. --- .../Transforms/Vectorize/LoopVectorize.cpp | 14 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 79 +- .../AArch64/sve-interleaved-accesses.ll | 260 +++- .../sve-interleaved-masked-accesses.ll | 252 ++++ .../RISCV/interleaved-accesses.ll | 1318 +++++++++-------- .../AArch64/sve-interleave-vectorization.ll | 135 ++ 6 files changed, 1387 insertions(+), 671 deletions(-) create mode 100644 llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 1b00e15ea28b7..355ff40ce770e 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3576,10 +3576,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( if (hasIrregularType(ScalarTy, DL)) return false; - // We currently only know how to emit interleave/deinterleave with - // Factor=2 for scalable vectors. This is purely an implementation - // limit. - if (VF.isScalable() && InterleaveFactor != 2) + // For scalable vectors, the only interleave factor currently supported + // must be power of 2 since we require the (de)interleave2 intrinsics + // instead of shufflevectors. + if (VF.isScalable() && !isPowerOf2_32(InterleaveFactor)) return false; // If the group involves a non-integral pointer, we may not be able to @@ -9364,9 +9364,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { CM.getWideningDecision(IG->getInsertPos(), VF) == LoopVectorizationCostModel::CM_Interleave); // For scalable vectors, the only interleave factor currently supported - // is 2 since we require the (de)interleave2 intrinsics instead of - // shufflevectors. - assert((!Result || !VF.isScalable() || IG->getFactor() == 2) && + // must be power of 2 since we require the (de)interleave2 intrinsics + // instead of shufflevectors. + assert((!Result || !VF.isScalable() || isPowerOf2_32(IG->getFactor())) && "Unsupported interleave factor for scalable vectors"); return Result; }; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 7fa5481fb3c95..aa1294f82c5f0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2849,10 +2849,21 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef Vals, // Scalable vectors cannot use arbitrary shufflevectors (only splats), so // must use intrinsics to interleave. if (VecTy->isScalableTy()) { - VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); - return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2, - Vals, - /*FMFSource=*/nullptr, Name); + assert(isPowerOf2_32(Factor) && "Unsupported interleave factor for " + "scalable vectors, must be power of 2"); + SmallVector InterleavingValues(Vals); + // When interleaving, the number of values will be shrunk until we have the + // single final interleaved value. + auto *InterleaveTy = cast(InterleavingValues[0]->getType()); + for (unsigned Midpoint = Factor / 2; Midpoint > 0; Midpoint /= 2) { + InterleaveTy = VectorType::getDoubleElementsVectorType(InterleaveTy); + for (unsigned I = 0; I < Midpoint; ++I) + InterleavingValues[I] = Builder.CreateIntrinsic( + InterleaveTy, Intrinsic::vector_interleave2, + {InterleavingValues[I], InterleavingValues[Midpoint + I]}, + /*FMFSource=*/nullptr, Name); + } + return InterleavingValues[0]; } // Fixed length. Start by concatenating all vectors into a wide vector. @@ -2938,15 +2949,11 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { &InterleaveFactor](Value *MaskForGaps) -> Value * { if (State.VF.isScalable()) { assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); - assert(InterleaveFactor == 2 && + assert(isPowerOf2_32(InterleaveFactor) && "Unsupported deinterleave factor for scalable vectors"); auto *ResBlockInMask = State.get(BlockInMask); - SmallVector Ops = {ResBlockInMask, ResBlockInMask}; - auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(), - State.VF.getKnownMinValue() * 2, true); - return State.Builder.CreateIntrinsic( - MaskTy, Intrinsic::vector_interleave2, Ops, - /*FMFSource=*/nullptr, "interleaved.mask"); + SmallVector Ops(InterleaveFactor, ResBlockInMask); + return interleaveVectors(State.Builder, Ops, "interleaved.mask"); } if (!BlockInMask) @@ -2986,22 +2993,48 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { ArrayRef VPDefs = definedValues(); const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); if (VecTy->isScalableTy()) { - assert(InterleaveFactor == 2 && + assert(isPowerOf2_32(InterleaveFactor) && "Unsupported deinterleave factor for scalable vectors"); - // Scalable vectors cannot use arbitrary shufflevectors (only splats), - // so must use intrinsics to deinterleave. - Value *DI = State.Builder.CreateIntrinsic( - Intrinsic::vector_deinterleave2, VecTy, NewLoad, - /*FMFSource=*/nullptr, "strided.vec"); - unsigned J = 0; - for (unsigned I = 0; I < InterleaveFactor; ++I) { - Instruction *Member = Group->getMember(I); + // Scalable vectors cannot use arbitrary shufflevectors (only splats), + // so must use intrinsics to deinterleave. + SmallVector DeinterleavedValues(InterleaveFactor); + DeinterleavedValues[0] = NewLoad; + // For the case of InterleaveFactor > 2, we will have to do recursive + // deinterleaving, because the current available deinterleave intrinsic + // supports only Factor of 2, otherwise it will bailout after first + // iteration. + // When deinterleaving, the number of values will double until we + // have "InterleaveFactor". + for (unsigned NumVectors = 1; NumVectors < InterleaveFactor; + NumVectors *= 2) { + // Deinterleave the elements within the vector + SmallVector TempDeinterleavedValues(NumVectors); + for (unsigned I = 0; I < NumVectors; ++I) { + auto *DiTy = DeinterleavedValues[I]->getType(); + TempDeinterleavedValues[I] = State.Builder.CreateIntrinsic( + Intrinsic::vector_deinterleave2, DiTy, DeinterleavedValues[I], + /*FMFSource=*/nullptr, "strided.vec"); + } + // Extract the deinterleaved values: + for (unsigned I = 0; I < 2; ++I) + for (unsigned J = 0; J < NumVectors; ++J) + DeinterleavedValues[NumVectors * I + J] = + State.Builder.CreateExtractValue(TempDeinterleavedValues[J], I); + } - if (!Member) +#ifndef NDEBUG + for (Value *Val : DeinterleavedValues) + assert(Val && "NULL Deinterleaved Value"); +#endif + for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) { + Instruction *Member = Group->getMember(I); + Value *StridedVec = DeinterleavedValues[I]; + if (!Member) { + // This value is not needed as it's not used + static_cast(StridedVec)->eraseFromParent(); continue; - - Value *StridedVec = State.Builder.CreateExtractValue(DI, I); + } // If this member has different type, cast the result type. if (Member->getType() != ScalarTy) { VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll index 0924992028378..9b37ba588f5d6 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll @@ -396,8 +396,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP9]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) ; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP10]]) ; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP10]]) ; CHECK-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP11]]) ; CHECK-NEXT: [[TMP12:%.*]] = add nsw [[REVERSE]], [[VEC_IND]] ; CHECK-NEXT: [[TMP13:%.*]] = sub nsw [[REVERSE1]], [[VEC_IND]] @@ -1548,5 +1548,263 @@ end: ret void } +; Check vectorization on an interleaved load/store groups of factor 4 + +; for (int i = 0; i < 1024; ++i) { +; dst[i].x = a[i].x + b[i].x; +; dst[i].y = a[i].y - b[i].y; +; dst[i].z = a[i].z << b[i].z; +; dst[i].t = a[i].t >> b[i].t; +; } +%struct.xyzt = type { i32, i32, i32, i32 } + +define void @interleave_deinterleave(ptr writeonly noalias %dst, ptr readonly %a, ptr readonly %b) { +; CHECK-LABEL: @interleave_deinterleave( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], 1024 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 2 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP6]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP7]]) +; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP8]]) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC7]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC7]], 1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_VEC8:%.*]] = load , ptr [[TMP13]], align 4 +; CHECK-NEXT: [[STRIDED_VEC9:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i32( [[WIDE_VEC8]]) +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC9]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC9]], 1 +; CHECK-NEXT: [[STRIDED_VEC10:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP14]]) +; CHECK-NEXT: [[STRIDED_VEC11:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP15]]) +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC10]], 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , } [[STRIDED_VEC11]], 0 +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { , } [[STRIDED_VEC10]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { , } [[STRIDED_VEC11]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = add nsw [[TMP16]], [[TMP9]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP22:%.*]] = sub nsw [[TMP10]], [[TMP17]] +; CHECK-NEXT: [[TMP23:%.*]] = shl [[TMP11]], [[TMP18]] +; CHECK-NEXT: [[TMP24:%.*]] = ashr [[TMP12]], [[TMP19]] +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP20]], [[TMP23]]) +; CHECK-NEXT: [[INTERLEAVED_VEC12:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP22]], [[TMP24]]) +; CHECK-NEXT: [[INTERLEAVED_VEC13:%.*]] = call @llvm.vector.interleave2.nxv16i32( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC12]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC13]], ptr [[TMP21]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP27]], [[TMP26]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4 +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[Y]], align 4 +; CHECK-NEXT: [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4 +; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[Y11]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP28]], [[TMP29]] +; CHECK-NEXT: [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4 +; CHECK-NEXT: store i32 [[SUB]], ptr [[Y14]], align 4 +; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8 +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[Z]], align 4 +; CHECK-NEXT: [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8 +; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[Z19]], align 4 +; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP30]], [[TMP31]] +; CHECK-NEXT: [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8 +; CHECK-NEXT: store i32 [[SHL]], ptr [[Z22]], align 4 +; CHECK-NEXT: [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12 +; CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[T]], align 4 +; CHECK-NEXT: [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12 +; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[T27]], align 4 +; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[TMP32]], [[TMP33]] +; CHECK-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12 +; CHECK-NEXT: store i32 [[SHR]], ptr [[T30]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds %struct.xyzt, ptr %a, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %indvars.iv + %1 = load i32, ptr %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx5 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %indvars.iv + store i32 %add, ptr %arrayidx5, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i64 4 + %2 = load i32, ptr %y, align 4 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 4 + %3 = load i32, ptr %y11, align 4 + %sub = sub nsw i32 %2, %3 + %y14 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 4 + store i32 %sub, ptr %y14, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i64 8 + %4 = load i32, ptr %z, align 4 + %z19 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 8 + %5 = load i32, ptr %z19, align 4 + %shl = shl i32 %4, %5 + %z22 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 8 + store i32 %shl, ptr %z22, align 4 + %t = getelementptr inbounds nuw i8, ptr %arrayidx, i64 12 + %6 = load i32, ptr %t, align 4 + %t27 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 12 + %7 = load i32, ptr %t27, align 4 + %shr = ashr i32 %6, %7 + %t30 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 12 + store i32 %shr, ptr %t30, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret void +} + +; Check vectorization on a reverse interleaved load/store groups of factor 4 + +; for (int i = 1023; i >= 0; i--) { +; int a = A[i].x + i; +; int b = A[i].y - i; +; int c = A[i].z * i; +; int d = A[i].t << i; +; B[i].x = a; +; B[i].y = b; +; B[i].z = c; +; B[i].t = d; +; } + +define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A, ptr noalias nocapture %B) #1{ +; CHECK-LABEL: @interleave_deinterleave_reverse( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.stepvector.nxv4i32() +; CHECK-NEXT: [[INDUCTION:%.*]] = sub splat (i32 1023), [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = sub nsw i32 0, [[TMP3]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP4]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = sub nsw i32 4, [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 [[TMP9]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP10]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP11]]) +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP12]]) +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 1 +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 1 +; CHECK-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP13]]) +; CHECK-NEXT: [[REVERSE3:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP14]]) +; CHECK-NEXT: [[REVERSE4:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP15]]) +; CHECK-NEXT: [[REVERSE5:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP16]]) +; CHECK-NEXT: [[TMP17:%.*]] = add nsw [[REVERSE]], [[VEC_IND]] +; CHECK-NEXT: [[TMP18:%.*]] = sub nsw [[REVERSE3]], [[VEC_IND]] +; CHECK-NEXT: [[TMP19:%.*]] = mul nsw [[REVERSE4]], [[VEC_IND]] +; CHECK-NEXT: [[TMP20:%.*]] = shl nuw nsw [[REVERSE5]], [[VEC_IND]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 4 +; CHECK-NEXT: [[TMP24:%.*]] = sub nsw i32 4, [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = sext i32 [[TMP24]] to i64 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP25]] +; CHECK-NEXT: [[REVERSE6:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP17]]) +; CHECK-NEXT: [[REVERSE7:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP18]]) +; CHECK-NEXT: [[REVERSE8:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP19]]) +; CHECK-NEXT: [[REVERSE9:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP20]]) +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[REVERSE6]], [[REVERSE8]]) +; CHECK-NEXT: [[INTERLEAVED_VEC10:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[REVERSE7]], [[REVERSE9]]) +; CHECK-NEXT: [[INTERLEAVED_VEC11:%.*]] = call @llvm.vector.interleave2.nxv16i32( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC10]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC11]], ptr [[TMP26]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: br i1 poison, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP44:![0-9]+]] +; +entry: + br label %for.body +for.cond.cleanup: ; preds = %for.body + ret void +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ] + %x = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 0 + %load1 = load i32, ptr %x, align 4 + %trunc = trunc i64 %indvars.iv to i32 + %add = add nsw i32 %load1, %trunc + %y = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 1 + %load2 = load i32, ptr %y, align 4 + %sub = sub nsw i32 %load2, %trunc + %z = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 2 + %load3 = load i32, ptr %z, align 4 + %mul = mul nsw i32 %load3, %trunc + %t = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 3 + %load4 = load i32, ptr %t, align 4 + %shl = shl nuw nsw i32 %load4, %trunc + %x5 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 0 + store i32 %add, ptr %x5, align 4 + %y8 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 1 + store i32 %sub, ptr %y8, align 4 + %z5 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 2 + store i32 %mul, ptr %z5, align 4 + %t8 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 3 + store i32 %shl, ptr %t8, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp = icmp sgt i64 %indvars.iv, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +} attributes #1 = { "target-features"="+sve" vscale_range(1, 16) } attributes #0 = { "unsafe-fp-math"="true" "target-features"="+sve" vscale_range(1, 16) } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll index 1a281fe7c6f7f..d4392bebdf37b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll @@ -529,3 +529,255 @@ for.inc: for.end: ret void } + +; Expected to contain interleave2/deinterleave2 instructions +; +; void masked_strided_factor4(const unsigned char* restrict p, +; unsigned char* restrict q, +; unsigned char guard) { +; for(ix=0; ix < 1024; ++ix) { +; if (ix > guard) { +; char left1 = p[4*ix]; +; char right1 = p[4*ix + 1]; +; char left2 = p[4*ix + 2]; +; char right2 = p[4*ix + 3]; +; char max1 = max(left1, right1); +; char max2 = max(left2, right2); +; q[4*ix] = max1; +; q[4*ix + 1] = 0 - max1; +; q[4*ix + 2] = max2; +; q[4*ix + 3] = 0 - max2; +; } +; } +;} +define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 { +; SCALAR_TAIL_FOLDING-LABEL: define dso_local void @masked_strided_factor4 +; SCALAR_TAIL_FOLDING-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) local_unnamed_addr #[[ATTR0]] { +; SCALAR_TAIL_FOLDING-NEXT: entry: +; SCALAR_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024 +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALAR_TAIL_FOLDING: vector.ph: +; SCALAR_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]] +; SCALAR_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl i32 [[TMP4]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv16i32() +; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP5]], i64 0 +; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 +; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALAR_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALAR_TAIL_FOLDING: vector.body: +; SCALAR_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[INDEX]], 2 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]] +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK2:%.*]] = call @llvm.vector.interleave2.nxv64i1( [[INTERLEAVED_MASK]], [[INTERLEAVED_MASK1]]) +; SCALAR_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, [[INTERLEAVED_MASK2]], poison) +; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv64i8( [[WIDE_MASKED_VEC]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC3:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[TMP11]]) +; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC4:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[TMP12]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = call @llvm.smax.nxv16i8( [[TMP13]], [[TMP14]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = sub zeroinitializer, [[TMP17]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call @llvm.smax.nxv16i8( [[TMP15]], [[TMP16]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = sub zeroinitializer, [[TMP19]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = sext i32 [[TMP8]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]] +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP17]], [[TMP19]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC5:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP18]], [[TMP20]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC6:%.*]] = call @llvm.vector.interleave2.nxv64i8( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC5]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK7:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK8:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK9:%.*]] = call @llvm.vector.interleave2.nxv64i1( [[INTERLEAVED_MASK7]], [[INTERLEAVED_MASK8]]) +; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC6]], ptr [[TMP22]], i32 1, [[INTERLEAVED_MASK9]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]] +; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP23:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; SCALAR_TAIL_FOLDING: middle.block: +; SCALAR_TAIL_FOLDING-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; SCALAR_TAIL_FOLDING: scalar.ph: +; SCALAR_TAIL_FOLDING-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; SCALAR_TAIL_FOLDING-NEXT: br label [[FOR_BODY:%.*]] +; SCALAR_TAIL_FOLDING: for.body: +; SCALAR_TAIL_FOLDING-NEXT: [[IX_024:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ] +; SCALAR_TAIL_FOLDING-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[IX_024]], [[CONV]] +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; SCALAR_TAIL_FOLDING: if.then: +; SCALAR_TAIL_FOLDING-NEXT: [[IDX0:%.*]] = shl nuw nsw i32 [[IX_024]], 2 +; SCALAR_TAIL_FOLDING-NEXT: [[IDX1:%.*]] = or disjoint i32 [[IDX0]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[IDX2:%.*]] = or disjoint i32 [[IDX0]], 2 +; SCALAR_TAIL_FOLDING-NEXT: [[IDX3:%.*]] = or disjoint i32 [[IDX0]], 3 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP24:%.*]] = zext nneg i32 [[IDX0]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP24]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP25:%.*]] = load i8, ptr [[ARRAY1IDX0]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP26:%.*]] = zext nneg i32 [[IDX1]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP26]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP27:%.*]] = load i8, ptr [[ARRAY1IDX1]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP28:%.*]] = zext nneg i32 [[IDX2]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP28]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP29:%.*]] = load i8, ptr [[ARRAY1IDX2]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP30:%.*]] = zext nneg i32 [[IDX3]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP30]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP31:%.*]] = load i8, ptr [[ARRAY1IDX3]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[SPEC_SELECT_I1:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP25]], i8 [[TMP27]]) +; SCALAR_TAIL_FOLDING-NEXT: [[SUB1:%.*]] = sub i8 0, [[SPEC_SELECT_I1]] +; SCALAR_TAIL_FOLDING-NEXT: [[SPEC_SELECT_I2:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP29]], i8 [[TMP31]]) +; SCALAR_TAIL_FOLDING-NEXT: [[SUB2:%.*]] = sub i8 0, [[SPEC_SELECT_I2]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP32:%.*]] = zext nneg i32 [[IDX0]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP32]] +; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SPEC_SELECT_I1]], ptr [[ARRAY3IDX0]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP33:%.*]] = zext nneg i32 [[IDX1]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP33]] +; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SUB1]], ptr [[ARRAY3IDX1]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP34:%.*]] = zext nneg i32 [[IDX2]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP34]] +; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SPEC_SELECT_I2]], ptr [[ARRAY3IDX2]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP35:%.*]] = zext nneg i32 [[IDX3]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP35]] +; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SUB2]], ptr [[ARRAY3IDX3]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: br label [[FOR_INC]] +; SCALAR_TAIL_FOLDING: for.inc: +; SCALAR_TAIL_FOLDING-NEXT: [[INC]] = add nuw nsw i32 [[IX_024]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1024 +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; SCALAR_TAIL_FOLDING: for.end: +; SCALAR_TAIL_FOLDING-NEXT: ret void +; +; PREDICATED_TAIL_FOLDING-LABEL: define dso_local void @masked_strided_factor4 +; PREDICATED_TAIL_FOLDING-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) local_unnamed_addr #[[ATTR0]] { +; PREDICATED_TAIL_FOLDING-NEXT: entry: +; PREDICATED_TAIL_FOLDING-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; PREDICATED_TAIL_FOLDING: vector.ph: +; PREDICATED_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv16i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP1]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] +; PREDICATED_TAIL_FOLDING: vector.body: +; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[INDEX]], 2 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]] +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK2:%.*]] = call @llvm.vector.interleave2.nxv64i1( [[INTERLEAVED_MASK]], [[INTERLEAVED_MASK1]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, [[INTERLEAVED_MASK2]], poison) +; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv64i8( [[WIDE_MASKED_VEC]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC3:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[TMP11]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC4:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[TMP12]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = call @llvm.smax.nxv16i8( [[TMP13]], [[TMP14]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = sub zeroinitializer, [[TMP17]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call @llvm.smax.nxv16i8( [[TMP15]], [[TMP16]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = sub zeroinitializer, [[TMP19]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = sext i32 [[TMP8]] to i64 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]] +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP17]], [[TMP19]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC5:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP18]], [[TMP20]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC6:%.*]] = call @llvm.vector.interleave2.nxv64i8( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC5]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK7:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK8:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK9:%.*]] = call @llvm.vector.interleave2.nxv64i1( [[INTERLEAVED_MASK7]], [[INTERLEAVED_MASK8]]) +; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC6]], ptr [[TMP22]], i32 1, [[INTERLEAVED_MASK9]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]] +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP23:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP23]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP8:![0-9]+]] +; PREDICATED_TAIL_FOLDING: middle.block: +; PREDICATED_TAIL_FOLDING-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; PREDICATED_TAIL_FOLDING: scalar.ph: +; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_BODY:%.*]] +; PREDICATED_TAIL_FOLDING: for.body: +; PREDICATED_TAIL_FOLDING-NEXT: br i1 poison, label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; PREDICATED_TAIL_FOLDING: if.then: +; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_INC]] +; PREDICATED_TAIL_FOLDING: for.inc: +; PREDICATED_TAIL_FOLDING-NEXT: br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; PREDICATED_TAIL_FOLDING: for.end: +; PREDICATED_TAIL_FOLDING-NEXT: ret void +; +entry: + %conv = zext i8 %guard to i32 + br label %for.body + +for.body: + %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp1 = icmp ugt i32 %ix.024, %conv + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %idx0 = shl nuw nsw i32 %ix.024, 2 + %idx1 = add i32 %idx0, 1 + %idx2 = add i32 %idx0, 2 + %idx3 = add i32 %idx0, 3 + + %array1idx0 = getelementptr inbounds i8, ptr %p, i32 %idx0 + %0 = load i8, ptr %array1idx0, align 1 + %array1idx1 = getelementptr inbounds i8, ptr %p, i32 %idx1 + %1 = load i8, ptr %array1idx1, align 1 + %array1idx2 = getelementptr inbounds i8, ptr %p, i32 %idx2 + %2 = load i8, ptr %array1idx2, align 1 + %array1idx3 = getelementptr inbounds i8, ptr %p, i32 %idx3 + %3 = load i8, ptr %array1idx3, align 1 + + %cmp.i1 = icmp slt i8 %0, %1 + %spec.select.i1 = select i1 %cmp.i1, i8 %1, i8 %0 + %sub1 = sub i8 0, %spec.select.i1 + %cmp.i2 = icmp slt i8 %2, %3 + %spec.select.i2 = select i1 %cmp.i2, i8 %3, i8 %2 + %sub2 = sub i8 0, %spec.select.i2 + + %array3idx0 = getelementptr inbounds i8, ptr %q, i32 %idx0 + store i8 %spec.select.i1, ptr %array3idx0, align 1 + %array3idx1 = getelementptr inbounds i8, ptr %q, i32 %idx1 + store i8 %sub1, ptr %array3idx1, align 1 + %array3idx2 = getelementptr inbounds i8, ptr %q, i32 %idx2 + store i8 %spec.select.i2, ptr %array3idx2, align 1 + %array3idx3 = getelementptr inbounds i8, ptr %q, i32 %idx3 + store i8 %sub2, ptr %array3idx3, align 1 + + br label %for.inc + +for.inc: + %inc = add nuw nsw i32 %ix.024, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll index bda4839dead51..b1ff589fe51bf 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll @@ -9,7 +9,7 @@ define void @load_store_factor2_i32(ptr %p) { ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 @@ -17,88 +17,88 @@ define void @load_store_factor2_i32(ptr %p) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 4 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i32 1) -; CHECK-NEXT: [[TMP15:%.*]] = add [[TMP11]], splat (i32 2) -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP12]], [[TMP15]]) -; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i32 1) +; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i32 2) +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP11]], [[TMP12]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; CHECK-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 ; CHECK-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; CHECK-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: store i32 [[Y0]], ptr [[Q2]], align 4 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; CHECK-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 ; CHECK-NEXT: store i32 [[Y1]], ptr [[Q1]], align 4 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @load_store_factor2_i32( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP2]], align 4 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1 +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[Q0]], align 4 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> ; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) -; FIXED-NEXT: [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) -; FIXED-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP7]], <16 x i32> -; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> poison, <16 x i32> -; FIXED-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; FIXED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; FIXED-NEXT: [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) +; FIXED-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) +; FIXED-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> +; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> +; FIXED-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8 +; FIXED-NEXT: [[TMP6:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; FIXED-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 ; FIXED-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; FIXED-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: store i32 [[Y0]], ptr [[Q2]], align 4 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; FIXED-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 ; FIXED-NEXT: store i32 [[Y1]], ptr [[Q1]], align 4 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; @@ -107,7 +107,7 @@ define void @load_store_factor2_i32(ptr %p) { ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 @@ -115,44 +115,44 @@ define void @load_store_factor2_i32(ptr %p) { ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 4 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 4 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) -; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i32 1) -; SCALABLE-NEXT: [[TMP15:%.*]] = add [[TMP11]], splat (i32 2) -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP12]], [[TMP15]]) -; SCALABLE-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i32 1) +; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i32 2) +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP11]], [[TMP12]]) +; SCALABLE-NEXT: store [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; SCALABLE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; SCALABLE-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 ; SCALABLE-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; SCALABLE-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: store i32 [[Y0]], ptr [[Q2]], align 4 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; SCALABLE-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 ; SCALABLE-NEXT: store i32 [[Y1]], ptr [[Q1]], align 4 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -186,7 +186,7 @@ define void @load_store_factor2_i64(ptr %p) { ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -194,88 +194,88 @@ define void @load_store_factor2_i64(ptr %p) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 8 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i64 1) -; CHECK-NEXT: [[TMP15:%.*]] = add [[TMP11]], splat (i64 2) -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[TMP12]], [[TMP15]]) -; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i64 1) +; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i64 2) +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[TMP11]], [[TMP12]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; CHECK-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 ; CHECK-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: store i64 [[Y0]], ptr [[Q2]], align 8 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; CHECK-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 ; CHECK-NEXT: store i64 [[Y1]], ptr [[Q1]], align 8 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @load_store_factor2_i64( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP2]], align 8 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1 +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[Q0]], align 8 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> ; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) -; FIXED-NEXT: [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) -; FIXED-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP7]], <8 x i32> -; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> poison, <8 x i32> -; FIXED-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; FIXED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; FIXED-NEXT: [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) +; FIXED-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) +; FIXED-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> +; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> +; FIXED-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 4 +; FIXED-NEXT: [[TMP6:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; FIXED-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 ; FIXED-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; FIXED-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: store i64 [[Y0]], ptr [[Q2]], align 8 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; FIXED-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 ; FIXED-NEXT: store i64 [[Y1]], ptr [[Q1]], align 8 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; @@ -284,7 +284,7 @@ define void @load_store_factor2_i64(ptr %p) { ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 ; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -292,44 +292,44 @@ define void @load_store_factor2_i64(ptr %p) { ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 8 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]]) -; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i64 1) -; SCALABLE-NEXT: [[TMP15:%.*]] = add [[TMP11]], splat (i64 2) -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[TMP12]], [[TMP15]]) -; SCALABLE-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i64 1) +; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i64 2) +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[TMP11]], [[TMP12]]) +; SCALABLE-NEXT: store [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; SCALABLE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; SCALABLE-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 ; SCALABLE-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q2]], align 8 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; SCALABLE-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 ; SCALABLE-NEXT: store i64 [[Y1]], ptr [[Q1]], align 8 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -360,42 +360,42 @@ exit: define void @load_store_factor3_i32(ptr %p) { ; CHECK-LABEL: @load_store_factor3_i32( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) -; CHECK-NEXT: [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) -; CHECK-NEXT: [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> -; CHECK-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) +; CHECK-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) +; CHECK-NEXT: [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> +; CHECK-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; CHECK-NEXT: [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]] +; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q3]], align 4 ; CHECK-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; CHECK-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: store i32 [[Y0]], ptr [[Q3]], align 4 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; CHECK-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 @@ -405,50 +405,50 @@ define void @load_store_factor3_i32(ptr %p) { ; CHECK-NEXT: [[X2:%.*]] = load i32, ptr [[Q2]], align 4 ; CHECK-NEXT: [[Y2:%.*]] = add i32 [[X2]], 3 ; CHECK-NEXT: store i32 [[Y2]], ptr [[Q2]], align 4 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @load_store_factor3_i32( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) -; FIXED-NEXT: [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) -; FIXED-NEXT: [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) -; FIXED-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> -; FIXED-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> -; FIXED-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> -; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> -; FIXED-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; FIXED-NEXT: [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) +; FIXED-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) +; FIXED-NEXT: [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) +; FIXED-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> +; FIXED-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> +; FIXED-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> +; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> +; FIXED-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8 +; FIXED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; FIXED-NEXT: [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]] +; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q3]], align 4 ; FIXED-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; FIXED-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: store i32 [[Y0]], ptr [[Q3]], align 4 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; FIXED-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 @@ -458,50 +458,50 @@ define void @load_store_factor3_i32(ptr %p) { ; FIXED-NEXT: [[X2:%.*]] = load i32, ptr [[Q2]], align 4 ; FIXED-NEXT: [[Y2:%.*]] = add i32 [[X2]], 3 ; FIXED-NEXT: store i32 [[Y2]], ptr [[Q2]], align 4 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; ; SCALABLE-LABEL: @load_store_factor3_i32( ; SCALABLE-NEXT: entry: -; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; SCALABLE-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> -; SCALABLE-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) -; SCALABLE-NEXT: [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) -; SCALABLE-NEXT: [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) -; SCALABLE-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> -; SCALABLE-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> -; SCALABLE-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> -; SCALABLE-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; SCALABLE-NEXT: [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) +; SCALABLE-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) +; SCALABLE-NEXT: [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) +; SCALABLE-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> +; SCALABLE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> +; SCALABLE-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> +; SCALABLE-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8 +; SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; SCALABLE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; SCALABLE-NEXT: [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]] +; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q3]], align 4 ; SCALABLE-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; SCALABLE-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: store i32 [[Y0]], ptr [[Q3]], align 4 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; SCALABLE-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 @@ -511,9 +511,9 @@ define void @load_store_factor3_i32(ptr %p) { ; SCALABLE-NEXT: [[X2:%.*]] = load i32, ptr [[Q2]], align 4 ; SCALABLE-NEXT: [[Y2:%.*]] = add i32 [[X2]], 3 ; SCALABLE-NEXT: store i32 [[Y2]], ptr [[Q2]], align 4 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -550,42 +550,42 @@ exit: define void @load_store_factor3_i64(ptr %p) { ; CHECK-LABEL: @load_store_factor3_i64( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) -; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> -; CHECK-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> +; CHECK-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], 4 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; CHECK-NEXT: [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]] +; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q3]], align 8 ; CHECK-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: store i64 [[Y0]], ptr [[Q3]], align 8 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; CHECK-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -595,50 +595,50 @@ define void @load_store_factor3_i64(ptr %p) { ; CHECK-NEXT: [[X2:%.*]] = load i64, ptr [[Q2]], align 8 ; CHECK-NEXT: [[Y2:%.*]] = add i64 [[X2]], 3 ; CHECK-NEXT: store i64 [[Y2]], ptr [[Q2]], align 8 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @load_store_factor3_i64( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) -; FIXED-NEXT: [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) -; FIXED-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) -; FIXED-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> -; FIXED-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> -; FIXED-NEXT: [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> -; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> -; FIXED-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; FIXED-NEXT: [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) +; FIXED-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) +; FIXED-NEXT: [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) +; FIXED-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> +; FIXED-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> +; FIXED-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> +; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> +; FIXED-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 4 +; FIXED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; FIXED-NEXT: [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]] +; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q3]], align 8 ; FIXED-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; FIXED-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: store i64 [[Y0]], ptr [[Q3]], align 8 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; FIXED-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -648,50 +648,50 @@ define void @load_store_factor3_i64(ptr %p) { ; FIXED-NEXT: [[X2:%.*]] = load i64, ptr [[Q2]], align 8 ; FIXED-NEXT: [[Y2:%.*]] = add i64 [[X2]], 3 ; FIXED-NEXT: store i64 [[Y2]], ptr [[Q2]], align 8 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; ; SCALABLE-LABEL: @load_store_factor3_i64( ; SCALABLE-NEXT: entry: -; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; SCALABLE-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> -; SCALABLE-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) -; SCALABLE-NEXT: [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) -; SCALABLE-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) -; SCALABLE-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> -; SCALABLE-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> -; SCALABLE-NEXT: [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> -; SCALABLE-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; SCALABLE-NEXT: [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) +; SCALABLE-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) +; SCALABLE-NEXT: [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) +; SCALABLE-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> +; SCALABLE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> +; SCALABLE-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> +; SCALABLE-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], 4 +; SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; SCALABLE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; SCALABLE-NEXT: [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]] +; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q3]], align 8 ; SCALABLE-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q3]], align 8 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; SCALABLE-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -701,9 +701,9 @@ define void @load_store_factor3_i64(ptr %p) { ; SCALABLE-NEXT: [[X2:%.*]] = load i64, ptr [[Q2]], align 8 ; SCALABLE-NEXT: [[Y2:%.*]] = add i64 [[X2]], 3 ; SCALABLE-NEXT: store i64 [[Y2]], ptr [[Q2]], align 8 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -740,56 +740,75 @@ exit: define void @load_store_factor8(ptr %p) { ; CHECK-LABEL: @load_store_factor8( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1) -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2) -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3) -; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4) -; CHECK-NEXT: [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5) -; CHECK-NEXT: [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6) -; CHECK-NEXT: [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7) -; CHECK-NEXT: [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8) -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> -; CHECK-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP3]], 3 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i64( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[TMP6]]) +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[TMP7]]) +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 1 +; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP8]]) +; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP9]]) +; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP10]]) +; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP11]]) +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC5]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 1 +; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 1 +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { , } [[STRIDED_VEC5]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = add [[TMP12]], splat (i64 1) +; CHECK-NEXT: [[TMP21:%.*]] = add [[TMP13]], splat (i64 2) +; CHECK-NEXT: [[TMP22:%.*]] = add [[TMP14]], splat (i64 3) +; CHECK-NEXT: [[TMP23:%.*]] = add [[TMP15]], splat (i64 4) +; CHECK-NEXT: [[TMP24:%.*]] = add [[TMP16]], splat (i64 5) +; CHECK-NEXT: [[TMP25:%.*]] = add [[TMP17]], splat (i64 6) +; CHECK-NEXT: [[TMP26:%.*]] = add [[TMP18]], splat (i64 7) +; CHECK-NEXT: [[TMP27:%.*]] = add [[TMP19]], splat (i64 8) +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP20]], [[TMP24]]) +; CHECK-NEXT: [[INTERLEAVED_VEC7:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP21]], [[TMP25]]) +; CHECK-NEXT: [[INTERLEAVED_VEC8:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP22]], [[TMP26]]) +; CHECK-NEXT: [[INTERLEAVED_VEC9:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP23]], [[TMP27]]) +; CHECK-NEXT: [[INTERLEAVED_VEC10:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC8]]) +; CHECK-NEXT: [[INTERLEAVED_VEC11:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC7]], [[INTERLEAVED_VEC9]]) +; CHECK-NEXT: [[INTERLEAVED_VEC12:%.*]] = call @llvm.vector.interleave2.nxv8i64( [[INTERLEAVED_VEC10]], [[INTERLEAVED_VEC11]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC12]], ptr [[Q0]], align 8 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP2]] +; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 3 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET8:%.*]] = shl i64 [[I1]], 3 +; CHECK-NEXT: [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]] +; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q8]], align 8 ; CHECK-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: store i64 [[Y0]], ptr [[Q8]], align 8 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; CHECK-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -824,23 +843,23 @@ define void @load_store_factor8(ptr %p) { ; CHECK-NEXT: [[X7:%.*]] = load i64, ptr [[Q7]], align 8 ; CHECK-NEXT: [[Y7:%.*]] = add i64 [[X7]], 8 ; CHECK-NEXT: store i64 [[Y7]], ptr [[Q7]], align 8 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @load_store_factor8( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP0]], 3 +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[Q0]], align 8 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> ; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> ; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> @@ -849,39 +868,39 @@ define void @load_store_factor8(ptr %p) { ; FIXED-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> ; FIXED-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> ; FIXED-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1) -; FIXED-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2) -; FIXED-NEXT: [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3) -; FIXED-NEXT: [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4) -; FIXED-NEXT: [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5) -; FIXED-NEXT: [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6) -; FIXED-NEXT: [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7) -; FIXED-NEXT: [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8) -; FIXED-NEXT: [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> -; FIXED-NEXT: [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> -; FIXED-NEXT: [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> -; FIXED-NEXT: [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> -; FIXED-NEXT: [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> -; FIXED-NEXT: [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> -; FIXED-NEXT: [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> -; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> -; FIXED-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; FIXED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; FIXED-NEXT: [[TMP3:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1) +; FIXED-NEXT: [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2) +; FIXED-NEXT: [[TMP5:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3) +; FIXED-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4) +; FIXED-NEXT: [[TMP7:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5) +; FIXED-NEXT: [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6) +; FIXED-NEXT: [[TMP9:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7) +; FIXED-NEXT: [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8) +; FIXED-NEXT: [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> +; FIXED-NEXT: [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; FIXED-NEXT: [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> [[TMP8]], <4 x i32> +; FIXED-NEXT: [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <4 x i32> +; FIXED-NEXT: [[TMP15:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP12]], <8 x i32> +; FIXED-NEXT: [[TMP16:%.*]] = shufflevector <4 x i64> [[TMP13]], <4 x i64> [[TMP14]], <8 x i32> +; FIXED-NEXT: [[TMP17:%.*]] = shufflevector <8 x i64> [[TMP15]], <8 x i64> [[TMP16]], <16 x i32> +; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP17]], <16 x i64> poison, <16 x i32> +; FIXED-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 2 +; FIXED-NEXT: [[TMP18:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 3 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET8:%.*]] = shl i64 [[I1]], 3 +; FIXED-NEXT: [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]] +; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q8]], align 8 ; FIXED-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; FIXED-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: store i64 [[Y0]], ptr [[Q8]], align 8 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; FIXED-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -916,64 +935,83 @@ define void @load_store_factor8(ptr %p) { ; FIXED-NEXT: [[X7:%.*]] = load i64, ptr [[Q7]], align 8 ; FIXED-NEXT: [[Y7:%.*]] = add i64 [[X7]], 8 ; FIXED-NEXT: store i64 [[Y7]], ptr [[Q7]], align 8 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; ; SCALABLE-LABEL: @load_store_factor8( ; SCALABLE-NEXT: entry: -; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3 -; SCALABLE-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8 -; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1) -; SCALABLE-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2) -; SCALABLE-NEXT: [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3) -; SCALABLE-NEXT: [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4) -; SCALABLE-NEXT: [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5) -; SCALABLE-NEXT: [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6) -; SCALABLE-NEXT: [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7) -; SCALABLE-NEXT: [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8) -; SCALABLE-NEXT: [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> -; SCALABLE-NEXT: [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> -; SCALABLE-NEXT: [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> -; SCALABLE-NEXT: [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> -; SCALABLE-NEXT: [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> -; SCALABLE-NEXT: [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> -; SCALABLE-NEXT: [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> -; SCALABLE-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; SCALABLE-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; SCALABLE-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP3:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP3]], 3 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i64( [[WIDE_VEC]]) +; SCALABLE-NEXT: [[TMP6:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP7:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[TMP6]]) +; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[TMP7]]) +; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 0 +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 0 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 1 +; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 1 +; SCALABLE-NEXT: [[STRIDED_VEC3:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP8]]) +; SCALABLE-NEXT: [[STRIDED_VEC4:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP9]]) +; SCALABLE-NEXT: [[STRIDED_VEC5:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP10]]) +; SCALABLE-NEXT: [[STRIDED_VEC6:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP11]]) +; SCALABLE-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 0 +; SCALABLE-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 0 +; SCALABLE-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC5]], 0 +; SCALABLE-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 0 +; SCALABLE-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 1 +; SCALABLE-NEXT: [[TMP17:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 1 +; SCALABLE-NEXT: [[TMP18:%.*]] = extractvalue { , } [[STRIDED_VEC5]], 1 +; SCALABLE-NEXT: [[TMP19:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 1 +; SCALABLE-NEXT: [[TMP20:%.*]] = add [[TMP12]], splat (i64 1) +; SCALABLE-NEXT: [[TMP21:%.*]] = add [[TMP13]], splat (i64 2) +; SCALABLE-NEXT: [[TMP22:%.*]] = add [[TMP14]], splat (i64 3) +; SCALABLE-NEXT: [[TMP23:%.*]] = add [[TMP15]], splat (i64 4) +; SCALABLE-NEXT: [[TMP24:%.*]] = add [[TMP16]], splat (i64 5) +; SCALABLE-NEXT: [[TMP25:%.*]] = add [[TMP17]], splat (i64 6) +; SCALABLE-NEXT: [[TMP26:%.*]] = add [[TMP18]], splat (i64 7) +; SCALABLE-NEXT: [[TMP27:%.*]] = add [[TMP19]], splat (i64 8) +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP20]], [[TMP24]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC7:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP21]], [[TMP25]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC8:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP22]], [[TMP26]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC9:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP23]], [[TMP27]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC10:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC8]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC11:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC7]], [[INTERLEAVED_VEC9]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC12:%.*]] = call @llvm.vector.interleave2.nxv8i64( [[INTERLEAVED_VEC10]], [[INTERLEAVED_VEC11]]) +; SCALABLE-NEXT: store [[INTERLEAVED_VEC12]], ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP2]] +; SCALABLE-NEXT: [[TMP28:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] ; SCALABLE: middle.block: -; SCALABLE-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 3 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET8:%.*]] = shl i64 [[I1]], 3 +; SCALABLE-NEXT: [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]] +; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q8]], align 8 ; SCALABLE-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q8]], align 8 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; SCALABLE-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -1008,9 +1046,9 @@ define void @load_store_factor8(ptr %p) { ; SCALABLE-NEXT: [[X7:%.*]] = load i64, ptr [[Q7]], align 8 ; SCALABLE-NEXT: [[Y7:%.*]] = add i64 [[X7]], 8 ; SCALABLE-NEXT: store i64 [[Y7]], ptr [[Q7]], align 8 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -1080,7 +1118,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 @@ -1088,94 +1126,94 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 4 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0 -; CHECK-NEXT: store [[TMP12]], ptr [[TMP14]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0 +; CHECK-NEXT: store [[TMP11]], ptr [[TMP13]], align 4 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; CHECK-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; CHECK-NEXT: [[RES:%.*]] = add i32 [[X0]], [[X1]] -; CHECK-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]] +; CHECK-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]] ; CHECK-NEXT: store i32 [[RES]], ptr [[DST]], align 4 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @combine_load_factor2_i32( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 -; FIXED-NEXT: [[TMP2:%.*]] = shl i64 [[TMP0]], 1 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[I]], 8 +; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1 ; FIXED-NEXT: [[TMP3:%.*]] = shl i64 [[TMP1]], 1 -; FIXED-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP2]] +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] ; FIXED-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP3]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP4]], align 4 +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[Q0]], align 4 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4 -; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[TMP8:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC3]] -; FIXED-NEXT: [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], [[STRIDED_VEC4]] -; FIXED-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP0]] -; FIXED-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0 -; FIXED-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP10]], i32 8 -; FIXED-NEXT: store <8 x i32> [[TMP8]], ptr [[TMP12]], align 4 -; FIXED-NEXT: store <8 x i32> [[TMP9]], ptr [[TMP13]], align 4 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> +; FIXED-NEXT: [[WIDE_VEC2:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4 +; FIXED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> +; FIXED-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> +; FIXED-NEXT: [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]] +; FIXED-NEXT: [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC4]] +; FIXED-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP0]] +; FIXED-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0 +; FIXED-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[TMP8]], i32 8 +; FIXED-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP9]], align 4 +; FIXED-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP10]], align 4 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 16 +; FIXED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; FIXED-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; FIXED-NEXT: [[RES:%.*]] = add i32 [[X0]], [[X1]] -; FIXED-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]] +; FIXED-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]] ; FIXED-NEXT: store i32 [[RES]], ptr [[DST]], align 4 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; @@ -1184,7 +1222,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 @@ -1192,43 +1230,43 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 4 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 4 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) -; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], [[TMP11]] -; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]] -; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0 -; SCALABLE-NEXT: store [[TMP12]], ptr [[TMP14]], align 4 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[TMP11:%.*]] = add [[TMP9]], [[TMP10]] +; SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]] +; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0 +; SCALABLE-NEXT: store [[TMP11]], ptr [[TMP13]], align 4 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; SCALABLE-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; SCALABLE-NEXT: [[RES:%.*]] = add i32 [[X0]], [[X1]] -; SCALABLE-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]] +; SCALABLE-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]] ; SCALABLE-NEXT: store i32 [[RES]], ptr [[DST]], align 4 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -1263,7 +1301,7 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -1271,94 +1309,94 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 8 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[TMP13]], i32 0 -; CHECK-NEXT: store [[TMP12]], ptr [[TMP14]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0 +; CHECK-NEXT: store [[TMP11]], ptr [[TMP13]], align 8 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; CHECK-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; CHECK-NEXT: [[RES:%.*]] = add i64 [[X0]], [[X1]] -; CHECK-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]] +; CHECK-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]] ; CHECK-NEXT: store i64 [[RES]], ptr [[DST]], align 8 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @combine_load_factor2_i64( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 -; FIXED-NEXT: [[TMP2:%.*]] = shl i64 [[TMP0]], 1 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[I]], 4 +; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1 ; FIXED-NEXT: [[TMP3:%.*]] = shl i64 [[TMP1]], 1 -; FIXED-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]] +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] ; FIXED-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8 +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[Q0]], align 8 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8 -; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[TMP8:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC3]] -; FIXED-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], [[STRIDED_VEC4]] -; FIXED-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]] -; FIXED-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[TMP10]], i32 0 -; FIXED-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP10]], i32 4 -; FIXED-NEXT: store <4 x i64> [[TMP8]], ptr [[TMP12]], align 8 -; FIXED-NEXT: store <4 x i64> [[TMP9]], ptr [[TMP13]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> +; FIXED-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8 +; FIXED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC2]], <8 x i64> poison, <4 x i32> +; FIXED-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC2]], <8 x i64> poison, <4 x i32> +; FIXED-NEXT: [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC1]] +; FIXED-NEXT: [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC3]], [[STRIDED_VEC4]] +; FIXED-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]] +; FIXED-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0 +; FIXED-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[TMP8]], i32 4 +; FIXED-NEXT: store <4 x i64> [[TMP6]], ptr [[TMP9]], align 8 +; FIXED-NEXT: store <4 x i64> [[TMP7]], ptr [[TMP10]], align 8 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8 +; FIXED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; FIXED-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; FIXED-NEXT: [[RES:%.*]] = add i64 [[X0]], [[X1]] -; FIXED-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]] +; FIXED-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]] ; FIXED-NEXT: store i64 [[RES]], ptr [[DST]], align 8 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; @@ -1367,7 +1405,7 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 ; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -1375,43 +1413,43 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 8 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]]) -; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], [[TMP11]] -; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]] -; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[TMP13]], i32 0 -; SCALABLE-NEXT: store [[TMP12]], ptr [[TMP14]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[TMP11:%.*]] = add [[TMP9]], [[TMP10]] +; SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]] +; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0 +; SCALABLE-NEXT: store [[TMP11]], ptr [[TMP13]], align 8 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; SCALABLE-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; SCALABLE-NEXT: [[RES:%.*]] = add i64 [[X0]], [[X1]] -; SCALABLE-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]] +; SCALABLE-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]] ; SCALABLE-NEXT: store i64 [[RES]], ptr [[DST]], align 8 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll new file mode 100644 index 0000000000000..b400b27df0839 --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll @@ -0,0 +1,135 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=loop-vectorize,interleaved-access -mattr=+sve -S -o - %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "aarch64" + +%struct.xyzt = type { i32, i32, i32, i32 } +; for (int i = 0; i < 1024; ++i) { +; dst[i].x = a[i].x + b[i].x; +; dst[i].y = a[i].y - b[i].y; +; dst[i].z = a[i].z << b[i].z; +; dst[i].t = a[i].t >> b[i].t; +; } + +define void @interleave_deinterleave(ptr noalias %dst, ptr %a, ptr %b) { +; CHECK-LABEL: @interleave_deinterleave( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( splat (i1 true), ptr [[TMP7]]) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[LDN]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[LDN]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[LDN]], 2 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[LDN]], 3 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[LDN9:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( splat (i1 true), ptr [[TMP13]]) +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , , , } [[LDN9]], 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , , , } [[LDN9]], 1 +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { , , , } [[LDN9]], 2 +; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { , , , } [[LDN9]], 3 +; CHECK-NEXT: [[TMP20:%.*]] = add nsw [[TMP16]], [[TMP9]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP22:%.*]] = sub nsw [[TMP10]], [[TMP17]] +; CHECK-NEXT: [[TMP23:%.*]] = shl [[TMP11]], [[TMP18]] +; CHECK-NEXT: [[TMP24:%.*]] = ashr [[TMP12]], [[TMP19]] +; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP20]], [[TMP22]], [[TMP23]], [[TMP24]], splat (i1 true), ptr [[TMP21]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP32]], [[TMP31]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4 +; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[Y]], align 4 +; CHECK-NEXT: [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4 +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[Y11]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP33]], [[TMP26]] +; CHECK-NEXT: [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4 +; CHECK-NEXT: store i32 [[SUB]], ptr [[Y14]], align 4 +; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8 +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[Z]], align 4 +; CHECK-NEXT: [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8 +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[Z19]], align 4 +; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP27]], [[TMP28]] +; CHECK-NEXT: [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8 +; CHECK-NEXT: store i32 [[SHL]], ptr [[Z22]], align 4 +; CHECK-NEXT: [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12 +; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[T]], align 4 +; CHECK-NEXT: [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12 +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[T27]], align 4 +; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[TMP29]], [[TMP30]] +; CHECK-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12 +; CHECK-NEXT: store i32 [[SHR]], ptr [[T30]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds %struct.xyzt, ptr %a, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %indvars.iv + %1 = load i32, ptr %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx5 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %indvars.iv + store i32 %add, ptr %arrayidx5, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i64 4 + %2 = load i32, ptr %y, align 4 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 4 + %3 = load i32, ptr %y11, align 4 + %sub = sub nsw i32 %2, %3 + %y14 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 4 + store i32 %sub, ptr %y14, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i64 8 + %4 = load i32, ptr %z, align 4 + %z19 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 8 + %5 = load i32, ptr %z19, align 4 + %shl = shl i32 %4, %5 + %z22 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 8 + store i32 %shl, ptr %z22, align 4 + %t = getelementptr inbounds nuw i8, ptr %arrayidx, i64 12 + %6 = load i32, ptr %t, align 4 + %t27 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 12 + %7 = load i32, ptr %t27, align 4 + %shr = ashr i32 %6, %7 + %t30 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 12 + store i32 %shr, ptr %t30, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret void +} From 5ad4213ef48253a6be1f9880f17555fc36efdd19 Mon Sep 17 00:00:00 2001 From: Kunwar Grover Date: Fri, 27 Dec 2024 13:19:58 +0000 Subject: [PATCH 113/567] [mlir][Linalg] Allow PartialReductionOpInterface ops in tile_reduction_using_for (#120118) The API used internally expects PartialReductionOpInterface. This patch allows any operation implementing this interface to use this transform op (instead of just LinalgOp). --- .../Dialect/Linalg/TransformOps/LinalgTransformOps.td | 6 +++--- .../Linalg/TransformOps/LinalgTransformOps.cpp | 11 +++++++++-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td index 2e713bca24efc..081bf9b6d3b23 100644 --- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td +++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td @@ -1765,8 +1765,8 @@ def TileReductionUsingForOp : Op:$tile_sizes); let results = (outs Variadic:$fill_op, - TransformHandleTypeInterface:$split_linalg_op, - TransformHandleTypeInterface:$combining_linalg_op, + TransformHandleTypeInterface:$split_op, + TransformHandleTypeInterface:$combining_op, TransformHandleTypeInterface:$for_op); let builders = [ @@ -1784,7 +1784,7 @@ def TileReductionUsingForOp : Op(target); + if (!partialReductionOp) { + return emitSilenceableFailure( + target->getLoc(), + "Operation should implement PartialReductionOpInterface"); + } FailureOr result = scf::tileReductionUsingScf( - rewriter, cast(target.getOperation()), + rewriter, partialReductionOp, getAsOpFoldResult(rewriter.getI64ArrayAttr(getTileSizes()))); if (failed(result)) From 9ab16d49c99966f33900d68ed5370f19927ca52c Mon Sep 17 00:00:00 2001 From: Ivan Butygin Date: Fri, 27 Dec 2024 14:50:21 +0100 Subject: [PATCH 114/567] [mlir][IntRangeInference] Fix `arith.ceildivsi` range inference when it includes `INT_MIN` (#121062) There is a special case in `arith.ceildivsi` range inference for handling `lhs.smin()==INT_MIN`, but when `lhs` is not a single value, it can cause it to skip entire negative range. Add `lhs.smin() + 1` check to handle it. --- mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp | 10 +++++++++- mlir/test/Dialect/Arith/int-range-interface.mlir | 12 ++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp b/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp index 7a73a94201f1d..1eab4139488bd 100644 --- a/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp +++ b/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp @@ -386,7 +386,15 @@ mlir::intrange::inferCeilDivS(ArrayRef argRanges) { } return result; }; - return inferDivSRange(lhs, rhs, ceilDivSIFix); + ConstantIntRanges result = inferDivSRange(lhs, rhs, ceilDivSIFix); + if (lhs.smin().isMinSignedValue() && lhs.smax().sgt(lhs.smin())) { + // If lhs range includes INT_MIN and lhs is not a single value, we can + // suddenly wrap to positive val, skipping entire negative range, add + // [INT_MIN + 1, smax()] range to the result to handle this. + auto newLhs = ConstantIntRanges::fromSigned(lhs.smin() + 1, lhs.smax()); + result = result.rangeUnion(inferDivSRange(newLhs, rhs, ceilDivSIFix)); + } + return result; } ConstantIntRanges diff --git a/mlir/test/Dialect/Arith/int-range-interface.mlir b/mlir/test/Dialect/Arith/int-range-interface.mlir index 48a3eb20eb7fb..090af3e79f4a1 100644 --- a/mlir/test/Dialect/Arith/int-range-interface.mlir +++ b/mlir/test/Dialect/Arith/int-range-interface.mlir @@ -249,6 +249,18 @@ func.func @ceil_divsi(%arg0 : index) -> i1 { func.return %10 : i1 } +// There was a bug, which was causing this expr errorneously fold to constant +// CHECK-LABEL: func @ceil_divsi_full_range +// CHECK-SAME: (%[[arg:.*]]: index) +// CHECK: %[[c64:.*]] = arith.constant 64 : index +// CHECK: %[[ret:.*]] = arith.ceildivsi %[[arg]], %[[c64]] : index +// CHECK: return %[[ret]] +func.func @ceil_divsi_full_range(%6: index) -> index { + %c64 = arith.constant 64 : index + %55 = arith.ceildivsi %6, %c64 : index + return %55 : index +} + // CHECK-LABEL: func @ceil_divsi_intmin_bug_115293 // CHECK: %[[ret:.*]] = arith.constant true // CHECK: return %[[ret]] From 07ba4575250b692b28d0fd5105e028b9f4c8e07f Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 27 Dec 2024 07:55:30 -0800 Subject: [PATCH 115/567] [SLP][NFC]Add dump of combined entries, where applicable --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e9fc89fa242a7..7f4c3d44b0ec4 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3311,7 +3311,7 @@ class BoUpSLP { /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from /// other nodes as a series of insertvector instructions. - SmallVector, 0> CombinedEntriesWithIndices; + SmallVector, 2> CombinedEntriesWithIndices; private: /// The operands of each instruction in each lane Operands[op_index][lane]. @@ -3545,6 +3545,13 @@ class BoUpSLP { for (const auto &EInfo : UserTreeIndices) dbgs() << EInfo << ", "; dbgs() << "\n"; + if (!CombinedEntriesWithIndices.empty()) { + dbgs() << "Combined entries: "; + interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) { + dbgs() << "Entry index " << P.first << " with offset " << P.second; + }); + dbgs() << "\n"; + } } #endif }; From 91bbebc7e118cceae1fc0e349de08094a3cd2fe7 Mon Sep 17 00:00:00 2001 From: Kunwar Grover Date: Fri, 27 Dec 2024 16:52:34 +0000 Subject: [PATCH 116/567] [mlir][scf] Add getPartialResultTilePosition to PartialReductionOpInterface (#120465) This PR adds a new interface method to PartialReductionOpInterface which allows it to query the result tile position for the partial result. Previously, tiling the reduction dimension with SplitReductionOuterReduction when the result has transposed parallel dimensions would produce wrong results. Other fixes that were needed to make this PR work: - Instead of ad-hoc logic to decide where to place the new reduction dimensions in the partial result based on the iteration space, the reduction dimensions are always appended to the partial result tensor. - Remove usage of PartialReductionOpInterface in Mesh dialect. The implementation was trying to just get a neutral element, but ended up trying to use PartialReductionOpInterface for it, which is not right. It was also passing the wrong sizes to it. --- .../mlir/Interfaces/TilingInterface.td | 22 +++ .../Transforms/MeshShardingInterfaceImpl.cpp | 34 ++-- .../Linalg/Transforms/TilingInterfaceImpl.cpp | 165 ++++++++++++------ .../SCF/Transforms/TileUsingInterface.cpp | 28 +-- .../Linalg/transform-tile-reduction.mlir | 67 +++++-- 5 files changed, 225 insertions(+), 91 deletions(-) diff --git a/mlir/include/mlir/Interfaces/TilingInterface.td b/mlir/include/mlir/Interfaces/TilingInterface.td index b75fc5e806afb..50b69b8f8d833 100644 --- a/mlir/include/mlir/Interfaces/TilingInterface.td +++ b/mlir/include/mlir/Interfaces/TilingInterface.td @@ -427,6 +427,28 @@ def PartialReductionOpInterface : OpInterface<"PartialReductionOpInterface"> { /*defaultImplementation=*/[{ return failure(); }] + >, + InterfaceMethod< + /*desc=*/[{ + Method to return the position of the partial result tile computed by + the tiled operation. This is same as + TilingInterface:::getResultTilePosition, but determines the result + tile position for partial reduction. + }], + /*retType=*/"::llvm::LogicalResult", + /*methodName=*/"getPartialResultTilePosition", + /*args=*/(ins + "::mlir::OpBuilder &":$b, + "unsigned":$resultNumber, + "::mlir::ArrayRef<::mlir::OpFoldResult> ":$offsets, + "::mlir::ArrayRef<::mlir::OpFoldResult> ":$sizes, + "::mlir::SmallVector<::mlir::OpFoldResult> &":$resultOffsets, + "::mlir::SmallVector<::mlir::OpFoldResult> &":$resultSizes, + "::mlir::ArrayRef":$reductionDims), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + return failure(); + }] > ]; } diff --git a/mlir/lib/Dialect/Linalg/Transforms/MeshShardingInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/Transforms/MeshShardingInterfaceImpl.cpp index 5bf2f91c2c7bc..92cfba2549a3f 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/MeshShardingInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/MeshShardingInterfaceImpl.cpp @@ -105,13 +105,13 @@ static ReductionKind getReductionKindOfLinalgOp(LinalgOp op) { static MeshOp getMesh(Operation *op, ArrayRef operandShardings, ArrayRef resultShardings, SymbolTableCollection &symbolTable) { - for (const MeshSharding& sharding : operandShardings) { + for (const MeshSharding &sharding : operandShardings) { if (sharding) { return mesh::getMesh(op, sharding.getMeshAttr(), symbolTable); } } - for (const MeshSharding& sharding : resultShardings) { + for (const MeshSharding &sharding : resultShardings) { if (sharding) { return mesh::getMesh(op, sharding.getMeshAttr(), symbolTable); } @@ -129,8 +129,9 @@ static MeshOp getMesh(Operation *op, ArrayRef operandShardings, // the original operand. // The other processes would use the reduction operation neutral tensor. static Value createDestinationPassingStyleInitOperand( - LinalgOp op, Value spmdizedOperand, ArrayRef reductionMeshAxes, - MeshOp meshOp, ImplicitLocOpBuilder &builder) { + LinalgOp op, int operandNumber, Value spmdizedOperand, + ArrayRef reductionMeshAxes, MeshOp meshOp, + ImplicitLocOpBuilder &builder) { Value processLinearIndexInReductionGroup = mesh::createProcessLinearIndex( meshOp.getSymName(), reductionMeshAxes, builder); Value zero = builder.create(0); @@ -152,14 +153,21 @@ static Value createDestinationPassingStyleInitOperand( builder.setInsertionPointToEnd(&ifOp.getElseRegion().front()); SmallVector shape = tensor::getMixedSizes(builder, builder.getLoc(), spmdizedOperand); - PartialReductionOpInterface partialReductionIface = - llvm::cast(op.getOperation()); - assert(op->getNumResults() == 1 && "Multiple results not supported."); - FailureOr> reductionNeutralTensor = - partialReductionIface.generateInitialTensorForPartialReduction( - builder, builder.getLoc(), shape, {}); - assert(succeeded(reductionNeutralTensor)); - builder.create(reductionNeutralTensor.value()); + + SmallVector combinerOps; + matchReduction(op.getRegionOutputArgs(), operandNumber, combinerOps); + assert(combinerOps.size() == 1); + std::optional neutralEl = + arith::getNeutralElement(combinerOps[0]); + + Value init = builder.create(op.getLoc(), shape, + neutralEl.value().getType()); + Value constant = + builder.create(op.getLoc(), neutralEl.value()); + Value fill = builder.create(op.getLoc(), constant, init) + .getResult(0); + + builder.create(fill); } return ifOp.getResult(0); } @@ -178,7 +186,7 @@ static SmallVector createDestinationPassingStyleInitOperands( Value spmdizedInitOperand = spmdizationMap.lookup(op->getOperands()[operandIdx]); newOperands[operandIdx] = createDestinationPassingStyleInitOperand( - op, spmdizedInitOperand, reductionMeshAxes, meshOp, builder); + op, 0, spmdizedInitOperand, reductionMeshAxes, meshOp, builder); return newOperands; } diff --git a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp index f86715a94b268..b7764da26a7f4 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp @@ -324,7 +324,27 @@ struct LinalgOpTilingInterface // External Model for implementing `PartialReductionInterface` for `LinalgOp`s. //===----------------------------------------------------------------------===// -/// External model implementation of PartialReductionInterface for LinalgOps. +/// Return an AffineMap for a partial result for the given result number, +/// assuming the partial tiling strategy is outer-reduction loop + +/// inner-parallel tile. The returned AffineMap can be used as the replacement +/// AffineMap for the inner-parallel tile linalg op for the given result number. +/// +/// The new AffineMap is the old AffineMap with reduction dimensions appended +/// at end. +static AffineMap getPartialResultAffineMap(LinalgOp linalgOp, + ArrayRef reductionDims, + unsigned resultNumber) { + AffineMap map = + linalgOp.getMatchingIndexingMap(linalgOp.getDpsInitOperand(resultNumber)); + for (int redPos : reductionDims) { + map = map.insertResult(getAffineDimExpr(redPos, linalgOp.getContext()), + map.getNumResults()); + } + return map; +} + +/// External model implementation of PartialReductionInterface for +/// LinalgOps. template struct LinalgOpPartialReductionInterface : public PartialReductionOpInterface::ExternalModel< @@ -338,11 +358,24 @@ struct LinalgOpPartialReductionInterface if (linalgOp.hasPureBufferSemantics()) return op->emitOpError("expected operation to have tensor semantics"); + // LinalgOp implements TilingInterface. + auto tilingInterfaceOp = cast(linalgOp.getOperation()); + SmallVector shape = + llvm::map_to_vector(tilingInterfaceOp.getIterationDomain(b), + [](Range x) { return x.size; }); + + SmallVector tiledShape; + for (auto [tileSize, dimSize] : llvm::zip_equal(sizes, shape)) { + if (isZeroIndex(tileSize)) { + tiledShape.push_back(dimSize); + } else { + tiledShape.push_back(tileSize); + } + } + SmallVector inits; for (int initIdx = 0, e = linalgOp.getNumDpsInits(); initIdx < e; ++initIdx) { - // Insert the new parallel dimension based on the index of the reduction - // loops. This could be controlled by user for more flexibility. SmallVector combinerOps; if (!matchReduction(linalgOp.getRegionOutputArgs(), initIdx, combinerOps) || @@ -355,33 +388,19 @@ struct LinalgOpPartialReductionInterface return op->emitOpError( "Failed to get an identity value for the reduction operation."); - ArrayRef oldShape = - linalgOp.getShape(linalgOp.getDpsInitOperand(initIdx)); - - // Calculate the new shape, we insert the new dimensions based on the - // index of the reduction dimensions. - SmallVector newOutputShape; - SmallVector dynamicDims; - int64_t currReductionDims = 0; - DenseSet reductionDimsSet(reductionDims.begin(), - reductionDims.end()); - for (int64_t idx : - llvm::seq(0, oldShape.size() + reductionDims.size())) { - if (reductionDimsSet.contains(idx)) { - dispatchIndexOpFoldResults(sizes[idx], dynamicDims, newOutputShape); - currReductionDims++; - continue; - } - int64_t oldIdx = idx - currReductionDims; - int64_t dim = oldShape[oldIdx]; - newOutputShape.push_back(dim); - if (ShapedType::isDynamic(dim)) - dynamicDims.push_back(b.create( - loc, linalgOp.getDpsInitOperand(initIdx)->get(), oldIdx)); + // Append the new partial result dimensions. + AffineMap partialMap = + getPartialResultAffineMap(linalgOp, reductionDims, initIdx); + SmallVector partialResultShape; + for (AffineExpr dimExpr : partialMap.getResults()) { + auto dim = cast(dimExpr); + partialResultShape.push_back(tiledShape[dim.getPosition()]); } - Value emptyTensor = b.create( - loc, newOutputShape, - linalgOp.getRegionOutputArgs()[initIdx].getType(), dynamicDims); + + Type elType = + getElementTypeOrSelf(linalgOp->getResult(initIdx).getType()); + Value emptyTensor = + b.create(loc, partialResultShape, elType); Value constantOp = b.create(loc, *identity); auto identityTensor = b.create(loc, constantOp, emptyTensor); @@ -407,11 +426,7 @@ struct LinalgOpPartialReductionInterface // TODO: linalg::Generic doesn't have getDpsInitOperands. Can replace // this with a for range loop when we have it. AffineMap newMap = - linalgOp.getMatchingIndexingMap(linalgOp.getDpsInitOperand(idx)); - for (int redPos : reductionDims) { - newMap = newMap.insertResult(b.getAffineDimExpr(redPos), - newMap.getNumResults()); - } + getPartialResultAffineMap(linalgOp, reductionDims, idx); newInitMaps.push_back(newMap); } @@ -476,29 +491,75 @@ struct LinalgOpPartialReductionInterface Location loc, ValueRange partialReduce, ArrayRef reductionDims) const { auto linalgOp = cast(op); - SmallVector reductionDimsInt64(reductionDims); - auto reduction = b.create( - loc, partialReduce, linalgOp.getDpsInits(), reductionDimsInt64, - [&linalgOp](OpBuilder &b, Location loc, ValueRange inputs) { - int64_t numInits = linalgOp.getNumDpsInits(); - SmallVector yieldedValues; - for (int idx : llvm::seq(0, numInits)) { + + // Permute the reduction dims as permuted by the partial result map. + + int64_t numInits = linalgOp.getNumDpsInits(); + SmallVector mergeOperations; + SmallVector replacements; + for (int idx : llvm::seq(numInits)) { + // linalg.reduce's iteration space is the tiled result's iteration space + // (and not the tiled operation's iteration space). To account for this, + // permute the reduction dimensions based on the partial result map of the + // tiled result. + AffineMap partialMap = + getPartialResultAffineMap(linalgOp, reductionDims, idx); + SmallVector partialReductionDims; + for (auto [resultNum, dimExpr] : + llvm::enumerate(partialMap.getResults())) { + unsigned dim = cast(dimExpr).getPosition(); + if (llvm::find(reductionDims, dim) != reductionDims.end()) { + partialReductionDims.push_back(resultNum); + } + } + + Value partialResult = partialReduce[idx]; + Value init = linalgOp.getDpsInits()[idx]; + + auto reduction = b.create( + loc, partialResult, init, partialReductionDims, + [&linalgOp, &idx](OpBuilder &b, Location loc, ValueRange inputs) { // Get the combiner op. SmallVector combinerOps; matchReduction(linalgOp.getRegionOutputArgs(), idx, combinerOps); Operation *clonedReductionOp = b.clone(*combinerOps[0]); // Combine the input at idx and output at numInits + idx. - clonedReductionOp->setOperand(0, inputs[idx]); - clonedReductionOp->setOperand(1, inputs[numInits + idx]); - // Yield. - yieldedValues.push_back(clonedReductionOp->getResult(0)); - } - b.create(loc, yieldedValues); - }); - return MergeResult{ - {reduction.getOperation()}, - llvm::map_to_vector(reduction->getResults(), - [](OpResult r) -> Value { return r; })}; + clonedReductionOp->setOperand(0, inputs[0]); + clonedReductionOp->setOperand(1, inputs[1]); + b.create(loc, clonedReductionOp->getResult(0)); + }); + + mergeOperations.push_back(reduction); + replacements.push_back(reduction->getResult(0)); + } + + return MergeResult{mergeOperations, replacements}; + } + + LogicalResult getPartialResultTilePosition( + Operation *op, OpBuilder &b, unsigned resultNumber, + ArrayRef offsets, ArrayRef sizes, + SmallVector &resultOffsets, + SmallVector &resultSizes, + ArrayRef reductionDims) const { + auto linalgOp = cast(op); + + AffineMap partialMap = + getPartialResultAffineMap(linalgOp, reductionDims, resultNumber); + for (AffineExpr dimExpr : partialMap.getResults()) { + unsigned dim = cast(dimExpr).getPosition(); + resultSizes.push_back(sizes[dim]); + + if (llvm::find(reductionDims, dim) != reductionDims.end()) { + // Reduction dims are reduced, and are always outputed in the same + // place. So use offset 0 for them. + resultOffsets.push_back(b.getIndexAttr(0)); + } else { + resultOffsets.push_back(offsets[dim]); + } + } + + return success(); } }; diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp index 2277989bf8411..b548f8ce8b560 100644 --- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp @@ -657,21 +657,29 @@ getResultTilePosition(RewriterBase &rewriter, int64_t index, Value tiledResult, resultOffset, resultSize); case scf::SCFTilingOptions::ReductionTilingStrategy:: PartialReductionOuterReduction: { - // TODO: This does not work for non identity accesses to the result tile. - // The proper fix is to add a getPartialResultTilePosition method to - // PartialReductionOpInterface. - resultOffset = - SmallVector(offsets.size(), rewriter.getIndexAttr(0)); - for (size_t i = 0; i < offsets.size(); i++) { - resultSize.push_back( - tensor::getMixedSize(rewriter, op.getLoc(), tiledResult, i)); + auto redOp = dyn_cast(op.getOperation()); + if (!redOp) { + return rewriter.notifyMatchFailure( + op, "PartialReductionOuterReduction tiling strategy is only supported" + "for operations implementing PartialReductionOpInterface"); } - return success(); + // Get reduction dimensions. + // TODO: PartialReductionOpInterface should really query TilingInterface + // itself and find reduction dimensions. + SmallVector reductionDims; + for (auto [idx, iteratorType] : + llvm::enumerate(op.getLoopIteratorTypes())) { + if (iteratorType == utils::IteratorType::reduction) + reductionDims.push_back(idx); + } + return redOp.getPartialResultTilePosition(rewriter, index, offsets, sizes, + resultOffset, resultSize, + reductionDims); + } default: return rewriter.notifyMatchFailure(op, "unhandled reduction tiling strategy"); } - } } static FailureOr diff --git a/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir b/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir index cce4b4efa61c8..9d34c80822d0e 100644 --- a/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir +++ b/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir @@ -32,8 +32,7 @@ module attributes {transform.with_named_sequence} { // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[D0:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor // CHECK-DAG: %[[D1:.*]] = tensor.dim %[[ARG0]], %[[C1]] : tensor -// CHECK-DAG: %[[D2:.*]] = tensor.dim %[[ARG1]], %[[C0]] : tensor -// CHECK: %[[E:.*]] = tensor.empty(%[[D2]]) : tensor +// CHECK: %[[E:.*]] = tensor.empty(%[[D0]]) : tensor // CHECK: %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor) -> tensor // CHECK: %[[L:.*]] = scf.for %[[K:.*]] = %[[C0]] to %[[D1]] step %[[C5]] iter_args(%[[ARG3:.*]] = %[[F]]) -> (tensor) { // CHECK: %[[PS:.*]] = affine.min #[[MAP0]](%[[K]])[%[[D1]]] @@ -81,13 +80,13 @@ module attributes {transform.with_named_sequence} { // CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1) -> (d0, d1)> // CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0, d1) -> (d1, d0)> // CHECK: func @reduction_tile_transpose -// CHECK: tensor.empty(%{{.*}}) : tensor<5x?xf32> -// CHECK: linalg.fill {{.*}} : tensor<5x?xf32>) -> tensor<5x?xf32> +// CHECK: tensor.empty(%{{.*}}) : tensor +// CHECK: linalg.fill {{.*}} : tensor) -> tensor // CHECK: scf.for -// CHECK: %[[EXT:.*]] = tensor.extract_slice %[[ARG3:.*]][0, 0] [%[[D0:.*]], %[[D1:.*]]] [1, 1] : tensor<5x?xf32> to tensor +// CHECK: %[[EXT:.*]] = tensor.extract_slice %[[ARG3:.*]][0, 0] [%[[D0:.*]], %[[D1:.*]]] [1, 1] : tensor to tensor // CHECK: %[[R:.*]] = linalg.generic {indexing_maps = [#[[MAP1]], #[[MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[L:.*]] : tensor) outs(%[[EXT]] : tensor) -// CHECK: %[[INS:.*]] = tensor.insert_slice %[[R]] into %[[ARG3]][0, 0] [%[[D0]], %[[D1]]] [1, 1] : tensor into tensor<5x?xf32> -// CHECK: scf.yield {{.*}} : tensor<5x?xf32> +// CHECK: %[[INS:.*]] = tensor.insert_slice %[[R]] into %[[ARG3]][0, 0] [%[[D0]], %[[D1]]] [1, 1] : tensor into tensor +// CHECK: scf.yield {{.*}} : tensor // CHECK: } // CHECK: linalg.reduce // CHECK: return @@ -129,8 +128,7 @@ module attributes {transform.with_named_sequence} { // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index // CHECK-DAG: %[[D0:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor // CHECK-DAG: %[[D1:.*]] = tensor.dim %[[ARG0]], %[[C1]] : tensor -// CHECK-DAG: %[[D2:.*]] = tensor.dim %[[ARG1]], %[[C0]] : tensor -// CHECK: %[[E:.*]] = tensor.empty(%[[D2]]) : tensor +// CHECK: %[[E:.*]] = tensor.empty(%[[D0]]) : tensor // CHECK: %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor) -> tensor // CHECK: %[[L:.*]] = scf.forall (%[[IV:.+]]) in (5) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor) { // CHECK-DAG: %[[TS0:.+]] = affine.min #[[MAP0]](%[[IV]])[%[[D1]]] @@ -183,9 +181,7 @@ module attributes {transform.with_named_sequence} { // CHECK-DAG: %[[D0:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor // CHECK-DAG: %[[D1:.*]] = tensor.dim %[[ARG0]], %[[C1]] : tensor // CHECK-DAG: %[[D2:.*]] = tensor.dim %[[ARG1]], %[[C1]] : tensor -// CHECK-DAG: %[[D3:.*]] = tensor.dim %[[ARG2]], %[[C0]] : tensor -// CHECK-DAG: %[[D4:.*]] = tensor.dim %[[ARG2]], %[[C1]] : tensor -// CHECK: %[[E:.*]] = tensor.empty(%[[D3]], %[[D4]]) : tensor +// CHECK: %[[E:.*]] = tensor.empty(%[[D0]], %[[D2]]) : tensor // CHECK: %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor) -> tensor // CHECK: %[[L:.*]] = scf.forall (%[[IV:.+]]) in (5) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor) { // CHECK-DAG: %[[TS0:.+]] = affine.min #[[MAP0]](%[[IV]])[%[[D1]]] @@ -243,8 +239,7 @@ module attributes {transform.with_named_sequence} { // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index // CHECK-DAG: %[[C15:.*]] = arith.constant 15 : index // CHECK-DAG: %[[D0:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor -// CHECK-DAG: %[[D2:.*]] = tensor.dim %[[ARG1]], %[[C0]] : tensor -// CHECK: %[[E:.*]] = tensor.empty(%[[D2]]) : tensor +// CHECK: %[[E:.*]] = tensor.empty(%[[D0]]) : tensor // CHECK: %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor) -> tensor // CHECK: %[[L:.*]] = scf.forall (%[[IV:.+]]) in (5) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor) { // CHECK: %[[ET:.+]] = tensor.extract_slice %[[ARG3:.+]][0, %[[IV]]] [%[[D0]], 1] [1, 1] : tensor to tensor @@ -422,8 +417,8 @@ func.func @reduction_tile_multiple_results(%arg0: tensor, %out: tensor< module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %12, %2, %3, %loop = transform.structured.tile_reduction_using_for %0 - by tile_sizes = [0, 5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + %1, %12, %2, %3, %4, %loop = transform.structured.tile_reduction_using_for %0 + by tile_sizes = [0, 5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) transform.yield } } @@ -444,4 +439,44 @@ module attributes {transform.with_named_sequence} { // CHECK: scf.yield %[[INSERT1]], %[[INSERT1]] // CHECK: linalg.reduce // CHECK: arith.addf +// CHECK: linalg.reduce // CHECK: arith.maximumf + +// ----- + +func.func @reduction_tile_multi_dim_transpose(%arg0: tensor, %out: tensor) -> tensor { + %red = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, + affine_map<(d0, d1, d2) -> (d2, d0)>], + iterator_types = ["parallel", "reduction", "parallel"]} + ins(%arg0 : tensor) + outs(%out : tensor) { + ^bb0(%arg7: f32, %arg9: f32): + %42 = arith.addf %arg7, %arg9 : f32 + linalg.yield %42 : f32 + } -> tensor + return %red : tensor +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %2, %3, %loop = transform.structured.tile_reduction_using_for %0 + by tile_sizes = [0, 5, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)> +// CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0, d1, d2) -> (d2, d0, d1)> +// CHECK: func @reduction_tile_multi_dim_transpose +// CHECK: tensor.empty(%{{.*}}) : tensor +// CHECK: linalg.fill {{.*}} : tensor) -> tensor +// CHECK: scf.for +// CHECK: %[[K:.*]] = affine.min +// CHECK: %[[EXT:.*]] = tensor.extract_slice %[[ARG3:.*]][0, 0, 0] [%[[D2:.*]], %[[D0:.*]], %[[K]]] [1, 1, 1] : tensor to tensor +// CHECK: %[[R:.*]] = linalg.generic {indexing_maps = [#[[MAP1]], #[[MAP2]]], iterator_types = ["parallel", "parallel", "parallel"]} ins(%[[L:.*]] : tensor) outs(%[[EXT]] : tensor) +// CHECK: %[[INS:.*]] = tensor.insert_slice %[[R]] into %[[ARG3]][0, 0, 0] [%[[D2]], %[[D0]], %[[K]]] [1, 1, 1] : tensor into tensor +// CHECK: scf.yield {{.*}} : tensor +// CHECK: } +// CHECK: linalg.reduce +// CHECK: return From 8caeb2e0c2fb8a5f1689c11775b81ceee76de958 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 27 Dec 2024 17:43:07 +0000 Subject: [PATCH 117/567] [VPlan] Always create initial blocks in constructor (NFC). Update C++ unit tests to use VPlanTestBase to construct initial VPlan, using a constructor that creates the VP blocks directly in the constructor. Split off from and in preparation for https://github.com/llvm/llvm-project/pull/120918. --- llvm/lib/Transforms/Vectorize/VPlan.h | 17 +- .../Transforms/Vectorize/VPlanHCFGBuilder.h | 4 +- .../Transforms/Vectorize/VPDomTreeTest.cpp | 42 ++-- .../Transforms/Vectorize/VPlanHCFGTest.cpp | 2 +- .../Transforms/Vectorize/VPlanSlpTest.cpp | 2 +- .../Transforms/Vectorize/VPlanTest.cpp | 210 ++++++------------ .../Transforms/Vectorize/VPlanTestBase.h | 20 +- .../Vectorize/VPlanVerifierTest.cpp | 89 +++----- 8 files changed, 141 insertions(+), 245 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index e2c0ff7954675..e62ace1980aa7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -3863,7 +3863,6 @@ class VPlan { /// been modeled in VPlan directly. DenseMap SCEVToExpansion; -public: /// Construct a VPlan with \p Entry to the plan and with \p ScalarHeader /// wrapping the original header of the scalar loop. VPlan(VPBasicBlock *Entry, VPIRBasicBlock *ScalarHeader) @@ -3873,18 +3872,20 @@ class VPlan { "scalar header must be a leaf node"); } - /// Construct a VPlan with \p Entry entering the plan, trip count \p TC and - /// with \p ScalarHeader wrapping the original header of the scalar loop. - VPlan(VPBasicBlock *Entry, VPValue *TC, VPIRBasicBlock *ScalarHeader) - : VPlan(Entry, ScalarHeader) { - TripCount = TC; - } - +public: /// Construct a VPlan for \p L. This will create VPIRBasicBlocks wrapping the /// original preheader and scalar header of \p L, to be used as entry and /// scalar header blocks of the new VPlan. VPlan(Loop *L); + /// Construct a VPlan with a new VPBasicBlock as entry, a VPIRBasicBlock + /// wrapping \p ScalarHeaderBB and a trip count of \p TC. + VPlan(BasicBlock *ScalarHeaderBB, VPValue *TC) { + setEntry(new VPBasicBlock("preheader")); + ScalarHeader = VPIRBasicBlock::fromBasicBlock(ScalarHeaderBB); + TripCount = TC; + } + ~VPlan(); void setEntry(VPBasicBlock *VPBB) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h index 9e8f9f3f40029..ad6e2ad90a961 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h @@ -32,11 +32,11 @@ class Loop; class LoopInfo; class VPRegionBlock; class VPlan; -class VPlanTestBase; +class VPlanTestIRBase; /// Main class to build the VPlan H-CFG for an incoming IR. class VPlanHCFGBuilder { - friend VPlanTestBase; + friend VPlanTestIRBase; private: // The outermost loop of the input loop nest considered for vectorization. diff --git a/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp b/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp index 847cca7714eff..6aa34a5fa431b 100644 --- a/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp @@ -9,12 +9,15 @@ #include "../lib/Transforms/Vectorize/VPlan.h" #include "../lib/Transforms/Vectorize/VPlanDominatorTree.h" +#include "VPlanTestBase.h" #include "gtest/gtest.h" namespace llvm { namespace { -TEST(VPDominatorTreeTest, DominanceNoRegionsTest) { +using VPDominatorTreeTest = VPlanTestBase; + +TEST_F(VPDominatorTreeTest, DominanceNoRegionsTest) { // VPBB0 // | // R1 { @@ -24,8 +27,8 @@ TEST(VPDominatorTreeTest, DominanceNoRegionsTest) { // \ / // VPBB4 // } - VPBasicBlock *VPPH = new VPBasicBlock("ph"); - VPBasicBlock *VPBB0 = new VPBasicBlock("VPBB0"); + VPlan &Plan = getPlan(); + VPBasicBlock *VPBB0 = Plan.getEntry(); VPBasicBlock *VPBB1 = new VPBasicBlock("VPBB1"); VPBasicBlock *VPBB2 = new VPBasicBlock("VPBB2"); VPBasicBlock *VPBB3 = new VPBasicBlock("VPBB3"); @@ -40,12 +43,7 @@ TEST(VPDominatorTreeTest, DominanceNoRegionsTest) { VPBlockUtils::connectBlocks(VPBB2, VPBB4); VPBlockUtils::connectBlocks(VPBB3, VPBB4); - LLVMContext C; - auto *ScalarHeader = BasicBlock::Create(C, ""); - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB0); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader()); VPDominatorTree VPDT; VPDT.recalculate(Plan); @@ -62,7 +60,6 @@ TEST(VPDominatorTreeTest, DominanceNoRegionsTest) { EXPECT_EQ(VPDT.findNearestCommonDominator(VPBB2, VPBB3), VPBB1); EXPECT_EQ(VPDT.findNearestCommonDominator(VPBB2, VPBB4), VPBB1); EXPECT_EQ(VPDT.findNearestCommonDominator(VPBB4, VPBB4), VPBB4); - delete ScalarHeader; } static void @@ -76,9 +73,7 @@ checkDomChildren(VPDominatorTree &VPDT, VPBlockBase *Src, EXPECT_EQ(Children, ExpectedNodes); } -TEST(VPDominatorTreeTest, DominanceRegionsTest) { - LLVMContext C; - auto *ScalarHeader = BasicBlock::Create(C, ""); +TEST_F(VPDominatorTreeTest, DominanceRegionsTest) { { // 2 consecutive regions. // VPBB0 @@ -99,8 +94,8 @@ TEST(VPDominatorTreeTest, DominanceRegionsTest) { // R2BB2 // } // - VPBasicBlock *VPPH = new VPBasicBlock("ph"); - VPBasicBlock *VPBB0 = new VPBasicBlock("VPBB0"); + VPlan &Plan = getPlan(); + VPBasicBlock *VPBB0 = Plan.getEntry(); VPBasicBlock *R1BB1 = new VPBasicBlock(); VPBasicBlock *R1BB2 = new VPBasicBlock(); VPBasicBlock *R1BB3 = new VPBasicBlock(); @@ -122,10 +117,7 @@ TEST(VPDominatorTreeTest, DominanceRegionsTest) { VPBlockUtils::connectBlocks(R2BB1, R2BB2); VPBlockUtils::connectBlocks(R1, R2); - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(R2, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB0); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(R2, Plan.getScalarHeader()); VPDominatorTree VPDT; VPDT.recalculate(Plan); @@ -177,7 +169,7 @@ TEST(VPDominatorTreeTest, DominanceRegionsTest) { // | // VPBB2 // - VPBasicBlock *VPPH = new VPBasicBlock("ph"); + VPlan &Plan = getPlan(); VPBasicBlock *R1BB1 = new VPBasicBlock("R1BB1"); VPBasicBlock *R1BB2 = new VPBasicBlock("R1BB2"); VPBasicBlock *R1BB3 = new VPBasicBlock("R1BB3"); @@ -199,15 +191,12 @@ TEST(VPDominatorTreeTest, DominanceRegionsTest) { VPBlockUtils::connectBlocks(R1BB2, R1BB3); VPBlockUtils::connectBlocks(R2, R1BB3); - VPBasicBlock *VPBB1 = new VPBasicBlock("VPBB1"); + VPBasicBlock *VPBB1 = Plan.getEntry(); VPBlockUtils::connectBlocks(VPBB1, R1); VPBasicBlock *VPBB2 = new VPBasicBlock("VPBB2"); VPBlockUtils::connectBlocks(R1, VPBB2); - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(VPBB2, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB1); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(VPBB2, Plan.getScalarHeader()); VPDominatorTree VPDT; VPDT.recalculate(Plan); @@ -220,9 +209,8 @@ TEST(VPDominatorTreeTest, DominanceRegionsTest) { checkDomChildren(VPDT, R2BB2, {R2BB3}); checkDomChildren(VPDT, R2BB3, {}); checkDomChildren(VPDT, R1BB3, {VPBB2}); - checkDomChildren(VPDT, VPBB2, {ScalarHeaderVPBB}); + checkDomChildren(VPDT, VPBB2, {Plan.getScalarHeader()}); } - delete ScalarHeader; } } // namespace diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp index 1b362d1d26bdd..19c2483d34ed1 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp @@ -17,7 +17,7 @@ namespace llvm { namespace { -class VPlanHCFGTest : public VPlanTestBase {}; +class VPlanHCFGTest : public VPlanTestIRBase {}; TEST_F(VPlanHCFGTest, testBuildHCFGInnerLoop) { const char *ModuleString = diff --git a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp index 1b993b63898ca..e3c542ec5cac8 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp @@ -16,7 +16,7 @@ namespace llvm { namespace { -class VPlanSlpTest : public VPlanTestBase { +class VPlanSlpTest : public VPlanTestIRBase { protected: TargetLibraryInfoImpl TLII; TargetLibraryInfo TLI; diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index f3a1bba518c83..2ab55f64a2073 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -9,6 +9,7 @@ #include "../lib/Transforms/Vectorize/VPlan.h" #include "../lib/Transforms/Vectorize/VPlanCFG.h" +#include "VPlanTestBase.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/Analysis/VectorUtils.h" @@ -237,12 +238,13 @@ TEST(VPInstructionTest, releaseOperandsAtDeletion) { delete VPV1; delete VPV2; } -TEST(VPBasicBlockTest, getPlan) { - LLVMContext C; - auto *ScalarHeader = BasicBlock::Create(C, ""); + +using VPBasicBlockTest = VPlanTestBase; + +TEST_F(VPBasicBlockTest, getPlan) { { - VPBasicBlock *VPPH = new VPBasicBlock("ph"); - VPBasicBlock *VPBB1 = new VPBasicBlock(); + VPlan &Plan = getPlan(); + VPBasicBlock *VPBB1 = Plan.getEntry(); VPBasicBlock *VPBB2 = new VPBasicBlock(); VPBasicBlock *VPBB3 = new VPBasicBlock(); VPBasicBlock *VPBB4 = new VPBasicBlock(); @@ -256,11 +258,7 @@ TEST(VPBasicBlockTest, getPlan) { VPBlockUtils::connectBlocks(VPBB1, VPBB3); VPBlockUtils::connectBlocks(VPBB2, VPBB4); VPBlockUtils::connectBlocks(VPBB3, VPBB4); - - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(VPBB4, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB1); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(VPBB4, Plan.getScalarHeader()); EXPECT_EQ(&Plan, VPBB1->getPlan()); EXPECT_EQ(&Plan, VPBB2->getPlan()); @@ -269,20 +267,17 @@ TEST(VPBasicBlockTest, getPlan) { } { - VPBasicBlock *VPPH = new VPBasicBlock("ph"); + VPlan &Plan = getPlan(); + VPBasicBlock *VPBB1 = Plan.getEntry(); // VPBasicBlock is the entry into the VPlan, followed by a region. VPBasicBlock *R1BB1 = new VPBasicBlock(); VPBasicBlock *R1BB2 = new VPBasicBlock(); VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB2, "R1"); VPBlockUtils::connectBlocks(R1BB1, R1BB2); - VPBasicBlock *VPBB1 = new VPBasicBlock(); VPBlockUtils::connectBlocks(VPBB1, R1); - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB1); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader()); EXPECT_EQ(&Plan, VPBB1->getPlan()); EXPECT_EQ(&Plan, R1->getPlan()); @@ -291,8 +286,7 @@ TEST(VPBasicBlockTest, getPlan) { } { - VPBasicBlock *VPPH = new VPBasicBlock("ph"); - + VPlan &Plan = getPlan(); VPBasicBlock *R1BB1 = new VPBasicBlock(); VPBasicBlock *R1BB2 = new VPBasicBlock(); VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB2, "R1"); @@ -303,7 +297,7 @@ TEST(VPBasicBlockTest, getPlan) { VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R2BB2, "R2"); VPBlockUtils::connectBlocks(R2BB1, R2BB2); - VPBasicBlock *VPBB1 = new VPBasicBlock(); + VPBasicBlock *VPBB1 = Plan.getEntry(); VPBlockUtils::connectBlocks(VPBB1, R1); VPBlockUtils::connectBlocks(VPBB1, R2); @@ -311,10 +305,7 @@ TEST(VPBasicBlockTest, getPlan) { VPBlockUtils::connectBlocks(R1, VPBB2); VPBlockUtils::connectBlocks(R2, VPBB2); - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(R2, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB1); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(R2, Plan.getScalarHeader()); EXPECT_EQ(&Plan, VPBB1->getPlan()); EXPECT_EQ(&Plan, R1->getPlan()); @@ -325,12 +316,9 @@ TEST(VPBasicBlockTest, getPlan) { EXPECT_EQ(&Plan, R2BB2->getPlan()); EXPECT_EQ(&Plan, VPBB2->getPlan()); } - delete ScalarHeader; } -TEST(VPBasicBlockTest, TraversingIteratorTest) { - LLVMContext C; - auto *ScalarHeader = BasicBlock::Create(C, ""); +TEST_F(VPBasicBlockTest, TraversingIteratorTest) { { // VPBasicBlocks only // VPBB1 @@ -339,8 +327,8 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) { // \ / // VPBB4 // - VPBasicBlock *VPPH = new VPBasicBlock("ph"); - VPBasicBlock *VPBB1 = new VPBasicBlock(); + VPlan &Plan = getPlan(); + VPBasicBlock *VPBB1 = Plan.getEntry(); VPBasicBlock *VPBB2 = new VPBasicBlock(); VPBasicBlock *VPBB3 = new VPBasicBlock(); VPBasicBlock *VPBB4 = new VPBasicBlock(); @@ -356,11 +344,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) { EXPECT_EQ(VPBB1, FromIterator[0]); EXPECT_EQ(VPBB2, FromIterator[1]); - // Use Plan to properly clean up created blocks. - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(VPBB4, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB1); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(VPBB4, Plan.getScalarHeader()); } { @@ -382,8 +366,8 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) { // | // R2BB2 // - VPBasicBlock *VPPH = new VPBasicBlock("ph"); - VPBasicBlock *VPBB0 = new VPBasicBlock("VPBB0"); + VPlan &Plan = getPlan(); + VPBasicBlock *VPBB0 = Plan.getEntry(); VPBasicBlock *R1BB1 = new VPBasicBlock(); VPBasicBlock *R1BB2 = new VPBasicBlock(); VPBasicBlock *R1BB3 = new VPBasicBlock(); @@ -458,11 +442,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) { EXPECT_EQ(R1BB1, FromIterator[6]); EXPECT_EQ(R1, FromIterator[7]); - // Use Plan to properly clean up created blocks. - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(R2, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB0); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(R2, Plan.getScalarHeader()); } { @@ -486,7 +466,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) { // | // VPBB2 // - VPBasicBlock *VPPH = new VPBasicBlock("ph"); + VPlan &Plan = getPlan(); VPBasicBlock *R1BB1 = new VPBasicBlock("R1BB1"); VPBasicBlock *R1BB2 = new VPBasicBlock("R1BB2"); VPBasicBlock *R1BB3 = new VPBasicBlock("R1BB3"); @@ -508,7 +488,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) { VPBlockUtils::connectBlocks(R1BB2, R1BB3); VPBlockUtils::connectBlocks(R2, R1BB3); - VPBasicBlock *VPBB1 = new VPBasicBlock("VPBB1"); + VPBasicBlock *VPBB1 = Plan.getEntry(); VPBlockUtils::connectBlocks(VPBB1, R1); VPBasicBlock *VPBB2 = new VPBasicBlock("VPBB2"); VPBlockUtils::connectBlocks(R1, VPBB2); @@ -543,11 +523,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) { EXPECT_EQ(R1, FromIterator[8]); EXPECT_EQ(VPBB1, FromIterator[9]); - // Use Plan to properly clean up created blocks. - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(VPBB2, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB1); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(VPBB2, Plan.getScalarHeader()); } { @@ -561,7 +537,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) { // R2BB2 // } // - VPBasicBlock *VPPH = new VPBasicBlock("ph"); + VPlan &Plan = getPlan(); VPBasicBlock *R2BB1 = new VPBasicBlock("R2BB1"); VPBasicBlock *R2BB2 = new VPBasicBlock("R2BB2"); VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R2BB2, "R2"); @@ -570,7 +546,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) { VPRegionBlock *R1 = new VPRegionBlock(R2, R2, "R1"); R2->setParent(R1); - VPBasicBlock *VPBB1 = new VPBasicBlock("VPBB1"); + VPBasicBlock *VPBB1 = Plan.getEntry(); VPBlockUtils::connectBlocks(VPBB1, R1); // Depth-first. @@ -593,11 +569,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) { EXPECT_EQ(R1, FromIterator[3]); EXPECT_EQ(VPBB1, FromIterator[4]); - // Use Plan to properly clean up created blocks. - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB1); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader()); } { @@ -619,7 +591,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) { // | // VPBB2 // - VPBasicBlock *VPPH = new VPBasicBlock("ph"); + VPlan &Plan = getPlan(); VPBasicBlock *R3BB1 = new VPBasicBlock("R3BB1"); VPRegionBlock *R3 = new VPRegionBlock(R3BB1, R3BB1, "R3"); @@ -631,7 +603,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) { VPRegionBlock *R1 = new VPRegionBlock(R2, R2, "R1"); R2->setParent(R1); - VPBasicBlock *VPBB1 = new VPBasicBlock("VPBB1"); + VPBasicBlock *VPBB1 = Plan.getEntry(); VPBasicBlock *VPBB2 = new VPBasicBlock("VPBB2"); VPBlockUtils::connectBlocks(VPBB1, R1); VPBlockUtils::connectBlocks(R1, VPBB2); @@ -687,19 +659,15 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) { EXPECT_EQ(R2BB1, FromIterator[2]); EXPECT_EQ(VPBB1, FromIterator[3]); - // Use Plan to properly clean up created blocks. - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(VPBB2, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB1); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(VPBB2, Plan.getScalarHeader()); } - delete ScalarHeader; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -TEST(VPBasicBlockTest, print) { +TEST_F(VPBasicBlockTest, print) { VPInstruction *TC = new VPInstruction(Instruction::Add, {}); - VPBasicBlock *VPBB0 = new VPBasicBlock("preheader"); + VPlan &Plan = getPlan(TC); + VPBasicBlock *VPBB0 = Plan.getEntry(); VPBB0->appendRecipe(TC); VPInstruction *I1 = new VPInstruction(Instruction::Add, {}); @@ -730,12 +698,8 @@ TEST(VPBasicBlockTest, print) { EXPECT_EQ("EMIT br , ", I3Dump); } - LLVMContext C; - auto *ScalarHeader = BasicBlock::Create(C, "scalar.header"); - auto * ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(VPBB2, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(VPBB2, Plan.getScalarHeader()); VPBlockUtils::connectBlocks(VPBB0, VPBB1); - VPlan Plan(VPBB0, TC, ScalarHeaderVPBB); std::string FullDump; raw_string_ostream OS(FullDump); Plan.printDOT(OS); @@ -810,13 +774,12 @@ Successor(s): ir-bb OS << *I4; EXPECT_EQ("EMIT vp<%5> = mul vp<%3>, vp<%2>", I4Dump); } - delete ScalarHeader; } -TEST(VPBasicBlockTest, printPlanWithVFsAndUFs) { - +TEST_F(VPBasicBlockTest, printPlanWithVFsAndUFs) { VPInstruction *TC = new VPInstruction(Instruction::Sub, {}); - VPBasicBlock *VPBB0 = new VPBasicBlock("preheader"); + VPlan &Plan = getPlan(TC); + VPBasicBlock *VPBB0 = Plan.getEntry(); VPBB0->appendRecipe(TC); VPInstruction *I1 = new VPInstruction(Instruction::Add, {}); @@ -824,12 +787,8 @@ TEST(VPBasicBlockTest, printPlanWithVFsAndUFs) { VPBB1->appendRecipe(I1); VPBB1->setName("bb1"); - LLVMContext C; - auto *ScalarHeader = BasicBlock::Create(C, ""); - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(VPBB1, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(VPBB1, Plan.getScalarHeader()); VPBlockUtils::connectBlocks(VPBB0, VPBB1); - VPlan Plan(VPBB0, TC, ScalarHeaderVPBB); Plan.setName("TestPlan"); Plan.addVF(ElementCount::getFixed(4)); @@ -847,9 +806,9 @@ Successor(s): bb1 bb1: EMIT vp<%2> = add -Successor(s): ir-bb<> +Successor(s): ir-bb -ir-bb<>: +ir-bb: No successors } )"; @@ -871,9 +830,9 @@ Successor(s): bb1 bb1: EMIT vp<%2> = add -Successor(s): ir-bb<> +Successor(s): ir-bb -ir-bb<>: +ir-bb: No successors } )"; @@ -895,19 +854,19 @@ Successor(s): bb1 bb1: EMIT vp<%2> = add -Successor(s): ir-bb<> +Successor(s): ir-bb -ir-bb<>: +ir-bb: No successors } )"; EXPECT_EQ(ExpectedStr, FullDump); } - delete ScalarHeader; } #endif -TEST(VPRecipeTest, CastVPInstructionToVPUser) { +using VPRecipeTest = VPlanTestBase; +TEST_F(VPRecipeTest, CastVPInstructionToVPUser) { VPValue Op1; VPValue Op2; VPInstruction Recipe(Instruction::Add, {&Op1, &Op2}); @@ -917,9 +876,7 @@ TEST(VPRecipeTest, CastVPInstructionToVPUser) { EXPECT_EQ(&Recipe, BaseR); } -TEST(VPRecipeTest, CastVPWidenRecipeToVPUser) { - LLVMContext C; - +TEST_F(VPRecipeTest, CastVPWidenRecipeToVPUser) { IntegerType *Int32 = IntegerType::get(C, 32); auto *AI = BinaryOperator::CreateAdd(PoisonValue::get(Int32), PoisonValue::get(Int32)); @@ -936,9 +893,7 @@ TEST(VPRecipeTest, CastVPWidenRecipeToVPUser) { delete AI; } -TEST(VPRecipeTest, CastVPWidenCallRecipeToVPUserAndVPDef) { - LLVMContext C; - +TEST_F(VPRecipeTest, CastVPWidenCallRecipeToVPUserAndVPDef) { IntegerType *Int32 = IntegerType::get(C, 32); FunctionType *FTy = FunctionType::get(Int32, false); Function *Fn = Function::Create(FTy, GlobalValue::ExternalLinkage, 0); @@ -964,9 +919,7 @@ TEST(VPRecipeTest, CastVPWidenCallRecipeToVPUserAndVPDef) { delete Fn; } -TEST(VPRecipeTest, CastVPWidenSelectRecipeToVPUserAndVPDef) { - LLVMContext C; - +TEST_F(VPRecipeTest, CastVPWidenSelectRecipeToVPUserAndVPDef) { IntegerType *Int1 = IntegerType::get(C, 1); IntegerType *Int32 = IntegerType::get(C, 32); auto *SelectI = SelectInst::Create( @@ -992,9 +945,7 @@ TEST(VPRecipeTest, CastVPWidenSelectRecipeToVPUserAndVPDef) { delete SelectI; } -TEST(VPRecipeTest, CastVPWidenGEPRecipeToVPUserAndVPDef) { - LLVMContext C; - +TEST_F(VPRecipeTest, CastVPWidenGEPRecipeToVPUserAndVPDef) { IntegerType *Int32 = IntegerType::get(C, 32); PointerType *Int32Ptr = PointerType::get(Int32, 0); auto *GEP = GetElementPtrInst::Create(Int32, PoisonValue::get(Int32Ptr), @@ -1017,9 +968,7 @@ TEST(VPRecipeTest, CastVPWidenGEPRecipeToVPUserAndVPDef) { delete GEP; } -TEST(VPRecipeTest, CastVPBlendRecipeToVPUser) { - LLVMContext C; - +TEST_F(VPRecipeTest, CastVPBlendRecipeToVPUser) { IntegerType *Int32 = IntegerType::get(C, 32); auto *Phi = PHINode::Create(Int32, 1); VPValue I1; @@ -1036,9 +985,7 @@ TEST(VPRecipeTest, CastVPBlendRecipeToVPUser) { delete Phi; } -TEST(VPRecipeTest, CastVPInterleaveRecipeToVPUser) { - LLVMContext C; - +TEST_F(VPRecipeTest, CastVPInterleaveRecipeToVPUser) { VPValue Addr; VPValue Mask; InterleaveGroup IG(4, false, Align(4)); @@ -1049,9 +996,7 @@ TEST(VPRecipeTest, CastVPInterleaveRecipeToVPUser) { EXPECT_EQ(&Recipe, BaseR); } -TEST(VPRecipeTest, CastVPReplicateRecipeToVPUser) { - LLVMContext C; - +TEST_F(VPRecipeTest, CastVPReplicateRecipeToVPUser) { VPValue Op1; VPValue Op2; SmallVector Args; @@ -1068,9 +1013,7 @@ TEST(VPRecipeTest, CastVPReplicateRecipeToVPUser) { delete Call; } -TEST(VPRecipeTest, CastVPBranchOnMaskRecipeToVPUser) { - LLVMContext C; - +TEST_F(VPRecipeTest, CastVPBranchOnMaskRecipeToVPUser) { VPValue Mask; VPBranchOnMaskRecipe Recipe(&Mask); EXPECT_TRUE(isa(&Recipe)); @@ -1079,9 +1022,7 @@ TEST(VPRecipeTest, CastVPBranchOnMaskRecipeToVPUser) { EXPECT_EQ(&Recipe, BaseR); } -TEST(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) { - LLVMContext C; - +TEST_F(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) { IntegerType *Int32 = IntegerType::get(C, 32); PointerType *Int32Ptr = PointerType::get(Int32, 0); auto *Load = @@ -1101,8 +1042,7 @@ TEST(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) { delete Load; } -TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { - LLVMContext C; +TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { IntegerType *Int1 = IntegerType::get(C, 1); IntegerType *Int32 = IntegerType::get(C, 32); PointerType *Int32Ptr = PointerType::get(Int32, 0); @@ -1242,7 +1182,6 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { { // Test for a call to a function without side-effects. - LLVMContext C; Module M("", C); Function *TheFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::thread_pointer); @@ -1296,15 +1235,12 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -TEST(VPRecipeTest, dumpRecipeInPlan) { - VPBasicBlock *VPBB0 = new VPBasicBlock("preheader"); +TEST_F(VPRecipeTest, dumpRecipeInPlan) { + VPlan &Plan = getPlan(); + VPBasicBlock *VPBB0 = Plan.getEntry(); VPBasicBlock *VPBB1 = new VPBasicBlock(); - LLVMContext C; - auto *ScalarHeader = BasicBlock::Create(C, ""); - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(VPBB1, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(VPBB1, Plan.getScalarHeader()); VPBlockUtils::connectBlocks(VPBB0, VPBB1); - VPlan Plan(VPBB0, ScalarHeaderVPBB); IntegerType *Int32 = IntegerType::get(C, 32); auto *AI = BinaryOperator::CreateAdd(PoisonValue::get(Int32), @@ -1366,18 +1302,14 @@ TEST(VPRecipeTest, dumpRecipeInPlan) { } delete AI; - delete ScalarHeader; } -TEST(VPRecipeTest, dumpRecipeUnnamedVPValuesInPlan) { - VPBasicBlock *VPBB0 = new VPBasicBlock("preheader"); +TEST_F(VPRecipeTest, dumpRecipeUnnamedVPValuesInPlan) { + VPlan &Plan = getPlan(); + VPBasicBlock *VPBB0 = Plan.getEntry(); VPBasicBlock *VPBB1 = new VPBasicBlock(); - LLVMContext C; - auto *ScalarHeader = BasicBlock::Create(C, ""); - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(VPBB1, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(VPBB1, Plan.getScalarHeader()); VPBlockUtils::connectBlocks(VPBB0, VPBB1); - VPlan Plan(VPBB0, ScalarHeaderVPBB); IntegerType *Int32 = IntegerType::get(C, 32); auto *AI = BinaryOperator::CreateAdd(PoisonValue::get(Int32), @@ -1456,11 +1388,9 @@ TEST(VPRecipeTest, dumpRecipeUnnamedVPValuesInPlan) { testing::ExitedWithCode(0), "EMIT vp<%2> = mul vp<%1>, vp<%1>"); } delete AI; - delete ScalarHeader; } -TEST(VPRecipeTest, dumpRecipeUnnamedVPValuesNotInPlanOrBlock) { - LLVMContext C; +TEST_F(VPRecipeTest, dumpRecipeUnnamedVPValuesNotInPlanOrBlock) { IntegerType *Int32 = IntegerType::get(C, 32); auto *AI = BinaryOperator::CreateAdd(PoisonValue::get(Int32), PoisonValue::get(Int32)); @@ -1543,9 +1473,7 @@ TEST(VPRecipeTest, dumpRecipeUnnamedVPValuesNotInPlanOrBlock) { #endif -TEST(VPRecipeTest, CastVPReductionRecipeToVPUser) { - LLVMContext C; - +TEST_F(VPRecipeTest, CastVPReductionRecipeToVPUser) { VPValue ChainOp; VPValue VecOp; VPValue CondOp; @@ -1556,9 +1484,7 @@ TEST(VPRecipeTest, CastVPReductionRecipeToVPUser) { EXPECT_TRUE(isa(BaseR)); } -TEST(VPRecipeTest, CastVPReductionEVLRecipeToVPUser) { - LLVMContext C; - +TEST_F(VPRecipeTest, CastVPReductionEVLRecipeToVPUser) { VPValue ChainOp; VPValue VecOp; VPValue CondOp; @@ -1630,7 +1556,7 @@ TEST(VPDoubleValueDefTest, traverseUseLists) { EXPECT_EQ(&DoubleValueDef, I3.getOperand(0)->getDefiningRecipe()); } -TEST(VPRecipeTest, CastToVPSingleDefRecipe) { +TEST_F(VPRecipeTest, CastToVPSingleDefRecipe) { VPValue Start; VPEVLBasedIVPHIRecipe R(&Start, {}); VPRecipeBase *B = &R; diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h index 06e091da9054e..1836a5e39a290 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h +++ b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h @@ -28,7 +28,7 @@ namespace llvm { /// Helper class to create a module from an assembly string and VPlans for a /// given loop entry block. -class VPlanTestBase : public testing::Test { +class VPlanTestIRBase : public testing::Test { protected: TargetLibraryInfoImpl TLII; TargetLibraryInfo TLI; @@ -41,7 +41,7 @@ class VPlanTestBase : public testing::Test { std::unique_ptr AC; std::unique_ptr SE; - VPlanTestBase() + VPlanTestIRBase() : TLII(), TLI(TLII), DL("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-" "f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:" @@ -92,6 +92,22 @@ class VPlanTestBase : public testing::Test { } }; +class VPlanTestBase : public testing::Test { +protected: + LLVMContext C; + std::unique_ptr ScalarHeader; + SmallVector> Plans; + + VPlanTestBase() : ScalarHeader(BasicBlock::Create(C, "scalar.header")) { + BranchInst::Create(&*ScalarHeader, &*ScalarHeader); + } + + VPlan &getPlan(VPValue *TC = nullptr) { + Plans.push_back(std::make_unique(&*ScalarHeader, TC)); + return *Plans.back(); + } +}; + } // namespace llvm #endif // LLVM_UNITTESTS_TRANSFORMS_VECTORIZE_VPLANTESTBASE_H diff --git a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp index 6448153de7821..174249a7e85e3 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp @@ -8,32 +8,29 @@ #include "../lib/Transforms/Vectorize/VPlanVerifier.h" #include "../lib/Transforms/Vectorize/VPlan.h" +#include "VPlanTestBase.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "gtest/gtest.h" using namespace llvm; +using VPVerifierTest = VPlanTestBase; + namespace { -TEST(VPVerifierTest, VPInstructionUseBeforeDefSameBB) { +TEST_F(VPVerifierTest, VPInstructionUseBeforeDefSameBB) { + VPlan &Plan = getPlan(); VPInstruction *DefI = new VPInstruction(Instruction::Add, {}); VPInstruction *UseI = new VPInstruction(Instruction::Sub, {DefI}); - VPBasicBlock *VPPH = new VPBasicBlock("ph"); - VPBasicBlock *VPBB1 = new VPBasicBlock(); + VPBasicBlock *VPBB1 = Plan.getEntry(); VPBB1->appendRecipe(UseI); VPBB1->appendRecipe(DefI); VPBasicBlock *VPBB2 = new VPBasicBlock(); VPRegionBlock *R1 = new VPRegionBlock(VPBB2, VPBB2, "R1"); VPBlockUtils::connectBlocks(VPBB1, R1); - - LLVMContext C; - auto *ScalarHeader = BasicBlock::Create(C, ""); - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB1); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader()); #if GTEST_HAS_STREAM_REDIRECTION ::testing::internal::CaptureStderr(); @@ -43,18 +40,17 @@ TEST(VPVerifierTest, VPInstructionUseBeforeDefSameBB) { EXPECT_STREQ("Use before def!\n", ::testing::internal::GetCapturedStderr().c_str()); #endif - delete ScalarHeader; } -TEST(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) { +TEST_F(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) { + VPlan &Plan = getPlan(); VPInstruction *DefI = new VPInstruction(Instruction::Add, {}); VPInstruction *UseI = new VPInstruction(Instruction::Sub, {DefI}); auto *CanIV = new VPCanonicalIVPHIRecipe(UseI, {}); VPInstruction *BranchOnCond = new VPInstruction(VPInstruction::BranchOnCond, {CanIV}); - VPBasicBlock *VPPH = new VPBasicBlock("ph"); - VPBasicBlock *VPBB1 = new VPBasicBlock(); + VPBasicBlock *VPBB1 = Plan.getEntry(); VPBasicBlock *VPBB2 = new VPBasicBlock(); VPBB1->appendRecipe(UseI); @@ -64,13 +60,7 @@ TEST(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) { VPRegionBlock *R1 = new VPRegionBlock(VPBB2, VPBB2, "R1"); VPBlockUtils::connectBlocks(VPBB1, R1); - - LLVMContext C; - auto *ScalarHeader = BasicBlock::Create(C, ""); - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB1); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader()); #if GTEST_HAS_STREAM_REDIRECTION ::testing::internal::CaptureStderr(); @@ -80,11 +70,9 @@ TEST(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) { EXPECT_STREQ("Use before def!\n", ::testing::internal::GetCapturedStderr().c_str()); #endif - delete ScalarHeader; } -TEST(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) { - LLVMContext C; +TEST_F(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) { IntegerType *Int32 = IntegerType::get(C, 32); auto *Phi = PHINode::Create(Int32, 1); @@ -95,8 +83,8 @@ TEST(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) { new VPInstruction(VPInstruction::BranchOnCond, {CanIV}); auto *Blend = new VPBlendRecipe(Phi, {DefI}); - VPBasicBlock *VPPH = new VPBasicBlock("ph"); - VPBasicBlock *VPBB1 = new VPBasicBlock(); + VPlan &Plan = getPlan(); + VPBasicBlock *VPBB1 = Plan.getEntry(); VPBasicBlock *VPBB2 = new VPBasicBlock(); VPBasicBlock *VPBB3 = new VPBasicBlock(); VPBasicBlock *VPBB4 = new VPBasicBlock(); @@ -113,11 +101,7 @@ TEST(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) { VPBlockUtils::connectBlocks(VPBB1, R1); VPBB3->setParent(R1); - auto *ScalarHeader = BasicBlock::Create(C, ""); - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB1); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader()); #if GTEST_HAS_STREAM_REDIRECTION ::testing::internal::CaptureStderr(); @@ -129,10 +113,9 @@ TEST(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) { #endif delete Phi; - delete ScalarHeader; } -TEST(VPVerifierTest, DuplicateSuccessorsOutsideRegion) { +TEST_F(VPVerifierTest, DuplicateSuccessorsOutsideRegion) { VPInstruction *I1 = new VPInstruction(Instruction::Add, {}); auto *CanIV = new VPCanonicalIVPHIRecipe(I1, {}); VPInstruction *BranchOnCond = @@ -140,8 +123,8 @@ TEST(VPVerifierTest, DuplicateSuccessorsOutsideRegion) { VPInstruction *BranchOnCond2 = new VPInstruction(VPInstruction::BranchOnCond, {I1}); - VPBasicBlock *VPPH = new VPBasicBlock("ph"); - VPBasicBlock *VPBB1 = new VPBasicBlock(); + VPlan &Plan = getPlan(); + VPBasicBlock *VPBB1 = Plan.getEntry(); VPBasicBlock *VPBB2 = new VPBasicBlock(); VPBB1->appendRecipe(I1); @@ -153,12 +136,7 @@ TEST(VPVerifierTest, DuplicateSuccessorsOutsideRegion) { VPBlockUtils::connectBlocks(VPBB1, R1); VPBlockUtils::connectBlocks(VPBB1, R1); - LLVMContext C; - auto *ScalarHeader = BasicBlock::Create(C, ""); - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB1); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader()); #if GTEST_HAS_STREAM_REDIRECTION ::testing::internal::CaptureStderr(); @@ -168,10 +146,9 @@ TEST(VPVerifierTest, DuplicateSuccessorsOutsideRegion) { EXPECT_STREQ("Multiple instances of the same successor.\n", ::testing::internal::GetCapturedStderr().c_str()); #endif - delete ScalarHeader; } -TEST(VPVerifierTest, DuplicateSuccessorsInsideRegion) { +TEST_F(VPVerifierTest, DuplicateSuccessorsInsideRegion) { VPInstruction *I1 = new VPInstruction(Instruction::Add, {}); auto *CanIV = new VPCanonicalIVPHIRecipe(I1, {}); VPInstruction *BranchOnCond = @@ -179,8 +156,8 @@ TEST(VPVerifierTest, DuplicateSuccessorsInsideRegion) { VPInstruction *BranchOnCond2 = new VPInstruction(VPInstruction::BranchOnCond, {I1}); - VPBasicBlock *VPPH = new VPBasicBlock("ph"); - VPBasicBlock *VPBB1 = new VPBasicBlock(); + VPlan &Plan = getPlan(); + VPBasicBlock *VPBB1 = Plan.getEntry(); VPBasicBlock *VPBB2 = new VPBasicBlock(); VPBasicBlock *VPBB3 = new VPBasicBlock(); @@ -195,12 +172,7 @@ TEST(VPVerifierTest, DuplicateSuccessorsInsideRegion) { VPBlockUtils::connectBlocks(VPBB1, R1); VPBB3->setParent(R1); - LLVMContext C; - auto *ScalarHeader = BasicBlock::Create(C, ""); - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB1); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader()); #if GTEST_HAS_STREAM_REDIRECTION ::testing::internal::CaptureStderr(); @@ -210,12 +182,11 @@ TEST(VPVerifierTest, DuplicateSuccessorsInsideRegion) { EXPECT_STREQ("Multiple instances of the same successor.\n", ::testing::internal::GetCapturedStderr().c_str()); #endif - delete ScalarHeader; } -TEST(VPVerifierTest, BlockOutsideRegionWithParent) { - VPBasicBlock *VPPH = new VPBasicBlock("ph"); - VPBasicBlock *VPBB1 = new VPBasicBlock(); +TEST_F(VPVerifierTest, BlockOutsideRegionWithParent) { + VPlan &Plan = getPlan(); + VPBasicBlock *VPBB1 = Plan.getEntry(); VPBasicBlock *VPBB2 = new VPBasicBlock(); VPInstruction *DefI = new VPInstruction(Instruction::Add, {}); @@ -228,12 +199,7 @@ TEST(VPVerifierTest, BlockOutsideRegionWithParent) { VPRegionBlock *R1 = new VPRegionBlock(VPBB2, VPBB2, "R1"); VPBlockUtils::connectBlocks(VPBB1, R1); - LLVMContext C; - auto *ScalarHeader = BasicBlock::Create(C, ""); - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB1); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader()); VPBB1->setParent(R1); #if GTEST_HAS_STREAM_REDIRECTION @@ -244,7 +210,6 @@ TEST(VPVerifierTest, BlockOutsideRegionWithParent) { EXPECT_STREQ("Predecessor is not in the same region.\n", ::testing::internal::GetCapturedStderr().c_str()); #endif - delete ScalarHeader; } } // namespace From 1f90797f6a9d91d61e0f66b465b0467e4c66d0e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?kleines=20Filmr=C3=B6llchen?= <28656157+kleinesfilmroellchen@users.noreply.github.com> Date: Fri, 27 Dec 2024 20:14:29 +0100 Subject: [PATCH 118/567] [clangd] Allow specifying what headers are always included via "" or <> (#67749) Projects can now add config fragments like this to their .clangd: ```yaml Style: QuotedHeaders: "src/.*" AngledHeaders: ["path/sdk/.*", "third-party/.*"] ``` to force headers inserted via the --header-insertion=iwyu mode matching at least one of the regexes to have <> (AngledHeaders) or "" (QuotedHeaders) around them, respectively. For other headers (and in conflicting cases where both styles have a matching regex), the current system header detection remains. Fixes https://github.com/clangd/clangd/issues/1247 --- clang-tools-extra/clangd/CodeComplete.cpp | 14 +++-- clang-tools-extra/clangd/Config.h | 4 ++ clang-tools-extra/clangd/ConfigCompile.cpp | 49 +++++++++++++++++ clang-tools-extra/clangd/ConfigFragment.h | 17 ++++++ clang-tools-extra/clangd/ConfigYAML.cpp | 8 +++ clang-tools-extra/clangd/Headers.cpp | 34 ++++++++++-- clang-tools-extra/clangd/Headers.h | 10 +++- clang-tools-extra/clangd/IncludeCleaner.h | 1 - clang-tools-extra/clangd/ParsedAST.cpp | 3 +- .../clangd/unittests/CodeCompleteTests.cpp | 55 +++++++++++++++---- .../clangd/unittests/ConfigCompileTests.cpp | 38 +++++++++++++ .../clangd/unittests/ConfigYAMLTests.cpp | 8 ++- .../clangd/unittests/HeadersTests.cpp | 29 +++++++++- 13 files changed, 242 insertions(+), 28 deletions(-) diff --git a/clang-tools-extra/clangd/CodeComplete.cpp b/clang-tools-extra/clangd/CodeComplete.cpp index 2c2d5f0b5ac92..fb39b7b292242 100644 --- a/clang-tools-extra/clangd/CodeComplete.cpp +++ b/clang-tools-extra/clangd/CodeComplete.cpp @@ -807,8 +807,8 @@ SpecifiedScope getQueryScopes(CodeCompletionContext &CCContext, llvm::StringRef SpelledSpecifier = Lexer::getSourceText( CharSourceRange::getCharRange(SemaSpecifier->getRange()), CCSema.SourceMgr, clang::LangOptions()); - if (SpelledSpecifier.consume_front("::")) - Scopes.QueryScopes = {""}; + if (SpelledSpecifier.consume_front("::")) + Scopes.QueryScopes = {""}; Scopes.UnresolvedQualifier = std::string(SpelledSpecifier); // Sema excludes the trailing "::". if (!Scopes.UnresolvedQualifier->empty()) @@ -1604,7 +1604,7 @@ class CodeCompleteFlow { CompletionPrefix HeuristicPrefix; std::optional Filter; // Initialized once Sema runs. Range ReplacedRange; - std::vector QueryScopes; // Initialized once Sema runs. + std::vector QueryScopes; // Initialized once Sema runs. std::vector AccessibleScopes; // Initialized once Sema runs. // Initialized once QueryScopes is initialized, if there are scopes. std::optional ScopeProximity; @@ -1663,7 +1663,9 @@ class CodeCompleteFlow { Inserter.emplace( SemaCCInput.FileName, SemaCCInput.ParseInput.Contents, Style, SemaCCInput.ParseInput.CompileCommand.Directory, - &Recorder->CCSema->getPreprocessor().getHeaderSearchInfo()); + &Recorder->CCSema->getPreprocessor().getHeaderSearchInfo(), + Config::current().Style.QuotedHeaders, + Config::current().Style.AngledHeaders); for (const auto &Inc : Includes.MainFileIncludes) Inserter->addExisting(Inc); @@ -1746,7 +1748,9 @@ class CodeCompleteFlow { auto Style = getFormatStyleForFile(FileName, Content, TFS, false); // This will only insert verbatim headers. Inserter.emplace(FileName, Content, Style, - /*BuildDir=*/"", /*HeaderSearchInfo=*/nullptr); + /*BuildDir=*/"", /*HeaderSearchInfo=*/nullptr, + Config::current().Style.QuotedHeaders, + Config::current().Style.AngledHeaders); auto Identifiers = collectIdentifiers(Content, Style); std::vector IdentifierResults; diff --git a/clang-tools-extra/clangd/Config.h b/clang-tools-extra/clangd/Config.h index e174f7fabe344..586d031d58481 100644 --- a/clang-tools-extra/clangd/Config.h +++ b/clang-tools-extra/clangd/Config.h @@ -124,6 +124,10 @@ struct Config { // declarations, always spell out the whole name (with or without leading // ::). All nested namespaces are affected as well. std::vector FullyQualifiedNamespaces; + + // List of matcher functions for inserting certain headers with <> or "". + std::vector> QuotedHeaders; + std::vector> AngledHeaders; } Style; /// controls the completion options for argument lists. diff --git a/clang-tools-extra/clangd/ConfigCompile.cpp b/clang-tools-extra/clangd/ConfigCompile.cpp index fb7692998d05c..aa2561e081047 100644 --- a/clang-tools-extra/clangd/ConfigCompile.cpp +++ b/clang-tools-extra/clangd/ConfigCompile.cpp @@ -482,6 +482,55 @@ struct FragmentCompiler { FullyQualifiedNamespaces.begin(), FullyQualifiedNamespaces.end()); }); } + auto QuotedFilter = compileHeaderRegexes(F.QuotedHeaders); + if (QuotedFilter.has_value()) { + Out.Apply.push_back( + [QuotedFilter = *QuotedFilter](const Params &, Config &C) { + C.Style.QuotedHeaders.emplace_back(QuotedFilter); + }); + } + auto AngledFilter = compileHeaderRegexes(F.AngledHeaders); + if (AngledFilter.has_value()) { + Out.Apply.push_back( + [AngledFilter = *AngledFilter](const Params &, Config &C) { + C.Style.AngledHeaders.emplace_back(AngledFilter); + }); + } + } + + auto compileHeaderRegexes(llvm::ArrayRef> HeaderPatterns) + -> std::optional> { + // TODO: Share this code with Diagnostics.Includes.IgnoreHeader +#ifdef CLANGD_PATH_CASE_INSENSITIVE + static llvm::Regex::RegexFlags Flags = llvm::Regex::IgnoreCase; +#else + static llvm::Regex::RegexFlags Flags = llvm::Regex::NoFlags; +#endif + auto Filters = std::make_shared>(); + for (auto &HeaderPattern : HeaderPatterns) { + // Anchor on the right. + std::string AnchoredPattern = "(" + *HeaderPattern + ")$"; + llvm::Regex CompiledRegex(AnchoredPattern, Flags); + std::string RegexError; + if (!CompiledRegex.isValid(RegexError)) { + diag(Warning, + llvm::formatv("Invalid regular expression '{0}': {1}", + *HeaderPattern, RegexError) + .str(), + HeaderPattern.Range); + continue; + } + Filters->push_back(std::move(CompiledRegex)); + } + if (Filters->empty()) + return std::nullopt; + auto Filter = [Filters](llvm::StringRef Path) { + for (auto &Regex : *Filters) + if (Regex.match(Path)) + return true; + return false; + }; + return Filter; } void appendTidyCheckSpec(std::string &CurSpec, diff --git a/clang-tools-extra/clangd/ConfigFragment.h b/clang-tools-extra/clangd/ConfigFragment.h index 36f7d04231c41..9535b20253b13 100644 --- a/clang-tools-extra/clangd/ConfigFragment.h +++ b/clang-tools-extra/clangd/ConfigFragment.h @@ -301,6 +301,23 @@ struct Fragment { // ::). All nested namespaces are affected as well. // Affects availability of the AddUsing tweak. std::vector> FullyQualifiedNamespaces; + + /// List of regexes for headers that should always be included with a + /// ""-style include. By default, and in case of a conflict with + /// AngledHeaders (i.e. a header matches a regex in both QuotedHeaders and + /// AngledHeaders), system headers use <> and non-system headers use "". + /// These can match any suffix of the header file in question. + /// Matching is performed against the header text, not its absolute path + /// within the project. + std::vector> QuotedHeaders; + /// List of regexes for headers that should always be included with a + /// <>-style include. By default, and in case of a conflict with + /// AngledHeaders (i.e. a header matches a regex in both QuotedHeaders and + /// AngledHeaders), system headers use <> and non-system headers use "". + /// These can match any suffix of the header file in question. + /// Matching is performed against the header text, not its absolute path + /// within the project. + std::vector> AngledHeaders; }; StyleBlock Style; diff --git a/clang-tools-extra/clangd/ConfigYAML.cpp b/clang-tools-extra/clangd/ConfigYAML.cpp index 32e028981d424..95cc5c1f9f1cf 100644 --- a/clang-tools-extra/clangd/ConfigYAML.cpp +++ b/clang-tools-extra/clangd/ConfigYAML.cpp @@ -116,6 +116,14 @@ class Parser { if (auto Values = scalarValues(N)) F.FullyQualifiedNamespaces = std::move(*Values); }); + Dict.handle("QuotedHeaders", [&](Node &N) { + if (auto Values = scalarValues(N)) + F.QuotedHeaders = std::move(*Values); + }); + Dict.handle("AngledHeaders", [&](Node &N) { + if (auto Values = scalarValues(N)) + F.AngledHeaders = std::move(*Values); + }); Dict.parse(N); } diff --git a/clang-tools-extra/clangd/Headers.cpp b/clang-tools-extra/clangd/Headers.cpp index b537417bd1056..0ffd9ee4d2751 100644 --- a/clang-tools-extra/clangd/Headers.cpp +++ b/clang-tools-extra/clangd/Headers.cpp @@ -9,6 +9,7 @@ #include "Headers.h" #include "Preamble.h" #include "SourceCode.h" +#include "support/Logger.h" #include "clang/Basic/SourceLocation.h" #include "clang/Basic/SourceManager.h" #include "clang/Frontend/CompilerInstance.h" @@ -30,8 +31,7 @@ namespace clangd { class IncludeStructure::RecordHeaders : public PPCallbacks { public: RecordHeaders(const CompilerInstance &CI, IncludeStructure *Out) - : SM(CI.getSourceManager()), - Out(Out) {} + : SM(CI.getSourceManager()), Out(Out) {} // Record existing #includes - both written and resolved paths. Only #includes // in the main file are collected. @@ -287,11 +287,11 @@ IncludeInserter::calculateIncludePath(const HeaderFile &InsertedHeader, assert(InsertedHeader.valid()); if (InsertedHeader.Verbatim) return InsertedHeader.File; - bool IsAngled = false; + bool IsAngledByDefault = false; std::string Suggested; if (HeaderSearchInfo) { Suggested = HeaderSearchInfo->suggestPathToFileForDiagnostics( - InsertedHeader.File, BuildDir, IncludingFile, &IsAngled); + InsertedHeader.File, BuildDir, IncludingFile, &IsAngledByDefault); } else { // Calculate include relative to including file only. StringRef IncludingDir = llvm::sys::path::parent_path(IncludingFile); @@ -304,9 +304,33 @@ IncludeInserter::calculateIncludePath(const HeaderFile &InsertedHeader, // FIXME: should we allow (some limited number of) "../header.h"? if (llvm::sys::path::is_absolute(Suggested)) return std::nullopt; + bool IsAngled = false; + for (auto Filter : AngledHeaders) { + if (Filter(Suggested)) { + IsAngled = true; + break; + } + } + bool IsQuoted = false; + for (auto Filter : QuotedHeaders) { + if (Filter(Suggested)) { + IsQuoted = true; + break; + } + } + // No filters apply, or both filters apply (a bug), use system default. + if (IsAngled == IsQuoted) { + // Probably a bug in the config regex. + if (IsAngled && IsQuoted) { + elog("Header '{0}' matches both quoted and angled regexes, default will " + "be used.", + Suggested); + } + IsAngled = IsAngledByDefault; + } if (IsAngled) Suggested = "<" + Suggested + ">"; - else + else // if (IsQuoted) Suggested = "\"" + Suggested + "\""; return Suggested; } diff --git a/clang-tools-extra/clangd/Headers.h b/clang-tools-extra/clangd/Headers.h index 41cf3de6bba35..b91179da253e9 100644 --- a/clang-tools-extra/clangd/Headers.h +++ b/clang-tools-extra/clangd/Headers.h @@ -33,6 +33,8 @@ namespace clang { namespace clangd { +using HeaderFilter = llvm::ArrayRef>; + /// Returns true if \p Include is literal include like "path" or . bool isLiteralInclude(llvm::StringRef Include); @@ -211,10 +213,12 @@ class IncludeInserter { // include path of non-verbatim header will not be shortened. IncludeInserter(StringRef FileName, StringRef Code, const format::FormatStyle &Style, StringRef BuildDir, - HeaderSearch *HeaderSearchInfo) + HeaderSearch *HeaderSearchInfo, HeaderFilter QuotedHeaders, + HeaderFilter AngledHeaders) : FileName(FileName), Code(Code), BuildDir(BuildDir), HeaderSearchInfo(HeaderSearchInfo), - Inserter(FileName, Code, Style.IncludeStyle) {} + Inserter(FileName, Code, Style.IncludeStyle), + QuotedHeaders(QuotedHeaders), AngledHeaders(AngledHeaders) {} void addExisting(const Inclusion &Inc); @@ -258,6 +262,8 @@ class IncludeInserter { HeaderSearch *HeaderSearchInfo = nullptr; llvm::StringSet<> IncludedHeaders; // Both written and resolved. tooling::HeaderIncludes Inserter; // Computers insertion replacement. + HeaderFilter QuotedHeaders; + HeaderFilter AngledHeaders; }; } // namespace clangd diff --git a/clang-tools-extra/clangd/IncludeCleaner.h b/clang-tools-extra/clangd/IncludeCleaner.h index a01146d14e3c1..3f6e3b2fd45b6 100644 --- a/clang-tools-extra/clangd/IncludeCleaner.h +++ b/clang-tools-extra/clangd/IncludeCleaner.h @@ -57,7 +57,6 @@ IncludeCleanerFindings computeIncludeCleanerFindings(ParsedAST &AST, bool AnalyzeAngledIncludes = false); -using HeaderFilter = llvm::ArrayRef>; std::vector issueIncludeCleanerDiagnostics(ParsedAST &AST, llvm::StringRef Code, const IncludeCleanerFindings &Findings, diff --git a/clang-tools-extra/clangd/ParsedAST.cpp b/clang-tools-extra/clangd/ParsedAST.cpp index 045d32afbc938..725cbeb154cb8 100644 --- a/clang-tools-extra/clangd/ParsedAST.cpp +++ b/clang-tools-extra/clangd/ParsedAST.cpp @@ -639,7 +639,8 @@ ParsedAST::build(llvm::StringRef Filename, const ParseInputs &Inputs, getFormatStyleForFile(Filename, Inputs.Contents, *Inputs.TFS, false); auto Inserter = std::make_shared( Filename, Inputs.Contents, Style, BuildDir.get(), - &Clang->getPreprocessor().getHeaderSearchInfo()); + &Clang->getPreprocessor().getHeaderSearchInfo(), + Cfg.Style.QuotedHeaders, Cfg.Style.AngledHeaders); ArrayRef MainFileIncludes; if (Preamble) { MainFileIncludes = Preamble->Includes.MainFileIncludes; diff --git a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp index 3acacf496e77f..9d48a6e09fc77 100644 --- a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp +++ b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp @@ -920,6 +920,41 @@ TEST(CompletionTest, NoIncludeInsertionWhenDeclFoundInFile) { AllOf(named("Y"), Not(insertInclude())))); } +TEST(CompletionTest, IncludeInsertionRespectsQuotedAngledConfig) { + TestTU TU; + TU.ExtraArgs.push_back("-I" + testPath("sub")); + TU.AdditionalFiles["sub/bar.h"] = ""; + auto BarURI = URI::create(testPath("sub/bar.h")).toString(); + + Symbol Sym = cls("ns::X"); + Sym.CanonicalDeclaration.FileURI = BarURI.c_str(); + Sym.IncludeHeaders.emplace_back(BarURI, 1, Symbol::Include); + Annotations Test("int main() { ns::^ }"); + TU.Code = Test.code().str(); + auto Results = completions(TU, Test.point(), {Sym}); + // Default for a local path is quoted include + EXPECT_THAT(Results.Completions, + ElementsAre(AllOf(named("X"), insertInclude("\"bar.h\"")))); + { + Config C; + C.Style.AngledHeaders.push_back( + [](auto header) { return header == "bar.h"; }); + WithContextValue WithCfg(Config::Key, std::move(C)); + Results = completions(TU, Test.point(), {Sym}); + EXPECT_THAT(Results.Completions, + ElementsAre(AllOf(named("X"), insertInclude("")))); + } + { + Config C; + C.Style.QuotedHeaders.push_back( + [](auto header) { return header == "bar.h"; }); + WithContextValue WithCfg(Config::Key, std::move(C)); + Results = completions(TU, Test.point(), {Sym}); + EXPECT_THAT(Results.Completions, + ElementsAre(AllOf(named("X"), insertInclude("\"bar.h\"")))); + } +} + TEST(CompletionTest, IndexSuppressesPreambleCompletions) { Annotations Test(R"cpp( #include "bar.h" @@ -1138,8 +1173,8 @@ TEST(CodeCompleteTest, NoColonColonAtTheEnd) { } TEST(CompletionTests, EmptySnippetDoesNotCrash) { - // See https://github.com/clangd/clangd/issues/1216 - auto Results = completions(R"cpp( + // See https://github.com/clangd/clangd/issues/1216 + auto Results = completions(R"cpp( int main() { auto w = [&](auto &&f) { return f(f); }; auto f = w([&](auto &&f) { @@ -1155,18 +1190,18 @@ TEST(CompletionTests, EmptySnippetDoesNotCrash) { } TEST(CompletionTest, Issue1427Crash) { - // Need to provide main file signals to ensure that the branch in - // SymbolRelevanceSignals::computeASTSignals() that tries to - // compute a symbol ID is taken. - ASTSignals MainFileSignals; - CodeCompleteOptions Opts; - Opts.MainFileSignals = &MainFileSignals; - completions(R"cpp( + // Need to provide main file signals to ensure that the branch in + // SymbolRelevanceSignals::computeASTSignals() that tries to + // compute a symbol ID is taken. + ASTSignals MainFileSignals; + CodeCompleteOptions Opts; + Opts.MainFileSignals = &MainFileSignals; + completions(R"cpp( auto f = []() { 1.0_^ }; )cpp", - {}, Opts); + {}, Opts); } TEST(CompletionTest, BacktrackCrashes) { diff --git a/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp b/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp index 4ecfdf0184ab4..179960a02cade 100644 --- a/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp +++ b/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp @@ -545,6 +545,44 @@ TEST_F(ConfigCompileTests, Style) { Frag.Style.FullyQualifiedNamespaces.push_back(std::string("bar")); EXPECT_TRUE(compileAndApply()); EXPECT_THAT(Conf.Style.FullyQualifiedNamespaces, ElementsAre("foo", "bar")); + + { + Frag = {}; + EXPECT_TRUE(Conf.Style.QuotedHeaders.empty()) + << Conf.Style.QuotedHeaders.size(); + Frag.Style.QuotedHeaders.push_back(Located("foo.h")); + Frag.Style.QuotedHeaders.push_back(Located(".*inc")); + EXPECT_TRUE(compileAndApply()); + auto HeaderFilter = [this](llvm::StringRef Path) { + for (auto &Filter : Conf.Style.QuotedHeaders) { + if (Filter(Path)) + return true; + } + return false; + }; + EXPECT_TRUE(HeaderFilter("foo.h")); + EXPECT_TRUE(HeaderFilter("prefix/foo.h")); + EXPECT_FALSE(HeaderFilter("bar.h")); + EXPECT_FALSE(HeaderFilter("foo.h/bar.h")); + } + + { + Frag = {}; + EXPECT_TRUE(Conf.Style.AngledHeaders.empty()) + << Conf.Style.AngledHeaders.size(); + Frag.Style.AngledHeaders.push_back(Located("foo.h")); + Frag.Style.AngledHeaders.push_back(Located(".*inc")); + EXPECT_TRUE(compileAndApply()); + auto HeaderFilter = [this](llvm::StringRef Path) { + for (auto &Filter : Conf.Style.AngledHeaders) { + if (Filter(Path)) + return true; + } + return false; + }; + EXPECT_TRUE(HeaderFilter("foo.h")); + EXPECT_FALSE(HeaderFilter("bar.h")); + } } } // namespace } // namespace config diff --git a/clang-tools-extra/clangd/unittests/ConfigYAMLTests.cpp b/clang-tools-extra/clangd/unittests/ConfigYAMLTests.cpp index 10d67dead342c..979d725461fd0 100644 --- a/clang-tools-extra/clangd/unittests/ConfigYAMLTests.cpp +++ b/clang-tools-extra/clangd/unittests/ConfigYAMLTests.cpp @@ -297,13 +297,19 @@ TEST(ParseYAML, Style) { CapturedDiags Diags; Annotations YAML(R"yaml( Style: - FullyQualifiedNamespaces: [foo, bar])yaml"); + FullyQualifiedNamespaces: [foo, bar] + AngledHeaders: ["foo", "bar"] + QuotedHeaders: ["baz", "baar"])yaml"); auto Results = Fragment::parseYAML(YAML.code(), "config.yaml", Diags.callback()); ASSERT_THAT(Diags.Diagnostics, IsEmpty()); ASSERT_EQ(Results.size(), 1u); EXPECT_THAT(Results[0].Style.FullyQualifiedNamespaces, ElementsAre(val("foo"), val("bar"))); + EXPECT_THAT(Results[0].Style.AngledHeaders, + ElementsAre(val("foo"), val("bar"))); + EXPECT_THAT(Results[0].Style.QuotedHeaders, + ElementsAre(val("baz"), val("baar"))); } } // namespace } // namespace config diff --git a/clang-tools-extra/clangd/unittests/HeadersTests.cpp b/clang-tools-extra/clangd/unittests/HeadersTests.cpp index dc6adaee11257..751383e3b4650 100644 --- a/clang-tools-extra/clangd/unittests/HeadersTests.cpp +++ b/clang-tools-extra/clangd/unittests/HeadersTests.cpp @@ -107,7 +107,8 @@ class HeadersTest : public ::testing::Test { IncludeInserter Inserter(MainFile, /*Code=*/"", format::getLLVMStyle(), CDB.getCompileCommand(MainFile)->Directory, - &Clang->getPreprocessor().getHeaderSearchInfo()); + &Clang->getPreprocessor().getHeaderSearchInfo(), + QuotedHeaders, AngledHeaders); for (const auto &Inc : Inclusions) Inserter.addExisting(Inc); auto Inserted = ToHeaderFile(Preferred); @@ -127,7 +128,8 @@ class HeadersTest : public ::testing::Test { IncludeInserter Inserter(MainFile, /*Code=*/"", format::getLLVMStyle(), CDB.getCompileCommand(MainFile)->Directory, - &Clang->getPreprocessor().getHeaderSearchInfo()); + &Clang->getPreprocessor().getHeaderSearchInfo(), + QuotedHeaders, AngledHeaders); auto Edit = Inserter.insert(VerbatimHeader, Directive); Action.EndSourceFile(); return Edit; @@ -139,6 +141,8 @@ class HeadersTest : public ::testing::Test { std::string Subdir = testPath("sub"); std::string SearchDirArg = (llvm::Twine("-I") + Subdir).str(); IgnoringDiagConsumer IgnoreDiags; + std::vector> QuotedHeaders; + std::vector> AngledHeaders; std::unique_ptr Clang; }; @@ -304,6 +308,9 @@ TEST_F(HeadersTest, InsertInclude) { std::string Path = testPath("sub/bar.h"); FS.Files[Path] = ""; EXPECT_EQ(calculate(Path), "\"bar.h\""); + + AngledHeaders.push_back([](auto Path) { return true; }); + EXPECT_EQ(calculate(Path), ""); } TEST_F(HeadersTest, DoNotInsertIfInSameFile) { @@ -326,6 +333,17 @@ TEST_F(HeadersTest, ShortenIncludesInSearchPath) { EXPECT_EQ(calculate(BarHeader), "\"sub/bar.h\""); } +TEST_F(HeadersTest, ShortenIncludesInSearchPathBracketed) { + AngledHeaders.push_back([](auto Path) { return true; }); + std::string BarHeader = testPath("sub/bar.h"); + EXPECT_EQ(calculate(BarHeader), ""); + + SearchDirArg = (llvm::Twine("-I") + Subdir + "/..").str(); + CDB.ExtraClangFlags = {SearchDirArg.c_str()}; + BarHeader = testPath("sub/bar.h"); + EXPECT_EQ(calculate(BarHeader), ""); +} + TEST_F(HeadersTest, ShortenedIncludeNotInSearchPath) { std::string BarHeader = llvm::sys::path::convert_to_slash(testPath("sub-2/bar.h")); @@ -338,6 +356,10 @@ TEST_F(HeadersTest, PreferredHeader) { std::string BazHeader = testPath("sub/baz.h"); EXPECT_EQ(calculate(BarHeader, BazHeader), "\"baz.h\""); + + AngledHeaders.push_back([](auto Path) { return true; }); + std::string BiffHeader = testPath("sub/biff.h"); + EXPECT_EQ(calculate(BarHeader, BiffHeader), ""); } TEST_F(HeadersTest, DontInsertDuplicatePreferred) { @@ -370,7 +392,8 @@ TEST_F(HeadersTest, PreferInserted) { TEST(Headers, NoHeaderSearchInfo) { std::string MainFile = testPath("main.cpp"); IncludeInserter Inserter(MainFile, /*Code=*/"", format::getLLVMStyle(), - /*BuildDir=*/"", /*HeaderSearchInfo=*/nullptr); + /*BuildDir=*/"", /*HeaderSearchInfo=*/nullptr, + /*QuotedHeaders=*/{}, /*AngledHeaders=*/{}); auto HeaderPath = testPath("sub/bar.h"); auto Inserting = HeaderFile{HeaderPath, /*Verbatim=*/false}; From 4d8f9594b20ae7063863e948907f4f8ef8ce07bd Mon Sep 17 00:00:00 2001 From: Zequan Wu Date: Fri, 27 Dec 2024 10:18:33 -0800 Subject: [PATCH 119/567] Revert "Reland "[LoopVectorizer] Add support for partial reductions" (#120721)" This reverts commit c858bf620c3ab2a4db53e84b9365b553c3ad1aa6 as it casuse optimization crash on -O2, see https://github.com/llvm/llvm-project/pull/120721#issuecomment-2563192057 --- .../llvm/Analysis/TargetTransformInfo.h | 39 - .../llvm/Analysis/TargetTransformInfoImpl.h | 9 - llvm/lib/Analysis/TargetTransformInfo.cpp | 17 - .../AArch64/AArch64TargetTransformInfo.h | 56 - .../Transforms/Vectorize/LoopVectorize.cpp | 136 +- .../Transforms/Vectorize/VPRecipeBuilder.h | 59 +- llvm/lib/Transforms/Vectorize/VPlan.h | 63 +- .../Transforms/Vectorize/VPlanAnalysis.cpp | 8 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 74 +- llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 - .../AArch64/fully-unrolled-cost.ll | 20 +- .../partial-reduce-dot-product-epilogue.ll | 213 -- .../partial-reduce-dot-product-neon.ll | 1375 ------------- .../AArch64/partial-reduce-dot-product.ll | 1733 ----------------- .../AArch64/partial-reduce-no-dotprod.ll | 61 - .../LoopVectorize/AArch64/vplan-printing.ll | 93 - 16 files changed, 30 insertions(+), 3927 deletions(-) delete mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll delete mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll delete mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll delete mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll delete mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index c6b846f96f162..752313ab15858 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -211,12 +211,6 @@ typedef TargetTransformInfo TTI; /// for IR-level transformations. class TargetTransformInfo { public: - enum PartialReductionExtendKind { PR_None, PR_SignExtend, PR_ZeroExtend }; - - /// Get the kind of extension that an instruction represents. - static PartialReductionExtendKind - getPartialReductionExtendKind(Instruction *I); - /// Construct a TTI object using a type implementing the \c Concept /// API below. /// @@ -1286,18 +1280,6 @@ class TargetTransformInfo { /// \return if target want to issue a prefetch in address space \p AS. bool shouldPrefetchAddressSpace(unsigned AS) const; - /// \return The cost of a partial reduction, which is a reduction from a - /// vector to another vector with fewer elements of larger size. They are - /// represented by the llvm.experimental.partial.reduce.add intrinsic, which - /// takes an accumulator and a binary operation operand that itself is fed by - /// two extends. An example of an operation that uses a partial reduction is a - /// dot product, which reduces a vector to another of 4 times fewer elements. - InstructionCost - getPartialReductionCost(unsigned Opcode, Type *InputType, Type *AccumType, - ElementCount VF, PartialReductionExtendKind OpAExtend, - PartialReductionExtendKind OpBExtend, - std::optional BinOp = std::nullopt) const; - /// \return The maximum interleave factor that any transform should try to /// perform for this target. This number depends on the level of parallelism /// and the number of execution units in the CPU. @@ -2125,18 +2107,6 @@ class TargetTransformInfo::Concept { /// \return if target want to issue a prefetch in address space \p AS. virtual bool shouldPrefetchAddressSpace(unsigned AS) const = 0; - /// \return The cost of a partial reduction, which is a reduction from a - /// vector to another vector with fewer elements of larger size. They are - /// represented by the llvm.experimental.partial.reduce.add intrinsic, which - /// takes an accumulator and a binary operation operand that itself is fed by - /// two extends. An example of an operation that uses a partial reduction is a - /// dot product, which reduces a vector to another of 4 times fewer elements. - virtual InstructionCost - getPartialReductionCost(unsigned Opcode, Type *InputType, Type *AccumType, - ElementCount VF, PartialReductionExtendKind OpAExtend, - PartialReductionExtendKind OpBExtend, - std::optional BinOp) const = 0; - virtual unsigned getMaxInterleaveFactor(ElementCount VF) = 0; virtual InstructionCost getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, @@ -2816,15 +2786,6 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { return Impl.shouldPrefetchAddressSpace(AS); } - InstructionCost getPartialReductionCost( - unsigned Opcode, Type *InputType, Type *AccumType, ElementCount VF, - PartialReductionExtendKind OpAExtend, - PartialReductionExtendKind OpBExtend, - std::optional BinOp = std::nullopt) const override { - return Impl.getPartialReductionCost(Opcode, InputType, AccumType, VF, - OpAExtend, OpBExtend, BinOp); - } - unsigned getMaxInterleaveFactor(ElementCount VF) override { return Impl.getMaxInterleaveFactor(VF); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 5fa0c46ad292d..9c74b2a0c31df 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -585,15 +585,6 @@ class TargetTransformInfoImplBase { bool enableWritePrefetching() const { return false; } bool shouldPrefetchAddressSpace(unsigned AS) const { return !AS; } - InstructionCost - getPartialReductionCost(unsigned Opcode, Type *InputType, Type *AccumType, - ElementCount VF, - TTI::PartialReductionExtendKind OpAExtend, - TTI::PartialReductionExtendKind OpBExtend, - std::optional BinOp = std::nullopt) const { - return InstructionCost::getInvalid(); - } - unsigned getMaxInterleaveFactor(ElementCount VF) const { return 1; } InstructionCost getArithmeticInstrCost( diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index c62e40db0c577..b32dffa9f0fe8 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -863,14 +863,6 @@ bool TargetTransformInfo::shouldPrefetchAddressSpace(unsigned AS) const { return TTIImpl->shouldPrefetchAddressSpace(AS); } -InstructionCost TargetTransformInfo::getPartialReductionCost( - unsigned Opcode, Type *InputType, Type *AccumType, ElementCount VF, - PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, - std::optional BinOp) const { - return TTIImpl->getPartialReductionCost(Opcode, InputType, AccumType, VF, - OpAExtend, OpBExtend, BinOp); -} - unsigned TargetTransformInfo::getMaxInterleaveFactor(ElementCount VF) const { return TTIImpl->getMaxInterleaveFactor(VF); } @@ -982,15 +974,6 @@ InstructionCost TargetTransformInfo::getShuffleCost( return Cost; } -TargetTransformInfo::PartialReductionExtendKind -TargetTransformInfo::getPartialReductionExtendKind(Instruction *I) { - if (isa(I)) - return PR_SignExtend; - if (isa(I)) - return PR_ZeroExtend; - return PR_None; -} - TTI::CastContextHint TargetTransformInfo::getCastContextHint(const Instruction *I) { if (!I) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 2a31cacc203f4..83b86e31565e4 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -23,7 +23,6 @@ #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/IR/Function.h" #include "llvm/IR/Intrinsics.h" -#include "llvm/Support/InstructionCost.h" #include #include @@ -358,61 +357,6 @@ class AArch64TTIImpl : public BasicTTIImplBase { return BaseT::isLegalNTLoad(DataType, Alignment); } - InstructionCost - getPartialReductionCost(unsigned Opcode, Type *InputType, Type *AccumType, - ElementCount VF, - TTI::PartialReductionExtendKind OpAExtend, - TTI::PartialReductionExtendKind OpBExtend, - std::optional BinOp) const { - - InstructionCost Invalid = InstructionCost::getInvalid(); - InstructionCost Cost(TTI::TCC_Basic); - - if (Opcode != Instruction::Add) - return Invalid; - - EVT InputEVT = EVT::getEVT(InputType); - EVT AccumEVT = EVT::getEVT(AccumType); - - if (VF.isScalable() && !ST->isSVEorStreamingSVEAvailable()) - return Invalid; - if (VF.isFixed() && (!ST->isNeonAvailable() || !ST->hasDotProd())) - return Invalid; - - if (InputEVT == MVT::i8) { - switch (VF.getKnownMinValue()) { - default: - return Invalid; - case 8: - if (AccumEVT == MVT::i32) - Cost *= 2; - else if (AccumEVT != MVT::i64) - return Invalid; - break; - case 16: - if (AccumEVT == MVT::i64) - Cost *= 2; - else if (AccumEVT != MVT::i32) - return Invalid; - break; - } - } else if (InputEVT == MVT::i16) { - // FIXME: Allow i32 accumulator but increase cost, as we would extend - // it to i64. - if (VF.getKnownMinValue() != 8 || AccumEVT != MVT::i64) - return Invalid; - } else - return Invalid; - - if (OpAExtend == TTI::PR_None || OpBExtend == TTI::PR_None) - return Invalid; - - if (!BinOp || (*BinOp) != Instruction::Mul) - return Invalid; - - return Cost; - } - bool enableOrderedReductions() const { return true; } InstructionCost getInterleavedMemoryOpCost( diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 355ff40ce770e..af6fce4b15190 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7605,10 +7605,6 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, } continue; } - // The VPlan-based cost model is more accurate for partial reduction and - // comparing against the legacy cost isn't desirable. - if (isa(&R)) - return true; if (Instruction *UI = GetInstructionForCost(&R)) SeenInstrs.insert(UI); } @@ -8831,103 +8827,6 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I, return Recipe; } -/// Find all possible partial reductions in the loop and track all of those that -/// are valid so recipes can be formed later. -void VPRecipeBuilder::collectScaledReductions(VFRange &Range) { - // Find all possible partial reductions. - SmallVector, 1> - PartialReductionChains; - for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) - if (std::optional> Pair = - getScaledReduction(Phi, RdxDesc, Range)) - PartialReductionChains.push_back(*Pair); - - // A partial reduction is invalid if any of its extends are used by - // something that isn't another partial reduction. This is because the - // extends are intended to be lowered along with the reduction itself. - - // Build up a set of partial reduction bin ops for efficient use checking. - SmallSet PartialReductionBinOps; - for (const auto &[PartialRdx, _] : PartialReductionChains) - PartialReductionBinOps.insert(PartialRdx.BinOp); - - auto ExtendIsOnlyUsedByPartialReductions = - [&PartialReductionBinOps](Instruction *Extend) { - return all_of(Extend->users(), [&](const User *U) { - return PartialReductionBinOps.contains(U); - }); - }; - - // Check if each use of a chain's two extends is a partial reduction - // and only add those that don't have non-partial reduction users. - for (auto Pair : PartialReductionChains) { - PartialReductionChain Chain = Pair.first; - if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) && - ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB)) - ScaledReductionExitInstrs.insert(std::make_pair(Chain.Reduction, Pair)); - } -} - -std::optional> -VPRecipeBuilder::getScaledReduction(PHINode *PHI, - const RecurrenceDescriptor &Rdx, - VFRange &Range) { - // TODO: Allow scaling reductions when predicating. The select at - // the end of the loop chooses between the phi value and most recent - // reduction result, both of which have different VFs to the active lane - // mask when scaling. - if (CM.blockNeedsPredicationForAnyReason(Rdx.getLoopExitInstr()->getParent())) - return std::nullopt; - - auto *Update = dyn_cast(Rdx.getLoopExitInstr()); - if (!Update) - return std::nullopt; - - Value *Op = Update->getOperand(0); - if (Op == PHI) - Op = Update->getOperand(1); - - auto *BinOp = dyn_cast(Op); - if (!BinOp || !BinOp->hasOneUse()) - return std::nullopt; - - using namespace llvm::PatternMatch; - Value *A, *B; - if (!match(BinOp->getOperand(0), m_ZExtOrSExt(m_Value(A))) || - !match(BinOp->getOperand(1), m_ZExtOrSExt(m_Value(B)))) - return std::nullopt; - - Instruction *ExtA = cast(BinOp->getOperand(0)); - Instruction *ExtB = cast(BinOp->getOperand(1)); - - // Check that the extends extend from the same type. - if (A->getType() != B->getType()) - return std::nullopt; - - TTI::PartialReductionExtendKind OpAExtend = - TargetTransformInfo::getPartialReductionExtendKind(ExtA); - TTI::PartialReductionExtendKind OpBExtend = - TargetTransformInfo::getPartialReductionExtendKind(ExtB); - - PartialReductionChain Chain(Rdx.getLoopExitInstr(), ExtA, ExtB, BinOp); - - unsigned TargetScaleFactor = - PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor( - A->getType()->getPrimitiveSizeInBits()); - - if (LoopVectorizationPlanner::getDecisionAndClampRange( - [&](ElementCount VF) { - InstructionCost Cost = TTI->getPartialReductionCost( - Update->getOpcode(), A->getType(), PHI->getType(), VF, - OpAExtend, OpBExtend, std::make_optional(BinOp->getOpcode())); - return Cost.isValid(); - }, - Range)) - return std::make_pair(Chain, TargetScaleFactor); - - return std::nullopt; -} - VPRecipeBase * VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, ArrayRef Operands, @@ -8952,14 +8851,9 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, Legal->getReductionVars().find(Phi)->second; assert(RdxDesc.getRecurrenceStartValue() == Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); - - // If the PHI is used by a partial reduction, set the scale factor. - std::optional> Pair = - getScaledReductionForInstr(RdxDesc.getLoopExitInstr()); - unsigned ScaleFactor = Pair ? Pair->second : 1; - PhiRecipe = new VPReductionPHIRecipe( - Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi), - CM.useOrderedReductions(RdxDesc), ScaleFactor); + PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, + CM.isInLoopReduction(Phi), + CM.useOrderedReductions(RdxDesc)); } else { // TODO: Currently fixed-order recurrences are modeled as chains of // first-order recurrences. If there are no users of the intermediate @@ -8991,9 +8885,6 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, if (isa(Instr) || isa(Instr)) return tryToWidenMemory(Instr, Operands, Range); - if (getScaledReductionForInstr(Instr)) - return tryToCreatePartialReduction(Instr, Operands); - if (!shouldWiden(Instr, Range)) return nullptr; @@ -9014,21 +8905,6 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, return tryToWiden(Instr, Operands, VPBB); } -VPRecipeBase * -VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, - ArrayRef Operands) { - assert(Operands.size() == 2 && - "Unexpected number of operands for partial reduction"); - - VPValue *BinOp = Operands[0]; - VPValue *Phi = Operands[1]; - if (isa(BinOp->getDefiningRecipe())) - std::swap(BinOp, Phi); - - return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Phi, - Reduction); -} - void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF) { assert(OrigLoop->isInnermost() && "Inner loop expected."); @@ -9346,8 +9222,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None; addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL); - VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE, - Builder); + VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder); // --------------------------------------------------------------------------- // Pre-construction: record ingredients whose recipes we'll need to further @@ -9393,9 +9268,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { bool NeedsBlends = BB != HeaderBB && !BB->phis().empty(); return Legal->blockNeedsPredication(BB) || NeedsBlends; }); - - RecipeBuilder.collectScaledReductions(Range); - auto *MiddleVPBB = Plan->getMiddleBlock(); VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi(); for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index cf653e2d3e658..5d4a3b555981c 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -21,28 +21,8 @@ namespace llvm { class LoopVectorizationLegality; class LoopVectorizationCostModel; class TargetLibraryInfo; -class TargetTransformInfo; struct HistogramInfo; -/// A chain of instructions that form a partial reduction. -/// Designed to match: reduction_bin_op (bin_op (extend (A), (extend (B))), -/// accumulator). -struct PartialReductionChain { - PartialReductionChain(Instruction *Reduction, Instruction *ExtendA, - Instruction *ExtendB, Instruction *BinOp) - : Reduction(Reduction), ExtendA(ExtendA), ExtendB(ExtendB), BinOp(BinOp) { - } - /// The top-level binary operation that forms the reduction to a scalar - /// after the loop body. - Instruction *Reduction; - /// The extension of each of the inner binary operation's operands. - Instruction *ExtendA; - Instruction *ExtendB; - - /// The binary operation using the extends that is then reduced. - Instruction *BinOp; -}; - /// Helper class to create VPRecipies from IR instructions. class VPRecipeBuilder { /// The VPlan new recipes are added to. @@ -54,9 +34,6 @@ class VPRecipeBuilder { /// Target Library Info. const TargetLibraryInfo *TLI; - // Target Transform Info. - const TargetTransformInfo *TTI; - /// The legality analysis. LoopVectorizationLegality *Legal; @@ -86,11 +63,6 @@ class VPRecipeBuilder { /// created. SmallVector PhisToFix; - /// The set of reduction exit instructions that will be scaled to - /// a smaller VF via partial reductions, paired with the scaling factor. - DenseMap> - ScaledReductionExitInstrs; - /// Check if \p I can be widened at the start of \p Range and possibly /// decrease the range such that the returned value holds for the entire \p /// Range. The function should not be called for memory instructions or calls. @@ -139,35 +111,13 @@ class VPRecipeBuilder { VPHistogramRecipe *tryToWidenHistogram(const HistogramInfo *HI, ArrayRef Operands); - /// Examines reduction operations to see if the target can use a cheaper - /// operation with a wider per-iteration input VF and narrower PHI VF. - /// Returns null if no scaled reduction was found, otherwise a pair with a - /// struct containing reduction information and the scaling factor between the - /// number of elements in the input and output. - std::optional> - getScaledReduction(PHINode *PHI, const RecurrenceDescriptor &Rdx, - VFRange &Range); - public: VPRecipeBuilder(VPlan &Plan, Loop *OrigLoop, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, LoopVectorizationLegality *Legal, LoopVectorizationCostModel &CM, PredicatedScalarEvolution &PSE, VPBuilder &Builder) - : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal), - CM(CM), PSE(PSE), Builder(Builder) {} - - std::optional> - getScaledReductionForInstr(const Instruction *ExitInst) { - auto It = ScaledReductionExitInstrs.find(ExitInst); - return It == ScaledReductionExitInstrs.end() - ? std::nullopt - : std::make_optional(It->second); - } - - /// Find all possible partial reductions in the loop and track all of those - /// that are valid so recipes can be formed later. - void collectScaledReductions(VFRange &Range); + : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM), + PSE(PSE), Builder(Builder) {} /// Create and return a widened recipe for \p I if one can be created within /// the given VF \p Range. @@ -175,11 +125,6 @@ class VPRecipeBuilder { ArrayRef Operands, VFRange &Range, VPBasicBlock *VPBB); - /// Create and return a partial reduction recipe for a reduction instruction - /// along with binary operation and reduction phi operands. - VPRecipeBase *tryToCreatePartialReduction(Instruction *Reduction, - ArrayRef Operands); - /// Set the recipe created for given ingredient. void setRecipe(Instruction *I, VPRecipeBase *R) { assert(!Ingredient2Recipe.contains(I) && diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index e62ace1980aa7..404202b7f3130 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -889,7 +889,6 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPWidenPointerInductionSC: case VPRecipeBase::VPReductionPHISC: case VPRecipeBase::VPScalarCastSC: - case VPRecipeBase::VPPartialReductionSC: return true; case VPRecipeBase::VPBranchOnMaskSC: case VPRecipeBase::VPInterleaveSC: @@ -2377,28 +2376,23 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, /// The phi is part of an ordered reduction. Requires IsInLoop to be true. bool IsOrdered; - /// When expanding the reduction PHI, the plan's VF element count is divided - /// by this factor to form the reduction phi's VF. - unsigned VFScaleFactor = 1; - public: /// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p /// RdxDesc. VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc, VPValue &Start, bool IsInLoop = false, - bool IsOrdered = false, unsigned VFScaleFactor = 1) + bool IsOrdered = false) : VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start), - RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered), - VFScaleFactor(VFScaleFactor) { + RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered) { assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop"); } ~VPReductionPHIRecipe() override = default; VPReductionPHIRecipe *clone() override { - auto *R = new VPReductionPHIRecipe(cast(getUnderlyingInstr()), - RdxDesc, *getOperand(0), IsInLoop, - IsOrdered, VFScaleFactor); + auto *R = + new VPReductionPHIRecipe(cast(getUnderlyingInstr()), RdxDesc, + *getOperand(0), IsInLoop, IsOrdered); R->addOperand(getBackedgeValue()); return R; } @@ -2429,51 +2423,6 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, bool isInLoop() const { return IsInLoop; } }; -/// A recipe for forming partial reductions. In the loop, an accumulator and -/// vector operand are added together and passed to the next iteration as the -/// next accumulator. After the loop body, the accumulator is reduced to a -/// scalar value. -class VPPartialReductionRecipe : public VPSingleDefRecipe { - unsigned Opcode; - -public: - VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0, - VPValue *Op1) - : VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1, - ReductionInst) {} - VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1, - Instruction *ReductionInst = nullptr) - : VPSingleDefRecipe(VPDef::VPPartialReductionSC, - ArrayRef({Op0, Op1}), ReductionInst), - Opcode(Opcode) { - assert(isa(getOperand(1)->getDefiningRecipe()) && - "Unexpected operand order for partial reduction recipe"); - } - ~VPPartialReductionRecipe() override = default; - - VPPartialReductionRecipe *clone() override { - return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1)); - } - - VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC) - - /// Generate the reduction in the loop. - void execute(VPTransformState &State) override; - - /// Return the cost of this VPPartialReductionRecipe. - InstructionCost computeCost(ElementCount VF, - VPCostContext &Ctx) const override; - - /// Get the binary op's opcode. - unsigned getOpcode() const { return Opcode; } - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const override; -#endif -}; - /// A recipe for vectorizing a phi-node as a sequence of mask-based select /// instructions. class VPBlendRecipe : public VPSingleDefRecipe { @@ -2683,7 +2632,7 @@ class VPReductionRecipe : public VPSingleDefRecipe { return R && classof(R); } - /// Generate the reduction in the loop. + /// Generate the reduction in the loop void execute(VPTransformState &State) override; /// Return the cost of VPReductionRecipe. diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 8fea2c6fd33b6..35497a7431f76 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -231,10 +231,10 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { [](const auto *R) { return R->getScalarType(); }) .Case([this](const VPRecipeBase *R) { - return inferScalarType(R->getOperand(0)); - }) + VPReverseVectorPointerRecipe, VPWidenCanonicalIVRecipe>( + [this](const VPRecipeBase *R) { + return inferScalarType(R->getOperand(0)); + }) .Case( diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index aa1294f82c5f0..7038e52a643c4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -292,66 +292,6 @@ InstructionCost VPRecipeBase::computeCost(ElementCount VF, llvm_unreachable("subclasses should implement computeCost"); } -InstructionCost -VPPartialReductionRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { - std::optional Opcode = std::nullopt; - VPRecipeBase *BinOpR = getOperand(0)->getDefiningRecipe(); - if (auto *WidenR = dyn_cast(BinOpR)) - Opcode = std::make_optional(WidenR->getOpcode()); - - VPRecipeBase *ExtAR = BinOpR->getOperand(0)->getDefiningRecipe(); - VPRecipeBase *ExtBR = BinOpR->getOperand(1)->getDefiningRecipe(); - - auto GetExtendKind = [](VPRecipeBase *R) { - auto *WidenCastR = dyn_cast(R); - if (!WidenCastR) - return TargetTransformInfo::PR_None; - if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt) - return TargetTransformInfo::PR_ZeroExtend; - if (WidenCastR->getOpcode() == Instruction::CastOps::SExt) - return TargetTransformInfo::PR_SignExtend; - return TargetTransformInfo::PR_None; - }; - - auto *PhiType = Ctx.Types.inferScalarType(getOperand(1)); - auto *ExtTy = Ctx.Types.inferScalarType(ExtAR->getOperand(0)); - - return Ctx.TTI.getPartialReductionCost(getOpcode(), ExtTy, PhiType, VF, - GetExtendKind(ExtAR), - GetExtendKind(ExtBR), Opcode); -} - -void VPPartialReductionRecipe::execute(VPTransformState &State) { - State.setDebugLocFrom(getDebugLoc()); - auto &Builder = State.Builder; - - assert(getOpcode() == Instruction::Add && - "Unhandled partial reduction opcode"); - - Value *BinOpVal = State.get(getOperand(0)); - Value *PhiVal = State.get(getOperand(1)); - assert(PhiVal && BinOpVal && "Phi and Mul must be set"); - - Type *RetTy = PhiVal->getType(); - - CallInst *V = Builder.CreateIntrinsic( - RetTy, Intrinsic::experimental_vector_partial_reduce_add, - {PhiVal, BinOpVal}, nullptr, "partial.reduce"); - - State.set(this, V); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "PARTIAL-REDUCE "; - printAsOperand(O, SlotTracker); - O << " = " << Instruction::getOpcodeName(getOpcode()) << " "; - printOperands(O, SlotTracker); -} -#endif - FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const { assert(OpType == OperationType::FPMathOp && "recipe doesn't have fast math flags"); @@ -3444,10 +3384,6 @@ void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, void VPReductionPHIRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; - // If this phi is fed by a scaled reduction then it should output a - // vector with fewer elements than the VF. - ElementCount VF = State.VF.divideCoefficientBy(VFScaleFactor); - // Reductions do not have to start at zero. They can start with // any loop invariant values. VPValue *StartVPV = getStartValue(); @@ -3458,8 +3394,8 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { // stage #1: We create a new vector PHI node with no incoming edges. We'll use // this value when we vectorize all of the instructions that use the PHI. bool ScalarPHI = State.VF.isScalar() || IsInLoop; - Type *VecTy = - ScalarPHI ? StartV->getType() : VectorType::get(StartV->getType(), VF); + Type *VecTy = ScalarPHI ? StartV->getType() + : VectorType::get(StartV->getType(), State.VF); BasicBlock *HeaderBB = State.CFG.PrevBB; assert(State.CurrentVectorLoop->getHeader() == HeaderBB && @@ -3509,13 +3445,13 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { // Create start and identity vector values for the reduction in the // preheader. // TODO: Introduce recipes in VPlan preheader to create initial values. - Iden = Builder.CreateVectorSplat(VF, Iden); + Iden = Builder.CreateVectorSplat(State.VF, Iden); IRBuilderBase::InsertPointGuard IPBuilder(Builder); Builder.SetInsertPoint(VectorPH->getTerminator()); Constant *Zero = Builder.getInt32(0); StartV = Builder.CreateInsertElement(Iden, StartV, Zero); } else { - Iden = Builder.CreateVectorSplat(VF, Iden); + Iden = Builder.CreateVectorSplat(State.VF, Iden); } } } @@ -3533,8 +3469,6 @@ void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, printAsOperand(O, SlotTracker); O << " = phi "; printOperands(O, SlotTracker); - if (VFScaleFactor != 1) - O << " (VF scaled by 1/" << VFScaleFactor << ")"; } #endif diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 7aaf4002b8b3e..957a602091c73 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -329,7 +329,6 @@ class VPDef { VPInterleaveSC, VPReductionEVLSC, VPReductionSC, - VPPartialReductionSC, VPReplicateSC, VPScalarCastSC, VPScalarIVStepsSC, diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll index c3e8c895fce24..1cfb507a74344 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll @@ -11,10 +11,10 @@ define i64 @test(ptr %a, ptr %b) #0 { ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 8: 30 +; CHECK: Cost for VF 8: 26 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 16: 56 +; CHECK: Cost for VF 16: 48 ; CHECK: LV: Selecting VF: 16 entry: br label %for.body @@ -31,8 +31,8 @@ for.body: ; preds = %entry, %for.body %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %i.iv %1 = load i8, ptr %arrayidx2, align 1 %conv3 = zext i8 %1 to i64 - %div = udiv i64 %conv3, %conv - %add = add i64 %div, %sum + %mul = mul nuw nsw i64 %conv3, %conv + %add = add i64 %mul, %sum %i.iv.next = add nuw nsw i64 %i.iv, 1 %exitcond.not = icmp eq i64 %i.iv.next, 16 br i1 %exitcond.not, label %exit, label %for.body @@ -45,11 +45,11 @@ define i64 @test_external_iv_user(ptr %a, ptr %b) #0 { ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 8: 30 +; CHECK: Cost for VF 8: 26 ; CHECK-NEXT: Cost of 1 for VF 16: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 16: 57 +; CHECK: Cost for VF 16: 49 ; CHECK: LV: Selecting VF: vscale x 2 entry: br label %for.body @@ -64,8 +64,8 @@ for.body: ; preds = %entry, %for.body %arrayidx2 = getelementptr inbounds nuw i8, ptr %b, i64 %i.iv.next %1 = load i8, ptr %arrayidx2, align 1 %conv3 = zext i8 %1 to i64 - %div = udiv i64 %conv3, %conv - %add = add i64 %sum, %div + %mul = mul nuw nsw i64 %conv3, %conv + %add = add i64 %sum, %mul %exitcond.not = icmp eq i64 %i.iv.next, 16 br i1 %exitcond.not, label %exit, label %for.body @@ -82,11 +82,11 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 { ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 8: 24 +; CHECK: Cost for VF 8: 27 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 16: 42 +; CHECK: Cost for VF 16: 48 ; CHECK: LV: Selecting VF: 16 entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll deleted file mode 100644 index 5cc00daab7ce5..0000000000000 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll +++ /dev/null @@ -1,213 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -mattr=+dotprod -passes=loop-vectorize -force-vector-interleave=1 -S < %s | FileCheck %s - -target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -target triple = "aarch64-none-unknown-elf" - -define i32 @dotp(ptr %a, ptr %b) #0 { -; CHECK-LABEL: define i32 @dotp( -; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: iter.check: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] -; CHECK: vector.main.loop.iter.check: -; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 -; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP8]], [[TMP5]] -; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK: middle.block: -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) -; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] -; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP13]] -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] -; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP15]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0 -; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] -; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi [ [[TMP18]], [[VEC_EPILOG_PH]] ], [ [[TMP27:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX2]], 0 -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP21]], align 1 -; CHECK-NEXT: [[TMP22:%.*]] = zext [[WIDE_LOAD4]] to -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[TMP23]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP24]], align 1 -; CHECK-NEXT: [[TMP25:%.*]] = zext [[WIDE_LOAD5]] to -; CHECK-NEXT: [[TMP26:%.*]] = mul [[TMP25]], [[TMP22]] -; CHECK-NEXT: [[TMP27]] = add [[TMP26]], [[VEC_PHI3]] -; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX2]], [[TMP17]] -; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP27]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT]], label [[VEC_EPILOG_SCALAR_PH]] -; -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] - %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] - %gep.a = getelementptr i8, ptr %a, i64 %iv - %load.a = load i8, ptr %gep.a, align 1 - %ext.a = zext i8 %load.a to i32 - %gep.b = getelementptr i8, ptr %b, i64 %iv - %load.b = load i8, ptr %gep.b, align 1 - %ext.b = zext i8 %load.b to i32 - %mul = mul i32 %ext.b, %ext.a - %add = add i32 %mul, %accum - %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 - br i1 %exitcond.not, label %for.exit, label %for.body - -for.exit: ; preds = %for.body - ret i32 %add -} - -define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 { -; CHECK-LABEL: define void @dotp_small_epilogue_vf( -; CHECK-SAME: i64 [[IDX_NEG:%.*]], i8 [[A:%.*]]) #[[ATTR1:[0-9]+]] { -; CHECK-NEXT: iter.check: -; CHECK-NEXT: [[TMP0:%.*]] = sub i64 1, [[IDX_NEG]] -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[ENTRY:%.*]] -; CHECK: vector.main.loop.iter.check: -; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16 -; CHECK-NEXT: [[IV_NEXT:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = sext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i64> -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr null, align 1 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i8> poison, i8 [[TMP2]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT2]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[BROADCAST_SPLAT3]] to <16 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i64> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP4]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[IV_NEXT]] -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] -; CHECK: middle.block: -; CHECK-NEXT: [[ADD:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[IV_NEXT]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY:%.*]] -; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[IND_END6:%.*]] = add i64 [[IDX_NEG]], [[IV_NEXT]] -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[IV_NEXT]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] -; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[ACCUM:%.*]] = phi i64 [ [[ADD]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i64 [[TMP0]], 8 -; CHECK-NEXT: [[N_VEC5:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF4]] -; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[IDX_NEG]], [[N_VEC5]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT7]], <8 x i8> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = sext <8 x i8> [[BROADCAST_SPLAT8]] to <8 x i64> -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <1 x i64> zeroinitializer, i64 [[ACCUM]], i32 0 -; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] -; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX9:%.*]] = phi i64 [ [[IV]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI10:%.*]] = phi <1 x i64> [ [[TMP8]], [[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr null, align 1 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <8 x i8> poison, i8 [[TMP9]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT11]], <8 x i8> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = sext <8 x i8> [[BROADCAST_SPLAT12]] to <8 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = mul <8 x i64> [[TMP10]], [[TMP7]] -; CHECK-NEXT: [[PARTIAL_REDUCE13]] = call <1 x i64> @llvm.experimental.vector.partial.reduce.add.v1i64.v8i64(<1 x i64> [[VEC_PHI10]], <8 x i64> [[TMP11]]) -; CHECK-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX9]], 8 -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC5]] -; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> [[PARTIAL_REDUCE13]]) -; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]] -; CHECK-NEXT: br i1 [[CMP_N15]], label [[WHILE_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] -; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IDX_NEG]], [[ITER_CHECK:%.*]] ], [ [[IND_END6]], [[WHILE_BODY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL16:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[IV_NEXT]], [[WHILE_BODY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX17:%.*]] = phi i64 [ [[TMP13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[ADD]], [[WHILE_BODY]] ] -; CHECK-NEXT: br label [[WHILE_BODY1:%.*]] -; CHECK: while.body: -; CHECK-NEXT: [[IV_NEG:%.*]] = phi i64 [ [[IV_NEG_NEXT:%.*]], [[WHILE_BODY1]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[IV_NEXT1:%.*]], [[WHILE_BODY1]] ], [ [[BC_RESUME_VAL16]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[ACCUM1:%.*]] = phi i64 [ [[ADD1:%.*]], [[WHILE_BODY1]] ], [ [[BC_MERGE_RDX17]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[IV_NEG_NEXT]] = add i64 [[IV_NEG]], 1 -; CHECK-NEXT: [[EXT_A:%.*]] = sext i8 [[A]] to i64 -; CHECK-NEXT: [[IV_NEXT1]] = add i64 [[IV1]], 1 -; CHECK-NEXT: [[B:%.*]] = load i8, ptr null, align 1 -; CHECK-NEXT: [[EXT_B:%.*]] = sext i8 [[B]] to i64 -; CHECK-NEXT: [[MUL:%.*]] = mul i64 [[EXT_B]], [[EXT_A]] -; CHECK-NEXT: [[ADD1]] = add i64 [[MUL]], [[ACCUM1]] -; CHECK-NEXT: [[CMP_IV_NEG:%.*]] = icmp ugt i64 [[IV_NEG]], 0 -; CHECK-NEXT: [[CMP_IV:%.*]] = icmp ne i64 [[IV1]], -1 -; CHECK-NEXT: [[EXITCOND:%.*]] = and i1 [[CMP_IV_NEG]], [[CMP_IV]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[WHILE_BODY1]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]] -; CHECK: while.end.loopexit: -; CHECK-NEXT: [[RESULT:%.*]] = phi i64 [ [[ADD1]], [[WHILE_BODY1]] ], [ [[ADD]], [[MIDDLE_BLOCK]] ], [ [[TMP13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] -; CHECK-NEXT: ret void -; -entry: - br label %while.body - -while.body: ; preds = %while.body, %entry - %iv.neg = phi i64 [ %iv.neg.next, %while.body ], [ %idx.neg, %entry ] - %iv = phi i64 [ %iv.next, %while.body ], [ 0, %entry ] - %accum = phi i64 [ %add, %while.body ], [ 0, %entry ] - %iv.neg.next = add i64 %iv.neg, 1 - %ext.a = sext i8 %a to i64 - %iv.next = add i64 %iv, 1 - %b = load i8, ptr null, align 1 - %ext.b = sext i8 %b to i64 - %mul = mul i64 %ext.b, %ext.a - %add = add i64 %mul, %accum - %cmp.iv.neg = icmp ugt i64 %iv.neg, 0 - %cmp.iv = icmp ne i64 %iv, -1 - %exitcond = and i1 %cmp.iv.neg, %cmp.iv - br i1 %exitcond, label %while.body, label %while.end.loopexit - -while.end.loopexit: ; preds = %while.body - %result = phi i64 [ %add, %while.body ] - ret void -} - -attributes #0 = { vscale_range(1,16) "target-features"="+sve" } -attributes #1 = { "target-cpu"="apple-m1" } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll deleted file mode 100644 index c66695f1b50f0..0000000000000 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll +++ /dev/null @@ -1,1375 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1 -; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED -; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -force-vector-interleave=1 -vectorizer-maximize-bandwidth -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW - -target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -target triple = "aarch64-none-unknown-elf" - -define i32 @dotp(ptr %a, ptr %b) { -; CHECK-INTERLEAVE1-LABEL: define i32 @dotp( -; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVE1: vector.body: -; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]]) -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] -; -; CHECK-INTERLEAVED-LABEL: define i32 @dotp( -; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVED: vector.body: -; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]] -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]]) -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] -; -; CHECK-MAXBW-LABEL: define i32 @dotp( -; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-MAXBW: vector.body: -; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] -; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]]) -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-MAXBW-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) -; CHECK-MAXBW-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] -; -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] - %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] - %gep.a = getelementptr i8, ptr %a, i64 %iv - %load.a = load i8, ptr %gep.a, align 1 - %ext.a = zext i8 %load.a to i32 - %gep.b = getelementptr i8, ptr %b, i64 %iv - %load.b = load i8, ptr %gep.b, align 1 - %ext.b = zext i8 %load.b to i32 - %mul = mul i32 %ext.b, %ext.a - %add = add i32 %mul, %accum - %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 - br i1 %exitcond.not, label %for.exit, label %for.body - -for.exit: ; preds = %for.body - ret i32 %add -} - -define i32 @not_dotp_different_types(ptr %a, ptr %b) { -; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_different_types( -; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVE1: vector.body: -; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 -; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 -; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 -; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 -; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 -; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] -; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] -; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] -; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] -; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]] -; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]] -; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] -; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]] -; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]] -; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] -; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]] -; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]] -; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]] -; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]] -; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]] -; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = load i16, ptr [[TMP19]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = load i16, ptr [[TMP20]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = load i16, ptr [[TMP21]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = load i16, ptr [[TMP22]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = load i16, ptr [[TMP23]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP40:%.*]] = load i16, ptr [[TMP24]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP41:%.*]] = load i16, ptr [[TMP25]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = load i16, ptr [[TMP26]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = load i16, ptr [[TMP27]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = load i16, ptr [[TMP28]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = load i16, ptr [[TMP29]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP46:%.*]] = load i16, ptr [[TMP30]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP47:%.*]] = load i16, ptr [[TMP31]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP48:%.*]] = load i16, ptr [[TMP32]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP49:%.*]] = load i16, ptr [[TMP33]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP50:%.*]] = load i16, ptr [[TMP34]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP51:%.*]] = insertelement <16 x i16> poison, i16 [[TMP35]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[TMP52:%.*]] = insertelement <16 x i16> [[TMP51]], i16 [[TMP36]], i32 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP53:%.*]] = insertelement <16 x i16> [[TMP52]], i16 [[TMP37]], i32 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP54:%.*]] = insertelement <16 x i16> [[TMP53]], i16 [[TMP38]], i32 3 -; CHECK-INTERLEAVE1-NEXT: [[TMP55:%.*]] = insertelement <16 x i16> [[TMP54]], i16 [[TMP39]], i32 4 -; CHECK-INTERLEAVE1-NEXT: [[TMP56:%.*]] = insertelement <16 x i16> [[TMP55]], i16 [[TMP40]], i32 5 -; CHECK-INTERLEAVE1-NEXT: [[TMP57:%.*]] = insertelement <16 x i16> [[TMP56]], i16 [[TMP41]], i32 6 -; CHECK-INTERLEAVE1-NEXT: [[TMP58:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[TMP42]], i32 7 -; CHECK-INTERLEAVE1-NEXT: [[TMP59:%.*]] = insertelement <16 x i16> [[TMP58]], i16 [[TMP43]], i32 8 -; CHECK-INTERLEAVE1-NEXT: [[TMP60:%.*]] = insertelement <16 x i16> [[TMP59]], i16 [[TMP44]], i32 9 -; CHECK-INTERLEAVE1-NEXT: [[TMP61:%.*]] = insertelement <16 x i16> [[TMP60]], i16 [[TMP45]], i32 10 -; CHECK-INTERLEAVE1-NEXT: [[TMP62:%.*]] = insertelement <16 x i16> [[TMP61]], i16 [[TMP46]], i32 11 -; CHECK-INTERLEAVE1-NEXT: [[TMP63:%.*]] = insertelement <16 x i16> [[TMP62]], i16 [[TMP47]], i32 12 -; CHECK-INTERLEAVE1-NEXT: [[TMP64:%.*]] = insertelement <16 x i16> [[TMP63]], i16 [[TMP48]], i32 13 -; CHECK-INTERLEAVE1-NEXT: [[TMP65:%.*]] = insertelement <16 x i16> [[TMP64]], i16 [[TMP49]], i32 14 -; CHECK-INTERLEAVE1-NEXT: [[TMP66:%.*]] = insertelement <16 x i16> [[TMP65]], i16 [[TMP50]], i32 15 -; CHECK-INTERLEAVE1-NEXT: [[TMP67:%.*]] = zext <16 x i16> [[TMP66]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]] -; CHECK-INTERLEAVE1-NEXT: [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; -; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_different_types( -; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVED: vector.body: -; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP137:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 16 -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 17 -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 18 -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 19 -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 20 -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 21 -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 22 -; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 23 -; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = add i64 [[INDEX]], 24 -; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], 25 -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = add i64 [[INDEX]], 26 -; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 27 -; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = add i64 [[INDEX]], 28 -; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = add i64 [[INDEX]], 29 -; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = add i64 [[INDEX]], 30 -; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 31 -; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] -; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[TMP32]], i32 16 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP34]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] -; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] -; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] -; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]] -; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]] -; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] -; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]] -; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]] -; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] -; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]] -; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]] -; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]] -; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]] -; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP16]] -; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]] -; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP18]] -; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]] -; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP20]] -; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP21]] -; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP22]] -; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP23]] -; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP24]] -; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP25]] -; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP26]] -; CHECK-INTERLEAVED-NEXT: [[TMP64:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP27]] -; CHECK-INTERLEAVED-NEXT: [[TMP65:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP28]] -; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP29]] -; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP30]] -; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP31]] -; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = load i16, ptr [[TMP37]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = load i16, ptr [[TMP38]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = load i16, ptr [[TMP39]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = load i16, ptr [[TMP40]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = load i16, ptr [[TMP41]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = load i16, ptr [[TMP42]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP75:%.*]] = load i16, ptr [[TMP43]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP76:%.*]] = load i16, ptr [[TMP44]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP77:%.*]] = load i16, ptr [[TMP45]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP78:%.*]] = load i16, ptr [[TMP46]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP79:%.*]] = load i16, ptr [[TMP47]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP80:%.*]] = load i16, ptr [[TMP48]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP81:%.*]] = load i16, ptr [[TMP49]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = load i16, ptr [[TMP50]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP83:%.*]] = load i16, ptr [[TMP51]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP84:%.*]] = load i16, ptr [[TMP52]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1 -; CHECK-INTERLEAVED-NEXT: [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2 -; CHECK-INTERLEAVED-NEXT: [[TMP88:%.*]] = insertelement <16 x i16> [[TMP87]], i16 [[TMP72]], i32 3 -; CHECK-INTERLEAVED-NEXT: [[TMP89:%.*]] = insertelement <16 x i16> [[TMP88]], i16 [[TMP73]], i32 4 -; CHECK-INTERLEAVED-NEXT: [[TMP90:%.*]] = insertelement <16 x i16> [[TMP89]], i16 [[TMP74]], i32 5 -; CHECK-INTERLEAVED-NEXT: [[TMP91:%.*]] = insertelement <16 x i16> [[TMP90]], i16 [[TMP75]], i32 6 -; CHECK-INTERLEAVED-NEXT: [[TMP92:%.*]] = insertelement <16 x i16> [[TMP91]], i16 [[TMP76]], i32 7 -; CHECK-INTERLEAVED-NEXT: [[TMP93:%.*]] = insertelement <16 x i16> [[TMP92]], i16 [[TMP77]], i32 8 -; CHECK-INTERLEAVED-NEXT: [[TMP94:%.*]] = insertelement <16 x i16> [[TMP93]], i16 [[TMP78]], i32 9 -; CHECK-INTERLEAVED-NEXT: [[TMP95:%.*]] = insertelement <16 x i16> [[TMP94]], i16 [[TMP79]], i32 10 -; CHECK-INTERLEAVED-NEXT: [[TMP96:%.*]] = insertelement <16 x i16> [[TMP95]], i16 [[TMP80]], i32 11 -; CHECK-INTERLEAVED-NEXT: [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP81]], i32 12 -; CHECK-INTERLEAVED-NEXT: [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13 -; CHECK-INTERLEAVED-NEXT: [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14 -; CHECK-INTERLEAVED-NEXT: [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15 -; CHECK-INTERLEAVED-NEXT: [[TMP101:%.*]] = load i16, ptr [[TMP53]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP102:%.*]] = load i16, ptr [[TMP54]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP103:%.*]] = load i16, ptr [[TMP55]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP104:%.*]] = load i16, ptr [[TMP56]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP105:%.*]] = load i16, ptr [[TMP57]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP106:%.*]] = load i16, ptr [[TMP58]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP107:%.*]] = load i16, ptr [[TMP59]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP108:%.*]] = load i16, ptr [[TMP60]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP109:%.*]] = load i16, ptr [[TMP61]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP110:%.*]] = load i16, ptr [[TMP62]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP111:%.*]] = load i16, ptr [[TMP63]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP112:%.*]] = load i16, ptr [[TMP64]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP113:%.*]] = load i16, ptr [[TMP65]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP114:%.*]] = load i16, ptr [[TMP66]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP115:%.*]] = load i16, ptr [[TMP67]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP116:%.*]] = load i16, ptr [[TMP68]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1 -; CHECK-INTERLEAVED-NEXT: [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2 -; CHECK-INTERLEAVED-NEXT: [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3 -; CHECK-INTERLEAVED-NEXT: [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4 -; CHECK-INTERLEAVED-NEXT: [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5 -; CHECK-INTERLEAVED-NEXT: [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6 -; CHECK-INTERLEAVED-NEXT: [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7 -; CHECK-INTERLEAVED-NEXT: [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8 -; CHECK-INTERLEAVED-NEXT: [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9 -; CHECK-INTERLEAVED-NEXT: [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10 -; CHECK-INTERLEAVED-NEXT: [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11 -; CHECK-INTERLEAVED-NEXT: [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12 -; CHECK-INTERLEAVED-NEXT: [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13 -; CHECK-INTERLEAVED-NEXT: [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14 -; CHECK-INTERLEAVED-NEXT: [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15 -; CHECK-INTERLEAVED-NEXT: [[TMP133:%.*]] = zext <16 x i16> [[TMP100]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP135:%.*]] = mul <16 x i32> [[TMP133]], [[TMP35]] -; CHECK-INTERLEAVED-NEXT: [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]] -; CHECK-INTERLEAVED-NEXT: [[TMP137]] = add <16 x i32> [[TMP135]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-INTERLEAVED-NEXT: [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; -; CHECK-MAXBW-LABEL: define i32 @not_dotp_different_types( -; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-MAXBW: vector.body: -; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 -; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 -; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 -; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 -; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 -; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 -; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 -; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] -; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 -; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] -; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] -; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] -; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]] -; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]] -; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]] -; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] -; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]] -; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]] -; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] -; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]] -; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]] -; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]] -; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]] -; CHECK-MAXBW-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]] -; CHECK-MAXBW-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]] -; CHECK-MAXBW-NEXT: [[TMP35:%.*]] = load i16, ptr [[TMP19]], align 2 -; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = load i16, ptr [[TMP20]], align 2 -; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = load i16, ptr [[TMP21]], align 2 -; CHECK-MAXBW-NEXT: [[TMP38:%.*]] = load i16, ptr [[TMP22]], align 2 -; CHECK-MAXBW-NEXT: [[TMP39:%.*]] = load i16, ptr [[TMP23]], align 2 -; CHECK-MAXBW-NEXT: [[TMP40:%.*]] = load i16, ptr [[TMP24]], align 2 -; CHECK-MAXBW-NEXT: [[TMP41:%.*]] = load i16, ptr [[TMP25]], align 2 -; CHECK-MAXBW-NEXT: [[TMP42:%.*]] = load i16, ptr [[TMP26]], align 2 -; CHECK-MAXBW-NEXT: [[TMP43:%.*]] = load i16, ptr [[TMP27]], align 2 -; CHECK-MAXBW-NEXT: [[TMP44:%.*]] = load i16, ptr [[TMP28]], align 2 -; CHECK-MAXBW-NEXT: [[TMP45:%.*]] = load i16, ptr [[TMP29]], align 2 -; CHECK-MAXBW-NEXT: [[TMP46:%.*]] = load i16, ptr [[TMP30]], align 2 -; CHECK-MAXBW-NEXT: [[TMP47:%.*]] = load i16, ptr [[TMP31]], align 2 -; CHECK-MAXBW-NEXT: [[TMP48:%.*]] = load i16, ptr [[TMP32]], align 2 -; CHECK-MAXBW-NEXT: [[TMP49:%.*]] = load i16, ptr [[TMP33]], align 2 -; CHECK-MAXBW-NEXT: [[TMP50:%.*]] = load i16, ptr [[TMP34]], align 2 -; CHECK-MAXBW-NEXT: [[TMP51:%.*]] = insertelement <16 x i16> poison, i16 [[TMP35]], i32 0 -; CHECK-MAXBW-NEXT: [[TMP52:%.*]] = insertelement <16 x i16> [[TMP51]], i16 [[TMP36]], i32 1 -; CHECK-MAXBW-NEXT: [[TMP53:%.*]] = insertelement <16 x i16> [[TMP52]], i16 [[TMP37]], i32 2 -; CHECK-MAXBW-NEXT: [[TMP54:%.*]] = insertelement <16 x i16> [[TMP53]], i16 [[TMP38]], i32 3 -; CHECK-MAXBW-NEXT: [[TMP55:%.*]] = insertelement <16 x i16> [[TMP54]], i16 [[TMP39]], i32 4 -; CHECK-MAXBW-NEXT: [[TMP56:%.*]] = insertelement <16 x i16> [[TMP55]], i16 [[TMP40]], i32 5 -; CHECK-MAXBW-NEXT: [[TMP57:%.*]] = insertelement <16 x i16> [[TMP56]], i16 [[TMP41]], i32 6 -; CHECK-MAXBW-NEXT: [[TMP58:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[TMP42]], i32 7 -; CHECK-MAXBW-NEXT: [[TMP59:%.*]] = insertelement <16 x i16> [[TMP58]], i16 [[TMP43]], i32 8 -; CHECK-MAXBW-NEXT: [[TMP60:%.*]] = insertelement <16 x i16> [[TMP59]], i16 [[TMP44]], i32 9 -; CHECK-MAXBW-NEXT: [[TMP61:%.*]] = insertelement <16 x i16> [[TMP60]], i16 [[TMP45]], i32 10 -; CHECK-MAXBW-NEXT: [[TMP62:%.*]] = insertelement <16 x i16> [[TMP61]], i16 [[TMP46]], i32 11 -; CHECK-MAXBW-NEXT: [[TMP63:%.*]] = insertelement <16 x i16> [[TMP62]], i16 [[TMP47]], i32 12 -; CHECK-MAXBW-NEXT: [[TMP64:%.*]] = insertelement <16 x i16> [[TMP63]], i16 [[TMP48]], i32 13 -; CHECK-MAXBW-NEXT: [[TMP65:%.*]] = insertelement <16 x i16> [[TMP64]], i16 [[TMP49]], i32 14 -; CHECK-MAXBW-NEXT: [[TMP66:%.*]] = insertelement <16 x i16> [[TMP65]], i16 [[TMP50]], i32 15 -; CHECK-MAXBW-NEXT: [[TMP67:%.*]] = zext <16 x i16> [[TMP66]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]] -; CHECK-MAXBW-NEXT: [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]] -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-MAXBW-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-MAXBW-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] - %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] - %gep.a = getelementptr i8, ptr %a, i64 %iv - %load.a = load i8, ptr %gep.a, align 1 - %ext.a = zext i8 %load.a to i32 - %gep.b = getelementptr i8, ptr %b, i64 %iv - %load.b = load i16, ptr %gep.b, align 2 - %ext.b = zext i16 %load.b to i32 - %mul = mul i32 %ext.b, %ext.a - %add = add i32 %mul, %accum - %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 - br i1 %exitcond.not, label %for.exit, label %for.body - -for.exit: ; preds = %for.body - ret i32 %add -} - -define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) { -; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_loop_carried( -; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVE1: vector.body: -; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ , [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; -; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried( -; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVED: vector.body: -; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ , [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; -; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_loop_carried( -; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-MAXBW: vector.body: -; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ , [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] -; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]] -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]] -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-MAXBW-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] - %accum = phi i32 [ 0, %entry ], [ %mul, %for.body ] - %gep.a = getelementptr i8, ptr %a, i64 %iv - %load.a = load i8, ptr %gep.a, align 1 - %ext.a = zext i8 %load.a to i32 - %gep.b = getelementptr i8, ptr %b, i64 %iv - %load.b = load i8, ptr %gep.b, align 1 - %ext.b = zext i8 %load.b to i32 - %mul = mul i32 %ext.b, %ext.a - %add = add i32 %mul, %accum - %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 - br i1 %exitcond.not, label %for.exit, label %for.body - -for.exit: ; preds = %for.body - ret i32 %add -} - -define i32 @not_dotp_not_phi(ptr %a, ptr %b) { -; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_phi( -; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVE1: vector.body: -; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ , [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] -; -; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_phi( -; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVED: vector.body: -; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ , [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] -; -; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_phi( -; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-MAXBW: vector.body: -; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ , [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] -; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] -; CHECK-MAXBW-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]] -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-MAXBW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] -; -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] - %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] - %gep.a = getelementptr i8, ptr %a, i64 %iv - %load.a = load i8, ptr %gep.a, align 1 - %ext.a = zext i8 %load.a to i32 - %gep.b = getelementptr i8, ptr %b, i64 %iv - %load.b = load i8, ptr %gep.b, align 1 - %ext.b = zext i8 %load.b to i32 - %mul = mul i32 %ext.b, %ext.a - %add = add i32 %mul, %ext.b - %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 - br i1 %exitcond.not, label %for.exit, label %for.body - -for.exit: ; preds = %for.body - ret i32 %add -} - -define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { -; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_unrolled( -; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16 -; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16 -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] -; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVE1: vector.body: -; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3 -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] -; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] -; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]] -; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]] -; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]] -; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]] -; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]]) -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] -; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) -; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] -; -; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled( -; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16 -; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16 -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] -; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVED: vector.body: -; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1 -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3 -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]]) -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]]) -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]]) -; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]]) -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] -; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]]) -; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]]) -; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]]) -; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) -; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] -; -; CHECK-MAXBW-LABEL: define i32 @dotp_unrolled( -; CHECK-MAXBW-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16 -; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16 -; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] -; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-MAXBW: vector.body: -; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1 -; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] -; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2 -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3 -; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] -; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] -; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1 -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1 -; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]]) -; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 -; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1 -; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]]) -; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1 -; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1 -; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]]) -; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1 -; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1 -; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]]) -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] -; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]]) -; CHECK-MAXBW-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]]) -; CHECK-MAXBW-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]]) -; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) -; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] -; -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] - %accum3 = phi i32 [ 0, %entry ], [ %add.a3, %for.body ] - %accum2 = phi i32 [ 0, %entry ], [ %add.a2, %for.body ] - %accum1 = phi i32 [ 0, %entry ], [ %add.a1, %for.body ] - %accum0 = phi i32 [ 0, %entry ], [ %add.a0, %for.body ] - %gep.a0 = getelementptr inbounds i8, ptr %a, i64 %iv - %gep.b0 = getelementptr inbounds i8, ptr %b, i64 %iv - %offset.1 = or disjoint i64 %iv, 1 - %gep.a1 = getelementptr inbounds i8, ptr %a, i64 %offset.1 - %gep.b1 = getelementptr inbounds i8, ptr %b, i64 %offset.1 - %offset.2 = or disjoint i64 %iv, 2 - %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %offset.2 - %gep.b2 = getelementptr inbounds i8, ptr %b, i64 %offset.2 - %offset.3 = or disjoint i64 %iv, 3 - %gep.a3 = getelementptr inbounds i8, ptr %a, i64 %offset.3 - %gep.b3 = getelementptr inbounds i8, ptr %b, i64 %offset.3 - %load.a0 = load i8, ptr %gep.a0, align 1 - %ext.a0 = sext i8 %load.a0 to i32 - %load.b0 = load i8, ptr %gep.b0, align 1 - %ext.b0 = sext i8 %load.b0 to i32 - %mul.a0 = mul nsw i32 %ext.b0, %ext.a0 - %add.a0 = add nsw i32 %mul.a0, %accum0 - %load.a1 = load i8, ptr %gep.a1, align 1 - %ext.a1 = sext i8 %load.a1 to i32 - %load.b1 = load i8, ptr %gep.b1, align 1 - %ext.b1 = sext i8 %load.b1 to i32 - %mul.a1 = mul nsw i32 %ext.a1, %ext.b1 - %add.a1 = add nsw i32 %mul.a1, %accum1 - %load.a2 = load i8, ptr %gep.a2, align 1 - %ext.a2 = sext i8 %load.a2 to i32 - %load.b2 = load i8, ptr %gep.b2, align 1 - %ext.b2 = sext i8 %load.b2 to i32 - %mul.a2 = mul nsw i32 %ext.a2, %ext.b2 - %add.a2 = add nsw i32 %mul.a2, %accum2 - %load.a3 = load i8, ptr %gep.a3, align 1 - %ext.a3 = sext i8 %load.a3 to i32 - %load.b3 = load i8, ptr %gep.b3, align 1 - %ext.b3 = sext i8 %load.b3 to i32 - %mul.a3 = mul nsw i32 %ext.a3, %ext.b3 - %add.a3 = add nsw i32 %mul.a3, %accum3 - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, %num_in - br i1 %exitcond.not, label %exit, label %for.body - -exit: ; preds = %for.body - %result0 = add nsw i32 %add.a0, %add.a1 - %result1 = add nsw i32 %add.a2, %add.a3 - %result = add nsw i32 %result0, %result1 - ret i32 %result -} - -define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) { -; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated( -; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16 -; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVE1: vector.body: -; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]]) -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] -; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) -; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] -; -; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated( -; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32 -; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32 -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVED: vector.body: -; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 16 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP9]], [[TMP4]] -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP10]], [[TMP5]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]]) -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] -; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] -; -; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated( -; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16 -; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 -; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-MAXBW: vector.body: -; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] -; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]]) -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] -; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) -; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] -; -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] - %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] - %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv - %load.a = load i8, ptr %gep.a, align 1 - %ext.a = sext i8 %load.a to i32 - %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv - %load.b = load i8, ptr %gep.b, align 1 - %ext.b = sext i8 %load.b to i32 - %mul = mul nsw i32 %ext.b, %ext.a - %add = add nsw i32 %mul, %accum - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, %N - br i1 %exitcond.not, label %exit, label %for.body - -exit: ; preds = %for.body - ret i32 %add -} - -define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) { -; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated_pragma( -; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 15 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16 -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-INTERLEAVE1-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 -; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer -; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVE1: vector.body: -; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE62]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 -; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 -; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 -; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 -; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 -; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] -; -; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated_pragma( -; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 15 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16 -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-INTERLEAVED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 -; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer -; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVED: vector.body: -; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE62]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] -; -; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated_pragma( -; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 15 -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16 -; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-MAXBW-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 -; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer -; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-MAXBW: vector.body: -; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ] -; CHECK-MAXBW-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE62]] ] -; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 -; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 -; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 -; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 -; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 -; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 -; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 -; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 -; CHECK-MAXBW-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] -; -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] - %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] - %gep.a = getelementptr inbounds i8, ptr %b, i64 %iv - %load.a = load i8, ptr %gep.a, align 1 - %ext.a = sext i8 %load.a to i32 - %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %iv - %load.b = load i8, ptr %gep.a2, align 1 - %ext.b = sext i8 %load.b to i32 - %mul = mul nsw i32 %ext.b, %ext.a - %add = add nsw i32 %mul, %accum - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, %N - br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !7 - -exit: ; preds = %for.body - ret i32 %add -} - -define i32 @not_dotp_extend_user(ptr %a, ptr %b) { -; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_extend_user( -; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVE1: vector.body: -; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] -; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15 -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] -; CHECK-INTERLEAVE1: scalar.ph: -; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] -; CHECK-INTERLEAVE1: for.body: -; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] -; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 -; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] -; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 -; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] -; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] -; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 -; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] -; CHECK-INTERLEAVE1: for.exit: -; CHECK-INTERLEAVE1-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] -; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] -; CHECK-INTERLEAVE1-NEXT: [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]] -; CHECK-INTERLEAVE1-NEXT: ret i32 [[RESULT]] -; -; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user( -; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVED: vector.body: -; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]] -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]] -; CHECK-INTERLEAVED-NEXT: [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] -; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP14]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = extractelement <16 x i32> [[TMP10]], i32 15 -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] -; CHECK-INTERLEAVED: scalar.ph: -; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] -; CHECK-INTERLEAVED: for.body: -; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] -; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 -; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 -; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] -; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 -; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 -; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] -; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] -; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 -; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] -; CHECK-INTERLEAVED: for.exit: -; CHECK-INTERLEAVED-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] -; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] -; CHECK-INTERLEAVED-NEXT: [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]] -; CHECK-INTERLEAVED-NEXT: ret i32 [[RESULT]] -; -; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user( -; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-MAXBW: vector.body: -; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] -; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] -; CHECK-MAXBW-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]] -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-MAXBW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] -; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]]) -; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15 -; CHECK-MAXBW-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] -; CHECK-MAXBW: scalar.ph: -; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] -; CHECK-MAXBW: for.body: -; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] -; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 -; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 -; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] -; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 -; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 -; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] -; CHECK-MAXBW-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] -; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 -; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] -; CHECK-MAXBW: for.exit: -; CHECK-MAXBW-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] -; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] -; CHECK-MAXBW-NEXT: [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]] -; CHECK-MAXBW-NEXT: ret i32 [[RESULT]] -; -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] - %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] - %gep.a = getelementptr i8, ptr %a, i64 %iv - %load.a = load i8, ptr %gep.a, align 1 - %ext.a = zext i8 %load.a to i32 - %gep.b = getelementptr i8, ptr %b, i64 %iv - %load.b = load i8, ptr %gep.b, align 1 - %ext.b = zext i8 %load.b to i32 - %mul = mul i32 %ext.b, %ext.a - %add = add i32 %mul, %accum - %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 - br i1 %exitcond.not, label %for.exit, label %for.body - -for.exit: ; preds = %for.body - %result = add i32 %add, %ext.b - ret i32 %result -} - -!7 = distinct !{!7, !8, !9, !10} -!8 = !{!"llvm.loop.mustprogress"} -!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} -!10 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll deleted file mode 100644 index 9530947232192..0000000000000 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ /dev/null @@ -1,1733 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1 -; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED -; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -vectorizer-maximize-bandwidth -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW - -target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -target triple = "aarch64-none-unknown-elf" - -define i32 @dotp(ptr %a, ptr %b) #0 { -; CHECK-INTERLEAVE1-LABEL: define i32 @dotp( -; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] -; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY1:%.*]] -; CHECK-INTERLEAVE1: vector.body: -; CHECK-INTERLEAVE1-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY1]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP14:%.*]], [[VECTOR_BODY1]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP12]] -; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP17]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]] -; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP21]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = zext [[WIDE_LOAD1]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul [[TMP18]], [[TMP9]] -; CHECK-INTERLEAVE1-NEXT: [[TMP14]] = add [[TMP13]], [[VEC_PHI]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP5]] -; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP14]]) -; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]] -; CHECK-INTERLEAVE1: scalar.ph: -; CHECK-INTERLEAVE1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ] -; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP27]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] -; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVE1: for.body: -; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 -; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 -; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] -; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK-INTERLEAVE1: for.exit: -; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ], [ [[TMP27]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] -; -; CHECK-INTERLEAVED-LABEL: define i32 @dotp( -; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] -; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 -; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY1:%.*]] -; CHECK-INTERLEAVED: vector.body: -; CHECK-INTERLEAVED-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY1]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP23:%.*]], [[VECTOR_BODY1]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP24:%.*]], [[VECTOR_BODY1]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP17]] -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP14]], 4 -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP20]], i64 [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP21]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP11]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD2]] to -; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]] -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP28]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 4 -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP28]], i64 [[TMP27]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP9]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP18]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = zext [[WIDE_LOAD3]] to -; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = zext [[WIDE_LOAD4]] to -; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul [[TMP19]], [[TMP12]] -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul [[TMP29]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add [[TMP30]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add [[TMP22]], [[VEC_PHI1]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP5]] -; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP24]], [[TMP23]] -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]] -; CHECK-INTERLEAVED: scalar.ph: -; CHECK-INTERLEAVED-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ] -; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] -; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVED: for.body: -; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 -; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 -; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 -; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 -; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] -; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK-INTERLEAVED: for.exit: -; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ], [ [[TMP16]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] -; -; CHECK-MAXBW-LABEL: define i32 @dotp( -; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 -; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] -; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] -; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 -; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-MAXBW: vector.body: -; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]] -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP8]], align 1 -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD2]] to -; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] -; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP15]], align 1 -; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = zext [[WIDE_LOAD4]] to -; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = mul [[TMP20]], [[TMP13]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE5]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI1]], [[TMP22]]) -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] - %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] - %gep.a = getelementptr i8, ptr %a, i64 %iv - %load.a = load i8, ptr %gep.a, align 1 - %ext.a = zext i8 %load.a to i32 - %gep.b = getelementptr i8, ptr %b, i64 %iv - %load.b = load i8, ptr %gep.b, align 1 - %ext.b = zext i8 %load.b to i32 - %mul = mul i32 %ext.b, %ext.a - %add = add i32 %mul, %accum - %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 - br i1 %exitcond.not, label %for.exit, label %for.body - -for.exit: ; preds = %for.body - ret i32 %add -} - -define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { -; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_different_types( -; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVE1: vector.body: -; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 -; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 -; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 -; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 -; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 -; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] -; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] -; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] -; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] -; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]] -; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]] -; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] -; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]] -; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]] -; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] -; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]] -; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]] -; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]] -; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]] -; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]] -; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = load i16, ptr [[TMP19]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = load i16, ptr [[TMP20]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = load i16, ptr [[TMP21]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = load i16, ptr [[TMP22]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP41:%.*]] = load i16, ptr [[TMP23]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = load i16, ptr [[TMP24]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = load i16, ptr [[TMP25]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = load i16, ptr [[TMP26]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = load i16, ptr [[TMP27]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP46:%.*]] = load i16, ptr [[TMP28]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP55:%.*]] = load i16, ptr [[TMP29]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP56:%.*]] = load i16, ptr [[TMP30]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP47:%.*]] = load i16, ptr [[TMP31]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP48:%.*]] = load i16, ptr [[TMP32]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP49:%.*]] = load i16, ptr [[TMP33]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP50:%.*]] = load i16, ptr [[TMP34]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP51:%.*]] = insertelement <16 x i16> poison, i16 [[TMP35]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[TMP52:%.*]] = insertelement <16 x i16> [[TMP51]], i16 [[TMP36]], i32 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP53:%.*]] = insertelement <16 x i16> [[TMP52]], i16 [[TMP37]], i32 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP54:%.*]] = insertelement <16 x i16> [[TMP53]], i16 [[TMP38]], i32 3 -; CHECK-INTERLEAVE1-NEXT: [[TMP57:%.*]] = insertelement <16 x i16> [[TMP54]], i16 [[TMP41]], i32 4 -; CHECK-INTERLEAVE1-NEXT: [[TMP58:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[TMP42]], i32 5 -; CHECK-INTERLEAVE1-NEXT: [[TMP59:%.*]] = insertelement <16 x i16> [[TMP58]], i16 [[TMP43]], i32 6 -; CHECK-INTERLEAVE1-NEXT: [[TMP60:%.*]] = insertelement <16 x i16> [[TMP59]], i16 [[TMP44]], i32 7 -; CHECK-INTERLEAVE1-NEXT: [[TMP61:%.*]] = insertelement <16 x i16> [[TMP60]], i16 [[TMP45]], i32 8 -; CHECK-INTERLEAVE1-NEXT: [[TMP96:%.*]] = insertelement <16 x i16> [[TMP61]], i16 [[TMP46]], i32 9 -; CHECK-INTERLEAVE1-NEXT: [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP55]], i32 10 -; CHECK-INTERLEAVE1-NEXT: [[TMP62:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP56]], i32 11 -; CHECK-INTERLEAVE1-NEXT: [[TMP63:%.*]] = insertelement <16 x i16> [[TMP62]], i16 [[TMP47]], i32 12 -; CHECK-INTERLEAVE1-NEXT: [[TMP64:%.*]] = insertelement <16 x i16> [[TMP63]], i16 [[TMP48]], i32 13 -; CHECK-INTERLEAVE1-NEXT: [[TMP65:%.*]] = insertelement <16 x i16> [[TMP64]], i16 [[TMP49]], i32 14 -; CHECK-INTERLEAVE1-NEXT: [[TMP66:%.*]] = insertelement <16 x i16> [[TMP65]], i16 [[TMP50]], i32 15 -; CHECK-INTERLEAVE1-NEXT: [[TMP67:%.*]] = zext <16 x i16> [[TMP66]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]] -; CHECK-INTERLEAVE1-NEXT: [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP70]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]]) -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] -; CHECK-INTERLEAVE1: scalar.ph: -; CHECK-INTERLEAVE1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ] -; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] -; CHECK-INTERLEAVE1: for.body: -; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] -; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 -; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] -; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i16, ptr [[GEP_B]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i16 [[LOAD_B]] to i32 -; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] -; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] -; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 -; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] -; CHECK-INTERLEAVE1: for.exit: -; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] -; -; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_different_types( -; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVED: vector.body: -; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP137:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 16 -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 17 -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 18 -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 19 -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 20 -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 21 -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 22 -; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 23 -; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = add i64 [[INDEX]], 24 -; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], 25 -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = add i64 [[INDEX]], 26 -; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 27 -; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = add i64 [[INDEX]], 28 -; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = add i64 [[INDEX]], 29 -; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = add i64 [[INDEX]], 30 -; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 31 -; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] -; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[TMP32]], i32 16 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP34]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] -; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] -; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] -; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]] -; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]] -; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] -; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]] -; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]] -; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] -; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]] -; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]] -; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]] -; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]] -; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP16]] -; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]] -; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP18]] -; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]] -; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP20]] -; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP21]] -; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP22]] -; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP23]] -; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP24]] -; CHECK-INTERLEAVED-NEXT: [[TMP64:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP25]] -; CHECK-INTERLEAVED-NEXT: [[TMP65:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP26]] -; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP27]] -; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP28]] -; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP29]] -; CHECK-INTERLEAVED-NEXT: [[TMP139:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP30]] -; CHECK-INTERLEAVED-NEXT: [[TMP140:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP31]] -; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = load i16, ptr [[TMP39]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = load i16, ptr [[TMP40]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = load i16, ptr [[TMP41]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = load i16, ptr [[TMP42]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = load i16, ptr [[TMP43]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = load i16, ptr [[TMP44]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP75:%.*]] = load i16, ptr [[TMP45]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP76:%.*]] = load i16, ptr [[TMP46]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP77:%.*]] = load i16, ptr [[TMP47]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP78:%.*]] = load i16, ptr [[TMP48]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP79:%.*]] = load i16, ptr [[TMP49]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP80:%.*]] = load i16, ptr [[TMP50]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP81:%.*]] = load i16, ptr [[TMP51]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = load i16, ptr [[TMP52]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP83:%.*]] = load i16, ptr [[TMP53]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP84:%.*]] = load i16, ptr [[TMP54]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1 -; CHECK-INTERLEAVED-NEXT: [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2 -; CHECK-INTERLEAVED-NEXT: [[TMP88:%.*]] = insertelement <16 x i16> [[TMP87]], i16 [[TMP72]], i32 3 -; CHECK-INTERLEAVED-NEXT: [[TMP89:%.*]] = insertelement <16 x i16> [[TMP88]], i16 [[TMP73]], i32 4 -; CHECK-INTERLEAVED-NEXT: [[TMP90:%.*]] = insertelement <16 x i16> [[TMP89]], i16 [[TMP74]], i32 5 -; CHECK-INTERLEAVED-NEXT: [[TMP91:%.*]] = insertelement <16 x i16> [[TMP90]], i16 [[TMP75]], i32 6 -; CHECK-INTERLEAVED-NEXT: [[TMP92:%.*]] = insertelement <16 x i16> [[TMP91]], i16 [[TMP76]], i32 7 -; CHECK-INTERLEAVED-NEXT: [[TMP93:%.*]] = insertelement <16 x i16> [[TMP92]], i16 [[TMP77]], i32 8 -; CHECK-INTERLEAVED-NEXT: [[TMP94:%.*]] = insertelement <16 x i16> [[TMP93]], i16 [[TMP78]], i32 9 -; CHECK-INTERLEAVED-NEXT: [[TMP95:%.*]] = insertelement <16 x i16> [[TMP94]], i16 [[TMP79]], i32 10 -; CHECK-INTERLEAVED-NEXT: [[TMP96:%.*]] = insertelement <16 x i16> [[TMP95]], i16 [[TMP80]], i32 11 -; CHECK-INTERLEAVED-NEXT: [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP81]], i32 12 -; CHECK-INTERLEAVED-NEXT: [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13 -; CHECK-INTERLEAVED-NEXT: [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14 -; CHECK-INTERLEAVED-NEXT: [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15 -; CHECK-INTERLEAVED-NEXT: [[TMP101:%.*]] = load i16, ptr [[TMP55]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP102:%.*]] = load i16, ptr [[TMP56]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP103:%.*]] = load i16, ptr [[TMP57]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP104:%.*]] = load i16, ptr [[TMP58]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP105:%.*]] = load i16, ptr [[TMP59]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP106:%.*]] = load i16, ptr [[TMP60]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP107:%.*]] = load i16, ptr [[TMP61]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP108:%.*]] = load i16, ptr [[TMP62]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP109:%.*]] = load i16, ptr [[TMP63]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP110:%.*]] = load i16, ptr [[TMP64]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP111:%.*]] = load i16, ptr [[TMP65]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP112:%.*]] = load i16, ptr [[TMP66]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP113:%.*]] = load i16, ptr [[TMP67]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP114:%.*]] = load i16, ptr [[TMP68]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP115:%.*]] = load i16, ptr [[TMP139]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP116:%.*]] = load i16, ptr [[TMP140]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1 -; CHECK-INTERLEAVED-NEXT: [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2 -; CHECK-INTERLEAVED-NEXT: [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3 -; CHECK-INTERLEAVED-NEXT: [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4 -; CHECK-INTERLEAVED-NEXT: [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5 -; CHECK-INTERLEAVED-NEXT: [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6 -; CHECK-INTERLEAVED-NEXT: [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7 -; CHECK-INTERLEAVED-NEXT: [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8 -; CHECK-INTERLEAVED-NEXT: [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9 -; CHECK-INTERLEAVED-NEXT: [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10 -; CHECK-INTERLEAVED-NEXT: [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11 -; CHECK-INTERLEAVED-NEXT: [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12 -; CHECK-INTERLEAVED-NEXT: [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13 -; CHECK-INTERLEAVED-NEXT: [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14 -; CHECK-INTERLEAVED-NEXT: [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15 -; CHECK-INTERLEAVED-NEXT: [[TMP133:%.*]] = zext <16 x i16> [[TMP100]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP135:%.*]] = mul <16 x i32> [[TMP133]], [[TMP35]] -; CHECK-INTERLEAVED-NEXT: [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]] -; CHECK-INTERLEAVED-NEXT: [[TMP137]] = add <16 x i32> [[TMP135]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-INTERLEAVED-NEXT: [[TMP141:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP141]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP138]], [[TMP137]] -; CHECK-INTERLEAVED-NEXT: [[TMP142:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] -; -; CHECK-MAXBW-LABEL: define i32 @not_dotp_different_types( -; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-MAXBW: vector.body: -; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 -; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 -; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 -; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 -; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 -; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 -; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 -; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] -; CHECK-MAXBW-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1 -; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] -; CHECK-MAXBW-NEXT: [[TMP38:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] -; CHECK-MAXBW-NEXT: [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] -; CHECK-MAXBW-NEXT: [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]] -; CHECK-MAXBW-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]] -; CHECK-MAXBW-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]] -; CHECK-MAXBW-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] -; CHECK-MAXBW-NEXT: [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]] -; CHECK-MAXBW-NEXT: [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]] -; CHECK-MAXBW-NEXT: [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] -; CHECK-MAXBW-NEXT: [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]] -; CHECK-MAXBW-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]] -; CHECK-MAXBW-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]] -; CHECK-MAXBW-NEXT: [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]] -; CHECK-MAXBW-NEXT: [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]] -; CHECK-MAXBW-NEXT: [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]] -; CHECK-MAXBW-NEXT: [[TMP101:%.*]] = load i16, ptr [[TMP37]], align 2 -; CHECK-MAXBW-NEXT: [[TMP102:%.*]] = load i16, ptr [[TMP38]], align 2 -; CHECK-MAXBW-NEXT: [[TMP103:%.*]] = load i16, ptr [[TMP39]], align 2 -; CHECK-MAXBW-NEXT: [[TMP104:%.*]] = load i16, ptr [[TMP40]], align 2 -; CHECK-MAXBW-NEXT: [[TMP105:%.*]] = load i16, ptr [[TMP41]], align 2 -; CHECK-MAXBW-NEXT: [[TMP106:%.*]] = load i16, ptr [[TMP42]], align 2 -; CHECK-MAXBW-NEXT: [[TMP107:%.*]] = load i16, ptr [[TMP43]], align 2 -; CHECK-MAXBW-NEXT: [[TMP108:%.*]] = load i16, ptr [[TMP44]], align 2 -; CHECK-MAXBW-NEXT: [[TMP109:%.*]] = load i16, ptr [[TMP45]], align 2 -; CHECK-MAXBW-NEXT: [[TMP110:%.*]] = load i16, ptr [[TMP46]], align 2 -; CHECK-MAXBW-NEXT: [[TMP111:%.*]] = load i16, ptr [[TMP47]], align 2 -; CHECK-MAXBW-NEXT: [[TMP112:%.*]] = load i16, ptr [[TMP48]], align 2 -; CHECK-MAXBW-NEXT: [[TMP113:%.*]] = load i16, ptr [[TMP49]], align 2 -; CHECK-MAXBW-NEXT: [[TMP114:%.*]] = load i16, ptr [[TMP50]], align 2 -; CHECK-MAXBW-NEXT: [[TMP115:%.*]] = load i16, ptr [[TMP51]], align 2 -; CHECK-MAXBW-NEXT: [[TMP116:%.*]] = load i16, ptr [[TMP52]], align 2 -; CHECK-MAXBW-NEXT: [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0 -; CHECK-MAXBW-NEXT: [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1 -; CHECK-MAXBW-NEXT: [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2 -; CHECK-MAXBW-NEXT: [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3 -; CHECK-MAXBW-NEXT: [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4 -; CHECK-MAXBW-NEXT: [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5 -; CHECK-MAXBW-NEXT: [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6 -; CHECK-MAXBW-NEXT: [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7 -; CHECK-MAXBW-NEXT: [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8 -; CHECK-MAXBW-NEXT: [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9 -; CHECK-MAXBW-NEXT: [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10 -; CHECK-MAXBW-NEXT: [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11 -; CHECK-MAXBW-NEXT: [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12 -; CHECK-MAXBW-NEXT: [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13 -; CHECK-MAXBW-NEXT: [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14 -; CHECK-MAXBW-NEXT: [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15 -; CHECK-MAXBW-NEXT: [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]] -; CHECK-MAXBW-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]] -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-MAXBW-NEXT: [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-MAXBW-NEXT: br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] - %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] - %gep.a = getelementptr i8, ptr %a, i64 %iv - %load.a = load i8, ptr %gep.a, align 1 - %ext.a = zext i8 %load.a to i32 - %gep.b = getelementptr i8, ptr %b, i64 %iv - %load.b = load i16, ptr %gep.b, align 2 - %ext.b = zext i16 %load.b to i32 - %mul = mul i32 %ext.b, %ext.a - %add = add i32 %mul, %accum - %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 - br i1 %exitcond.not, label %for.exit, label %for.body - -for.exit: ; preds = %for.body - ret i32 %add -} - -define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { -; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_loop_carried( -; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8 -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 -; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 0, i32 [[TMP8]] -; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVE1: vector.body: -; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]] -; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP11]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] -; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP14]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = zext [[WIDE_LOAD1]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP16]] = mul [[TMP15]], [[TMP12]] -; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call @llvm.vector.splice.nxv8i32( [[VECTOR_RECUR]], [[TMP16]], i32 -1) -; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = add [[TMP16]], [[TMP17]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; -; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried( -; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8 -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 -; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 0, i32 [[TMP8]] -; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVED: vector.body: -; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]] -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 8 -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP11]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP14]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = zext [[WIDE_LOAD1]] to -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 8 -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP17]], i64 [[TMP20]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP18]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP21]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = zext [[WIDE_LOAD2]] to -; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = zext [[WIDE_LOAD3]] to -; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = mul [[TMP22]], [[TMP15]] -; CHECK-INTERLEAVED-NEXT: [[TMP25]] = mul [[TMP23]], [[TMP16]] -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call @llvm.vector.splice.nxv8i32( [[TMP24]], [[TMP25]], i32 -1) -; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = add [[TMP25]], [[TMP26]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; -; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_loop_carried( -; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 -; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] -; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] -; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8 -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 -; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 0, i32 [[TMP8]] -; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-MAXBW: vector.body: -; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 -; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]] -; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 -; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = zext [[WIDE_LOAD1]] to -; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] -; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP18]], align 1 -; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = zext [[WIDE_LOAD3]] to -; CHECK-MAXBW-NEXT: [[TMP25]] = mul [[TMP23]], [[TMP16]] -; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = call @llvm.vector.splice.nxv8i32( [[VECTOR_RECUR]], [[TMP25]], i32 -1) -; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = add [[TMP25]], [[TMP26]] -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] - %accum = phi i32 [ 0, %entry ], [ %mul, %for.body ] - %gep.a = getelementptr i8, ptr %a, i64 %iv - %load.a = load i8, ptr %gep.a, align 1 - %ext.a = zext i8 %load.a to i32 - %gep.b = getelementptr i8, ptr %b, i64 %iv - %load.b = load i8, ptr %gep.b, align 1 - %ext.b = zext i8 %load.b to i32 - %mul = mul i32 %ext.b, %ext.a - %add = add i32 %mul, %accum - %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 - br i1 %exitcond.not, label %for.exit, label %for.body - -for.exit: ; preds = %for.body - ret i32 %add -} - -define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { -; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_phi( -; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8 -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 -; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 0, i32 [[TMP8]] -; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVE1: vector.body: -; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]] -; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP11]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] -; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP14]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = zext [[WIDE_LOAD1]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul [[TMP15]], [[TMP12]] -; CHECK-INTERLEAVE1-NEXT: [[TMP17]] = add [[TMP16]], [[TMP15]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] -; -; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_phi( -; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8 -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 -; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 0, i32 [[TMP8]] -; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVED: vector.body: -; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]] -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 8 -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8 -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP16]], i64 [[TMP19]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP20]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = zext [[WIDE_LOAD1]] to -; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul [[TMP22]], [[TMP15]] -; CHECK-INTERLEAVED-NEXT: [[TMP21]] = add [[TMP30]], [[TMP22]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] -; -; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_phi( -; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 -; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] -; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] -; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8 -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 -; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 0, i32 [[TMP8]] -; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-MAXBW: vector.body: -; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 -; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]] -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP13]], align 1 -; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] -; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP15]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP18]], align 1 -; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = zext [[WIDE_LOAD1]] to -; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = mul [[TMP19]], [[TMP14]] -; CHECK-MAXBW-NEXT: [[TMP21]] = add [[TMP20]], [[TMP19]] -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] -; -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] - %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] - %gep.a = getelementptr i8, ptr %a, i64 %iv - %load.a = load i8, ptr %gep.a, align 1 - %ext.a = zext i8 %load.a to i32 - %gep.b = getelementptr i8, ptr %b, i64 %iv - %load.b = load i8, ptr %gep.b, align 1 - %ext.b = zext i8 %load.b to i32 - %mul = mul i32 %ext.b, %ext.a - %add = add i32 %mul, %ext.b - %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 - br i1 %exitcond.not, label %for.exit, label %for.body - -for.exit: ; preds = %for.body - ret i32 %add -} - -define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { -; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_unrolled( -; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = mul i64 [[TMP13]], 4 -; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP15]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], 4 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP18]] -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] -; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = mul i64 [[TMP20]], 4 -; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVE1: vector.body: -; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI2:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3 -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] -; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] -; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = sext [[WIDE_LOAD]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP14]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = sext [[WIDE_LOAD4]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = mul nsw [[TMP21]], [[TMP36]] -; CHECK-INTERLEAVE1-NEXT: [[TMP23]] = add [[TMP38]], [[VEC_PHI3]] -; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP17]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = sext [[WIDE_LOAD5]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP19]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = sext [[WIDE_LOAD6]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = mul nsw [[TMP25]], [[TMP42]] -; CHECK-INTERLEAVE1-NEXT: [[TMP30]] = add [[TMP28]], [[VEC_PHI2]] -; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP22]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = sext [[WIDE_LOAD7]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD8:%.*]] = load , ptr [[TMP24]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = sext [[WIDE_LOAD8]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = mul nsw [[TMP31]], [[TMP33]] -; CHECK-INTERLEAVE1-NEXT: [[TMP35]] = add [[TMP34]], [[VEC_PHI1]] -; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD9:%.*]] = load , ptr [[TMP27]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = sext [[WIDE_LOAD9]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD10:%.*]] = load , ptr [[TMP29]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = sext [[WIDE_LOAD10]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP40:%.*]] = mul nsw [[TMP37]], [[TMP39]] -; CHECK-INTERLEAVE1-NEXT: [[TMP41]] = add [[TMP40]], [[VEC_PHI]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP26]] -; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] -; -; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled( -; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = mul i64 [[TMP13]], 8 -; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP15]] -; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], 8 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP18]] -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] -; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = mul i64 [[TMP34]], 8 -; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVED: vector.body: -; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP64:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP65:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI4:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI5:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI6:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1 -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3 -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul i64 [[TMP56]], 4 -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP20]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load , ptr [[TMP21]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = sext [[WIDE_LOAD]] to -; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext [[WIDE_LOAD8]] to -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 4 -; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 [[TMP26]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load , ptr [[TMP14]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load , ptr [[TMP72]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext [[WIDE_LOAD9]] to -; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = sext [[WIDE_LOAD10]] to -; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul nsw [[TMP28]], [[TMP66]] -; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw [[TMP82]], [[TMP23]] -; CHECK-INTERLEAVED-NEXT: [[TMP50]] = add [[TMP30]], [[VEC_PHI6]] -; CHECK-INTERLEAVED-NEXT: [[TMP33]] = add [[TMP31]], [[VEC_PHI7]] -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = mul i64 [[TMP35]], 4 -; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 [[TMP36]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load , ptr [[TMP17]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load , ptr [[TMP37]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = sext [[WIDE_LOAD11]] to -; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = sext [[WIDE_LOAD12]] to -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = mul i64 [[TMP41]], 4 -; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP42]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD13:%.*]] = load , ptr [[TMP19]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD14:%.*]] = load , ptr [[TMP43]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = sext [[WIDE_LOAD13]] to -; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = sext [[WIDE_LOAD14]] to -; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = mul nsw [[TMP38]], [[TMP44]] -; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = mul nsw [[TMP39]], [[TMP45]] -; CHECK-INTERLEAVED-NEXT: [[TMP48]] = add [[TMP46]], [[VEC_PHI4]] -; CHECK-INTERLEAVED-NEXT: [[TMP49]] = add [[TMP47]], [[VEC_PHI5]] -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = mul i64 [[TMP51]], 4 -; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP52]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load , ptr [[TMP22]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD16:%.*]] = load , ptr [[TMP53]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = sext [[WIDE_LOAD15]] to -; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = sext [[WIDE_LOAD16]] to -; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = mul i64 [[TMP57]], 4 -; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[TMP58]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD17:%.*]] = load , ptr [[TMP24]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD18:%.*]] = load , ptr [[TMP59]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = sext [[WIDE_LOAD17]] to -; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = sext [[WIDE_LOAD18]] to -; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = mul nsw [[TMP54]], [[TMP60]] -; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = mul nsw [[TMP55]], [[TMP61]] -; CHECK-INTERLEAVED-NEXT: [[TMP64]] = add [[TMP62]], [[VEC_PHI2]] -; CHECK-INTERLEAVED-NEXT: [[TMP65]] = add [[TMP63]], [[VEC_PHI3]] -; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = mul i64 [[TMP67]], 4 -; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 [[TMP68]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD19:%.*]] = load , ptr [[TMP27]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD20:%.*]] = load , ptr [[TMP69]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = sext [[WIDE_LOAD19]] to -; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = sext [[WIDE_LOAD20]] to -; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = mul i64 [[TMP73]], 4 -; CHECK-INTERLEAVED-NEXT: [[TMP75:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 [[TMP74]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load , ptr [[TMP29]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD22:%.*]] = load , ptr [[TMP75]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP76:%.*]] = sext [[WIDE_LOAD21]] to -; CHECK-INTERLEAVED-NEXT: [[TMP77:%.*]] = sext [[WIDE_LOAD22]] to -; CHECK-INTERLEAVED-NEXT: [[TMP78:%.*]] = mul nsw [[TMP70]], [[TMP76]] -; CHECK-INTERLEAVED-NEXT: [[TMP79:%.*]] = mul nsw [[TMP71]], [[TMP77]] -; CHECK-INTERLEAVED-NEXT: [[TMP80]] = add [[TMP78]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP81]] = add [[TMP79]], [[VEC_PHI1]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP40]] -; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] -; -; CHECK-MAXBW-LABEL: define i32 @dotp_unrolled( -; CHECK-MAXBW-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 -; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP1]] -; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP3]] -; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] -; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 -; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-MAXBW: vector.body: -; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI4:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI5:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI6:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI7:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP6]], 1 -; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] -; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] -; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = or disjoint i64 [[TMP6]], 2 -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] -; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] -; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = or disjoint i64 [[TMP6]], 3 -; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]] -; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]] -; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP18]], align 1 -; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = sext [[WIDE_LOAD]] to -; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD9:%.*]] = load , ptr [[TMP24]], align 1 -; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = sext [[WIDE_LOAD9]] to -; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = mul nsw [[TMP29]], [[TMP23]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE11]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI7]], [[TMP31]]) -; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD12:%.*]] = load , ptr [[TMP32]], align 1 -; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = sext [[WIDE_LOAD12]] to -; CHECK-MAXBW-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD14:%.*]] = load , ptr [[TMP38]], align 1 -; CHECK-MAXBW-NEXT: [[TMP43:%.*]] = sext [[WIDE_LOAD14]] to -; CHECK-MAXBW-NEXT: [[TMP45:%.*]] = mul nsw [[TMP37]], [[TMP43]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI6]], [[TMP45]]) -; CHECK-MAXBW-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD18:%.*]] = load , ptr [[TMP46]], align 1 -; CHECK-MAXBW-NEXT: [[TMP51:%.*]] = sext [[WIDE_LOAD18]] to -; CHECK-MAXBW-NEXT: [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD20:%.*]] = load , ptr [[TMP52]], align 1 -; CHECK-MAXBW-NEXT: [[TMP57:%.*]] = sext [[WIDE_LOAD20]] to -; CHECK-MAXBW-NEXT: [[TMP59:%.*]] = mul nsw [[TMP51]], [[TMP57]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE17]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI5]], [[TMP59]]) -; CHECK-MAXBW-NEXT: [[TMP60:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD24:%.*]] = load , ptr [[TMP60]], align 1 -; CHECK-MAXBW-NEXT: [[TMP65:%.*]] = sext [[WIDE_LOAD24]] to -; CHECK-MAXBW-NEXT: [[TMP66:%.*]] = getelementptr inbounds i8, ptr [[TMP17]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD26:%.*]] = load , ptr [[TMP66]], align 1 -; CHECK-MAXBW-NEXT: [[TMP71:%.*]] = sext [[WIDE_LOAD26]] to -; CHECK-MAXBW-NEXT: [[TMP73:%.*]] = mul nsw [[TMP65]], [[TMP71]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE16]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI4]], [[TMP73]]) -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-MAXBW-NEXT: [[TMP74:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] -; -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] - %accum3 = phi i32 [ 0, %entry ], [ %add.a3, %for.body ] - %accum2 = phi i32 [ 0, %entry ], [ %add.a2, %for.body ] - %accum1 = phi i32 [ 0, %entry ], [ %add.a1, %for.body ] - %accum0 = phi i32 [ 0, %entry ], [ %add.a0, %for.body ] - %gep.a0 = getelementptr inbounds i8, ptr %a, i64 %iv - %gep.b0 = getelementptr inbounds i8, ptr %b, i64 %iv - %offset.1 = or disjoint i64 %iv, 1 - %gep.a1 = getelementptr inbounds i8, ptr %a, i64 %offset.1 - %gep.b1 = getelementptr inbounds i8, ptr %b, i64 %offset.1 - %offset.2 = or disjoint i64 %iv, 2 - %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %offset.2 - %gep.b2 = getelementptr inbounds i8, ptr %b, i64 %offset.2 - %offset.3 = or disjoint i64 %iv, 3 - %gep.a3 = getelementptr inbounds i8, ptr %a, i64 %offset.3 - %gep.b3 = getelementptr inbounds i8, ptr %b, i64 %offset.3 - %load.a0 = load i8, ptr %gep.a0, align 1 - %ext.a0 = sext i8 %load.a0 to i32 - %load.b0 = load i8, ptr %gep.b0, align 1 - %ext.b0 = sext i8 %load.b0 to i32 - %mul.a0 = mul nsw i32 %ext.b0, %ext.a0 - %add.a0 = add nsw i32 %mul.a0, %accum0 - %load.a1 = load i8, ptr %gep.a1, align 1 - %ext.a1 = sext i8 %load.a1 to i32 - %load.b1 = load i8, ptr %gep.b1, align 1 - %ext.b1 = sext i8 %load.b1 to i32 - %mul.a1 = mul nsw i32 %ext.a1, %ext.b1 - %add.a1 = add nsw i32 %mul.a1, %accum1 - %load.a2 = load i8, ptr %gep.a2, align 1 - %ext.a2 = sext i8 %load.a2 to i32 - %load.b2 = load i8, ptr %gep.b2, align 1 - %ext.b2 = sext i8 %load.b2 to i32 - %mul.a2 = mul nsw i32 %ext.a2, %ext.b2 - %add.a2 = add nsw i32 %mul.a2, %accum2 - %load.a3 = load i8, ptr %gep.a3, align 1 - %ext.a3 = sext i8 %load.a3 to i32 - %load.b3 = load i8, ptr %gep.b3, align 1 - %ext.b3 = sext i8 %load.b3 to i32 - %mul.a3 = mul nsw i32 %ext.a3, %ext.b3 - %add.a3 = add nsw i32 %mul.a3, %accum3 - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, %num_in - br i1 %exitcond.not, label %exit, label %for.body - -exit: ; preds = %for.body - %result0 = add nsw i32 %add.a0, %add.a1 - %result1 = add nsw i32 %add.a2, %add.a3 - %result = add nsw i32 %result0, %result1 - ret i32 %result -} - -define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { -; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated( -; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 -; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP7]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP10]], 4 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = mul i64 [[TMP11]], 4 -; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVE1: vector.body: -; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP2]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = sext [[WIDE_LOAD]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP5]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = sext [[WIDE_LOAD1]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul nsw [[TMP12]], [[TMP9]] -; CHECK-INTERLEAVE1-NEXT: [[TMP14]] = add [[TMP13]], [[VEC_PHI]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] -; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP14]]) -; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]] -; -; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated( -; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul i64 [[TMP8]], 8 -; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP14]] -; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP15]], 8 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 -; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVED: vector.body: -; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP2]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP11]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = sext [[WIDE_LOAD]] to -; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext [[WIDE_LOAD2]] to -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[TMP17]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP7]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP18]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = sext [[WIDE_LOAD3]] to -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext [[WIDE_LOAD4]] to -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw [[TMP19]], [[TMP12]] -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul nsw [[TMP20]], [[TMP25]] -; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add [[TMP21]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add [[TMP22]], [[VEC_PHI1]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] -; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP24]], [[TMP23]] -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]] -; -; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated( -; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 -; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] -; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 -; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-MAXBW: vector.body: -; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP8]], align 1 -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD2]] to -; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] -; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP15]], align 1 -; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = sext [[WIDE_LOAD4]] to -; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = mul nsw [[TMP20]], [[TMP13]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE5]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI1]], [[TMP22]]) -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] -; -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] - %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] - %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv - %load.a = load i8, ptr %gep.a, align 1 - %ext.a = sext i8 %load.a to i32 - %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv - %load.b = load i8, ptr %gep.b, align 1 - %ext.b = sext i8 %load.b to i32 - %mul = mul nsw i32 %ext.b, %ext.a - %add = add nsw i32 %mul, %accum - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, %N - br i1 %exitcond.not, label %exit, label %for.body - -exit: ; preds = %for.body - ret i32 %add -} - -define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) #0 { -; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated_pragma( -; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 -; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = sub i64 [[TMP10]], 1 -; CHECK-INTERLEAVE1-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP11]] -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = mul i64 [[TMP12]], 4 -; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = mul i64 [[TMP15]], 4 -; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[TMP6]] -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[N]], [[TMP6]] -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0 -; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) -; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVE1: vector.body: -; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP5]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_LOAD]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP8]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = sext [[WIDE_MASKED_LOAD1]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = mul nsw [[TMP16]], [[TMP13]] -; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = add [[TMP17]], [[VEC_PHI]] -; CHECK-INTERLEAVE1-NEXT: [[TMP19]] = select [[ACTIVE_LANE_MASK]], [[TMP18]], [[VEC_PHI]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]] -; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) -; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i32 0 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] -; -; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated_pragma( -; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = sub i64 [[TMP10]], 1 -; CHECK-INTERLEAVED-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP11]] -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul i64 [[TMP12]], 4 -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP15]], 4 -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[TMP6]] -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[N]], [[TMP6]] -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0 -; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) -; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVED: vector.body: -; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP5]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_LOAD]] to -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP8]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = sext [[WIDE_MASKED_LOAD1]] to -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul nsw [[TMP16]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = add [[TMP17]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP19]] = select [[ACTIVE_LANE_MASK]], [[TMP18]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]] -; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]]) -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i32 0 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] -; -; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated_pragma( -; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; CHECK-MAXBW-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP2]] -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] -; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 -; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) -; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-MAXBW: vector.body: -; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 -; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] -; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_LOAD]] to -; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]] -; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP15]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = sext [[WIDE_MASKED_LOAD1]] to -; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw [[TMP16]], [[TMP13]] -; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = add [[TMP17]], [[VEC_PHI]] -; CHECK-MAXBW-NEXT: [[TMP19]] = select [[ACTIVE_LANE_MASK]], [[TMP18]], [[VEC_PHI]] -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] -; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) -; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) -; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i32 0 -; CHECK-MAXBW-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] -; -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] - %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] - %gep.a = getelementptr inbounds i8, ptr %b, i64 %iv - %load.a = load i8, ptr %gep.a, align 1 - %ext.a = sext i8 %load.a to i32 - %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %iv - %load.b = load i8, ptr %gep.a2, align 1 - %ext.b = sext i8 %load.b to i32 - %mul = mul nsw i32 %ext.b, %ext.a - %add = add nsw i32 %mul, %accum - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, %N - br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !7 - -exit: ; preds = %for.body - ret i32 %add -} - -define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { -; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_extend_user( -; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = mul i64 [[TMP8]], 4 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] -; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP11]], 4 -; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVE1: vector.body: -; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] -; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP7]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD1]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul [[TMP12]], [[TMP9]] -; CHECK-INTERLEAVE1-NEXT: [[TMP14]] = add [[TMP13]], [[VEC_PHI]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] -; -; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user( -; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP7]] -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP14]], 8 -; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVED: vector.body: -; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP15]], 4 -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP11]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD2]] to -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP17]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP9]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP18]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = zext [[WIDE_LOAD3]] to -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = zext [[WIDE_LOAD4]] to -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul [[TMP19]], [[TMP12]] -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul [[TMP20]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add [[TMP21]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add [[TMP22]], [[VEC_PHI1]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] -; -; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user( -; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 -; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] -; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] -; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 -; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-MAXBW: vector.body: -; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]] -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP8]], align 1 -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD2]] to -; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] -; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP15]], align 1 -; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = zext [[WIDE_LOAD4]] to -; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = mul [[TMP20]], [[TMP13]] -; CHECK-MAXBW-NEXT: [[TMP24]] = add [[TMP22]], [[VEC_PHI1]] -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] -; -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] - %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] - %gep.a = getelementptr i8, ptr %a, i64 %iv - %load.a = load i8, ptr %gep.a, align 1 - %ext.a = zext i8 %load.a to i32 - %gep.b = getelementptr i8, ptr %b, i64 %iv - %load.b = load i8, ptr %gep.b, align 1 - %ext.b = zext i8 %load.b to i32 - %mul = mul i32 %ext.b, %ext.a - %add = add i32 %mul, %accum - %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 - br i1 %exitcond.not, label %for.exit, label %for.body - -for.exit: ; preds = %for.body - %result = add i32 %add, %ext.b - ret i32 %result -} - -define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { -; CHECK-INTERLEAVE1-LABEL: define i64 @dotp_cost_disagreement( -; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 -; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]] -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 -; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVE1: vector.body: -; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP11]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP12]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD1]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = mul nuw nsw [[TMP13]], [[TMP9]] -; CHECK-INTERLEAVE1-NEXT: [[TMP15]] = add [[VEC_PHI]], [[TMP14]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; -; CHECK-INTERLEAVED-LABEL: define i64 @dotp_cost_disagreement( -; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]] -; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]] -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVED: vector.body: -; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i64 [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP11]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD2]] to -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add nuw nsw i64 [[TMP6]], 1 -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP14]] -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i64 [[TMP18]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP16]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP19]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = zext [[WIDE_LOAD3]] to -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = zext [[WIDE_LOAD4]] to -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul nuw nsw [[TMP20]], [[TMP12]] -; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = mul nuw nsw [[TMP21]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add [[VEC_PHI]], [[TMP22]] -; CHECK-INTERLEAVED-NEXT: [[TMP25]] = add [[VEC_PHI1]], [[TMP23]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; -; CHECK-MAXBW-LABEL: define i64 @dotp_cost_disagreement( -; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 -; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]] -; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]] -; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]] -; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 -; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-MAXBW: vector.body: -; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 1 -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 1 -; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]] -; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP11]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP12]], align 1 -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD1]] to -; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = mul nuw nsw [[TMP13]], [[TMP9]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call @llvm.experimental.vector.partial.reduce.add.nxv1i64.nxv8i64( [[VEC_PHI]], [[TMP14]]) -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64( [[PARTIAL_REDUCE]]) -; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] -; -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] - %sum = phi i64 [ 0, %entry ], [ %add, %for.body ] - %arrayidx = getelementptr inbounds nuw i8, ptr %a, i64 %i.iv - %0 = load i8, ptr %arrayidx, align 1 - %conv = zext i8 %0 to i64 - %i.iv.next = add nuw nsw i64 %i.iv, 1 - %arrayidx2 = getelementptr inbounds nuw i8, ptr %b, i64 %i.iv.next - %1 = load i8, ptr %arrayidx2, align 1 - %conv3 = zext i8 %1 to i64 - %mul = mul nuw nsw i64 %conv3, %conv - %add = add i64 %sum, %mul - %exitcond.not = icmp eq i64 %i.iv.next, 16 - br i1 %exitcond.not, label %exit, label %for.body - -exit: ; preds = %for.body - ret i64 %add -} - -!7 = distinct !{!7, !8, !9, !10} -!8 = !{!"llvm.loop.mustprogress"} -!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} -!10 = !{!"llvm.loop.vectorize.enable", i1 true} -attributes #0 = { vscale_range(1,16) "target-features"="+sve" } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll deleted file mode 100644 index f24b115ab9f99..0000000000000 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll +++ /dev/null @@ -1,61 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon -S < %s | FileCheck %s - -target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -target triple = "aarch64-none-unknown-elf" - -define i32 @not_dotp(ptr %a, ptr %b) { -; CHECK-LABEL: define i32 @not_dotp( -; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 -; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16 -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 -; CHECK-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]] -; CHECK-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]] -; CHECK-NEXT: [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] - %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] - %gep.a = getelementptr i8, ptr %a, i64 %iv - %load.a = load i8, ptr %gep.a, align 1 - %ext.a = zext i8 %load.a to i32 - %gep.b = getelementptr i8, ptr %b, i64 %iv - %load.b = load i8, ptr %gep.b, align 1 - %ext.b = zext i8 %load.b to i32 - %mul = mul i32 %ext.b, %ext.a - %add = add i32 %mul, %accum - %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 - br i1 %exitcond.not, label %for.exit, label %for.body - -for.exit: ; preds = %for.body - ret i32 %add -} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll deleted file mode 100644 index 06aaf29b382a2..0000000000000 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll +++ /dev/null @@ -1,93 +0,0 @@ -; REQUIRES: asserts -; RUN: opt -mattr=+neon,+dotprod -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-interleave=1 -disable-output %s 2>&1 | FileCheck %s - -target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -target triple = "aarch64-none-unknown-elf" - -; Tests for printing VPlans that are enabled under AArch64 - -define i32 @print_partial_reduction(ptr %a, ptr %b) { -; CHECK: VPlan 'Initial VPlan for VF={8,16},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF -; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count -; CHECK-NEXT: Live-in ir<0> = original trip-count -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: Successor(s): vector.ph -; CHECK-EMPTY: -; CHECK-NEXT: vector.ph: -; CHECK-NEXT: Successor(s): vector loop -; CHECK-EMPTY: -; CHECK-NEXT: vector loop: { -; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi ir<0>, ir<[[REDUCE:%.+]]> (VF scaled by 1/4) -; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> -; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[STEPS]]> -; CHECK-NEXT: vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a> -; CHECK-NEXT: WIDEN ir<%load.a> = load vp<[[PTR_A]]> -; CHECK-NEXT: WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32 -; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[STEPS]]> -; CHECK-NEXT: vp<[[PTR_B:%.+]]> = vector-pointer ir<%gep.b> -; CHECK-NEXT: WIDEN ir<%load.b> = load vp<[[PTR_B]]> -; CHECK-NEXT: WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32 -; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a> -; CHECK-NEXT: PARTIAL-REDUCE ir<[[REDUCE]]> = add ir<%mul>, ir<[[ACC]]> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): middle.block -; CHECK-EMPTY: -; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, ir<[[REDUCE]]> -; CHECK-NEXT: EMIT vp<[[EXTRACT:%.+]]> = extract-from-end vp<[[RED_RESULT]]>, ir<1> -; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<0>, vp<%1> -; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> -; CHECK-NEXT: Successor(s): ir-bb, scalar.ph -; CHECK-EMPTY: -; CHECK-NEXT: scalar.ph: -; CHECK-NEXT: EMIT vp<%bc.merge.rdx> = resume-phi vp<[[RED_RESULT]]>, ir<0> -; CHECK-NEXT: Successor(s): ir-bb -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] -; CHECK-NEXT: IR %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) -; CHECK-NEXT: IR %gep.a = getelementptr i8, ptr %a, i64 %iv -; CHECK-NEXT: IR %load.a = load i8, ptr %gep.a, align 1 -; CHECK-NEXT: IR %ext.a = zext i8 %load.a to i32 -; CHECK-NEXT: IR %gep.b = getelementptr i8, ptr %b, i64 %iv -; CHECK-NEXT: IR %load.b = load i8, ptr %gep.b, align 1 -; CHECK-NEXT: IR %ext.b = zext i8 %load.b to i32 -; CHECK-NEXT: IR %mul = mul i32 %ext.b, %ext.a -; CHECK-NEXT: IR %add = add i32 %mul, %accum -; CHECK-NEXT: IR %iv.next = add i64 %iv, 1 -; CHECK-NEXT: IR %exitcond.not = icmp eq i64 %iv.next, 0 -; CHECK-NEXT: No successors -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[EXTRACT]]> from middle.block) -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] - %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] - %gep.a = getelementptr i8, ptr %a, i64 %iv - %load.a = load i8, ptr %gep.a, align 1 - %ext.a = zext i8 %load.a to i32 - %gep.b = getelementptr i8, ptr %b, i64 %iv - %load.b = load i8, ptr %gep.b, align 1 - %ext.b = zext i8 %load.b to i32 - %mul = mul i32 %ext.b, %ext.a - %add = add i32 %mul, %accum - %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 - br i1 %exitcond.not, label %exit, label %for.body - -exit: - ret i32 %add -} From 815343e7dd32cc4c5c903ac52daf87aaa4e4cd6e Mon Sep 17 00:00:00 2001 From: Kyungwoo Lee Date: Fri, 27 Dec 2024 11:59:25 -0800 Subject: [PATCH 120/567] [CGData][Merger] Avoid merging the attached call target (#121030) For global function merging, the target of the arc-attached call must be a constant and cannot be parameterized. This change adds a check to bypass this case in `canParameterizeCallOperand()`. --- llvm/lib/CodeGen/GlobalMergeFunctions.cpp | 12 ++++-- .../cgdata-no-merge-attached-call-garget.ll | 37 +++++++++++++++++++ 2 files changed, 46 insertions(+), 3 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/cgdata-no-merge-attached-call-garget.ll diff --git a/llvm/lib/CodeGen/GlobalMergeFunctions.cpp b/llvm/lib/CodeGen/GlobalMergeFunctions.cpp index 1187ad06b7372..e920b1be6822c 100644 --- a/llvm/lib/CodeGen/GlobalMergeFunctions.cpp +++ b/llvm/lib/CodeGen/GlobalMergeFunctions.cpp @@ -60,11 +60,17 @@ static bool canParameterizeCallOperand(const CallBase *CI, unsigned OpIdx) { if (Name.starts_with("__dtrace")) return false; } - if (isCalleeOperand(CI, OpIdx) && - CI->getOperandBundle(LLVMContext::OB_ptrauth).has_value()) { + if (isCalleeOperand(CI, OpIdx)) { // The operand is the callee and it has already been signed. Ignore this // because we cannot add another ptrauth bundle to the call instruction. - return false; + if (CI->getOperandBundle(LLVMContext::OB_ptrauth).has_value()) + return false; + } else { + // The target of the arc-attached call must be a constant and cannot be + // parameterized. + if (CI->isOperandBundleOfType(LLVMContext::OB_clang_arc_attachedcall, + OpIdx)) + return false; } return true; } diff --git a/llvm/test/CodeGen/AArch64/cgdata-no-merge-attached-call-garget.ll b/llvm/test/CodeGen/AArch64/cgdata-no-merge-attached-call-garget.ll new file mode 100644 index 0000000000000..1163314062df1 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/cgdata-no-merge-attached-call-garget.ll @@ -0,0 +1,37 @@ +; This test verifies that two similar functions, f1 and f2, are not merged +; when their attached call targets differ, since these targets cannot be parameterized. + +; RUN: llc -mtriple=arm64-apple-darwin -enable-global-merge-func=true < %s | FileCheck %s + +; CHECK-NOT: _f1.Tgm +; CHECK-NOT: _f2.Tgm + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-darwin" + +define i64 @f1(ptr %0) { + %2 = call ptr @g1(ptr %0, i32 0) minsize [ "clang.arc.attachedcall"(ptr @llvm.objc.unsafeClaimAutoreleasedReturnValue) ] + tail call void (...) @llvm.objc.clang.arc.noop.use(ptr %2) + %3 = call i64 @g2(ptr %2) + tail call void @objc_release(ptr %2) + %4 = tail call i64 @g3(i64 %3) + ret i64 %4 +} + +define i64 @f2(ptr %0) { + %2 = call ptr @g1(ptr %0, i32 0) minsize [ "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ] + tail call void (...) @llvm.objc.clang.arc.noop.use(ptr %2) + %3 = call i64 @g2(ptr %2) + tail call void @objc_release(ptr %2) + %4 = tail call i64 @g3(i64 %3) + ret i64 %4 +} + +declare ptr @g1(ptr, i32) +declare i64 @g2(ptr) +declare i64 @g3(i64) + +declare void @llvm.objc.clang.arc.noop.use(...) +declare ptr @llvm.objc.unsafeClaimAutoreleasedReturnValue(ptr) +declare ptr @llvm.objc.retainAutoreleasedReturnValue(ptr) +declare void @objc_release(ptr) From b2fd0a7a7065658ab4a3355399978523c1370615 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Fri, 27 Dec 2024 13:02:51 -0800 Subject: [PATCH 121/567] [Driver] Fix implicit-check-not regex (#121221) We need to exclude more than builtins, but it's tricky with FileCheck regex. So switching to list of libs we want to check. --- clang/test/Driver/sanitizer-ld.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/clang/test/Driver/sanitizer-ld.c b/clang/test/Driver/sanitizer-ld.c index 5befbb159183e..4e4cfbae27e11 100644 --- a/clang/test/Driver/sanitizer-ld.c +++ b/clang/test/Driver/sanitizer-ld.c @@ -1,8 +1,9 @@ // Test sanitizers ld flags. -// Match all libclang_rt, excluding platform-inconsistent builtins. +// Match all libclang_rt, excluding platform-inconsistent libs, like +// libclang_rt.builtins, libclang_rt.osx etc. -// DEFINE: %{filecheck} = FileCheck %s --implicit-check-not="libclang_rt.{{([^b]..|.[^u].|..[^i]).*}}" +// DEFINE: %{filecheck} = FileCheck %s --implicit-check-not="libclang_rt.{{([^.]+san|scudo|cfi|safestack|stats|fuzzer|undefined)}}" // RUN: %clang -### %s 2>&1 \ // RUN: --target=i386-unknown-linux -fuse-ld=ld -fsanitize=address \ @@ -37,7 +38,6 @@ // RUN: | %{filecheck} --check-prefix=CHECK-ASAN-NO-LINK-RUNTIME-DARWIN // // CHECK-ASAN-NO-LINK-RUNTIME-DARWIN: "{{.*}}ld" -// CHECK-ASAN-NO-LINK-RUNTIME-DARWIN: libclang_rt.osx.a" // RUN: %clang -fsanitize=address -### %s 2>&1 \ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ @@ -366,7 +366,6 @@ // RUN: | %{filecheck} --check-prefix=CHECK-TYSAN-DARWIN-CXX // CHECK-TYSAN-DARWIN-CXX: "{{.*}}ld" // CHECK-TYSAN-DARWIN-CXX: libclang_rt.tysan_osx_dynamic.dylib -// CHECK-TYSAN-DARWIN-CXX: libclang_rt.osx.a // CHECK-TYSAN-DARWIN-CXX-NOT: -lc++abi // RUN: %clangxx -### %s 2>&1 \ @@ -403,7 +402,7 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | %{filecheck} --check-prefix=CHECK-TSAN-NO-LINK-RUNTIME-DARWIN // -// CHECK-TSAN-NO-LINK-RUNTIME-DARWIN: libclang_rt.ios.a +// CHECK-TSAN-NO-LINK-RUNTIME-DARWIN: "{{(.*[^-.0-9A-Z_a-z])?}}ld" // RUN: %clangxx -### %s 2>&1 \ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld -stdlib=platform -lstdc++ \ @@ -473,7 +472,6 @@ // RUN: | %{filecheck} --check-prefix=CHECK-UBSAN-NO-LINK-RUNTIME-DARWIN // // CHECK-UBSAN-NO-LINK-RUNTIME-DARWIN: "{{.*}}ld" -// CHECK-UBSAN-NO-LINK-RUNTIME-DARWIN: libclang_rt.osx.a // RUN: %clang -fsanitize=fuzzer -fno-sanitize-link-runtime -### %s 2>&1 \ // RUN: --target=arm64e-apple-watchos -fuse-ld=ld \ @@ -482,7 +480,6 @@ // RUN: | %{filecheck} --check-prefix=CHECK-FUZZER-NO-LINK-RUNTIME-DARWIN // // CHECK-FUZZER-NO-LINK-RUNTIME-DARWIN: "{{.*}}ld" -// CHECK-FUZZER-NO-LINK-RUNTIME-DARWIN: libclang_rt.watchos.a // RUN: %clang -fsanitize=undefined -### %s 2>&1 \ // RUN: --target=i386-unknown-linux -fuse-ld=ld \ @@ -838,7 +835,6 @@ // CHECK-ASAN-DARWIN106-CXX: "{{.*}}ld" // CHECK-ASAN-DARWIN106-CXX: libclang_rt.asan_osx_dynamic.dylib // CHECK-ASAN-DARWIN106-CXX-NOT: -lc++abi -// CHECK-ASAN-DARWIN106-CXX: libclang_rt.osx.a // RUN: %clangxx -fsanitize=leak -### %s 2>&1 \ // RUN: -mmacos-version-min=10.6 \ @@ -849,7 +845,6 @@ // CHECK-LSAN-DARWIN106-CXX: "{{.*}}ld" // CHECK-LSAN-DARWIN106-CXX: libclang_rt.lsan_osx_dynamic.dylib // CHECK-LSAN-DARWIN106-CXX-NOT: -lc++abi -// CHECK-LSAN-DARWIN106-CXX: libclang_rt.osx.a // RUN: %clang -### %s 2>&1 \ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld -fsanitize=safe-stack \ From 48bf0a9457fd60d0872d9b9b4804a95c833a72e1 Mon Sep 17 00:00:00 2001 From: Amr Hesham Date: Sat, 28 Dec 2024 01:10:24 +0100 Subject: [PATCH 122/567] [Clang][ASTMatcher] Add `dependentScopeDeclRefExpr` matcher (#120996) Fixes https://github.com/llvm/llvm-project/issues/120937 --- clang/docs/LibASTMatchersReference.html | 6 ++++++ clang/docs/ReleaseNotes.rst | 2 ++ clang/include/clang/ASTMatchers/ASTMatchers.h | 10 +++++++++ clang/lib/ASTMatchers/ASTMatchersInternal.cpp | 2 ++ clang/lib/ASTMatchers/Dynamic/Registry.cpp | 1 + clang/unittests/AST/ASTImporterTest.cpp | 3 --- .../ASTMatchers/ASTMatchersNodeTest.cpp | 21 +++++++++++++++---- 7 files changed, 38 insertions(+), 7 deletions(-) diff --git a/clang/docs/LibASTMatchersReference.html b/clang/docs/LibASTMatchersReference.html index f18e9cf134169..ddc99020604c9 100644 --- a/clang/docs/LibASTMatchersReference.html +++ b/clang/docs/LibASTMatchersReference.html @@ -1842,6 +1842,12 @@

Node Matchers

if (x) {} +Matcher<
Stmt>dependentScopeDeclRefExprMatcher<DependentScopeDeclRefExpr>... +
Matches expressions that refer to dependent scope declarations.
+
+Example matches T::v
+   template  class X : T { void f() { T::v; } };
+
Matcher<Stmt>declStmtMatcher<DeclStmt>...
Matches declaration statements.
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 4410b9f99e802..983c1da20ed4c 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1108,6 +1108,8 @@ AST Matchers
 
 - Ensure ``pointee`` matches Objective-C pointer types.
 
+- Add ``dependentScopeDeclRefExpr`` matcher to match expressions that refer to dependent scope declarations.
+
 clang-format
 ------------
 
diff --git a/clang/include/clang/ASTMatchers/ASTMatchers.h b/clang/include/clang/ASTMatchers/ASTMatchers.h
index 897aa25dc95cc..22e2546ab81e0 100644
--- a/clang/include/clang/ASTMatchers/ASTMatchers.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchers.h
@@ -2125,6 +2125,16 @@ extern const internal::VariadicDynCastAllOfMatcher expr;
 extern const internal::VariadicDynCastAllOfMatcher
     declRefExpr;
 
+/// Matches expressions that refer to dependent scope declarations.
+///
+/// example matches T::v;
+/// \code
+///  template  class X : T { void f() { T::v; } };
+/// \endcode
+extern const internal::VariadicDynCastAllOfMatcher
+    dependentScopeDeclRefExpr;
+
 /// Matches a reference to an ObjCIvar.
 ///
 /// Example: matches "a" in "init" method:
diff --git a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
index bf9dc5f2373f9..8c744eebbdfb5 100644
--- a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
+++ b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
@@ -924,6 +924,8 @@ const internal::VariadicDynCastAllOfMatcher
 const internal::VariadicDynCastAllOfMatcher cxxFoldExpr;
 const internal::VariadicDynCastAllOfMatcher expr;
 const internal::VariadicDynCastAllOfMatcher declRefExpr;
+const internal::VariadicDynCastAllOfMatcher
+    dependentScopeDeclRefExpr;
 const internal::VariadicDynCastAllOfMatcher objcIvarRefExpr;
 const internal::VariadicDynCastAllOfMatcher blockExpr;
 const internal::VariadicDynCastAllOfMatcher ifStmt;
diff --git a/clang/lib/ASTMatchers/Dynamic/Registry.cpp b/clang/lib/ASTMatchers/Dynamic/Registry.cpp
index 837633fb2f060..685d626d2978b 100644
--- a/clang/lib/ASTMatchers/Dynamic/Registry.cpp
+++ b/clang/lib/ASTMatchers/Dynamic/Registry.cpp
@@ -222,6 +222,7 @@ RegistryMaps::RegistryMaps() {
   REGISTER_MATCHER(decompositionDecl);
   REGISTER_MATCHER(declCountIs);
   REGISTER_MATCHER(declRefExpr);
+  REGISTER_MATCHER(dependentScopeDeclRefExpr);
   REGISTER_MATCHER(declStmt);
   REGISTER_MATCHER(declaratorDecl);
   REGISTER_MATCHER(decltypeType);
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index f3f314b723dfc..ec062a5cc953b 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -3172,9 +3172,6 @@ TEST_P(ImportDecl, ImportFieldOrder) {
              recordDecl(hasFieldOrder({"b", "a"})));
 }
 
-const internal::VariadicDynCastAllOfMatcher
-    dependentScopeDeclRefExpr;
-
 TEST_P(ImportExpr, DependentScopeDeclRefExpr) {
   MatchVerifier Verifier;
   testImport("template  struct S { static T foo; };"
diff --git a/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
index 9bc287e07224a..a3baad367a27b 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
+++ b/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
@@ -556,6 +556,21 @@ TEST_P(ASTMatchersTest, DeclRefExpr) {
                          Reference));
 }
 
+TEST_P(ASTMatchersTest, DependentScopeDeclRefExpr) {
+  if (!GetParam().isCXX() || GetParam().hasDelayedTemplateParsing()) {
+    // FIXME: Fix this test to work with delayed template parsing.
+    return;
+  }
+
+  EXPECT_TRUE(matches("template  class X : T { void f() { T::v; } };",
+                      dependentScopeDeclRefExpr()));
+
+  EXPECT_TRUE(
+      matches("template  struct S { static T Foo; };"
+              "template  void declToImport() { (void)S::Foo; }",
+              dependentScopeDeclRefExpr()));
+}
+
 TEST_P(ASTMatchersTest, CXXMemberCallExpr) {
   if (!GetParam().isCXX()) {
     return;
@@ -629,10 +644,8 @@ TEST_P(ASTMatchersTest, MemberExpr_MatchesVariable) {
   EXPECT_TRUE(matches("template "
                       "class X : T { void f() { this->T::v; } };",
                       cxxDependentScopeMemberExpr()));
-  // FIXME: Add a matcher for DependentScopeDeclRefExpr.
-  EXPECT_TRUE(
-      notMatches("template  class X : T { void f() { T::v; } };",
-                 cxxDependentScopeMemberExpr()));
+  EXPECT_TRUE(matches("template  class X : T { void f() { T::v; } };",
+                      dependentScopeDeclRefExpr()));
   EXPECT_TRUE(matches("template  void x() { T t; t.v; }",
                       cxxDependentScopeMemberExpr()));
 }

From 092966a44d1793e6bd53c191efeb5a5425fecde3 Mon Sep 17 00:00:00 2001
From: Vitaly Buka 
Date: Fri, 27 Dec 2024 19:32:06 -0800
Subject: [PATCH 123/567] [Driver] Make regex stricter (#121243)

Follow up to #121221.
---
 clang/test/Driver/sanitizer-ld.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/clang/test/Driver/sanitizer-ld.c b/clang/test/Driver/sanitizer-ld.c
index 4e4cfbae27e11..17766cef86d2a 100644
--- a/clang/test/Driver/sanitizer-ld.c
+++ b/clang/test/Driver/sanitizer-ld.c
@@ -1,9 +1,13 @@
 // Test sanitizers ld flags.
 
-// Match all libclang_rt, excluding platform-inconsistent libs, like
-// libclang_rt.builtins, libclang_rt.osx etc.
-
-// DEFINE: %{filecheck} = FileCheck %s --implicit-check-not="libclang_rt.{{([^.]+san|scudo|cfi|safestack|stats|fuzzer|undefined)}}"
+// Match all sanitizer related libclang_rt, we are not interested in
+// libclang_rt.builtins, libclang_rt.osx, libclang_rt.ios, libclang_rt.watchos
+// etc.
+//
+// If we need to add sanitizer with name starting with excluded laters, e.g.
+// `bsan`, we can extend expression like this: `([^iow]|b[^u])`.
+//
+// DEFINE: %{filecheck} = FileCheck %s --implicit-check-not="libclang_rt.{{([^biow])}}"
 
 // RUN: %clang -### %s 2>&1 \
 // RUN:     --target=i386-unknown-linux -fuse-ld=ld -fsanitize=address \

From edc42b2dc18c1bf413fc393eeb05abd6076d8bbe Mon Sep 17 00:00:00 2001
From: Fangrui Song 
Date: Fri, 27 Dec 2024 21:01:08 -0800
Subject: [PATCH 124/567] [SLP] Migrate away from PointerUnion::get

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 32 +++++++++----------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7f4c3d44b0ec4..f52ddfda5e64c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -10776,23 +10776,21 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     }
     if (ForExtracts) {
       // No need to add vectors here, already handled them in adjustExtracts.
-      assert(
-          InVectors.size() == 1 && isa(InVectors.front()) &&
-          !CommonMask.empty() &&
-          all_of(enumerate(CommonMask),
-                 [&](auto P) {
-                   Value *Scalar =
-                       InVectors.front().get()->getOrdered(
-                           P.index());
-                   if (P.value() == PoisonMaskElem)
-                     return P.value() == Mask[P.index()] ||
-                            isa(Scalar);
-                   if (isa(V1))
-                     return true;
-                   auto *EI = cast(Scalar);
-                   return EI->getVectorOperand() == V1;
-                 }) &&
-          "Expected only tree entry for extractelement vectors.");
+      assert(InVectors.size() == 1 && isa(InVectors[0]) &&
+             !CommonMask.empty() &&
+             all_of(enumerate(CommonMask),
+                    [&](auto P) {
+                      Value *Scalar = cast(InVectors[0])
+                                          ->getOrdered(P.index());
+                      if (P.value() == PoisonMaskElem)
+                        return P.value() == Mask[P.index()] ||
+                               isa(Scalar);
+                      if (isa(V1))
+                        return true;
+                      auto *EI = cast(Scalar);
+                      return EI->getVectorOperand() == V1;
+                    }) &&
+             "Expected only tree entry for extractelement vectors.");
       return;
     }
     assert(!InVectors.empty() && !CommonMask.empty() &&

From 39e8953f892a51816aa1fde70829a61d7e756a51 Mon Sep 17 00:00:00 2001
From: Timm Baeder 
Date: Sat, 28 Dec 2024 07:43:15 +0100
Subject: [PATCH 125/567] [clang][bytecode] Move a local variable to a later
 point (#121250)

We don't need `E` before.
---
 clang/lib/AST/ByteCode/Pointer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/AST/ByteCode/Pointer.cpp b/clang/lib/AST/ByteCode/Pointer.cpp
index 54484853fcdae..01e642310aad3 100644
--- a/clang/lib/AST/ByteCode/Pointer.cpp
+++ b/clang/lib/AST/ByteCode/Pointer.cpp
@@ -476,10 +476,10 @@ bool Pointer::pointsToLiteral() const {
   if (isZero() || !isBlockPointer())
     return false;
 
-  const Expr *E = block()->getDescriptor()->asExpr();
   if (block()->isDynamic())
     return false;
 
+  const Expr *E = block()->getDescriptor()->asExpr();
   return E && !isa(E);
 }
 

From 5bec2b71b44ddff44aa4d8534b58a5561389bb1d Mon Sep 17 00:00:00 2001
From: 4m4n-x-B4w4ne <125849251+4m4n-x-B4w4ne@users.noreply.github.com>
Date: Sat, 28 Dec 2024 13:05:30 +0530
Subject: [PATCH 126/567] Added options to readability-implicit-bool-conversion
  (#120087)

As given in the issue #36323 , I added two new options in the
clang-tools-extra/clan-tidy/readibility/ImplicitBoolConversionCheck.cpp
and header file.
I have also written new test cases to test these new options in
test/readibility directory.
---
 .../ImplicitBoolConversionCheck.cpp           | 145 ++++++++++--------
 .../readability/ImplicitBoolConversionCheck.h |   2 +
 clang-tools-extra/docs/ReleaseNotes.rst       |   9 +-
 .../readability/implicit-bool-conversion.rst  |  38 +++++
 .../implicit-bool-conversion-check.cpp        |  57 +++++++
 5 files changed, 183 insertions(+), 68 deletions(-)
 create mode 100644 clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion-check.cpp

diff --git a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
index f9fd1d903e231..48851da143068 100644
--- a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
@@ -259,13 +259,17 @@ ImplicitBoolConversionCheck::ImplicitBoolConversionCheck(
       AllowIntegerConditions(Options.get("AllowIntegerConditions", false)),
       AllowPointerConditions(Options.get("AllowPointerConditions", false)),
       UseUpperCaseLiteralSuffix(
-          Options.get("UseUpperCaseLiteralSuffix", false)) {}
+          Options.get("UseUpperCaseLiteralSuffix", false)),
+      CheckConversionsToBool(Options.get("CheckConversionsToBool", true)),
+      CheckConversionsFromBool(Options.get("CheckConversionsFromBool", true)) {}
 
 void ImplicitBoolConversionCheck::storeOptions(
     ClangTidyOptions::OptionMap &Opts) {
   Options.store(Opts, "AllowIntegerConditions", AllowIntegerConditions);
   Options.store(Opts, "AllowPointerConditions", AllowPointerConditions);
   Options.store(Opts, "UseUpperCaseLiteralSuffix", UseUpperCaseLiteralSuffix);
+  Options.store(Opts, "CheckConversionsToBool", CheckConversionsToBool);
+  Options.store(Opts, "CheckConversionsFromBool", CheckConversionsFromBool);
 }
 
 void ImplicitBoolConversionCheck::registerMatchers(MatchFinder *Finder) {
@@ -277,6 +281,7 @@ void ImplicitBoolConversionCheck::registerMatchers(MatchFinder *Finder) {
                  expr(hasType(qualType().bind("type")),
                       hasParent(initListExpr(hasParent(explicitCastExpr(
                           hasType(qualType(equalsBoundNode("type"))))))))));
+
   auto ImplicitCastFromBool = implicitCastExpr(
       anyOf(hasCastKind(CK_IntegralCast), hasCastKind(CK_IntegralToFloating),
             // Prior to C++11 cast from bool literal to pointer was allowed.
@@ -287,72 +292,84 @@ void ImplicitBoolConversionCheck::registerMatchers(MatchFinder *Finder) {
   auto BoolXor =
       binaryOperator(hasOperatorName("^"), hasLHS(ImplicitCastFromBool),
                      hasRHS(ImplicitCastFromBool));
-  auto ComparisonInCall = allOf(
-      hasParent(callExpr()),
-      hasSourceExpression(binaryOperator(hasAnyOperatorName("==", "!="))));
-
   auto IsInCompilerGeneratedFunction = hasAncestor(namedDecl(anyOf(
       isImplicit(), functionDecl(isDefaulted()), functionTemplateDecl())));
 
-  Finder->addMatcher(
-      traverse(TK_AsIs,
-               implicitCastExpr(
-                   anyOf(hasCastKind(CK_IntegralToBoolean),
-                         hasCastKind(CK_FloatingToBoolean),
-                         hasCastKind(CK_PointerToBoolean),
-                         hasCastKind(CK_MemberPointerToBoolean)),
-                   // Exclude cases of C23 comparison result.
-                   unless(allOf(isC23(),
-                                hasSourceExpression(ignoringParens(
-                                    binaryOperator(hasAnyOperatorName(
-                                        ">", ">=", "==", "!=", "<", "<=")))))),
-                   // Exclude case of using if or while statements with variable
-                   // declaration, e.g.:
-                   //   if (int var = functionCall()) {}
-                   unless(hasParent(
-                       stmt(anyOf(ifStmt(), whileStmt()), has(declStmt())))),
-                   // Exclude cases common to implicit cast to and from bool.
-                   unless(ExceptionCases), unless(has(BoolXor)),
-                   // Exclude C23 cases common to implicit cast to bool.
-                   unless(ComparisonInCall),
-                   // Retrieve also parent statement, to check if we need
-                   // additional parens in replacement.
-                   optionally(hasParent(stmt().bind("parentStmt"))),
-                   unless(isInTemplateInstantiation()),
-                   unless(IsInCompilerGeneratedFunction))
-                   .bind("implicitCastToBool")),
-      this);
-
-  auto BoolComparison = binaryOperator(hasAnyOperatorName("==", "!="),
-                                       hasLHS(ImplicitCastFromBool),
-                                       hasRHS(ImplicitCastFromBool));
-  auto BoolOpAssignment = binaryOperator(hasAnyOperatorName("|=", "&="),
-                                         hasLHS(expr(hasType(booleanType()))));
-  auto BitfieldAssignment = binaryOperator(
-      hasLHS(memberExpr(hasDeclaration(fieldDecl(hasBitWidth(1))))));
-  auto BitfieldConstruct = cxxConstructorDecl(hasDescendant(cxxCtorInitializer(
-      withInitializer(equalsBoundNode("implicitCastFromBool")),
-      forField(hasBitWidth(1)))));
-  Finder->addMatcher(
-      traverse(
-          TK_AsIs,
-          implicitCastExpr(
-              ImplicitCastFromBool, unless(ExceptionCases),
-              // Exclude comparisons of bools, as they are always cast to
-              // integers in such context:
-              //   bool_expr_a == bool_expr_b
-              //   bool_expr_a != bool_expr_b
-              unless(hasParent(
-                  binaryOperator(anyOf(BoolComparison, BoolXor,
-                                       BoolOpAssignment, BitfieldAssignment)))),
-              implicitCastExpr().bind("implicitCastFromBool"),
-              unless(hasParent(BitfieldConstruct)),
-              // Check also for nested casts, for example: bool -> int -> float.
-              anyOf(hasParent(implicitCastExpr().bind("furtherImplicitCast")),
-                    anything()),
-              unless(isInTemplateInstantiation()),
-              unless(IsInCompilerGeneratedFunction))),
-      this);
+  if (CheckConversionsToBool) {
+    auto ComparisonInCall = allOf(
+        hasParent(callExpr()),
+        hasSourceExpression(binaryOperator(hasAnyOperatorName("==", "!="))));
+
+    Finder->addMatcher(
+        traverse(
+            TK_AsIs,
+            implicitCastExpr(
+                anyOf(hasCastKind(CK_IntegralToBoolean),
+                      hasCastKind(CK_FloatingToBoolean),
+                      hasCastKind(CK_PointerToBoolean),
+                      hasCastKind(CK_MemberPointerToBoolean)),
+                // Exclude cases of C23 comparison result.
+                unless(allOf(isC23(),
+                             hasSourceExpression(ignoringParens(
+                                 binaryOperator(hasAnyOperatorName(
+                                     ">", ">=", "==", "!=", "<", "<=")))))),
+                // Exclude case of using if or while statements with variable
+                // declaration, e.g.:
+                //   if (int var = functionCall()) {}
+                unless(hasParent(
+                    stmt(anyOf(ifStmt(), whileStmt()), has(declStmt())))),
+                // Exclude cases common to implicit cast to and from bool.
+                unless(ExceptionCases), unless(has(BoolXor)),
+                // Exclude C23 cases common to implicit cast to bool.
+                unless(ComparisonInCall),
+                // Retrieve also parent statement, to check if we need
+                // additional parens in replacement.
+                optionally(hasParent(stmt().bind("parentStmt"))),
+                unless(isInTemplateInstantiation()),
+                unless(IsInCompilerGeneratedFunction))
+                .bind("implicitCastToBool")),
+        this);
+  }
+
+  if (CheckConversionsFromBool) {
+
+    auto BoolComparison = binaryOperator(hasAnyOperatorName("==", "!="),
+                                         hasLHS(ImplicitCastFromBool),
+                                         hasRHS(ImplicitCastFromBool));
+
+    auto BoolOpAssignment = binaryOperator(
+        hasAnyOperatorName("|=", "&="), hasLHS(expr(hasType(booleanType()))));
+
+    auto BitfieldAssignment = binaryOperator(
+        hasLHS(memberExpr(hasDeclaration(fieldDecl(hasBitWidth(1))))));
+
+    auto BitfieldConstruct =
+        cxxConstructorDecl(hasDescendant(cxxCtorInitializer(
+            withInitializer(equalsBoundNode("implicitCastFromBool")),
+            forField(hasBitWidth(1)))));
+
+    Finder->addMatcher(
+        traverse(
+            TK_AsIs,
+            implicitCastExpr(
+                ImplicitCastFromBool, unless(ExceptionCases),
+                // Exclude comparisons of bools, as they are
+                // always cast to integers in such context:
+                //   bool_expr_a == bool_expr_b
+                //   bool_expr_a != bool_expr_b
+                unless(hasParent(binaryOperator(anyOf(BoolComparison, BoolXor,
+                                                      BoolOpAssignment,
+                                                      BitfieldAssignment)))),
+                implicitCastExpr().bind("implicitCastFromBool"),
+                unless(hasParent(BitfieldConstruct)),
+                // Check also for nested casts, for example:
+                // bool -> int -> float.
+                anyOf(hasParent(implicitCastExpr().bind("furtherImplicitCast")),
+                      anything()),
+                unless(isInTemplateInstantiation()),
+                unless(IsInCompilerGeneratedFunction))),
+        this);
+  }
 }
 
 void ImplicitBoolConversionCheck::check(
diff --git a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.h b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.h
index 5947f7316e67c..b0c3c2943e649 100644
--- a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.h
@@ -37,6 +37,8 @@ class ImplicitBoolConversionCheck : public ClangTidyCheck {
   const bool AllowIntegerConditions;
   const bool AllowPointerConditions;
   const bool UseUpperCaseLiteralSuffix;
+  const bool CheckConversionsToBool;
+  const bool CheckConversionsFromBool;
 };
 
 } // namespace clang::tidy::readability
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index fabd0cc78ac64..8c360222ce43d 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -342,10 +342,11 @@ Changes in existing checks
   diagnostic.
 
 - Improved :doc:`readability-implicit-bool-conversion
-  ` check
-  by adding the option `UseUpperCaseLiteralSuffix` to select the
-  case of the literal suffix in fixes and fixing false positive for implicit
-  conversion of comparison result in C23.
+  ` check by adding the
+  option `UseUpperCaseLiteralSuffix` to select the case of the literal suffix in 
+  fixes and fixing false positive for implicit conversion of comparison result in 
+  C23, and by adding the option `CheckConversionsToBool` or 
+  `CheckConversionsFromBool` to configure checks for conversions involving ``bool``.
 
 - Improved :doc:`readability-redundant-smartptr-get
   ` check to
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/implicit-bool-conversion.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/implicit-bool-conversion.rst
index 88cff387f4c16..f7c15ffa2da51 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/implicit-bool-conversion.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/implicit-bool-conversion.rst
@@ -147,3 +147,41 @@ Options
       if (foo) {}
       // ^ propose replacement default: if (foo != 0u) {}
       // ^ propose replacement with option `UseUpperCaseLiteralSuffix`: if (foo != 0U) {}
+
+.. option:: CheckConversionsToBool
+
+   When `true`, the check diagnoses implicit conversions to ``bool``.
+   Default is `true`.
+
+   Example
+
+   .. code-block:: c++
+
+      int x = 42;
+      if (x) {}
+      // ^ propose replacement: if (x != 0) {}
+
+      float f = 3.14;
+      if (f) {}
+      // ^ propose replacement: if (f != 0.0f) {}
+
+.. option:: CheckConversionsFromBool
+
+   When `true`, the check diagnoses implicit conversions from ``bool``.
+   Default is `true`.
+
+   Example
+
+   .. code-block:: c++
+
+      bool b = true;
+
+      int x = b;
+      // ^ propose replacement: int x = b ? 1 : 0;
+
+      float f = b;
+      // ^ propose replacement: float f = b ? 1.0f : 0.0f;
+
+      int* p = b;
+      // ^ propose replacement: int* p = b ? some_ptr : nullptr;
+
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion-check.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion-check.cpp
new file mode 100644
index 0000000000000..8ba4635de1704
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion-check.cpp
@@ -0,0 +1,57 @@
+// RUN: %check_clang_tidy -check-suffix=FROM %s readability-implicit-bool-conversion %t -- \
+// RUN:     -config='{CheckOptions: { \
+// RUN:         readability-implicit-bool-conversion.CheckConversionsToBool: false, \
+// RUN:         readability-implicit-bool-conversion.CheckConversionsFromBool: true \
+// RUN:     }}' -- -std=c23
+// RUN: %check_clang_tidy -check-suffix=TO %s readability-implicit-bool-conversion %t -- \
+// RUN:     -config='{CheckOptions: { \
+// RUN:         readability-implicit-bool-conversion.CheckConversionsToBool: true, \
+// RUN:         readability-implicit-bool-conversion.CheckConversionsFromBool: false \
+// RUN:     }}' -- -std=c23
+// RUN: %check_clang_tidy -check-suffix=NORMAL %s readability-implicit-bool-conversion %t -- \
+// RUN:     -config='{CheckOptions: { \
+// RUN:         readability-implicit-bool-conversion.CheckConversionsToBool: false, \
+// RUN:         readability-implicit-bool-conversion.CheckConversionsFromBool: false \
+// RUN:     }}' -- -std=c23
+// RUN: %check_clang_tidy -check-suffix=TO,FROM %s readability-implicit-bool-conversion %t -- \
+// RUN:     -config='{CheckOptions: { \
+// RUN:         readability-implicit-bool-conversion.CheckConversionsToBool: true, \
+// RUN:         readability-implicit-bool-conversion.CheckConversionsFromBool: true \
+// RUN:     }}' -- -std=c23
+
+// Test various implicit bool conversions in different contexts
+void TestImplicitBoolConversion() {
+    // Basic type conversions to bool
+    int intValue = 42;
+    if (intValue) // CHECK-MESSAGES-TO: :[[@LINE]]:9: warning: implicit conversion 'int' -> 'bool' [readability-implicit-bool-conversion]
+                  // CHECK-FIXES-TO: if (intValue != 0)
+        (void)0;
+
+    float floatValue = 3.14f;
+    while (floatValue) // CHECK-MESSAGES-TO: :[[@LINE]]:12: warning: implicit conversion 'float' -> 'bool' [readability-implicit-bool-conversion]
+                       // CHECK-FIXES-TO: while (floatValue != 0.0f)
+        break;
+
+    char charValue = 'a';
+    do {
+        break;
+    } while (charValue); // CHECK-MESSAGES-TO: :[[@LINE]]:14: warning: implicit conversion 'char' -> 'bool' [readability-implicit-bool-conversion]
+                         // CHECK-FIXES-TO: } while (charValue != 0);
+
+    // Pointer conversions to bool
+    int* ptrValue = &intValue;
+    if (ptrValue) // CHECK-MESSAGES-TO: :[[@LINE]]:9: warning: implicit conversion 'int *' -> 'bool' [readability-implicit-bool-conversion]
+                  // CHECK-FIXES-TO: if (ptrValue != nullptr)
+        (void)0;
+
+    // Conversions from bool to other types
+    bool boolValue = true;
+    int intFromBool = boolValue; // CHECK-MESSAGES-FROM: :[[@LINE]]:23: warning: implicit conversion 'bool' -> 'int' [readability-implicit-bool-conversion]
+                                 // CHECK-FIXES-FROM: int intFromBool = static_cast(boolValue);
+                                 
+    float floatFromBool = boolValue; // CHECK-MESSAGES-FROM: :[[@LINE]]:27: warning: implicit conversion 'bool' -> 'float' [readability-implicit-bool-conversion]
+                                     // CHECK-FIXES-FROM: float floatFromBool = static_cast(boolValue);
+
+    char charFromBool = boolValue; // CHECK-MESSAGES-FROM: :[[@LINE]]:25: warning: implicit conversion 'bool' -> 'char' [readability-implicit-bool-conversion]
+                                   // CHECK-FIXES-FROM: char charFromBool = static_cast(boolValue);
+}

From 537d4e9d21be1f5e40a780f570663b04572765af Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi 
Date: Sat, 28 Dec 2024 17:47:00 +0900
Subject: [PATCH 127/567] Revert "Added options to
 readability-implicit-bool-conversion  (#120087)"

This reverts commit 5bec2b71b44ddff44aa4d8534b58a5561389bb1d.
(llvmorg-20-init-16425-g5bec2b71b44d)

This broke tests.
---
 .../ImplicitBoolConversionCheck.cpp           | 145 ++++++++----------
 .../readability/ImplicitBoolConversionCheck.h |   2 -
 clang-tools-extra/docs/ReleaseNotes.rst       |   9 +-
 .../readability/implicit-bool-conversion.rst  |  38 -----
 .../implicit-bool-conversion-check.cpp        |  57 -------
 5 files changed, 68 insertions(+), 183 deletions(-)
 delete mode 100644 clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion-check.cpp

diff --git a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
index 48851da143068..f9fd1d903e231 100644
--- a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
@@ -259,17 +259,13 @@ ImplicitBoolConversionCheck::ImplicitBoolConversionCheck(
       AllowIntegerConditions(Options.get("AllowIntegerConditions", false)),
       AllowPointerConditions(Options.get("AllowPointerConditions", false)),
       UseUpperCaseLiteralSuffix(
-          Options.get("UseUpperCaseLiteralSuffix", false)),
-      CheckConversionsToBool(Options.get("CheckConversionsToBool", true)),
-      CheckConversionsFromBool(Options.get("CheckConversionsFromBool", true)) {}
+          Options.get("UseUpperCaseLiteralSuffix", false)) {}
 
 void ImplicitBoolConversionCheck::storeOptions(
     ClangTidyOptions::OptionMap &Opts) {
   Options.store(Opts, "AllowIntegerConditions", AllowIntegerConditions);
   Options.store(Opts, "AllowPointerConditions", AllowPointerConditions);
   Options.store(Opts, "UseUpperCaseLiteralSuffix", UseUpperCaseLiteralSuffix);
-  Options.store(Opts, "CheckConversionsToBool", CheckConversionsToBool);
-  Options.store(Opts, "CheckConversionsFromBool", CheckConversionsFromBool);
 }
 
 void ImplicitBoolConversionCheck::registerMatchers(MatchFinder *Finder) {
@@ -281,7 +277,6 @@ void ImplicitBoolConversionCheck::registerMatchers(MatchFinder *Finder) {
                  expr(hasType(qualType().bind("type")),
                       hasParent(initListExpr(hasParent(explicitCastExpr(
                           hasType(qualType(equalsBoundNode("type"))))))))));
-
   auto ImplicitCastFromBool = implicitCastExpr(
       anyOf(hasCastKind(CK_IntegralCast), hasCastKind(CK_IntegralToFloating),
             // Prior to C++11 cast from bool literal to pointer was allowed.
@@ -292,84 +287,72 @@ void ImplicitBoolConversionCheck::registerMatchers(MatchFinder *Finder) {
   auto BoolXor =
       binaryOperator(hasOperatorName("^"), hasLHS(ImplicitCastFromBool),
                      hasRHS(ImplicitCastFromBool));
+  auto ComparisonInCall = allOf(
+      hasParent(callExpr()),
+      hasSourceExpression(binaryOperator(hasAnyOperatorName("==", "!="))));
+
   auto IsInCompilerGeneratedFunction = hasAncestor(namedDecl(anyOf(
       isImplicit(), functionDecl(isDefaulted()), functionTemplateDecl())));
 
-  if (CheckConversionsToBool) {
-    auto ComparisonInCall = allOf(
-        hasParent(callExpr()),
-        hasSourceExpression(binaryOperator(hasAnyOperatorName("==", "!="))));
-
-    Finder->addMatcher(
-        traverse(
-            TK_AsIs,
-            implicitCastExpr(
-                anyOf(hasCastKind(CK_IntegralToBoolean),
-                      hasCastKind(CK_FloatingToBoolean),
-                      hasCastKind(CK_PointerToBoolean),
-                      hasCastKind(CK_MemberPointerToBoolean)),
-                // Exclude cases of C23 comparison result.
-                unless(allOf(isC23(),
-                             hasSourceExpression(ignoringParens(
-                                 binaryOperator(hasAnyOperatorName(
-                                     ">", ">=", "==", "!=", "<", "<=")))))),
-                // Exclude case of using if or while statements with variable
-                // declaration, e.g.:
-                //   if (int var = functionCall()) {}
-                unless(hasParent(
-                    stmt(anyOf(ifStmt(), whileStmt()), has(declStmt())))),
-                // Exclude cases common to implicit cast to and from bool.
-                unless(ExceptionCases), unless(has(BoolXor)),
-                // Exclude C23 cases common to implicit cast to bool.
-                unless(ComparisonInCall),
-                // Retrieve also parent statement, to check if we need
-                // additional parens in replacement.
-                optionally(hasParent(stmt().bind("parentStmt"))),
-                unless(isInTemplateInstantiation()),
-                unless(IsInCompilerGeneratedFunction))
-                .bind("implicitCastToBool")),
-        this);
-  }
-
-  if (CheckConversionsFromBool) {
-
-    auto BoolComparison = binaryOperator(hasAnyOperatorName("==", "!="),
-                                         hasLHS(ImplicitCastFromBool),
-                                         hasRHS(ImplicitCastFromBool));
-
-    auto BoolOpAssignment = binaryOperator(
-        hasAnyOperatorName("|=", "&="), hasLHS(expr(hasType(booleanType()))));
-
-    auto BitfieldAssignment = binaryOperator(
-        hasLHS(memberExpr(hasDeclaration(fieldDecl(hasBitWidth(1))))));
-
-    auto BitfieldConstruct =
-        cxxConstructorDecl(hasDescendant(cxxCtorInitializer(
-            withInitializer(equalsBoundNode("implicitCastFromBool")),
-            forField(hasBitWidth(1)))));
-
-    Finder->addMatcher(
-        traverse(
-            TK_AsIs,
-            implicitCastExpr(
-                ImplicitCastFromBool, unless(ExceptionCases),
-                // Exclude comparisons of bools, as they are
-                // always cast to integers in such context:
-                //   bool_expr_a == bool_expr_b
-                //   bool_expr_a != bool_expr_b
-                unless(hasParent(binaryOperator(anyOf(BoolComparison, BoolXor,
-                                                      BoolOpAssignment,
-                                                      BitfieldAssignment)))),
-                implicitCastExpr().bind("implicitCastFromBool"),
-                unless(hasParent(BitfieldConstruct)),
-                // Check also for nested casts, for example:
-                // bool -> int -> float.
-                anyOf(hasParent(implicitCastExpr().bind("furtherImplicitCast")),
-                      anything()),
-                unless(isInTemplateInstantiation()),
-                unless(IsInCompilerGeneratedFunction))),
-        this);
-  }
+  Finder->addMatcher(
+      traverse(TK_AsIs,
+               implicitCastExpr(
+                   anyOf(hasCastKind(CK_IntegralToBoolean),
+                         hasCastKind(CK_FloatingToBoolean),
+                         hasCastKind(CK_PointerToBoolean),
+                         hasCastKind(CK_MemberPointerToBoolean)),
+                   // Exclude cases of C23 comparison result.
+                   unless(allOf(isC23(),
+                                hasSourceExpression(ignoringParens(
+                                    binaryOperator(hasAnyOperatorName(
+                                        ">", ">=", "==", "!=", "<", "<=")))))),
+                   // Exclude case of using if or while statements with variable
+                   // declaration, e.g.:
+                   //   if (int var = functionCall()) {}
+                   unless(hasParent(
+                       stmt(anyOf(ifStmt(), whileStmt()), has(declStmt())))),
+                   // Exclude cases common to implicit cast to and from bool.
+                   unless(ExceptionCases), unless(has(BoolXor)),
+                   // Exclude C23 cases common to implicit cast to bool.
+                   unless(ComparisonInCall),
+                   // Retrieve also parent statement, to check if we need
+                   // additional parens in replacement.
+                   optionally(hasParent(stmt().bind("parentStmt"))),
+                   unless(isInTemplateInstantiation()),
+                   unless(IsInCompilerGeneratedFunction))
+                   .bind("implicitCastToBool")),
+      this);
+
+  auto BoolComparison = binaryOperator(hasAnyOperatorName("==", "!="),
+                                       hasLHS(ImplicitCastFromBool),
+                                       hasRHS(ImplicitCastFromBool));
+  auto BoolOpAssignment = binaryOperator(hasAnyOperatorName("|=", "&="),
+                                         hasLHS(expr(hasType(booleanType()))));
+  auto BitfieldAssignment = binaryOperator(
+      hasLHS(memberExpr(hasDeclaration(fieldDecl(hasBitWidth(1))))));
+  auto BitfieldConstruct = cxxConstructorDecl(hasDescendant(cxxCtorInitializer(
+      withInitializer(equalsBoundNode("implicitCastFromBool")),
+      forField(hasBitWidth(1)))));
+  Finder->addMatcher(
+      traverse(
+          TK_AsIs,
+          implicitCastExpr(
+              ImplicitCastFromBool, unless(ExceptionCases),
+              // Exclude comparisons of bools, as they are always cast to
+              // integers in such context:
+              //   bool_expr_a == bool_expr_b
+              //   bool_expr_a != bool_expr_b
+              unless(hasParent(
+                  binaryOperator(anyOf(BoolComparison, BoolXor,
+                                       BoolOpAssignment, BitfieldAssignment)))),
+              implicitCastExpr().bind("implicitCastFromBool"),
+              unless(hasParent(BitfieldConstruct)),
+              // Check also for nested casts, for example: bool -> int -> float.
+              anyOf(hasParent(implicitCastExpr().bind("furtherImplicitCast")),
+                    anything()),
+              unless(isInTemplateInstantiation()),
+              unless(IsInCompilerGeneratedFunction))),
+      this);
 }
 
 void ImplicitBoolConversionCheck::check(
diff --git a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.h b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.h
index b0c3c2943e649..5947f7316e67c 100644
--- a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.h
@@ -37,8 +37,6 @@ class ImplicitBoolConversionCheck : public ClangTidyCheck {
   const bool AllowIntegerConditions;
   const bool AllowPointerConditions;
   const bool UseUpperCaseLiteralSuffix;
-  const bool CheckConversionsToBool;
-  const bool CheckConversionsFromBool;
 };
 
 } // namespace clang::tidy::readability
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 8c360222ce43d..fabd0cc78ac64 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -342,11 +342,10 @@ Changes in existing checks
   diagnostic.
 
 - Improved :doc:`readability-implicit-bool-conversion
-  ` check by adding the
-  option `UseUpperCaseLiteralSuffix` to select the case of the literal suffix in 
-  fixes and fixing false positive for implicit conversion of comparison result in 
-  C23, and by adding the option `CheckConversionsToBool` or 
-  `CheckConversionsFromBool` to configure checks for conversions involving ``bool``.
+  ` check
+  by adding the option `UseUpperCaseLiteralSuffix` to select the
+  case of the literal suffix in fixes and fixing false positive for implicit
+  conversion of comparison result in C23.
 
 - Improved :doc:`readability-redundant-smartptr-get
   ` check to
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/implicit-bool-conversion.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/implicit-bool-conversion.rst
index f7c15ffa2da51..88cff387f4c16 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/implicit-bool-conversion.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/implicit-bool-conversion.rst
@@ -147,41 +147,3 @@ Options
       if (foo) {}
       // ^ propose replacement default: if (foo != 0u) {}
       // ^ propose replacement with option `UseUpperCaseLiteralSuffix`: if (foo != 0U) {}
-
-.. option:: CheckConversionsToBool
-
-   When `true`, the check diagnoses implicit conversions to ``bool``.
-   Default is `true`.
-
-   Example
-
-   .. code-block:: c++
-
-      int x = 42;
-      if (x) {}
-      // ^ propose replacement: if (x != 0) {}
-
-      float f = 3.14;
-      if (f) {}
-      // ^ propose replacement: if (f != 0.0f) {}
-
-.. option:: CheckConversionsFromBool
-
-   When `true`, the check diagnoses implicit conversions from ``bool``.
-   Default is `true`.
-
-   Example
-
-   .. code-block:: c++
-
-      bool b = true;
-
-      int x = b;
-      // ^ propose replacement: int x = b ? 1 : 0;
-
-      float f = b;
-      // ^ propose replacement: float f = b ? 1.0f : 0.0f;
-
-      int* p = b;
-      // ^ propose replacement: int* p = b ? some_ptr : nullptr;
-
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion-check.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion-check.cpp
deleted file mode 100644
index 8ba4635de1704..0000000000000
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion-check.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-// RUN: %check_clang_tidy -check-suffix=FROM %s readability-implicit-bool-conversion %t -- \
-// RUN:     -config='{CheckOptions: { \
-// RUN:         readability-implicit-bool-conversion.CheckConversionsToBool: false, \
-// RUN:         readability-implicit-bool-conversion.CheckConversionsFromBool: true \
-// RUN:     }}' -- -std=c23
-// RUN: %check_clang_tidy -check-suffix=TO %s readability-implicit-bool-conversion %t -- \
-// RUN:     -config='{CheckOptions: { \
-// RUN:         readability-implicit-bool-conversion.CheckConversionsToBool: true, \
-// RUN:         readability-implicit-bool-conversion.CheckConversionsFromBool: false \
-// RUN:     }}' -- -std=c23
-// RUN: %check_clang_tidy -check-suffix=NORMAL %s readability-implicit-bool-conversion %t -- \
-// RUN:     -config='{CheckOptions: { \
-// RUN:         readability-implicit-bool-conversion.CheckConversionsToBool: false, \
-// RUN:         readability-implicit-bool-conversion.CheckConversionsFromBool: false \
-// RUN:     }}' -- -std=c23
-// RUN: %check_clang_tidy -check-suffix=TO,FROM %s readability-implicit-bool-conversion %t -- \
-// RUN:     -config='{CheckOptions: { \
-// RUN:         readability-implicit-bool-conversion.CheckConversionsToBool: true, \
-// RUN:         readability-implicit-bool-conversion.CheckConversionsFromBool: true \
-// RUN:     }}' -- -std=c23
-
-// Test various implicit bool conversions in different contexts
-void TestImplicitBoolConversion() {
-    // Basic type conversions to bool
-    int intValue = 42;
-    if (intValue) // CHECK-MESSAGES-TO: :[[@LINE]]:9: warning: implicit conversion 'int' -> 'bool' [readability-implicit-bool-conversion]
-                  // CHECK-FIXES-TO: if (intValue != 0)
-        (void)0;
-
-    float floatValue = 3.14f;
-    while (floatValue) // CHECK-MESSAGES-TO: :[[@LINE]]:12: warning: implicit conversion 'float' -> 'bool' [readability-implicit-bool-conversion]
-                       // CHECK-FIXES-TO: while (floatValue != 0.0f)
-        break;
-
-    char charValue = 'a';
-    do {
-        break;
-    } while (charValue); // CHECK-MESSAGES-TO: :[[@LINE]]:14: warning: implicit conversion 'char' -> 'bool' [readability-implicit-bool-conversion]
-                         // CHECK-FIXES-TO: } while (charValue != 0);
-
-    // Pointer conversions to bool
-    int* ptrValue = &intValue;
-    if (ptrValue) // CHECK-MESSAGES-TO: :[[@LINE]]:9: warning: implicit conversion 'int *' -> 'bool' [readability-implicit-bool-conversion]
-                  // CHECK-FIXES-TO: if (ptrValue != nullptr)
-        (void)0;
-
-    // Conversions from bool to other types
-    bool boolValue = true;
-    int intFromBool = boolValue; // CHECK-MESSAGES-FROM: :[[@LINE]]:23: warning: implicit conversion 'bool' -> 'int' [readability-implicit-bool-conversion]
-                                 // CHECK-FIXES-FROM: int intFromBool = static_cast(boolValue);
-                                 
-    float floatFromBool = boolValue; // CHECK-MESSAGES-FROM: :[[@LINE]]:27: warning: implicit conversion 'bool' -> 'float' [readability-implicit-bool-conversion]
-                                     // CHECK-FIXES-FROM: float floatFromBool = static_cast(boolValue);
-
-    char charFromBool = boolValue; // CHECK-MESSAGES-FROM: :[[@LINE]]:25: warning: implicit conversion 'bool' -> 'char' [readability-implicit-bool-conversion]
-                                   // CHECK-FIXES-FROM: char charFromBool = static_cast(boolValue);
-}

From 811e1f4661bca4a2b5c93d30f54c3aa338f175e9 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi 
Date: Sat, 28 Dec 2024 17:33:29 +0900
Subject: [PATCH 128/567] clang/test/CoverageMapping/single-byte-counters.cpp:
 Align to the final form to fill linefeeds.

---
 .../CoverageMapping/single-byte-counters.cpp  | 69 +++++++++++--------
 1 file changed, 41 insertions(+), 28 deletions(-)

diff --git a/clang/test/CoverageMapping/single-byte-counters.cpp b/clang/test/CoverageMapping/single-byte-counters.cpp
index 4c0987eea4b98..f09e13038d900 100644
--- a/clang/test/CoverageMapping/single-byte-counters.cpp
+++ b/clang/test/CoverageMapping/single-byte-counters.cpp
@@ -1,20 +1,22 @@
 // RUN: %clang_cc1 -mllvm -emptyline-comment-coverage=false -mllvm -enable-single-byte-coverage=true -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name single-byte-counters.cpp %s | FileCheck %s
 
 // CHECK: testIf
-int testIf(int x) { // CHECK-NEXT: File 0, [[@LINE]]:19 -> [[@LINE+7]]:2 = [[C00:#0]]
+int testIf(int x) { // CHECK-NEXT: File 0, [[@LINE]]:19 -> [[@LINE+8]]:2 = [[C00:#0]]
   int result = 0;
   if (x == 0)       // CHECK-NEXT: File 0, [[@LINE]]:7 -> [[@LINE]]:13 = [[C00]]
-                    // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:14 -> [[@LINE+1]]:5 = [[C0T:#1]]
+
+                    // CHECK-NEXT: Gap,File 0, [[@LINE-2]]:14 -> [[@LINE+1]]:5 = [[C0T:#1]]
     result = -1;    // CHECK-NEXT: File 0, [[@LINE]]:5 -> [[@LINE]]:16 = [[C0T]]
 
   return result;    // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE]]:16 = [[C0E:#2]]
 }
 
 // CHECK-NEXT: testIfElse
-int testIfElse(int x) { // CHECK-NEXT: File 0, [[@LINE]]:23 -> [[@LINE+8]]:2 = [[C10:#0]]
+int testIfElse(int x) { // CHECK-NEXT: File 0, [[@LINE]]:23 -> [[@LINE+9]]:2 = [[C10:#0]]
   int result = 0;
   if (x < 0)            // CHECK-NEXT: File 0, [[@LINE]]:7 -> [[@LINE]]:12 = [[C10]]
-                        // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:13 -> [[@LINE+1]]:5 = [[C1T:#1]]
+
+                        // CHECK-NEXT: Gap,File 0, [[@LINE-2]]:13 -> [[@LINE+1]]:5 = [[C1T:#1]]
     result = 0;         // CHECK-NEXT: File 0, [[@LINE]]:5 -> [[@LINE]]:15 = [[C1T]]
   else                  // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:16 -> [[@LINE+1]]:5 = [[C1F:#2]]
     result = x * x;     // CHECK-NEXT: File 0, [[@LINE]]:5 -> [[@LINE]]:19 = [[C1F]]
@@ -22,10 +24,11 @@ int testIfElse(int x) { // CHECK-NEXT: File 0, [[@LINE]]:23 -> [[@LINE+8]]:2 = [
 }
 
 // CHECK-NEXT: testIfElseReturn
-int testIfElseReturn(int x) { // CHECK-NEXT: File 0, [[@LINE]]:29 -> [[@LINE+9]]:2 = [[C20:#0]]
+int testIfElseReturn(int x) { // CHECK-NEXT: File 0, [[@LINE]]:29 -> [[@LINE+10]]:2 = [[C20:#0]]
   int result = 0;
   if (x > 0)                  // CHECK-NEXT: File 0, [[@LINE]]:7 -> [[@LINE]]:12 = [[C20]]
-                              // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:13 -> [[@LINE+1]]:5 = [[C2T:#1]]
+
+                              // CHECK-NEXT: Gap,File 0, [[@LINE-2]]:13 -> [[@LINE+1]]:5 = [[C2T:#1]]
     result = x * x;           // CHECK-NEXT: File 0, [[@LINE]]:5 -> [[@LINE]]:19 = [[C2T]]
   else                        // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:20 -> [[@LINE+1]]:5 = [[C2F:#2]]
     return 0;                 // CHECK-NEXT: File 0, [[@LINE]]:5 -> [[@LINE]]:13 = [[C2F]]
@@ -34,10 +37,11 @@ int testIfElseReturn(int x) { // CHECK-NEXT: File 0, [[@LINE]]:29 -> [[@LINE+9]]
 }
 
 // CHECK-NEXT: testIfBothReturn
-int testIfBothReturn(int x) { // CHECK-NEXT: File 0, [[@LINE]]:29 -> [[@LINE+9]]:2 = [[C20:#0]]
+int testIfBothReturn(int x) { // CHECK-NEXT: File 0, [[@LINE]]:29 -> [[@LINE+10]]:2 = [[C20:#0]]
   int result = 0;
   if (x > 0)                  // CHECK-NEXT: File 0, [[@LINE]]:7 -> [[@LINE]]:12 = [[C20]]
-                              // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:13 -> [[@LINE+1]]:5 = [[C2T:#1]]
+
+                              // CHECK-NEXT: Gap,File 0, [[@LINE-2]]:13 -> [[@LINE+1]]:5 = [[C2T:#1]]
     return 42;                // CHECK-NEXT: File 0, [[@LINE]]:5 -> [[@LINE]]:14 = [[C2T]]
   else                        // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:15 -> [[@LINE+1]]:5 = [[C2F:#2]]
     return 0;                 // CHECK-NEXT: File 0, [[@LINE]]:5 -> [[@LINE]]:13 = [[C2F]]
@@ -46,19 +50,22 @@ int testIfBothReturn(int x) { // CHECK-NEXT: File 0, [[@LINE]]:29 -> [[@LINE+9]]
 }
 
 // CHECK-NEXT: testSwitch
-int testSwitch(int x) { // CHECK-NEXT: File 0, [[@LINE]]:23 -> [[@LINE+17]]:2 = [[C30:#0]]
+int testSwitch(int x) { // CHECK-NEXT: File 0, [[@LINE]]:23 -> [[@LINE+20]]:2 = [[C30:#0]]
   int result;
   switch (x) {
-                        // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:14 -> [[@LINE+10]]:15 = 0
-  case 1:               // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+2]]:10 = [[C31:#2]]
+                        // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:14 -> [[@LINE+13]]:15 = 0
+  case 1:               // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+3]]:10 = [[C31:#2]]
+
     result = 1;
     break;
                         // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:11 -> [[@LINE+1]]:3 = 0
-  case 2:               // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+2]]:10 = [[C32:#3]]
+  case 2:               // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+3]]:10 = [[C32:#3]]
+
     result = 2;
     break;
                         // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:11 -> [[@LINE+1]]:3 = 0
-  default:              // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:15 = [[C3D:#4]]
+  default:              // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+2]]:15 = [[C3D:#4]]
+
     result = 0;
   }
                         // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:4 -> [[@LINE+1]]:3 = [[C3E:#1]]
@@ -66,12 +73,13 @@ int testSwitch(int x) { // CHECK-NEXT: File 0, [[@LINE]]:23 -> [[@LINE+17]]:2 =
 }
 
 // CHECK-NEXT: testWhile
-int testWhile() {       // CHECK-NEXT: File 0, [[@LINE]]:17 -> [[@LINE+11]]:2 = [[C40:#0]]
+int testWhile() {       // CHECK-NEXT: File 0, [[@LINE]]:17 -> [[@LINE+12]]:2 = [[C40:#0]]
   int i = 0;
   int sum = 0;
   while (i < 10) {      // CHECK-NEXT: File 0, [[@LINE]]:10 -> [[@LINE]]:16 = [[C4C:#1]]
-                        // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:17 -> [[@LINE-1]]:18 = [[C4T:#2]]
-                        // CHECK-NEXT: File 0, [[@LINE-2]]:18 -> [[@LINE+3]]:4 = [[C4T]]
+
+                        // CHECK-NEXT: Gap,File 0, [[@LINE-2]]:17 -> [[@LINE-2]]:18 = [[C4T:#2]]
+                        // CHECK-NEXT: File 0, [[@LINE-3]]:18 -> [[@LINE+3]]:4 = [[C4T]]
     sum += i;
     i++;
   }
@@ -80,19 +88,22 @@ int testWhile() {       // CHECK-NEXT: File 0, [[@LINE]]:17 -> [[@LINE+11]]:2 =
 }
 
 // CHECK-NEXT: testContinueBreak
-int testContinueBreak() { // CHECK-NEXT: File 0, [[@LINE]]:25 -> [[@LINE+20]]:2 = #0
+int testContinueBreak() { // CHECK-NEXT: File 0, [[@LINE]]:25 -> [[@LINE+23]]:2 = #0
   int i = 0;
   int sum = 0;
   while (i < 10) {   // CHECK-NEXT: File 0, [[@LINE]]:10 -> [[@LINE]]:16 = #1
-                     // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:17 -> [[@LINE-1]]:18 = [[C5B:#2]]
-                     // CHECK-NEXT: File 0, [[@LINE-2]]:18 -> [[@LINE+12]]:4 = [[C5B]]
+
+                     // CHECK-NEXT: Gap,File 0, [[@LINE-2]]:17 -> [[@LINE-2]]:18 = [[C5B:#2]]
+                     // CHECK-NEXT: File 0, [[@LINE-3]]:18 -> [[@LINE+14]]:4 = [[C5B]]
     if (i == 4)      // CHECK-NEXT: File 0, [[@LINE]]:9 -> [[@LINE]]:15 = [[C5B]]
-                     // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:16 -> [[@LINE+1]]:7 = [[C5T:#4]]
+
+                     // CHECK-NEXT: Gap,File 0, [[@LINE-2]]:16 -> [[@LINE+1]]:7 = [[C5T:#4]]
       continue;      // CHECK-NEXT: File 0, [[@LINE]]:7 -> [[@LINE]]:15 = [[C5T]]
                      // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:16 -> [[@LINE+2]]:5 = [[C5F:#5]]
-                     // CHECK-NEXT: File 0, [[@LINE+1]]:5 -> [[@LINE+7]]:4 = [[C5F]]
+                     // CHECK-NEXT: File 0, [[@LINE+1]]:5 -> [[@LINE+8]]:4 = [[C5F]]
     if (i == 5)      // CHECK-NEXT: File 0, [[@LINE]]:9 -> [[@LINE]]:15 = [[C5F]]
-                     // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:16 -> [[@LINE+1]]:7 = [[C5T1:#6]]
+
+                     // CHECK-NEXT: Gap,File 0, [[@LINE-2]]:16 -> [[@LINE+1]]:7 = [[C5T1:#6]]
       break;         // CHECK-NEXT: File 0, [[@LINE]]:7 -> [[@LINE]]:12 = [[C5T1]]
                      // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:13 -> [[@LINE+1]]:5 = [[C5F1:#7]]
     sum += i;        // CHECK-NEXT: File 0, [[@LINE]]:5 -> [[@LINE+2]]:4 = [[C5F1]]
@@ -103,10 +114,11 @@ int testContinueBreak() { // CHECK-NEXT: File 0, [[@LINE]]:25 -> [[@LINE+20]]:2
 }
 
 // CHECK-NEXT: testFor
-int testFor() { // CHECK-NEXT: File 0, [[@LINE]]:15 -> [[@LINE+12]]:2 = [[C60:#0]]
+int testFor() { // CHECK-NEXT: File 0, [[@LINE]]:15 -> [[@LINE+13]]:2 = [[C60:#0]]
   int i;
   int sum = 0;
-                // CHECK-NEXT: File 0, [[@LINE+2]]:19 -> [[@LINE+2]]:25 = [[C61:#1]]
+                // CHECK-NEXT: File 0, [[@LINE+3]]:19 -> [[@LINE+3]]:25 = [[C61:#1]]
+
                 // CHECK-NEXT: File 0, [[@LINE+1]]:27 -> [[@LINE+1]]:30 = [[C6C:#2]]
   for (int i = 0; i < 10; i++) {
                 // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:31 -> [[@LINE-1]]:32 = [[C6B:#3]]
@@ -144,10 +156,11 @@ int testDo() {          // CHECK-NEXT: File 0, [[@LINE]]:14 -> [[@LINE+9]]:2 = [
 }
 
 // CHECK-NEXT: testConditional
-int testConditional(int x) {    // CHECK-NEXT: File 0, [[@LINE]]:28 -> [[@LINE+6]]:2 = [[C90:#0]]
+int testConditional(int x) {    // CHECK-NEXT: File 0, [[@LINE]]:28 -> [[@LINE+7]]:2 = [[C90:#0]]
  int result = (x > 0) ? 1 : -1; // CHECK-NEXT: File 0, [[@LINE]]:15 -> [[@LINE]]:22 = [[C90]]
-                                // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:24 -> [[@LINE-1]]:25 = [[C9T:#2]]
-                                // CHECK-NEXT: File 0, [[@LINE-2]]:25 -> [[@LINE-2]]:26 = [[C9T]]
-                                // CHECK-NEXT: File 0, [[@LINE-3]]:29 -> [[@LINE-3]]:31 = [[C9F:#3]]
+
+                                // CHECK-NEXT: Gap,File 0, [[@LINE-2]]:24 -> [[@LINE-2]]:25 = [[C9T:#2]]
+                                // CHECK-NEXT: File 0, [[@LINE-3]]:25 -> [[@LINE-3]]:26 = [[C9T]]
+                                // CHECK-NEXT: File 0, [[@LINE-4]]:29 -> [[@LINE-4]]:31 = [[C9F:#3]]
  return result;                 // CHECK-NEXT: File 0, [[@LINE]]:2 -> [[@LINE]]:15 = [[C9E:#1]]
 }

From ee6f10d37232627137ce97388a5eb21b90907bfb Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi 
Date: Sat, 28 Dec 2024 17:48:30 +0900
Subject: [PATCH 129/567] [Coverage] Make `MCDCRecord::Folded` as
 `[false/true]` with BitVector. NFC. (#121190)

For merging `MCDCRecord`s, `Folded` is expected to be promoted as
"Non-folded".
---
 .../llvm/ProfileData/Coverage/CoverageMapping.h       |  6 ++++--
 llvm/lib/ProfileData/Coverage/CoverageMapping.cpp     | 11 ++++++-----
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index 0ad6f07bde989..3a018d2a95c6b 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -439,7 +439,7 @@ struct MCDCRecord {
   };
 
   using TestVectors = llvm::SmallVector>;
-  using BoolVector = llvm::SmallVector;
+  using BoolVector = std::array;
   using TVRowPair = std::pair;
   using TVPairMap = llvm::DenseMap;
   using CondIDMap = llvm::DenseMap;
@@ -467,7 +467,9 @@ struct MCDCRecord {
     return Region.getDecisionParams().NumConditions;
   }
   unsigned getNumTestVectors() const { return TV.size(); }
-  bool isCondFolded(unsigned Condition) const { return Folded[Condition]; }
+  bool isCondFolded(unsigned Condition) const {
+    return Folded[false][Condition] || Folded[true][Condition];
+  }
 
   /// Return the evaluation of a condition (indicated by Condition) in an
   /// executed test vector (indicated by TestVectorIndex), which will be True,
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index 1bf2e8d627bc4..f95e311e09de6 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -392,8 +392,9 @@ class MCDCRecordProcessor : NextIDsBuilder, mcdc::TVIdxBuilder {
       : NextIDsBuilder(Branches), TVIdxBuilder(this->NextIDs), Bitmap(Bitmap),
         Region(Region), DecisionParams(Region.getDecisionParams()),
         Branches(Branches), NumConditions(DecisionParams.NumConditions),
-        Folded(NumConditions, false), IndependencePairs(NumConditions),
-        ExecVectors(ExecVectorsByCond[false]), IsVersion11(IsVersion11) {}
+        Folded{{BitVector(NumConditions), BitVector(NumConditions)}},
+        IndependencePairs(NumConditions), ExecVectors(ExecVectorsByCond[false]),
+        IsVersion11(IsVersion11) {}
 
 private:
   // Walk the binary decision diagram and try assigning both false and true to
@@ -485,7 +486,6 @@ class MCDCRecordProcessor : NextIDsBuilder, mcdc::TVIdxBuilder {
   /// location is also tracked, as well as whether it is constant folded (in
   /// which case it is excuded from the metric).
   MCDCRecord processMCDCRecord() {
-    unsigned I = 0;
     MCDCRecord::CondIDMap PosToID;
     MCDCRecord::LineColPairMap CondLoc;
 
@@ -499,11 +499,12 @@ class MCDCRecordProcessor : NextIDsBuilder, mcdc::TVIdxBuilder {
     //   visualize where the condition is.
     // - Record whether the condition is constant folded so that we exclude it
     //   from being measured.
-    for (const auto *B : Branches) {
+    for (auto [I, B] : enumerate(Branches)) {
       const auto &BranchParams = B->getBranchParams();
       PosToID[I] = BranchParams.ID;
       CondLoc[I] = B->startLoc();
-      Folded[I++] = (B->Count.isZero() || B->FalseCount.isZero());
+      Folded[false][I] = B->FalseCount.isZero();
+      Folded[true][I] = B->Count.isZero();
     }
 
     // Using Profile Bitmap from runtime, mark the executed test vectors.

From a21f13bde2564a691a2da49adb773816f6c4e06b Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi 
Date: Sat, 28 Dec 2024 17:49:10 +0900
Subject: [PATCH 130/567] llvm-cov: Refactor CoverageSummaryInfo. NFC.
 (#121189)

- Let subfunctions return `CoverageInfo` objects w/o accumulating
  reference values.
- Introduce `CoverageDataSummary` for handling `CoverageData`-oriented
  metrics.
---
 llvm/tools/llvm-cov/CoverageSummaryInfo.cpp | 51 +++++++++++----------
 llvm/tools/llvm-cov/CoverageSummaryInfo.h   | 45 +++++++++---------
 2 files changed, 50 insertions(+), 46 deletions(-)

diff --git a/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp b/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp
index ad7561d3dc62c..5c002a694f66a 100644
--- a/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp
+++ b/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp
@@ -16,8 +16,9 @@
 using namespace llvm;
 using namespace coverage;
 
-static void sumBranches(size_t &NumBranches, size_t &CoveredBranches,
-                        const ArrayRef &Branches) {
+static auto sumBranches(const ArrayRef &Branches) {
+  size_t NumBranches = 0;
+  size_t CoveredBranches = 0;
   for (const auto &BR : Branches) {
     if (!BR.TrueFolded) {
       // "True" Condition Branches.
@@ -32,20 +33,22 @@ static void sumBranches(size_t &NumBranches, size_t &CoveredBranches,
         ++CoveredBranches;
     }
   }
+  return BranchCoverageInfo(CoveredBranches, NumBranches);
 }
 
-static void sumBranchExpansions(size_t &NumBranches, size_t &CoveredBranches,
-                                const CoverageMapping &CM,
-                                ArrayRef Expansions) {
+static BranchCoverageInfo
+sumBranchExpansions(const CoverageMapping &CM,
+                    ArrayRef Expansions) {
+  BranchCoverageInfo BranchCoverage;
   for (const auto &Expansion : Expansions) {
     auto CE = CM.getCoverageForExpansion(Expansion);
-    sumBranches(NumBranches, CoveredBranches, CE.getBranches());
-    sumBranchExpansions(NumBranches, CoveredBranches, CM, CE.getExpansions());
+    BranchCoverage += sumBranches(CE.getBranches());
+    BranchCoverage += sumBranchExpansions(CM, CE.getExpansions());
   }
+  return BranchCoverage;
 }
 
-static std::pair
-sumMCDCPairs(const ArrayRef &Records) {
+auto sumMCDCPairs(const ArrayRef &Records) {
   size_t NumPairs = 0, CoveredPairs = 0;
   for (const auto &Record : Records) {
     const auto NumConditions = Record.getNumConditions();
@@ -56,7 +59,7 @@ sumMCDCPairs(const ArrayRef &Records) {
         ++CoveredPairs;
     }
   }
-  return {NumPairs, CoveredPairs};
+  return MCDCCoverageInfo(CoveredPairs, NumPairs);
 }
 
 static std::pair
@@ -85,24 +88,27 @@ sumRegions(ArrayRef CodeRegions, const CoverageData &CD) {
           LineCoverageInfo(CoveredLines, NumLines)};
 }
 
+CoverageDataSummary::CoverageDataSummary(const CoverageData &CD,
+                                         ArrayRef CodeRegions) {
+  std::tie(RegionCoverage, LineCoverage) = sumRegions(CodeRegions, CD);
+  BranchCoverage = sumBranches(CD.getBranches());
+  MCDCCoverage = sumMCDCPairs(CD.getMCDCRecords());
+}
+
 FunctionCoverageSummary
 FunctionCoverageSummary::get(const CoverageMapping &CM,
                              const coverage::FunctionRecord &Function) {
   CoverageData CD = CM.getCoverageForFunction(Function);
-  auto [RegionCoverage, LineCoverage] = sumRegions(Function.CountedRegions, CD);
 
-  // Compute the branch coverage, including branches from expansions.
-  size_t NumBranches = 0, CoveredBranches = 0;
-  sumBranches(NumBranches, CoveredBranches, CD.getBranches());
-  sumBranchExpansions(NumBranches, CoveredBranches, CM, CD.getExpansions());
+  auto Summary =
+      FunctionCoverageSummary(Function.Name, Function.ExecutionCount);
 
-  size_t NumPairs = 0, CoveredPairs = 0;
-  std::tie(NumPairs, CoveredPairs) = sumMCDCPairs(CD.getMCDCRecords());
+  Summary += CoverageDataSummary(CD, Function.CountedRegions);
 
-  return FunctionCoverageSummary(
-      Function.Name, Function.ExecutionCount, RegionCoverage, LineCoverage,
-      BranchCoverageInfo(CoveredBranches, NumBranches),
-      MCDCCoverageInfo(CoveredPairs, NumPairs));
+  // Compute the branch coverage, including branches from expansions.
+  Summary.BranchCoverage += sumBranchExpansions(CM, CD.getExpansions());
+
+  return Summary;
 }
 
 FunctionCoverageSummary
@@ -117,8 +123,7 @@ FunctionCoverageSummary::get(const InstantiationGroup &Group,
        << Group.getColumn();
   }
 
-  FunctionCoverageSummary Summary(Name);
-  Summary.ExecutionCount = Group.getTotalExecutionCount();
+  FunctionCoverageSummary Summary(Name, Group.getTotalExecutionCount());
   Summary.RegionCoverage = Summaries[0].RegionCoverage;
   Summary.LineCoverage = Summaries[0].LineCoverage;
   Summary.BranchCoverage = Summaries[0].BranchCoverage;
diff --git a/llvm/tools/llvm-cov/CoverageSummaryInfo.h b/llvm/tools/llvm-cov/CoverageSummaryInfo.h
index 64c2c8406cf3e..d9210676c41bf 100644
--- a/llvm/tools/llvm-cov/CoverageSummaryInfo.h
+++ b/llvm/tools/llvm-cov/CoverageSummaryInfo.h
@@ -223,26 +223,32 @@ class FunctionCoverageInfo {
   }
 };
 
-/// A summary of function's code coverage.
-struct FunctionCoverageSummary {
-  std::string Name;
-  uint64_t ExecutionCount;
+struct CoverageDataSummary {
   RegionCoverageInfo RegionCoverage;
   LineCoverageInfo LineCoverage;
   BranchCoverageInfo BranchCoverage;
   MCDCCoverageInfo MCDCCoverage;
 
-  FunctionCoverageSummary(const std::string &Name)
-      : Name(Name), ExecutionCount(0) {}
+  CoverageDataSummary() = default;
+  CoverageDataSummary(const coverage::CoverageData &CD,
+                      ArrayRef CodeRegions);
 
-  FunctionCoverageSummary(const std::string &Name, uint64_t ExecutionCount,
-                          const RegionCoverageInfo &RegionCoverage,
-                          const LineCoverageInfo &LineCoverage,
-                          const BranchCoverageInfo &BranchCoverage,
-                          const MCDCCoverageInfo &MCDCCoverage)
-      : Name(Name), ExecutionCount(ExecutionCount),
-        RegionCoverage(RegionCoverage), LineCoverage(LineCoverage),
-        BranchCoverage(BranchCoverage), MCDCCoverage(MCDCCoverage) {}
+  auto &operator+=(const CoverageDataSummary &RHS) {
+    RegionCoverage += RHS.RegionCoverage;
+    LineCoverage += RHS.LineCoverage;
+    BranchCoverage += RHS.BranchCoverage;
+    MCDCCoverage += RHS.MCDCCoverage;
+    return *this;
+  }
+};
+
+/// A summary of function's code coverage.
+struct FunctionCoverageSummary : CoverageDataSummary {
+  std::string Name;
+  uint64_t ExecutionCount;
+
+  FunctionCoverageSummary(const std::string &Name, uint64_t ExecutionCount = 0)
+      : Name(Name), ExecutionCount(ExecutionCount) {}
 
   /// Compute the code coverage summary for the given function coverage
   /// mapping record.
@@ -257,12 +263,8 @@ struct FunctionCoverageSummary {
 };
 
 /// A summary of file's code coverage.
-struct FileCoverageSummary {
+struct FileCoverageSummary : CoverageDataSummary {
   StringRef Name;
-  RegionCoverageInfo RegionCoverage;
-  LineCoverageInfo LineCoverage;
-  BranchCoverageInfo BranchCoverage;
-  MCDCCoverageInfo MCDCCoverage;
   FunctionCoverageInfo FunctionCoverage;
   FunctionCoverageInfo InstantiationCoverage;
 
@@ -270,11 +272,8 @@ struct FileCoverageSummary {
   FileCoverageSummary(StringRef Name) : Name(Name) {}
 
   FileCoverageSummary &operator+=(const FileCoverageSummary &RHS) {
-    RegionCoverage += RHS.RegionCoverage;
-    LineCoverage += RHS.LineCoverage;
+    *static_cast(this) += RHS;
     FunctionCoverage += RHS.FunctionCoverage;
-    BranchCoverage += RHS.BranchCoverage;
-    MCDCCoverage += RHS.MCDCCoverage;
     InstantiationCoverage += RHS.InstantiationCoverage;
     return *this;
   }

From 52bbe20eb40f45bc64c614b6b3d7fe13bbacb0ff Mon Sep 17 00:00:00 2001
From: Daniil Kovalev 
Date: Sat, 28 Dec 2024 12:13:39 +0300
Subject: [PATCH 131/567] [PAC][CodeGen][ELF][AArch64] Support signed TLSDESC
 (#113813)

Depends on #120010

`TLSDESC_AUTH_CALLSEQ` pseudo-instruction is introduced which is later expanded
to actual instruction sequence like the following.

```
adrp  x0, :tlsdesc_auth:var
ldr   x16, [x0, #:tlsdesc_auth_lo12:var]
add   x0, x0, #:tlsdesc_auth_lo12:var
blraa x16, x0
(TPIDR_EL0 offset now in x0)
```

Only SelectionDAG ISel is supported.

Tests starting with 'ptrauth-' have corresponding variants w/o this prefix.
---
 llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp |  48 ++++++++
 .../Target/AArch64/AArch64ISelLowering.cpp    |  16 ++-
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   1 +
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  11 ++
 .../AArch64/ptrauth-arm64-tls-dynamics.ll     | 114 ++++++++++++++++++
 5 files changed, 185 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/ptrauth-arm64-tls-dynamics.ll

diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 69d07f27fa8e1..9bec782ca8ce9 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -2730,6 +2730,54 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
     EmitToStreamer(*OutStreamer, TmpInstSB);
     return;
   }
+  case AArch64::TLSDESC_AUTH_CALLSEQ: {
+    /// lower this to:
+    ///    adrp  x0, :tlsdesc_auth:var
+    ///    ldr   x16, [x0, #:tlsdesc_auth_lo12:var]
+    ///    add   x0, x0, #:tlsdesc_auth_lo12:var
+    ///    blraa x16, x0
+    ///    (TPIDR_EL0 offset now in x0)
+    const MachineOperand &MO_Sym = MI->getOperand(0);
+    MachineOperand MO_TLSDESC_LO12(MO_Sym), MO_TLSDESC(MO_Sym);
+    MCOperand SymTLSDescLo12, SymTLSDesc;
+    MO_TLSDESC_LO12.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
+    MO_TLSDESC.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGE);
+    MCInstLowering.lowerOperand(MO_TLSDESC_LO12, SymTLSDescLo12);
+    MCInstLowering.lowerOperand(MO_TLSDESC, SymTLSDesc);
+
+    MCInst Adrp;
+    Adrp.setOpcode(AArch64::ADRP);
+    Adrp.addOperand(MCOperand::createReg(AArch64::X0));
+    Adrp.addOperand(SymTLSDesc);
+    EmitToStreamer(*OutStreamer, Adrp);
+
+    MCInst Ldr;
+    Ldr.setOpcode(AArch64::LDRXui);
+    Ldr.addOperand(MCOperand::createReg(AArch64::X16));
+    Ldr.addOperand(MCOperand::createReg(AArch64::X0));
+    Ldr.addOperand(SymTLSDescLo12);
+    Ldr.addOperand(MCOperand::createImm(0));
+    EmitToStreamer(*OutStreamer, Ldr);
+
+    MCInst Add;
+    Add.setOpcode(AArch64::ADDXri);
+    Add.addOperand(MCOperand::createReg(AArch64::X0));
+    Add.addOperand(MCOperand::createReg(AArch64::X0));
+    Add.addOperand(SymTLSDescLo12);
+    Add.addOperand(MCOperand::createImm(AArch64_AM::getShiftValue(0)));
+    EmitToStreamer(*OutStreamer, Add);
+
+    // Authenticated TLSDESC accesses are not relaxed.
+    // Thus, do not emit .tlsdesccall for AUTH TLSDESC.
+
+    MCInst Blraa;
+    Blraa.setOpcode(AArch64::BLRAA);
+    Blraa.addOperand(MCOperand::createReg(AArch64::X16));
+    Blraa.addOperand(MCOperand::createReg(AArch64::X0));
+    EmitToStreamer(*OutStreamer, Blraa);
+
+    return;
+  }
   case AArch64::TLSDESC_CALLSEQ: {
     /// lower this to:
     ///    adrp  x0, :tlsdesc:var
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a6f8f47f31fa5..24e1ebd8421fb 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2669,6 +2669,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::CSINC)
     MAKE_CASE(AArch64ISD::THREAD_POINTER)
     MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
+    MAKE_CASE(AArch64ISD::TLSDESC_AUTH_CALLSEQ)
     MAKE_CASE(AArch64ISD::PROBED_ALLOCA)
     MAKE_CASE(AArch64ISD::ABDS_PRED)
     MAKE_CASE(AArch64ISD::ABDU_PRED)
@@ -10123,8 +10124,11 @@ SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
   SDValue Chain = DAG.getEntryNode();
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
 
-  Chain =
-      DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
+  unsigned Opcode =
+      DAG.getMachineFunction().getInfo()->hasELFSignedGOT()
+          ? AArch64ISD::TLSDESC_AUTH_CALLSEQ
+          : AArch64ISD::TLSDESC_CALLSEQ;
+  Chain = DAG.getNode(Opcode, DL, NodeTys, {Chain, SymAddr});
   SDValue Glue = Chain.getValue(1);
 
   return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
@@ -10136,8 +10140,12 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
   assert(Subtarget->isTargetELF() && "This function expects an ELF target");
 
   const GlobalAddressSDNode *GA = cast(Op);
+  AArch64FunctionInfo *MFI =
+      DAG.getMachineFunction().getInfo();
 
-  TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
+  TLSModel::Model Model = MFI->hasELFSignedGOT()
+                              ? TLSModel::GeneralDynamic
+                              : getTargetMachine().getTLSModel(GA->getGlobal());
 
   if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
     if (Model == TLSModel::LocalDynamic)
@@ -10174,8 +10182,6 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
     // calculation.
 
     // These accesses will need deduplicating if there's more than one.
-    AArch64FunctionInfo *MFI =
-        DAG.getMachineFunction().getInfo();
     MFI->incNumLocalDynamicTLSAccesses();
 
     // The call needs a relocation too for linker relaxation. It doesn't make
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 1b7f328fa729a..85b62be5dd30d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -83,6 +83,7 @@ enum NodeType : unsigned {
   // Produces the full sequence of instructions for getting the thread pointer
   // offset of a variable into X0, using the TLSDesc model.
   TLSDESC_CALLSEQ,
+  TLSDESC_AUTH_CALLSEQ,
   ADRP,     // Page address of a TargetGlobalAddress operand.
   ADR,      // ADR
   ADDlow,   // Add the low 12 bits of a TargetGlobalAddress operand.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 629098cda0c4e..ec891ea4bac85 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -883,6 +883,9 @@ def AArch64tlsdesc_callseq : SDNode<"AArch64ISD::TLSDESC_CALLSEQ",
                                     SDT_AArch64TLSDescCallSeq,
                                     [SDNPOutGlue, SDNPHasChain, SDNPVariadic]>;
 
+def AArch64tlsdesc_auth_callseq : SDNode<"AArch64ISD::TLSDESC_AUTH_CALLSEQ",
+                                    SDT_AArch64TLSDescCallSeq,
+                                    [SDNPOutGlue, SDNPHasChain, SDNPVariadic]>;
 
 def AArch64WrapperLarge : SDNode<"AArch64ISD::WrapperLarge",
                                  SDT_AArch64WrapperLarge>;
@@ -3312,8 +3315,16 @@ def TLSDESC_CALLSEQ
     : Pseudo<(outs), (ins i64imm:$sym),
              [(AArch64tlsdesc_callseq tglobaltlsaddr:$sym)]>,
       Sched<[WriteI, WriteLD, WriteI, WriteBrReg]>;
+let isCall = 1, Defs = [NZCV, LR, X0, X16], hasSideEffects = 1, Size = 16,
+    isCodeGenOnly = 1 in
+def TLSDESC_AUTH_CALLSEQ
+    : Pseudo<(outs), (ins i64imm:$sym),
+             [(AArch64tlsdesc_auth_callseq tglobaltlsaddr:$sym)]>,
+      Sched<[WriteI, WriteLD, WriteI, WriteBrReg]>;
 def : Pat<(AArch64tlsdesc_callseq texternalsym:$sym),
           (TLSDESC_CALLSEQ texternalsym:$sym)>;
+def : Pat<(AArch64tlsdesc_auth_callseq texternalsym:$sym),
+          (TLSDESC_AUTH_CALLSEQ texternalsym:$sym)>;
 
 //===----------------------------------------------------------------------===//
 // Conditional branch (immediate) instruction.
diff --git a/llvm/test/CodeGen/AArch64/ptrauth-arm64-tls-dynamics.ll b/llvm/test/CodeGen/AArch64/ptrauth-arm64-tls-dynamics.ll
new file mode 100644
index 0000000000000..89731e62dcc1e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ptrauth-arm64-tls-dynamics.ll
@@ -0,0 +1,114 @@
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu -mattr=+pauth -relocation-model=pic \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu -mattr=+pauth -relocation-model=pic \
+; RUN:   -filetype=obj < %s | llvm-readelf -r -s - | FileCheck --check-prefix=CHECK-OBJ %s
+; RUN: not --crash llc -mtriple=aarch64-unknown-linux-gnu -mattr=+pauth -relocation-model=pic \
+; RUN:   -global-isel=1 < %s 2>&1 | FileCheck --check-prefix=CHECK-ERR %s
+
+@general_dynamic_var = external thread_local global i32
+
+define i32 @test_generaldynamic() {
+; CHECK-LABEL: test_generaldynamic:
+
+  %val = load i32, ptr @general_dynamic_var
+  ret i32 %val
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc_auth:general_dynamic_var
+; CHECK-NEXT: ldr x16, [x[[TLSDESC_HI]], :tlsdesc_auth_lo12:general_dynamic_var]
+; CHECK-NEXT: add x0, x[[TLSDESC_HI]], :tlsdesc_auth_lo12:general_dynamic_var
+; CHECK-NEXT: blraa x16, x0
+; CHECK-NEXT: mrs x[[TPIDR:[0-9]+]], TPIDR_EL0
+; CHECK-NEXT: ldr w0, [x[[TPIDR]], x0]
+
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_ADR_PAGE21
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_LD64_LO12
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_ADD_LO12
+; CHECK-OBJ-NOT: R_AARCH64_TLSDESC_CALL
+
+; CHECK-ERR: LLVM ERROR: cannot select: %1:gpr64sp(p0) = G_GLOBAL_VALUE @general_dynamic_var (in function: test_generaldynamic)
+}
+
+define ptr @test_generaldynamic_addr() {
+; CHECK-LABEL: test_generaldynamic_addr:
+
+  ret ptr @general_dynamic_var
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc_auth:general_dynamic_var
+; CHECK-NEXT: ldr x16, [x[[TLSDESC_HI]], :tlsdesc_auth_lo12:general_dynamic_var]
+; CHECK-NEXT: add x0, x[[TLSDESC_HI]], :tlsdesc_auth_lo12:general_dynamic_var
+; CHECK-NEXT: blraa x16, x0
+; CHECK-NEXT: mrs [[TP:x[0-9]+]], TPIDR_EL0
+; CHECK-NEXT: add x0, [[TP]], x0
+
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_ADR_PAGE21
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_LD64_LO12
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_ADD_LO12
+; CHECK-OBJ-NOT: R_AARCH64_TLSDESC_CALL
+}
+
+;; Note: with signed TLSDESC, general dynamic model is always used,
+;; even when local dynamic is requested.
+
+@local_dynamic_var = external thread_local(localdynamic) global i32
+
+define i32 @test_localdynamic() {
+; CHECK-LABEL: test_localdynamic:
+
+  %val = load i32, ptr @local_dynamic_var
+  ret i32 %val
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc_auth:local_dynamic_var
+; CHECK-NEXT: ldr x16, [x[[TLSDESC_HI]], :tlsdesc_auth_lo12:local_dynamic_var]
+; CHECK-NEXT: add x0, x[[TLSDESC_HI]], :tlsdesc_auth_lo12:local_dynamic_var
+; CHECK-NEXT: blraa x16, x0
+; CHECK-NEXT: mrs x[[TPIDR:[0-9]+]], TPIDR_EL0
+; CHECK-NEXT: ldr w0, [x[[TPIDR]], x0]
+
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_ADR_PAGE21
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_LD64_LO12
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_ADD_LO12
+; CHECK-OBJ-NOT: R_AARCH64_TLSDESC_CALL
+}
+
+define ptr @test_localdynamic_addr() {
+; CHECK-LABEL: test_localdynamic_addr:
+
+  ret ptr @local_dynamic_var
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc_auth:local_dynamic_var
+; CHECK-NEXT: ldr x16, [x[[TLSDESC_HI]], :tlsdesc_auth_lo12:local_dynamic_var]
+; CHECK-NEXT: add x0, x[[TLSDESC_HI]], :tlsdesc_auth_lo12:local_dynamic_var
+; CHECK-NEXT: blraa x16, x0
+; CHECK-NEXT: mrs x[[TPIDR:[0-9]+]], TPIDR_EL0
+; CHECK-NEXT: add x0, x[[TPIDR]], x0
+
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_ADR_PAGE21
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_LD64_LO12
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_ADD_LO12
+; CHECK-OBJ-NOT: R_AARCH64_TLSDESC_CALL
+}
+
+@extern_weak_var = extern_weak thread_local global i32
+
+define i32 @test_extern_weak() {
+; CHECK-LABEL: test_extern_weak:
+
+  %val = load i32, ptr @extern_weak_var
+  ret i32 %val
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc_auth:extern_weak_var
+; CHECK-NEXT: ldr x16, [x[[TLSDESC_HI]], :tlsdesc_auth_lo12:extern_weak_var]
+; CHECK-NEXT: add x0, x[[TLSDESC_HI]], :tlsdesc_auth_lo12:extern_weak_var
+; CHECK-NEXT: blraa x16, x0
+; CHECK-NEXT: mrs x[[TPIDR:[0-9]+]], TPIDR_EL0
+; CHECK-NEXT: ldr w0, [x[[TPIDR]], x0]
+
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_ADR_PAGE21
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_LD64_LO12
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_ADD_LO12
+; CHECK-OBJ-NOT: R_AARCH64_TLSDESC_CALL
+; CHECK-OBJ: 0000000000000000     0 TLS     WEAK   DEFAULT   UND extern_weak_var
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 8, !"ptrauth-elf-got", i32 1}

From f68dbbbd57dd0947730300d1e827ad16c2dfffb5 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim 
Date: Fri, 27 Dec 2024 08:39:53 +0000
Subject: [PATCH 132/567] [VectorCombine] Add test coverage for #121110

---
 .../VectorCombine/X86/shuffle-of-cmps.ll      | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll
index b3360b61e66e8..6ee60287e62dc 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll
@@ -275,3 +275,26 @@ define <4 x i32> @shuf_icmp_ugt_v4i32_use(<4 x i32> %x, <4 x i32> %y, <4 x i32>
   %r = sext <4 x i1> %s to <4 x i32>
   ret <4 x i32> %r
 }
+
+; TODO: PR121110 - don't merge equivalent (but not matching) predicates
+define <2 x i1> @PR121110() {
+; CHECK-LABEL: define <2 x i1> @PR121110(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+;
+  %ugt = icmp samesign ugt <2 x i32> < i32 0, i32 0 >, < i32 0, i32 0 >
+  %sgt = icmp sgt <2 x i32> < i32 0, i32 0 >, < i32 6, i32 4294967292 >
+  %res = shufflevector <2 x i1> %ugt, <2 x i1> %sgt, <2 x i32> 
+  ret <2 x i1> %res
+}
+
+define <2 x i1> @PR121110_commute() {
+; CHECK-LABEL: define <2 x i1> @PR121110_commute(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+;
+  %sgt = icmp sgt <2 x i32> < i32 0, i32 0 >, < i32 6, i32 4294967292 >
+  %ugt = icmp samesign ugt <2 x i32> < i32 0, i32 0 >, < i32 0, i32 0 >
+  %res = shufflevector <2 x i1> %sgt, <2 x i1> %ugt, <2 x i32> 
+  ret <2 x i1> %res
+}

From f2f02b21cd581057e3c9b4a7a27e0014eeb9ba15 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim 
Date: Sat, 28 Dec 2024 09:21:22 +0000
Subject: [PATCH 133/567] [VectorCombine] foldShuffleOfBinops - only accept
 exact matching cmp predicates

m_SpecificCmp allowed equivalent predicate+flags which don't necessarily work after being folded from "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)"

Fixes #121110
---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp  | 16 +++++++++-------
 .../VectorCombine/X86/shuffle-of-cmps.ll         | 13 ++++++++++---
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index ecbc13d489eb3..2460ccc61d84d 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1669,7 +1669,8 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
 
   Value *X, *Y, *Z, *W;
   bool IsCommutative = false;
-  CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+  CmpPredicate PredLHS = CmpInst::BAD_ICMP_PREDICATE;
+  CmpPredicate PredRHS = CmpInst::BAD_ICMP_PREDICATE;
   if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) &&
       match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) {
     auto *BO = cast(LHS);
@@ -1677,8 +1678,9 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
     if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem())
       return false;
     IsCommutative = BinaryOperator::isCommutative(BO->getOpcode());
-  } else if (match(LHS, m_Cmp(Pred, m_Value(X), m_Value(Y))) &&
-             match(RHS, m_SpecificCmp(Pred, m_Value(Z), m_Value(W)))) {
+  } else if (match(LHS, m_Cmp(PredLHS, m_Value(X), m_Value(Y))) &&
+             match(RHS, m_Cmp(PredRHS, m_Value(Z), m_Value(W))) &&
+             (CmpInst::Predicate)PredLHS == (CmpInst::Predicate)PredRHS) {
     IsCommutative = cast(LHS)->isCommutative();
   } else
     return false;
@@ -1727,14 +1729,14 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
       TTI.getShuffleCost(SK0, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z}) +
       TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind, 0, nullptr, {Y, W});
 
-  if (Pred == CmpInst::BAD_ICMP_PREDICATE) {
+  if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) {
     NewCost +=
         TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy, CostKind);
   } else {
     auto *ShuffleCmpTy =
         FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
     NewCost += TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy,
-                                      ShuffleDstTy, Pred, CostKind);
+                                      ShuffleDstTy, PredLHS, CostKind);
   }
 
   LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
@@ -1750,10 +1752,10 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
 
   Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
   Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1);
-  Value *NewBO = Pred == CmpInst::BAD_ICMP_PREDICATE
+  Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE
                      ? Builder.CreateBinOp(
                            cast(LHS)->getOpcode(), Shuf0, Shuf1)
-                     : Builder.CreateCmp(Pred, Shuf0, Shuf1);
+                     : Builder.CreateCmp(PredLHS, Shuf0, Shuf1);
 
   // Intersect flags from the old binops.
   if (auto *NewInst = dyn_cast(NewBO)) {
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll
index 6ee60287e62dc..b8b2c6aef74a3 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll
@@ -276,11 +276,15 @@ define <4 x i32> @shuf_icmp_ugt_v4i32_use(<4 x i32> %x, <4 x i32> %y, <4 x i32>
   ret <4 x i32> %r
 }
 
-; TODO: PR121110 - don't merge equivalent (but not matching) predicates
+; PR121110 - don't merge equivalent (but not matching) predicates
+
 define <2 x i1> @PR121110() {
 ; CHECK-LABEL: define <2 x i1> @PR121110(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[UGT:%.*]] = icmp samesign ugt <2 x i32> zeroinitializer, zeroinitializer
+; CHECK-NEXT:    [[SGT:%.*]] = icmp sgt <2 x i32> zeroinitializer, 
+; CHECK-NEXT:    [[RES:%.*]] = shufflevector <2 x i1> [[UGT]], <2 x i1> [[SGT]], <2 x i32> 
+; CHECK-NEXT:    ret <2 x i1> [[RES]]
 ;
   %ugt = icmp samesign ugt <2 x i32> < i32 0, i32 0 >, < i32 0, i32 0 >
   %sgt = icmp sgt <2 x i32> < i32 0, i32 0 >, < i32 6, i32 4294967292 >
@@ -291,7 +295,10 @@ define <2 x i1> @PR121110() {
 define <2 x i1> @PR121110_commute() {
 ; CHECK-LABEL: define <2 x i1> @PR121110_commute(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[SGT:%.*]] = icmp sgt <2 x i32> zeroinitializer, 
+; CHECK-NEXT:    [[UGT:%.*]] = icmp samesign ugt <2 x i32> zeroinitializer, zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = shufflevector <2 x i1> [[SGT]], <2 x i1> [[UGT]], <2 x i32> 
+; CHECK-NEXT:    ret <2 x i1> [[RES]]
 ;
   %sgt = icmp sgt <2 x i32> < i32 0, i32 0 >, < i32 6, i32 4294967292 >
   %ugt = icmp samesign ugt <2 x i32> < i32 0, i32 0 >, < i32 0, i32 0 >

From 8e965d89c9624c184c48806dc39d50209265f0f8 Mon Sep 17 00:00:00 2001
From: JOSTAR <52376093+shenjunjiekoda@users.noreply.github.com>
Date: Sat, 28 Dec 2024 18:09:29 +0800
Subject: [PATCH 134/567] [analyzer] Fix zext assertion failure in loop
 unrolling (#121203)

The current implementation of APInt extension in the code can trigger an
assertion failure when the `zext` function is called with a target width
smaller than the current bit width. For example:
```cpp
if (InitNum.getBitWidth() != BoundNum.getBitWidth()) {
    InitNum = InitNum.zext(BoundNum.getBitWidth());
    BoundNum = BoundNum.zext(InitNum.getBitWidth());
}
```

This logic does not guarantee that the `zext` target width is always
greater than or equal to the current bit width, leading to potential
crashes.

Expected Behavior:
- Ensure InitNum and BoundNum are extended to the maximum of their respective widths.
- Prevent assertion failures by enforcing correct `zext` usage.

Fixes #121201
---
 .../lib/StaticAnalyzer/Core/LoopUnrolling.cpp |  8 +--
 clang/test/Analysis/PR121201.cpp              | 67 +++++++++++++++++++
 2 files changed, 71 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/Analysis/PR121201.cpp

diff --git a/clang/lib/StaticAnalyzer/Core/LoopUnrolling.cpp b/clang/lib/StaticAnalyzer/Core/LoopUnrolling.cpp
index 96f5d7c44baf8..01d87b02fcdbd 100644
--- a/clang/lib/StaticAnalyzer/Core/LoopUnrolling.cpp
+++ b/clang/lib/StaticAnalyzer/Core/LoopUnrolling.cpp
@@ -283,10 +283,10 @@ static bool shouldCompletelyUnroll(const Stmt *LoopStmt, ASTContext &ASTCtx,
   llvm::APInt InitNum =
       Matches[0].getNodeAs("initNum")->getValue();
   auto CondOp = Matches[0].getNodeAs("conditionOperator");
-  if (InitNum.getBitWidth() != BoundNum.getBitWidth()) {
-    InitNum = InitNum.zext(BoundNum.getBitWidth());
-    BoundNum = BoundNum.zext(InitNum.getBitWidth());
-  }
+  unsigned MaxWidth = std::max(InitNum.getBitWidth(), BoundNum.getBitWidth());
+
+  InitNum = InitNum.zext(MaxWidth);
+  BoundNum = BoundNum.zext(MaxWidth);
 
   if (CondOp->getOpcode() == BO_GE || CondOp->getOpcode() == BO_LE)
     maxStep = (BoundNum - InitNum + 1).abs().getZExtValue();
diff --git a/clang/test/Analysis/PR121201.cpp b/clang/test/Analysis/PR121201.cpp
new file mode 100644
index 0000000000000..acd2492d011fa
--- /dev/null
+++ b/clang/test/Analysis/PR121201.cpp
@@ -0,0 +1,67 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core -verify %s \
+// RUN:    -analyzer-config unroll-loops=true
+
+// expected-no-diagnostics
+
+template  using conditional_t = T;
+class basic_format_arg;
+template  struct formatter;
+
+template  struct value {
+  template  value(T) {
+    using value_type = T;
+    (void)format_custom_arg>;
+  }
+
+  template  static void format_custom_arg() {
+    Context ctx;
+    auto f = Formatter();
+    f.format(0, ctx);
+  }
+};
+
+struct context {
+  template  using formatter_type = formatter;
+};
+
+enum { max_packed_args };
+
+template 
+using arg_t = conditional_t, basic_format_arg>;
+
+template  struct format_arg_store {
+  arg_t args;
+};
+
+template 
+auto make_format_args(T... args) -> format_arg_store {
+  return {args...};
+}
+
+template  void write_padded(F write) { write(0); }
+
+template  void format(T... args) { make_format_args(args...); }
+
+template  struct bitset {
+  bitset(long);
+};
+
+template  struct formatter> {
+  struct writer {
+    bitset bs;
+
+    template  void operator()(OutputIt) {
+      for (auto pos = N; pos > 0; --pos) // no-crash
+        ;
+    }
+  };
+
+  template  void format(bitset bs, FormatContext) {
+    write_padded(writer{bs});
+  }
+};
+
+bitset<6> TestBody_bs(2);
+
+void TestBody() { format(TestBody_bs); }

From 7e749d4fb7327ce2da307ed020c02a07e8279992 Mon Sep 17 00:00:00 2001
From: Amir Bishara <139038766+amirBish@users.noreply.github.com>
Date: Sat, 28 Dec 2024 13:28:09 +0200
Subject: [PATCH 135/567] [mlir][bufferization]-Add
 ControlBuildSubsetExtractionFn to TensorEmptyElimination (#120851)

This PR Adds a `ControlBuildSubsetExtractionFn` to the tensor empty
elimination util, This will control the building of the subsets
extraction of the
`SubsetInsertionOpInterface`.

This control function returns the subsets extraction value that will
replace the `emptyTensorOp` use
which is being consumed by a specefic user (which the
 util expects to eliminate it).

The default control function will stay like today's behavior without any
additional changes.
---
 .../Bufferization/Transforms/Transforms.h     | 28 +++++++++++-
 .../Transforms/EmptyTensorElimination.cpp     | 45 +++++++++++--------
 2 files changed, 53 insertions(+), 20 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/Transforms.h
index 892675954493b..a4ee893ca5341 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Transforms.h
@@ -10,7 +10,9 @@
 #define MLIR_DIALECT_BUFFERIZATION_TRANSFORMS_TRANSFORMS_H
 
 #include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/Operation.h"
+#include "mlir/Interfaces/SubsetOpInterface.h"
 
 namespace mlir {
 namespace bufferization {
@@ -34,13 +36,35 @@ struct OneShotBufferizationOptions;
 /// "tensor.empty" op.
 LogicalResult eliminateEmptyTensors(RewriterBase &rewriter, Operation *op);
 
+/// A function type that defines a callback to control the construction
+/// of the subset extraction of the `SubsetInsertionOpInterface`.
+/// The subset extraction value can be used as a replacement for the
+/// `emptyTensorOp` value which is being consumed by `user`, failing
+/// of building such a value should be indicated with an empty value.
+/// This function should guarantee the legality of the replacement,
+/// i.e. the replacement should dominate the user of the `emptyTensorOp`
+/// being eliminated.
+using ControlBuildSubsetExtractionFn =
+    std::function;
+
+/// This method builds and returns a subset extraction value for the
+/// destination tensor that the given `op` inserts into.
+/// It returns a value which should replace the `emptyTensorOp` use
+/// that is being consumed by `user`.
+/// If no such a value found it will return an empty Value.
+Value buildSubsetExtraction(RewriterBase &rewriter,
+                            SubsetInsertionOpInterface op,
+                            tensor::EmptyOp emptyTensorOp, Operation *user);
+
 /// Try to eliminate "tensor.empty" ops inside `op`.
 ///
 /// This function overload accepts an existing `OneShotAnalysisState`, which
 /// contains in-place bufferization decisions. This overload is useful if an
 /// existing analysis should be reused for empty tensor elimination.
-LogicalResult eliminateEmptyTensors(RewriterBase &rewriter, Operation *op,
-                                    OneShotAnalysisState &state);
+LogicalResult eliminateEmptyTensors(
+    RewriterBase &rewriter, Operation *op, OneShotAnalysisState &state,
+    ControlBuildSubsetExtractionFn subsetsExtractionFn = buildSubsetExtraction);
 
 /// Within the given operation, hoist buffers from loops where possible. See
 /// "BufferLoopHoistingPass" for more information.
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/EmptyTensorElimination.cpp b/mlir/lib/Dialect/Bufferization/Transforms/EmptyTensorElimination.cpp
index abc0635a2cdff..98c3d8d0adc6d 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/EmptyTensorElimination.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/EmptyTensorElimination.cpp
@@ -93,8 +93,31 @@ findValidInsertionPoint(Operation *emptyTensorOp, Operation *user,
   return nullptr;
 }
 
+Value mlir::bufferization::buildSubsetExtraction(RewriterBase &rewriter,
+                                                 SubsetInsertionOpInterface op,
+                                                 tensor::EmptyOp emptyTensorOp,
+                                                 Operation *user) {
+
+  mlir::OpBuilder::InsertionGuard guard(rewriter);
+  // All values that are needed to create the replacement op.
+  SmallVector neededValues = op.getValuesNeededToBuildSubsetExtraction();
+  // Find a suitable insertion point. If no suitable insertion point
+  // for the replacement can be found, return an empty value to skip
+  // this replacement.
+  Operation *insertionPoint =
+      findValidInsertionPoint(emptyTensorOp, user, neededValues);
+  if (!insertionPoint)
+    return {};
+
+  rewriter.setInsertionPoint(insertionPoint);
+  Value replacement =
+      op.buildSubsetExtraction(rewriter, emptyTensorOp->getLoc());
+  return replacement;
+}
+
 LogicalResult mlir::bufferization::eliminateEmptyTensors(
-    RewriterBase &rewriter, Operation *op, OneShotAnalysisState &state) {
+    RewriterBase &rewriter, Operation *op, OneShotAnalysisState &state,
+    ControlBuildSubsetExtractionFn subsetsExtractionFn) {
   OpBuilder::InsertionGuard g(rewriter);
   llvm::DenseSet visitedOpOperands;
   op->walk([&](SubsetInsertionOpInterface op) {
@@ -105,10 +128,6 @@ LogicalResult mlir::bufferization::eliminateEmptyTensors(
     if (!state.isInPlace(source))
       return WalkResult::skip();
 
-    // All values that are needed to create the replacement op.
-    SmallVector neededValues =
-        op.getValuesNeededToBuildSubsetExtraction();
-
     // Find tensor.empty ops on the reverse SSA use-def chain. Only follow
     // equivalent tensors. I.e., stop when there are ops such as extract_slice
     // on the path.
@@ -129,8 +148,8 @@ LogicalResult mlir::bufferization::eliminateEmptyTensors(
         &visitedOpOperands);
 
     for (Value v : emptyTensors) {
-      Operation *emptyTensorOp = v.getDefiningOp();
-
+      auto emptyTensorOp = v.getDefiningOp();
+      assert(emptyTensorOp && "expected tensor.empty op");
       // Find the use to be replaced from the use-def chain.
       auto iter = llvm::find_if(
           visitedOpOperands, [&emptyTensorOp](OpOperand *opOperand) {
@@ -142,17 +161,7 @@ LogicalResult mlir::bufferization::eliminateEmptyTensors(
         continue;
       OpOperand *useToBeReplaced = *iter;
       Operation *user = useToBeReplaced->getOwner();
-
-      // Find a suitable insertion point. If no suitable insertion point for
-      // the replacement can be found, skip this replacement.
-      Operation *insertionPoint =
-          findValidInsertionPoint(emptyTensorOp, user, neededValues);
-      if (!insertionPoint)
-        continue;
-
-      rewriter.setInsertionPoint(insertionPoint);
-      Value replacement =
-          op.buildSubsetExtraction(rewriter, emptyTensorOp->getLoc());
+      auto replacement = subsetsExtractionFn(rewriter, op, emptyTensorOp, user);
       if (!replacement)
         continue;
       if (emptyTensorOp == replacement.getDefiningOp())

From e86b68ff560aaf5fc723eaa8d8418892b2456e12 Mon Sep 17 00:00:00 2001
From: Timm Baeder 
Date: Sat, 28 Dec 2024 14:07:01 +0100
Subject: [PATCH 136/567] [clang][bytecode] Add support for typeid pointers
 (#121251)

Add it as another kind of pointer, saving both a `Type*` for the result
of the typeid() expression as well as one for the type of the typeid
expression.
---
 clang/lib/AST/ByteCode/Compiler.cpp | 32 +++++++++++
 clang/lib/AST/ByteCode/Compiler.h   |  1 +
 clang/lib/AST/ByteCode/Interp.cpp   | 82 +++++++++++++++++++++++++++++
 clang/lib/AST/ByteCode/Interp.h     | 63 +++-------------------
 clang/lib/AST/ByteCode/Opcodes.td   |  4 ++
 clang/lib/AST/ByteCode/Pointer.cpp  | 16 ++++++
 clang/lib/AST/ByteCode/Pointer.h    | 25 +++++++--
 clang/test/AST/ByteCode/cxx2a.cpp   | 60 +++++++++++++++++++++
 8 files changed, 224 insertions(+), 59 deletions(-)

diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index 68c75b01e6f6d..036f9608bf3ca 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -3426,6 +3426,38 @@ bool Compiler::VisitBlockExpr(const BlockExpr *E) {
   return this->emitGetFnPtr(Func, E);
 }
 
+template 
+bool Compiler::VisitCXXTypeidExpr(const CXXTypeidExpr *E) {
+  const Type *TypeInfoType = E->getType().getTypePtr();
+
+  if (!E->isPotentiallyEvaluated()) {
+    if (DiscardResult)
+      return true;
+
+    if (E->isTypeOperand())
+      return this->emitGetTypeid(
+          E->getTypeOperand(Ctx.getASTContext()).getTypePtr(), TypeInfoType, E);
+    return this->emitGetTypeid(E->getExprOperand()->getType().getTypePtr(),
+                               TypeInfoType, E);
+  }
+
+  // Otherwise, we need to evaluate the expression operand.
+  assert(E->getExprOperand());
+  assert(E->getExprOperand()->isLValue());
+
+  if (!Ctx.getLangOpts().CPlusPlus20 && !this->emitDiagTypeid(E))
+    return false;
+
+  if (!this->visit(E->getExprOperand()))
+    return false;
+
+  if (!this->emitGetTypeidPtr(TypeInfoType, E))
+    return false;
+  if (DiscardResult)
+    return this->emitPopPtr(E);
+  return true;
+}
+
 template 
 bool Compiler::VisitExpressionTraitExpr(const ExpressionTraitExpr *E) {
   assert(Ctx.getLangOpts().CPlusPlus);
diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h
index 2a94f5ec76b6c..71765b18cb1a9 100644
--- a/clang/lib/AST/ByteCode/Compiler.h
+++ b/clang/lib/AST/ByteCode/Compiler.h
@@ -205,6 +205,7 @@ class Compiler : public ConstStmtVisitor, bool>,
   bool VisitCXXNewExpr(const CXXNewExpr *E);
   bool VisitCXXDeleteExpr(const CXXDeleteExpr *E);
   bool VisitBlockExpr(const BlockExpr *E);
+  bool VisitCXXTypeidExpr(const CXXTypeidExpr *E);
 
   // Statements.
   bool visitCompoundStmt(const CompoundStmt *S);
diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index 7c7752080746e..cb0ce886f6680 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -1154,6 +1154,53 @@ bool CheckLiteralType(InterpState &S, CodePtr OpPC, const Type *T) {
   return false;
 }
 
+static bool getField(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
+                     uint32_t Off) {
+  if (S.getLangOpts().CPlusPlus && S.inConstantContext() &&
+      !CheckNull(S, OpPC, Ptr, CSK_Field))
+    return false;
+
+  if (!CheckExtern(S, OpPC, Ptr))
+    return false;
+  if (!CheckRange(S, OpPC, Ptr, CSK_Field))
+    return false;
+  if (!CheckArray(S, OpPC, Ptr))
+    return false;
+  if (!CheckSubobject(S, OpPC, Ptr, CSK_Field))
+    return false;
+
+  if (Ptr.isIntegralPointer()) {
+    S.Stk.push(Ptr.asIntPointer().atOffset(S.getASTContext(), Off));
+    return true;
+  }
+
+  if (!Ptr.isBlockPointer()) {
+    // FIXME: The only time we (seem to) get here is when trying to access a
+    // field of a typeid pointer. In that case, we're supposed to diagnose e.g.
+    // `typeid(int).name`, but we currently diagnose `&typeid(int)`.
+    S.FFDiag(S.Current->getSource(OpPC),
+             diag::note_constexpr_access_unreadable_object)
+        << AK_Read << Ptr.toDiagnosticString(S.getASTContext());
+    return false;
+  }
+
+  if (Off > Ptr.block()->getSize())
+    return false;
+
+  S.Stk.push(Ptr.atField(Off));
+  return true;
+}
+
+bool GetPtrField(InterpState &S, CodePtr OpPC, uint32_t Off) {
+  const auto &Ptr = S.Stk.peek();
+  return getField(S, OpPC, Ptr, Off);
+}
+
+bool GetPtrFieldPop(InterpState &S, CodePtr OpPC, uint32_t Off) {
+  const auto &Ptr = S.Stk.pop();
+  return getField(S, OpPC, Ptr, Off);
+}
+
 static bool checkConstructor(InterpState &S, CodePtr OpPC, const Function *Func,
                              const Pointer &ThisPtr) {
   assert(Func->isConstructor());
@@ -1595,6 +1642,41 @@ bool CheckBitCast(InterpState &S, CodePtr OpPC, bool HasIndeterminateBits,
   return false;
 }
 
+bool GetTypeid(InterpState &S, CodePtr OpPC, const Type *TypePtr,
+               const Type *TypeInfoType) {
+  S.Stk.push(TypePtr, TypeInfoType);
+  return true;
+}
+
+bool GetTypeidPtr(InterpState &S, CodePtr OpPC, const Type *TypeInfoType) {
+  const auto &P = S.Stk.pop();
+
+  if (!P.isBlockPointer())
+    return false;
+
+  if (P.isDummy()) {
+    QualType StarThisType =
+        S.getASTContext().getLValueReferenceType(P.getType());
+    S.FFDiag(S.Current->getSource(OpPC),
+             diag::note_constexpr_polymorphic_unknown_dynamic_type)
+        << AK_TypeId
+        << P.toAPValue(S.getASTContext())
+               .getAsString(S.getASTContext(), StarThisType);
+    return false;
+  }
+
+  S.Stk.push(P.getType().getTypePtr(), TypeInfoType);
+  return true;
+}
+
+bool DiagTypeid(InterpState &S, CodePtr OpPC) {
+  const auto *E = cast(S.Current->getExpr(OpPC));
+  S.CCEDiag(E, diag::note_constexpr_typeid_polymorphic)
+      << E->getExprOperand()->getType()
+      << E->getExprOperand()->getSourceRange();
+  return false;
+}
+
 // https://github.com/llvm/llvm-project/issues/102513
 #if defined(_MSC_VER) && !defined(__clang__) && !defined(NDEBUG)
 #pragma optimize("", off)
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index 8461d1e98f977..d2aec69072e04 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -1526,61 +1526,8 @@ inline bool GetPtrGlobal(InterpState &S, CodePtr OpPC, uint32_t I) {
 
 /// 1) Peeks a Pointer
 /// 2) Pushes Pointer.atField(Off) on the stack
-inline bool GetPtrField(InterpState &S, CodePtr OpPC, uint32_t Off) {
-  const Pointer &Ptr = S.Stk.peek();
-
-  if (S.getLangOpts().CPlusPlus && S.inConstantContext() &&
-      !CheckNull(S, OpPC, Ptr, CSK_Field))
-    return false;
-
-  if (!CheckExtern(S, OpPC, Ptr))
-    return false;
-  if (!CheckRange(S, OpPC, Ptr, CSK_Field))
-    return false;
-  if (!CheckArray(S, OpPC, Ptr))
-    return false;
-  if (!CheckSubobject(S, OpPC, Ptr, CSK_Field))
-    return false;
-
-  if (Ptr.isBlockPointer() && Off > Ptr.block()->getSize())
-    return false;
-
-  if (Ptr.isIntegralPointer()) {
-    S.Stk.push(Ptr.asIntPointer().atOffset(S.getASTContext(), Off));
-    return true;
-  }
-
-  S.Stk.push(Ptr.atField(Off));
-  return true;
-}
-
-inline bool GetPtrFieldPop(InterpState &S, CodePtr OpPC, uint32_t Off) {
-  const Pointer &Ptr = S.Stk.pop();
-
-  if (S.getLangOpts().CPlusPlus && S.inConstantContext() &&
-      !CheckNull(S, OpPC, Ptr, CSK_Field))
-    return false;
-
-  if (!CheckExtern(S, OpPC, Ptr))
-    return false;
-  if (!CheckRange(S, OpPC, Ptr, CSK_Field))
-    return false;
-  if (!CheckArray(S, OpPC, Ptr))
-    return false;
-  if (!CheckSubobject(S, OpPC, Ptr, CSK_Field))
-    return false;
-
-  if (Ptr.isBlockPointer() && Off > Ptr.block()->getSize())
-    return false;
-
-  if (Ptr.isIntegralPointer()) {
-    S.Stk.push(Ptr.asIntPointer().atOffset(S.getASTContext(), Off));
-    return true;
-  }
-
-  S.Stk.push(Ptr.atField(Off));
-  return true;
-}
+bool GetPtrField(InterpState &S, CodePtr OpPC, uint32_t Off);
+bool GetPtrFieldPop(InterpState &S, CodePtr OpPC, uint32_t Off);
 
 inline bool GetPtrThisField(InterpState &S, CodePtr OpPC, uint32_t Off) {
   if (S.checkingPotentialConstantExpression())
@@ -3087,6 +3034,12 @@ inline bool BitCast(InterpState &S, CodePtr OpPC) {
   return true;
 }
 
+/// Typeid support.
+bool GetTypeid(InterpState &S, CodePtr OpPC, const Type *TypePtr,
+               const Type *TypeInfoType);
+bool GetTypeidPtr(InterpState &S, CodePtr OpPC, const Type *TypeInfoType);
+bool DiagTypeid(InterpState &S, CodePtr OpPC);
+
 //===----------------------------------------------------------------------===//
 // Read opcode arguments
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td
index 123c21fa43ece..4b0c902ab2926 100644
--- a/clang/lib/AST/ByteCode/Opcodes.td
+++ b/clang/lib/AST/ByteCode/Opcodes.td
@@ -850,3 +850,7 @@ def BitCastPrim : Opcode {
 }
 
 def BitCast : Opcode;
+
+def GetTypeid : Opcode { let Args = [ArgTypePtr, ArgTypePtr]; }
+def GetTypeidPtr : Opcode { let Args = [ArgTypePtr]; }
+def DiagTypeid : Opcode;
diff --git a/clang/lib/AST/ByteCode/Pointer.cpp b/clang/lib/AST/ByteCode/Pointer.cpp
index 01e642310aad3..da202598b363a 100644
--- a/clang/lib/AST/ByteCode/Pointer.cpp
+++ b/clang/lib/AST/ByteCode/Pointer.cpp
@@ -96,6 +96,8 @@ void Pointer::operator=(const Pointer &P) {
     PointeeStorage.Int = P.PointeeStorage.Int;
   } else if (P.isFunctionPointer()) {
     PointeeStorage.Fn = P.PointeeStorage.Fn;
+  } else if (P.isTypeidPointer()) {
+    PointeeStorage.Typeid = P.PointeeStorage.Typeid;
   } else {
     assert(false && "Unhandled storage kind");
   }
@@ -132,6 +134,8 @@ void Pointer::operator=(Pointer &&P) {
     PointeeStorage.Int = P.PointeeStorage.Int;
   } else if (P.isFunctionPointer()) {
     PointeeStorage.Fn = P.PointeeStorage.Fn;
+  } else if (P.isTypeidPointer()) {
+    PointeeStorage.Typeid = P.PointeeStorage.Typeid;
   } else {
     assert(false && "Unhandled storage kind");
   }
@@ -151,6 +155,14 @@ APValue Pointer::toAPValue(const ASTContext &ASTCtx) const {
   if (isFunctionPointer())
     return asFunctionPointer().toAPValue(ASTCtx);
 
+  if (isTypeidPointer()) {
+    TypeInfoLValue TypeInfo(PointeeStorage.Typeid.TypePtr);
+    return APValue(
+        APValue::LValueBase::getTypeInfo(
+            TypeInfo, QualType(PointeeStorage.Typeid.TypeInfoType, 0)),
+        CharUnits::Zero(), APValue::NoLValuePath{});
+  }
+
   // Build the lvalue base from the block.
   const Descriptor *Desc = getDeclDesc();
   APValue::LValueBase Base;
@@ -304,6 +316,8 @@ void Pointer::print(llvm::raw_ostream &OS) const {
   case Storage::Fn:
     OS << "(Fn) { " << asFunctionPointer().getFunction() << " + " << Offset
        << " }";
+  case Storage::Typeid:
+    OS << "(Typeid)";
   }
 }
 
@@ -450,6 +464,8 @@ bool Pointer::hasSameBase(const Pointer &A, const Pointer &B) {
     return true;
   if (A.isFunctionPointer() && B.isFunctionPointer())
     return true;
+  if (A.isTypeidPointer() && B.isTypeidPointer())
+    return true;
 
   if (A.isIntegralPointer() || B.isIntegralPointer())
     return A.getSource() == B.getSource();
diff --git a/clang/lib/AST/ByteCode/Pointer.h b/clang/lib/AST/ByteCode/Pointer.h
index 0d467c2abf083..ef03c12e86c10 100644
--- a/clang/lib/AST/ByteCode/Pointer.h
+++ b/clang/lib/AST/ByteCode/Pointer.h
@@ -49,7 +49,12 @@ struct IntPointer {
   IntPointer baseCast(const ASTContext &ASTCtx, unsigned BaseOffset) const;
 };
 
-enum class Storage { Block, Int, Fn };
+struct TypeidPointer {
+  const Type *TypePtr;
+  const Type *TypeInfoType;
+};
+
+enum class Storage { Block, Int, Fn, Typeid };
 
 /// A pointer to a memory block, live or dead.
 ///
@@ -107,6 +112,11 @@ class Pointer {
       : Offset(Offset), StorageKind(Storage::Fn) {
     PointeeStorage.Fn = FunctionPointer(F);
   }
+  Pointer(const Type *TypePtr, const Type *TypeInfoType, uint64_t Offset = 0)
+      : Offset(Offset), StorageKind(Storage::Typeid) {
+    PointeeStorage.Typeid.TypePtr = TypePtr;
+    PointeeStorage.Typeid.TypeInfoType = TypeInfoType;
+  }
   Pointer(Block *Pointee, unsigned Base, uint64_t Offset);
   ~Pointer();
 
@@ -263,6 +273,8 @@ class Pointer {
       return asBlockPointer().Pointee == nullptr;
     if (isFunctionPointer())
       return asFunctionPointer().isZero();
+    if (isTypeidPointer())
+      return false;
     assert(isIntegralPointer());
     return asIntPointer().Value == 0 && Offset == 0;
   }
@@ -284,7 +296,7 @@ class Pointer {
   const Descriptor *getDeclDesc() const {
     if (isIntegralPointer())
       return asIntPointer().Desc;
-    if (isFunctionPointer())
+    if (isFunctionPointer() || isTypeidPointer())
       return nullptr;
 
     assert(isBlockPointer());
@@ -337,6 +349,9 @@ class Pointer {
 
   /// Returns the type of the innermost field.
   QualType getType() const {
+    if (isTypeidPointer())
+      return QualType(PointeeStorage.Typeid.TypeInfoType, 0);
+
     if (inPrimitiveArray() && Offset != asBlockPointer().Base) {
       // Unfortunately, complex and vector types are not array types in clang,
       // but they are for us.
@@ -437,7 +452,7 @@ class Pointer {
   }
   /// Pointer points directly to a block.
   bool isRoot() const {
-    if (isZero() || isIntegralPointer())
+    if (isZero() || !isBlockPointer())
       return true;
     return (asBlockPointer().Base ==
                 asBlockPointer().Pointee->getDescriptor()->getMetadataSize() ||
@@ -467,6 +482,7 @@ class Pointer {
   bool isBlockPointer() const { return StorageKind == Storage::Block; }
   bool isIntegralPointer() const { return StorageKind == Storage::Int; }
   bool isFunctionPointer() const { return StorageKind == Storage::Fn; }
+  bool isTypeidPointer() const { return StorageKind == Storage::Typeid; }
 
   /// Returns the record descriptor of a class.
   const Record *getRecord() const { return getFieldDesc()->ElemRecord; }
@@ -605,7 +621,7 @@ class Pointer {
 
   /// Checks if the index is one past end.
   bool isOnePastEnd() const {
-    if (isIntegralPointer() || isFunctionPointer())
+    if (!isBlockPointer())
       return false;
 
     if (!asBlockPointer().Pointee)
@@ -746,6 +762,7 @@ class Pointer {
     BlockPointer BS;
     IntPointer Int;
     FunctionPointer Fn;
+    TypeidPointer Typeid;
   } PointeeStorage;
   Storage StorageKind = Storage::Int;
 };
diff --git a/clang/test/AST/ByteCode/cxx2a.cpp b/clang/test/AST/ByteCode/cxx2a.cpp
index eaae978e01184..f6006881cee4d 100644
--- a/clang/test/AST/ByteCode/cxx2a.cpp
+++ b/clang/test/AST/ByteCode/cxx2a.cpp
@@ -110,3 +110,63 @@ namespace DtorOrder {
   }
   static_assert(check_abnormal_termination());
 }
+
+namespace std {
+  struct type_info;
+}
+
+namespace TypeId {
+  struct A {
+    const std::type_info &ti = typeid(*this);
+  };
+  struct A2 : A {};
+  static_assert(&A().ti == &typeid(A));
+  static_assert(&typeid((A2())) == &typeid(A2));
+  extern A2 extern_a2;
+  static_assert(&typeid(extern_a2) == &typeid(A2));
+
+  constexpr A2 a2;
+  constexpr const A &a1 = a2;
+  static_assert(&typeid(a1) == &typeid(A));
+
+  struct B {
+    virtual void f();
+    const std::type_info &ti1 = typeid(*this);
+  };
+  struct B2 : B {
+    const std::type_info &ti2 = typeid(*this);
+  };
+  static_assert(&B2().ti1 == &typeid(B));
+  static_assert(&B2().ti2 == &typeid(B2));
+  extern B2 extern_b2;
+  static_assert(&typeid(extern_b2) == &typeid(B2)); // both-error {{constant expression}} \
+                                                    // both-note{{typeid applied to object 'extern_b2' whose dynamic type is not constant}}
+
+
+  constexpr B2 b2;
+  constexpr const B &b1 = b2;
+  static_assert(&typeid(b1) == &typeid(B2));
+
+  constexpr bool side_effects() {
+    // Not polymorphic nor a glvalue.
+    bool OK = true;
+    (void)typeid(OK = false, A2()); // both-warning {{has no effect}}
+    if (!OK) return false;
+
+    // Not polymorphic.
+    A2 a2;
+    (void)typeid(OK = false, a2); // both-warning {{has no effect}}
+    if (!OK) return false;
+
+    // Not a glvalue.
+    (void)typeid(OK = false, B2()); // both-warning {{has no effect}}
+    if (!OK) return false;
+
+    // Polymorphic glvalue: operand evaluated.
+    OK = false;
+    B2 b2;
+    (void)typeid(OK = true, b2); // both-warning {{will be evaluated}}
+    return OK;
+  }
+  static_assert(side_effects());
+}

From 3496e96f78c46f5b94c1892f97c470fd89293795 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= 
Date: Sat, 28 Dec 2024 14:17:06 +0100
Subject: [PATCH 137/567] [clang][bytecode] Add a missing break

---
 clang/lib/AST/ByteCode/Pointer.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/lib/AST/ByteCode/Pointer.cpp b/clang/lib/AST/ByteCode/Pointer.cpp
index da202598b363a..ec4756fe4f87d 100644
--- a/clang/lib/AST/ByteCode/Pointer.cpp
+++ b/clang/lib/AST/ByteCode/Pointer.cpp
@@ -316,6 +316,7 @@ void Pointer::print(llvm::raw_ostream &OS) const {
   case Storage::Fn:
     OS << "(Fn) { " << asFunctionPointer().getFunction() << " + " << Offset
        << " }";
+    break;
   case Storage::Typeid:
     OS << "(Typeid)";
   }

From 088d636136a42d738e15f2f0a85e1b77a8f2de35 Mon Sep 17 00:00:00 2001
From: adam-bzowski 
Date: Sat, 28 Dec 2024 18:21:47 +0100
Subject: [PATCH 138/567] [ValueTracking] Fix a bug for signed min-max clamping
 (#121206)

Correctly handle the case where the clamp is over the full range.
This fixes an issue introduced in #121206.
---
 llvm/lib/Analysis/ValueTracking.cpp           |   3 +-
 .../knownbits-trunc-with-min-max-clamp.ll     | 196 ++++++++++++++----
 2 files changed, 161 insertions(+), 38 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 78fec25a6e502..2f6e869ae7b73 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -1119,7 +1119,8 @@ static void unionWithMinMaxIntrinsicClamp(const IntrinsicInst *II,
                                           KnownBits &Known) {
   const APInt *CLow, *CHigh;
   if (isSignedMinMaxIntrinsicClamp(II, CLow, CHigh))
-    Known = Known.unionWith(ConstantRange(*CLow, *CHigh + 1).toKnownBits());
+    Known = Known.unionWith(
+        ConstantRange::getNonEmpty(*CLow, *CHigh + 1).toKnownBits());
 }
 
 static void computeKnownBitsFromOperator(const Operator *I,
diff --git a/llvm/test/Analysis/ValueTracking/knownbits-trunc-with-min-max-clamp.ll b/llvm/test/Analysis/ValueTracking/knownbits-trunc-with-min-max-clamp.ll
index 1ff8a41b3459b..52f12a6df9193 100644
--- a/llvm/test/Analysis/ValueTracking/knownbits-trunc-with-min-max-clamp.ll
+++ b/llvm/test/Analysis/ValueTracking/knownbits-trunc-with-min-max-clamp.ll
@@ -2,7 +2,8 @@
 ; RUN: opt < %s -passes=aggressive-instcombine -S | FileCheck %s
 
 ; The LIT tests rely on i32, i16 and i8 being valid machine types.
-target datalayout = "n8:16:32"
+; The bounds checking tests require also i64 and i128.
+target datalayout = "n8:16:32:64:128"
 
 ; This LIT test checks if TruncInstCombine pass correctly recognizes the
 ; constraints from a signed min-max clamp. The clamp is a sequence of smin and
@@ -12,6 +13,11 @@ target datalayout = "n8:16:32"
 ; of smin and smax:
 ; a) y = smax(smin(x, upper_limit), lower_limit)
 ; b) y = smin(smax(x, lower_limit), upper_limit)
+;
+; The clamp is used in TruncInstCombine.cpp pass (as part of aggressive-instcombine)
+; to optimize extensions and truncations of lshr. This is what is tested here.
+; The pass also optimizes extensions and truncations of other binary operators,
+; but in such cases the smin-smax clamp may not be used.
 
 define i8 @test_0a(i16 %x) {
 ; CHECK-LABEL: define i8 @test_0a(
@@ -47,6 +53,8 @@ define i8 @test_0b(i16 %x) {
   ret i8 %b.trunc
 }
 
+; The following two tests contain add instead of lshr.
+; The optimization works here as well.
 define i8 @test_1a(i16 %x) {
 ; CHECK-LABEL: define i8 @test_1a(
 ; CHECK-SAME: i16 [[X:%.*]]) {
@@ -81,19 +89,23 @@ define i8 @test_1b(i16 %x) {
   ret i8 %b.trunc
 }
 
+; Tests for clamping with negative min and max.
+
+; With sext no optimization occurs.
 define i8 @test_2a(i16 %x) {
 ; CHECK-LABEL: define i8 @test_2a(
 ; CHECK-SAME: i16 [[X:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 -1)
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 -31)
-; CHECK-NEXT:    [[A:%.*]] = trunc i16 [[TMP2]] to i8
-; CHECK-NEXT:    [[B:%.*]] = add i8 [[A]], 2
-; CHECK-NEXT:    ret i8 [[B]]
+; CHECK-NEXT:    [[A:%.*]] = sext i16 [[TMP2]] to i32
+; CHECK-NEXT:    [[B:%.*]] = lshr i32 [[A]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i32 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
 ;
   %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 -1)
   %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 -31)
   %a = sext i16 %2 to i32
-  %b = add i32 %a, 2
+  %b = lshr i32 %a, 2
   %b.trunc = trunc i32 %b to i8
   ret i8 %b.trunc
 }
@@ -103,31 +115,69 @@ define i8 @test_2b(i16 %x) {
 ; CHECK-SAME: i16 [[X:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smax.i16(i16 [[X]], i16 -31)
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP1]], i16 -1)
-; CHECK-NEXT:    [[A:%.*]] = trunc i16 [[TMP2]] to i8
-; CHECK-NEXT:    [[B:%.*]] = add i8 [[A]], 2
-; CHECK-NEXT:    ret i8 [[B]]
+; CHECK-NEXT:    [[A:%.*]] = sext i16 [[TMP2]] to i32
+; CHECK-NEXT:    [[B:%.*]] = lshr i32 [[A]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i32 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
 ;
   %1 = tail call i16 @llvm.smax.i16(i16 %x, i16 -31)
   %2 = tail call i16 @llvm.smin.i16(i16 %1, i16 -1)
   %a = sext i16 %2 to i32
-  %b = add i32 %a, 2
+  %b = lshr i32 %a, 2
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+; With zext the optimization occurs.
+define i8 @test_2c(i16 %x) {
+; CHECK-LABEL: define i8 @test_2c(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 -1)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 -31)
+; CHECK-NEXT:    [[B:%.*]] = lshr i16 [[TMP2]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i16 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
+;
+  %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 -1)
+  %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 -31)
+  %a = zext i16 %2 to i32
+  %b = lshr i32 %a, 2
   %b.trunc = trunc i32 %b to i8
   ret i8 %b.trunc
 }
 
+define i8 @test_2d(i16 %x) {
+; CHECK-LABEL: define i8 @test_2d(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smax.i16(i16 [[X]], i16 -31)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP1]], i16 -1)
+; CHECK-NEXT:    [[B:%.*]] = lshr i16 [[TMP2]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i16 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
+;
+  %1 = tail call i16 @llvm.smax.i16(i16 %x, i16 -31)
+  %2 = tail call i16 @llvm.smin.i16(i16 %1, i16 -1)
+  %a = zext i16 %2 to i32
+  %b = lshr i32 %a, 2
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+; Tests for clamping with mixed-signed min and max.
+; With zext the optimization occurs.
 define i8 @test_3a(i16 %x) {
 ; CHECK-LABEL: define i8 @test_3a(
 ; CHECK-SAME: i16 [[X:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 31)
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 -31)
-; CHECK-NEXT:    [[A:%.*]] = trunc i16 [[TMP2]] to i8
-; CHECK-NEXT:    [[B:%.*]] = add i8 [[A]], 2
-; CHECK-NEXT:    ret i8 [[B]]
+; CHECK-NEXT:    [[B:%.*]] = lshr i16 [[TMP2]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i16 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
 ;
   %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 31)
   %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 -31)
-  %a = sext i16 %2 to i32
-  %b = add i32 %a, 2
+  %a = zext i16 %2 to i32
+  %b = lshr i32 %a, 2
   %b.trunc = trunc i32 %b to i8
   ret i8 %b.trunc
 }
@@ -137,31 +187,32 @@ define i8 @test_3b(i16 %x) {
 ; CHECK-SAME: i16 [[X:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smax.i16(i16 [[X]], i16 -31)
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP1]], i16 31)
-; CHECK-NEXT:    [[A:%.*]] = trunc i16 [[TMP2]] to i8
-; CHECK-NEXT:    [[B:%.*]] = add i8 [[A]], 2
-; CHECK-NEXT:    ret i8 [[B]]
+; CHECK-NEXT:    [[B:%.*]] = lshr i16 [[TMP2]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i16 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
 ;
   %1 = tail call i16 @llvm.smax.i16(i16 %x, i16 -31)
   %2 = tail call i16 @llvm.smin.i16(i16 %1, i16 31)
-  %a = sext i16 %2 to i32
-  %b = add i32 %a, 2
+  %a = zext i16 %2 to i32
+  %b = lshr i32 %a, 2
   %b.trunc = trunc i32 %b to i8
   ret i8 %b.trunc
 }
 
+; Optimizations with vector types.
 define <16 x i8> @test_vec_1a(<16 x i16> %x) {
 ; CHECK-LABEL: define <16 x i8> @test_vec_1a(
 ; CHECK-SAME: <16 x i16> [[X:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> [[X]], <16 x i16> splat (i16 127))
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> [[TMP1]], <16 x i16> zeroinitializer)
 ; CHECK-NEXT:    [[A:%.*]] = trunc <16 x i16> [[TMP2]] to <16 x i8>
-; CHECK-NEXT:    [[B:%.*]] = add <16 x i8> [[A]], splat (i8 2)
+; CHECK-NEXT:    [[B:%.*]] = lshr <16 x i8> [[A]], splat (i8 2)
 ; CHECK-NEXT:    ret <16 x i8> [[B]]
 ;
   %1 = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> %x, <16 x i16> splat (i16 127))
   %2 = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> %1, <16 x i16> zeroinitializer)
   %a = sext <16 x i16> %2 to <16 x i32>
-  %b = add <16 x i32> %a, splat (i32 2)
+  %b = lshr <16 x i32> %a, splat (i32 2)
   %b.trunc = trunc <16 x i32> %b to <16 x i8>
   ret <16 x i8> %b.trunc
 }
@@ -172,13 +223,13 @@ define <16 x i8> @test_vec_1b(<16 x i16> %x) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> [[X]], <16 x i16> zeroinitializer)
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> [[TMP1]], <16 x i16> splat (i16 127))
 ; CHECK-NEXT:    [[A:%.*]] = trunc <16 x i16> [[TMP2]] to <16 x i8>
-; CHECK-NEXT:    [[B:%.*]] = add <16 x i8> [[A]], splat (i8 2)
+; CHECK-NEXT:    [[B:%.*]] = lshr <16 x i8> [[A]], splat (i8 2)
 ; CHECK-NEXT:    ret <16 x i8> [[B]]
 ;
   %1 = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> %x, <16 x i16> zeroinitializer)
   %2 = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> %1, <16 x i16> splat (i16 127))
   %a = sext <16 x i16> %2 to <16 x i32>
-  %b = add <16 x i32> %a, splat (i32 2)
+  %b = lshr <16 x i32> %a, splat (i32 2)
   %b.trunc = trunc <16 x i32> %b to <16 x i8>
   ret <16 x i8> %b.trunc
 }
@@ -217,14 +268,14 @@ define i8 @test_bounds_1(i16 %x) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 127)
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 0)
 ; CHECK-NEXT:    [[A:%.*]] = trunc i16 [[TMP2]] to i8
-; CHECK-NEXT:    [[SHR:%.*]] = ashr i8 [[A]], 7
-; CHECK-NEXT:    ret i8 [[SHR]]
+; CHECK-NEXT:    [[B:%.*]] = lshr i8 [[A]], 7
+; CHECK-NEXT:    ret i8 [[B]]
 ;
   %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 127)
   %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 0)
   %a = sext i16 %2 to i32
-  %shr = ashr i32 %a, 7
-  %b.trunc = trunc i32 %shr to i8
+  %b = lshr i32 %a, 7
+  %b.trunc = trunc i32 %b to i8
   ret i8 %b.trunc
 }
 
@@ -234,15 +285,15 @@ define i8 @test_bounds_2(i16 %x) {
 ; CHECK-SAME: i16 [[X:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 128)
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 0)
-; CHECK-NEXT:    [[SHR:%.*]] = ashr i16 [[TMP2]], 7
-; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i16 [[SHR]] to i8
-; CHECK-NEXT:    ret i8 [[B_TRUNC]]
+; CHECK-NEXT:    [[A:%.*]] = trunc i16 [[TMP2]] to i8
+; CHECK-NEXT:    [[B:%.*]] = lshr i8 [[A]], 7
+; CHECK-NEXT:    ret i8 [[B]]
 ;
   %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 128)
   %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 0)
   %a = sext i16 %2 to i32
-  %shr = ashr i32 %a, 7
-  %b.trunc = trunc i32 %shr to i8
+  %b = lshr i32 %a, 7
+  %b.trunc = trunc i32 %b to i8
   ret i8 %b.trunc
 }
 
@@ -253,14 +304,85 @@ define i8 @test_bounds_3(i16 %x) {
 ; CHECK-SAME: i16 [[X:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 32767)
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 32752)
-; CHECK-NEXT:    [[A:%.*]] = trunc i16 [[TMP2]] to i8
-; CHECK-NEXT:    [[AND:%.*]] = and i8 [[A]], -1
-; CHECK-NEXT:    ret i8 [[AND]]
+; CHECK-NEXT:    [[B:%.*]] = lshr i16 [[TMP2]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i16 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
 ;
   %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 32767)
   %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 32752)
   %a = sext i16 %2 to i32
-  %and = and i32 %a, 255
-  %b.trunc = trunc i32 %and to i8
+  %b = lshr i32 %a, 2
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+; Here min = 128 is greater than max = 0.
+define i8 @test_bounds_4(i16 %x) {
+; CHECK-LABEL: define i8 @test_bounds_4(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 128)
+; CHECK-NEXT:    [[B:%.*]] = lshr i16 [[TMP2]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i16 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
+;
+  %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 0)
+  %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 128)
+  %a = sext i16 %2 to i32
+  %b = lshr i32 %a, 2
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+; The following 3 tests check the situation where min and max are minimal and
+; maximal signed values. No transformations should occur here.
+define i8 @test_bounds_5(i16 %x) {
+; CHECK-LABEL: define i8 @test_bounds_5(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 32767)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 -32768)
+; CHECK-NEXT:    [[B:%.*]] = lshr i16 [[TMP2]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i16 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
+;
+  %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 32767)
+  %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 -32768)
+  %a = zext i16 %2 to i32
+  %b = lshr i32 %a, 2
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+define i8 @test_bounds_6(i32 %x) {
+; CHECK-LABEL: define i8 @test_bounds_6(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.smin.i32(i32 [[X]], i32 2147483647)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.smax.i32(i32 [[TMP1]], i32 -2147483648)
+; CHECK-NEXT:    [[B:%.*]] = lshr i32 [[TMP2]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i32 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
+;
+  %1 = tail call i32 @llvm.smin.i32(i32 %x, i32 2147483647)
+  %2 = tail call i32 @llvm.smax.i32(i32 %1, i32 -2147483648)
+  %a = zext i32 %2 to i64
+  %b = lshr i64 %a, 2
+  %b.trunc = trunc i64 %b to i8
+  ret i8 %b.trunc
+}
+
+define i8 @test_bounds_7(i64 %x) {
+; CHECK-LABEL: define i8 @test_bounds_7(
+; CHECK-SAME: i64 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.smin.i64(i64 [[X]], i64 9223372036854775807)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP1]], i64 -9223372036854775808)
+; CHECK-NEXT:    [[B:%.*]] = lshr i64 [[TMP2]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i64 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
+;
+  %1 = tail call i64 @llvm.smin.i64(i64 %x, i64 9223372036854775807)
+  %2 = tail call i64 @llvm.smax.i64(i64 %1, i64 -9223372036854775808)
+  %a = zext i64 %2 to i128
+  %b = lshr i128 %a, 2
+  %b.trunc = trunc i128 %b to i8
   ret i8 %b.trunc
 }

From 4a6fcc17c6d5ab9bf2b9629b49fce607a4eb4e25 Mon Sep 17 00:00:00 2001
From: Maksim Levental 
Date: Sat, 28 Dec 2024 10:42:16 -0800
Subject: [PATCH 139/567] [mlir][emitc] DCE unimplemented decls (#121253)

---
 mlir/include/mlir/Dialect/EmitC/IR/EmitC.td | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
index fc5a33541533a..729a573b71c97 100644
--- a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
+++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
@@ -1305,8 +1305,6 @@ def EmitC_IfOp : EmitC_Op<"if",
       Block* body = getBody(1);
       return OpBuilder::atBlockEnd(body, listener);
     }
-    Block* thenBlock();
-    Block* elseBlock();
   }];
   let hasCustomAssemblyFormat = 1;
 }

From f1bc3afb6cb07d6cc9bb3dce963130a879978aa1 Mon Sep 17 00:00:00 2001
From: Maksim Levental 
Date: Sat, 28 Dec 2024 12:53:05 -0800
Subject: [PATCH 140/567] [mlir][scf] DCE unimplemented decls in TDs (#121237)

More dead code in headers...
---
 mlir/include/mlir/Dialect/SCF/IR/SCF.h | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SCF/IR/SCF.h b/mlir/include/mlir/Dialect/SCF/IR/SCF.h
index b62c941797947..ba648181daecb 100644
--- a/mlir/include/mlir/Dialect/SCF/IR/SCF.h
+++ b/mlir/include/mlir/Dialect/SCF/IR/SCF.h
@@ -40,12 +40,6 @@ void buildTerminatedBody(OpBuilder &builder, Location loc);
 namespace mlir {
 namespace scf {
 
-// Insert `loop.yield` at the end of the only region's only block if it
-// does not have a terminator already.  If a new `loop.yield` is inserted,
-// the location is specified by `loc`. If the region is empty, insert a new
-// block first.
-void ensureLoopTerminator(Region ®ion, Builder &builder, Location loc);
-
 /// Returns the loop parent of an induction variable. If the provided value is
 /// not an induction variable, then return nullptr.
 ForOp getForInductionVarOwner(Value val);

From 8e329593313bb792592529ee825a52683108df99 Mon Sep 17 00:00:00 2001
From: David Olsen 
Date: Sat, 28 Dec 2024 14:02:15 -0800
Subject: [PATCH 141/567] [CIR] Upstream initial attribute support (#121069)

Upstream several ClangIR-specific MLIR attributes, in particular
attributes for integer, floating-point, and null pointer constants.
These are the first ClangIR attributes to be upstreamed, so
infrastructure changes are included, such as the table-gen file
`CIRAttrs.td`.

Attributes can be used as the initial values for global variables. The
existing automated test global-var-simple.cpp includes initial values
for some of the global variables in the test.
---
 .../CIR/Dialect/Builder/CIRBaseBuilder.h      |  11 ++
 clang/include/clang/CIR/Dialect/IR/CIRAttrs.h |  36 ++++
 .../include/clang/CIR/Dialect/IR/CIRAttrs.td  | 142 +++++++++++++++
 .../include/clang/CIR/Dialect/IR/CIRDialect.h |   1 +
 clang/include/clang/CIR/Dialect/IR/CIROps.td  |  54 +++++-
 .../include/clang/CIR/Dialect/IR/CIRTypes.td  |  12 +-
 .../clang/CIR/Dialect/IR/CMakeLists.txt       |   3 +
 clang/lib/CIR/CodeGen/CIRGenModule.cpp        |  42 +++++
 clang/lib/CIR/Dialect/IR/CIRAttrs.cpp         | 172 +++++++++++++++++-
 clang/lib/CIR/Dialect/IR/CIRDialect.cpp       | 107 ++++++++++-
 clang/lib/CIR/Dialect/IR/CMakeLists.txt       |   1 +
 clang/lib/CIR/Interfaces/CMakeLists.txt       |   1 +
 clang/test/CIR/global-var-simple.cpp          |  24 +--
 13 files changed, 583 insertions(+), 23 deletions(-)
 create mode 100644 clang/include/clang/CIR/Dialect/IR/CIRAttrs.h
 create mode 100644 clang/include/clang/CIR/Dialect/IR/CIRAttrs.td

diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index 0e414921324b7..b4a961de224aa 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -9,7 +9,11 @@
 #ifndef LLVM_CLANG_CIR_DIALECT_BUILDER_CIRBASEBUILDER_H
 #define LLVM_CLANG_CIR_DIALECT_BUILDER_CIRBASEBUILDER_H
 
+#include "clang/CIR/Dialect/IR/CIRAttrs.h"
+
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Types.h"
 
 namespace cir {
 
@@ -26,6 +30,13 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
   cir::PointerType getVoidPtrTy() {
     return getPointerTo(cir::VoidType::get(getContext()));
   }
+
+  mlir::TypedAttr getConstPtrAttr(mlir::Type type, int64_t value) {
+    auto valueAttr = mlir::IntegerAttr::get(
+        mlir::IntegerType::get(type.getContext(), 64), value);
+    return cir::ConstPtrAttr::get(
+        getContext(), mlir::cast(type), valueAttr);
+  }
 };
 
 } // namespace cir
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.h b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.h
new file mode 100644
index 0000000000000..438fb7d09608d
--- /dev/null
+++ b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.h
@@ -0,0 +1,36 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the attributes in the CIR dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_CIR_DIALECT_IR_CIRATTRS_H
+#define LLVM_CLANG_CIR_DIALECT_IR_CIRATTRS_H
+
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/BuiltinAttributeInterfaces.h"
+
+#include "llvm/ADT/SmallVector.h"
+
+//===----------------------------------------------------------------------===//
+// CIR Dialect Attrs
+//===----------------------------------------------------------------------===//
+
+namespace clang {
+class FunctionDecl;
+class VarDecl;
+class RecordDecl;
+} // namespace clang
+
+#define GET_ATTRDEF_CLASSES
+#include "clang/CIR/Dialect/IR/CIROpsAttributes.h.inc"
+
+#endif // LLVM_CLANG_CIR_DIALECT_IR_CIRATTRS_H
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
new file mode 100644
index 0000000000000..bd1665e1ac1a0
--- /dev/null
+++ b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
@@ -0,0 +1,142 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the CIR dialect attributes.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_CIR_DIALECT_IR_CIRATTRS_TD
+#define LLVM_CLANG_CIR_DIALECT_IR_CIRATTRS_TD
+
+include "mlir/IR/BuiltinAttributeInterfaces.td"
+include "mlir/IR/EnumAttr.td"
+
+include "clang/CIR/Dialect/IR/CIRDialect.td"
+
+//===----------------------------------------------------------------------===//
+// CIR Attrs
+//===----------------------------------------------------------------------===//
+
+class CIR_Attr traits = []>
+    : AttrDef {
+  let mnemonic = attrMnemonic;
+}
+
+class CIRUnitAttr traits = []>
+    : CIR_Attr {
+  let returnType = "bool";
+  let defaultValue = "false";
+  let valueType = NoneType;
+  let isOptional = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// IntegerAttr
+//===----------------------------------------------------------------------===//
+
+def IntAttr : CIR_Attr<"Int", "int", [TypedAttrInterface]> {
+  let summary = "An attribute containing an integer value";
+  let description = [{
+    An integer attribute is a literal attribute that represents an integral
+    value of the specified integer type.
+  }];
+  let parameters = (ins AttributeSelfTypeParameter<"">:$type,
+                        "llvm::APInt":$value);
+  let builders = [
+    AttrBuilderWithInferredContext<(ins "mlir::Type":$type,
+                                        "const llvm::APInt &":$value), [{
+      return $_get(type.getContext(), type, value);
+    }]>,
+    AttrBuilderWithInferredContext<(ins "mlir::Type":$type,
+                                        "int64_t":$value), [{
+      IntType intType = mlir::cast(type);
+      mlir::APInt apValue(intType.getWidth(), value, intType.isSigned());
+      return $_get(intType.getContext(), intType, apValue);
+    }]>,
+  ];
+  let extraClassDeclaration = [{
+    int64_t getSInt() const { return getValue().getSExtValue(); }
+    uint64_t getUInt() const { return getValue().getZExtValue(); }
+    bool isNullValue() const { return getValue() == 0; }
+    uint64_t getBitWidth() const {
+      return mlir::cast(getType()).getWidth();
+    }
+  }];
+  let genVerifyDecl = 1;
+  let hasCustomAssemblyFormat = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// FPAttr
+//===----------------------------------------------------------------------===//
+
+def FPAttr : CIR_Attr<"FP", "fp", [TypedAttrInterface]> {
+  let summary = "An attribute containing a floating-point value";
+  let description = [{
+    An fp attribute is a literal attribute that represents a floating-point
+    value of the specified floating-point type. Supporting only CIR FP types.
+  }];
+  let parameters = (ins
+    AttributeSelfTypeParameter<"", "::cir::CIRFPTypeInterface">:$type,
+    APFloatParameter<"">:$value
+  );
+  let builders = [
+    AttrBuilderWithInferredContext<(ins "mlir::Type":$type,
+                                        "const llvm::APFloat &":$value), [{
+      return $_get(type.getContext(), mlir::cast(type),
+                   value);
+    }]>,
+    AttrBuilder<(ins "mlir::Type":$type,
+                     "const llvm::APFloat &":$value), [{
+      return $_get($_ctxt, mlir::cast(type), value);
+    }]>,
+  ];
+  let extraClassDeclaration = [{
+    static FPAttr getZero(mlir::Type type);
+  }];
+  let genVerifyDecl = 1;
+
+  let assemblyFormat = [{
+    `<` custom($value, ref($type)) `>`
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// ConstPtrAttr
+//===----------------------------------------------------------------------===//
+
+def ConstPtrAttr : CIR_Attr<"ConstPtr", "ptr", [TypedAttrInterface]> {
+  let summary = "Holds a constant pointer value";
+  let parameters = (ins
+    AttributeSelfTypeParameter<"", "::cir::PointerType">:$type,
+    "mlir::IntegerAttr":$value);
+  let description = [{
+    A pointer attribute is a literal attribute that represents an integral
+    value of a pointer type.
+  }];
+  let builders = [
+    AttrBuilderWithInferredContext<(ins "mlir::Type":$type,
+                                        "mlir::IntegerAttr":$value), [{
+      return $_get(type.getContext(), mlir::cast(type),
+                   value);
+    }]>,
+    AttrBuilder<(ins "mlir::Type":$type,
+                     "mlir::IntegerAttr":$value), [{
+      return $_get($_ctxt, mlir::cast(type), value);
+    }]>,
+  ];
+  let extraClassDeclaration = [{
+    bool isNullValue() const { return getValue().getInt() == 0; }
+  }];
+
+  let assemblyFormat = [{
+    `<` custom($value) `>`
+  }];
+}
+
+#endif // LLVM_CLANG_CIR_DIALECT_IR_CIRATTRS_TD
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRDialect.h b/clang/include/clang/CIR/Dialect/IR/CIRDialect.h
index 0b71bdad29a3a..683176b139ca4 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRDialect.h
+++ b/clang/include/clang/CIR/Dialect/IR/CIRDialect.h
@@ -26,6 +26,7 @@
 #include "mlir/Interfaces/MemorySlotInterfaces.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 
+#include "clang/CIR/Dialect/IR/CIRAttrs.h"
 #include "clang/CIR/Dialect/IR/CIROpsDialect.h.inc"
 
 // TableGen'erated files for MLIR dialects require that a macro be defined when
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 0d6c65ecf4102..b15e0415360ea 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -16,6 +16,7 @@
 
 include "clang/CIR/Dialect/IR/CIRDialect.td"
 include "clang/CIR/Dialect/IR/CIRTypes.td"
+include "clang/CIR/Dialect/IR/CIRAttrs.td"
 
 include "mlir/IR/BuiltinAttributeInterfaces.td"
 include "mlir/IR/EnumAttr.td"
@@ -75,6 +76,45 @@ class LLVMLoweringInfo {
 class CIR_Op traits = []> :
     Op, LLVMLoweringInfo;
 
+//===----------------------------------------------------------------------===//
+// ConstantOp
+//===----------------------------------------------------------------------===//
+
+def ConstantOp : CIR_Op<"const",
+                        [ConstantLike, Pure, AllTypesMatch<["value", "res"]>]> {
+  let summary = "Defines a CIR constant";
+  let description = [{
+    The `cir.const` operation turns a literal into an SSA value. The data is
+    attached to the operation as an attribute.
+
+    ```mlir
+      %0 = cir.const 42 : i32
+      %1 = cir.const 4.2 : f32
+      %2 = cir.const nullptr : !cir.ptr
+    ```
+  }];
+
+  // The constant operation takes an attribute as the only input.
+  let arguments = (ins TypedAttrInterface:$value);
+
+  // The constant operation returns a single value of CIR_AnyType.
+  let results = (outs CIR_AnyType:$res);
+
+  let assemblyFormat = "attr-dict $value";
+
+  let hasVerifier = 1;
+
+  let extraClassDeclaration = [{
+    bool isNullPtr() {
+      if (const auto ptrAttr = mlir::dyn_cast(getValue()))
+        return ptrAttr.isNullValue();
+      return false;
+    }
+  }];
+
+  let hasFolder = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // GlobalOp
 //===----------------------------------------------------------------------===//
@@ -92,9 +132,19 @@ def GlobalOp : CIR_Op<"global"> {
     described by the type of the variable.
   }];
 
-  let arguments = (ins SymbolNameAttr:$sym_name, TypeAttr:$sym_type);
+  let arguments = (ins SymbolNameAttr:$sym_name, TypeAttr:$sym_type,
+                       OptionalAttr:$initial_value);
+
+  let assemblyFormat = [{
+    $sym_name
+    custom($sym_type, $initial_value)
+    attr-dict
+  }];
 
-  let assemblyFormat = [{ $sym_name `:` $sym_type attr-dict }];
+  let extraClassDeclaration = [{
+    bool isDeclaration() { return !getInitialValue(); }
+    bool hasInitializer() { return !isDeclaration(); }
+  }];
 
   let skipDefaultBuilders = 1;
 
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRTypes.td b/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
index ef00b26c1fd98..a32fb3c801114 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
@@ -220,8 +220,8 @@ def CIR_LongDouble : CIR_FloatType<"LongDouble", "long_double"> {
 
 // Constraints
 
-def CIR_AnyFloat: AnyTypeOf<[CIR_Single, CIR_Double, CIR_FP80, CIR_FP128, CIR_LongDouble,
-    CIR_FP16, CIR_BFloat16]>;
+def CIR_AnyFloat: AnyTypeOf<[CIR_Single, CIR_Double, CIR_FP80, CIR_FP128,
+                             CIR_LongDouble, CIR_FP16, CIR_BFloat16]>;
 def CIR_AnyIntOrFloat: AnyTypeOf<[CIR_AnyFloat, CIR_IntType]>;
 
 //===----------------------------------------------------------------------===//
@@ -350,4 +350,12 @@ def VoidPtr : Type<
       "cir::VoidType::get($_builder.getContext()))"> {
 }
 
+//===----------------------------------------------------------------------===//
+// Global type constraints
+//===----------------------------------------------------------------------===//
+
+def CIR_AnyType : AnyTypeOf<[
+  CIR_VoidType, CIR_IntType, CIR_AnyFloat, CIR_PointerType, CIR_FuncType
+]>;
+
 #endif // MLIR_CIR_DIALECT_CIR_TYPES
diff --git a/clang/include/clang/CIR/Dialect/IR/CMakeLists.txt b/clang/include/clang/CIR/Dialect/IR/CMakeLists.txt
index 28ae30dab8dfb..1fdbc24ba6b4a 100644
--- a/clang/include/clang/CIR/Dialect/IR/CMakeLists.txt
+++ b/clang/include/clang/CIR/Dialect/IR/CMakeLists.txt
@@ -14,3 +14,6 @@ mlir_tablegen(CIROpsDialect.cpp.inc -gen-dialect-defs)
 add_public_tablegen_target(MLIRCIROpsIncGen)
 add_dependencies(mlir-headers MLIRCIROpsIncGen)
 
+mlir_tablegen(CIROpsAttributes.h.inc -gen-attrdef-decls)
+mlir_tablegen(CIROpsAttributes.cpp.inc -gen-attrdef-defs)
+add_public_tablegen_target(MLIRCIRAttrsEnumsGen)
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 416d532028d09..2615ae382cb8b 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -115,6 +115,48 @@ void CIRGenModule::emitGlobalVarDefinition(const clang::VarDecl *vd,
   if (clang::IdentifierInfo *identifier = vd->getIdentifier()) {
     auto varOp = builder.create(getLoc(vd->getSourceRange()),
                                                identifier->getName(), type);
+    // TODO(CIR): This code for processing initial values is a placeholder
+    // until class ConstantEmitter is upstreamed and the code for processing
+    // constant expressions is filled out.  Only the most basic handling of
+    // certain constant expressions is implemented for now.
+    const VarDecl *initDecl;
+    const Expr *initExpr = vd->getAnyInitializer(initDecl);
+    if (initExpr) {
+      mlir::Attribute initializer;
+      if (APValue *value = initDecl->evaluateValue()) {
+        switch (value->getKind()) {
+        case APValue::Int: {
+          initializer = builder.getAttr(type, value->getInt());
+          break;
+        }
+        case APValue::Float: {
+          initializer = builder.getAttr(type, value->getFloat());
+          break;
+        }
+        case APValue::LValue: {
+          if (value->getLValueBase()) {
+            errorNYI(initExpr->getSourceRange(),
+                     "non-null pointer initialization");
+          } else {
+            if (auto ptrType = mlir::dyn_cast(type)) {
+              initializer = builder.getConstPtrAttr(
+                  ptrType, value->getLValueOffset().getQuantity());
+            } else {
+              llvm_unreachable(
+                  "non-pointer variable initialized with a pointer");
+            }
+          }
+          break;
+        }
+        default:
+          errorNYI(initExpr->getSourceRange(), "unsupported initializer kind");
+          break;
+        }
+      } else {
+        errorNYI(initExpr->getSourceRange(), "non-constant initializer");
+      }
+      varOp.setInitialValueAttr(initializer);
+    }
     theModule.push_back(varOp);
   } else {
     errorNYI(vd->getSourceRange().getBegin(),
diff --git a/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp b/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp
index 7d42da1ab20d7..8e8f7d5b7d7cb 100644
--- a/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp
@@ -12,6 +12,24 @@
 
 #include "clang/CIR/Dialect/IR/CIRDialect.h"
 
+#include "mlir/IR/DialectImplementation.h"
+#include "llvm/ADT/TypeSwitch.h"
+
+static void printFloatLiteral(mlir::AsmPrinter &p, llvm::APFloat value,
+                              mlir::Type ty);
+static mlir::ParseResult
+parseFloatLiteral(mlir::AsmParser &parser,
+                  mlir::FailureOr &value,
+                  cir::CIRFPTypeInterface fpType);
+
+static mlir::ParseResult parseConstPtr(mlir::AsmParser &parser,
+                                       mlir::IntegerAttr &value);
+
+static void printConstPtr(mlir::AsmPrinter &p, mlir::IntegerAttr value);
+
+#define GET_ATTRDEF_CLASSES
+#include "clang/CIR/Dialect/IR/CIROpsAttributes.cpp.inc"
+
 using namespace mlir;
 using namespace cir;
 
@@ -21,12 +39,155 @@ using namespace cir;
 
 Attribute CIRDialect::parseAttribute(DialectAsmParser &parser,
                                      Type type) const {
-  // No attributes yet to parse
-  return Attribute{};
+  llvm::SMLoc typeLoc = parser.getCurrentLocation();
+  llvm::StringRef mnemonic;
+  Attribute genAttr;
+  OptionalParseResult parseResult =
+      generatedAttributeParser(parser, &mnemonic, type, genAttr);
+  if (parseResult.has_value())
+    return genAttr;
+  parser.emitError(typeLoc, "unknown attribute in CIR dialect");
+  return Attribute();
 }
 
 void CIRDialect::printAttribute(Attribute attr, DialectAsmPrinter &os) const {
-  // No attributes yet to print
+  if (failed(generatedAttributePrinter(attr, os)))
+    llvm_unreachable("unexpected CIR type kind");
+}
+
+//===----------------------------------------------------------------------===//
+// ConstPtrAttr definitions
+//===----------------------------------------------------------------------===//
+
+// TODO(CIR): Consider encoding the null value differently and use conditional
+// assembly format instead of custom parsing/printing.
+static ParseResult parseConstPtr(AsmParser &parser, mlir::IntegerAttr &value) {
+
+  if (parser.parseOptionalKeyword("null").succeeded()) {
+    value = mlir::IntegerAttr::get(
+        mlir::IntegerType::get(parser.getContext(), 64), 0);
+    return success();
+  }
+
+  return parser.parseAttribute(value);
+}
+
+static void printConstPtr(AsmPrinter &p, mlir::IntegerAttr value) {
+  if (!value.getInt())
+    p << "null";
+  else
+    p << value;
+}
+
+//===----------------------------------------------------------------------===//
+// IntAttr definitions
+//===----------------------------------------------------------------------===//
+
+Attribute IntAttr::parse(AsmParser &parser, Type odsType) {
+  mlir::APInt apValue;
+
+  if (!mlir::isa(odsType))
+    return {};
+  auto type = mlir::cast(odsType);
+
+  // Consume the '<' symbol.
+  if (parser.parseLess())
+    return {};
+
+  // Fetch arbitrary precision integer value.
+  if (type.isSigned()) {
+    int64_t value = 0;
+    if (parser.parseInteger(value)) {
+      parser.emitError(parser.getCurrentLocation(), "expected integer value");
+    } else {
+      apValue = mlir::APInt(type.getWidth(), value, type.isSigned(),
+                            /*implicitTrunc=*/true);
+      if (apValue.getSExtValue() != value)
+        parser.emitError(parser.getCurrentLocation(),
+                         "integer value too large for the given type");
+    }
+  } else {
+    uint64_t value = 0;
+    if (parser.parseInteger(value)) {
+      parser.emitError(parser.getCurrentLocation(), "expected integer value");
+    } else {
+      apValue = mlir::APInt(type.getWidth(), value, type.isSigned(),
+                            /*implicitTrunc=*/true);
+      if (apValue.getZExtValue() != value)
+        parser.emitError(parser.getCurrentLocation(),
+                         "integer value too large for the given type");
+    }
+  }
+
+  // Consume the '>' symbol.
+  if (parser.parseGreater())
+    return {};
+
+  return IntAttr::get(type, apValue);
+}
+
+void IntAttr::print(AsmPrinter &printer) const {
+  auto type = mlir::cast(getType());
+  printer << '<';
+  if (type.isSigned())
+    printer << getSInt();
+  else
+    printer << getUInt();
+  printer << '>';
+}
+
+LogicalResult IntAttr::verify(function_ref emitError,
+                              Type type, APInt value) {
+  if (!mlir::isa(type)) {
+    emitError() << "expected 'simple.int' type";
+    return failure();
+  }
+
+  auto intType = mlir::cast(type);
+  if (value.getBitWidth() != intType.getWidth()) {
+    emitError() << "type and value bitwidth mismatch: " << intType.getWidth()
+                << " != " << value.getBitWidth();
+    return failure();
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// FPAttr definitions
+//===----------------------------------------------------------------------===//
+
+static void printFloatLiteral(AsmPrinter &p, APFloat value, Type ty) {
+  p << value;
+}
+
+static ParseResult parseFloatLiteral(AsmParser &parser,
+                                     FailureOr &value,
+                                     CIRFPTypeInterface fpType) {
+
+  APFloat parsedValue(0.0);
+  if (parser.parseFloat(fpType.getFloatSemantics(), parsedValue))
+    return failure();
+
+  value.emplace(parsedValue);
+  return success();
+}
+
+FPAttr FPAttr::getZero(Type type) {
+  return get(type,
+             APFloat::getZero(
+                 mlir::cast(type).getFloatSemantics()));
+}
+
+LogicalResult FPAttr::verify(function_ref emitError,
+                             CIRFPTypeInterface fpType, APFloat value) {
+  if (APFloat::SemanticsToEnum(fpType.getFloatSemantics()) !=
+      APFloat::SemanticsToEnum(value.getSemantics())) {
+    emitError() << "floating-point semantics mismatch";
+    return failure();
+  }
+
+  return success();
 }
 
 //===----------------------------------------------------------------------===//
@@ -34,5 +195,8 @@ void CIRDialect::printAttribute(Attribute attr, DialectAsmPrinter &os) const {
 //===----------------------------------------------------------------------===//
 
 void CIRDialect::registerAttributes() {
-  // No attributes yet to register
+  addAttributes<
+#define GET_ATTRDEF_LIST
+#include "clang/CIR/Dialect/IR/CIROpsAttributes.cpp.inc"
+      >();
 }
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index dbdca1f840166..f98d8b60f6ff8 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -12,6 +12,8 @@
 
 #include "clang/CIR/Dialect/IR/CIRDialect.h"
 
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
+
 #include "mlir/Support/LogicalResult.h"
 
 #include "clang/CIR/Dialect/IR/CIROpsDialect.cpp.inc"
@@ -32,13 +34,73 @@ void cir::CIRDialect::initialize() {
       >();
 }
 
+//===----------------------------------------------------------------------===//
+// ConstantOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult checkConstantTypes(mlir::Operation *op, mlir::Type opType,
+                                        mlir::Attribute attrType) {
+  if (isa(attrType)) {
+    if (!mlir::isa(opType))
+      return op->emitOpError(
+          "pointer constant initializing a non-pointer type");
+    return success();
+  }
+
+  if (mlir::isa(attrType)) {
+    auto at = cast(attrType);
+    if (at.getType() != opType) {
+      return op->emitOpError("result type (")
+             << opType << ") does not match value type (" << at.getType()
+             << ")";
+    }
+    return success();
+  }
+
+  assert(isa(attrType) && "What else could we be looking at here?");
+  return op->emitOpError("global with type ")
+         << cast(attrType).getType() << " not yet supported";
+}
+
+LogicalResult cir::ConstantOp::verify() {
+  // ODS already generates checks to make sure the result type is valid. We just
+  // need to additionally check that the value's attribute type is consistent
+  // with the result type.
+  return checkConstantTypes(getOperation(), getType(), getValue());
+}
+
+OpFoldResult cir::ConstantOp::fold(FoldAdaptor /*adaptor*/) {
+  return getValue();
+}
+
 //===----------------------------------------------------------------------===//
 // GlobalOp
 //===----------------------------------------------------------------------===//
 
-// TODO(CIR): The properties of global variables that require verification
-// haven't been implemented yet.
-mlir::LogicalResult cir::GlobalOp::verify() { return success(); }
+static ParseResult parseConstantValue(OpAsmParser &parser,
+                                      mlir::Attribute &valueAttr) {
+  NamedAttrList attr;
+  return parser.parseAttribute(valueAttr, "value", attr);
+}
+
+static void printConstant(OpAsmPrinter &p, Attribute value) {
+  p.printAttribute(value);
+}
+
+mlir::LogicalResult cir::GlobalOp::verify() {
+  // Verify that the initial value, if present, is either a unit attribute or
+  // an attribute CIR supports.
+  if (getInitialValue().has_value()) {
+    if (checkConstantTypes(getOperation(), getSymType(), *getInitialValue())
+            .failed())
+      return failure();
+  }
+
+  // TODO(CIR): Many other checks for properties that haven't been upstreamed
+  // yet.
+
+  return success();
+}
 
 void cir::GlobalOp::build(OpBuilder &odsBuilder, OperationState &odsState,
                           llvm::StringRef sym_name, mlir::Type sym_type) {
@@ -48,6 +110,45 @@ void cir::GlobalOp::build(OpBuilder &odsBuilder, OperationState &odsState,
                         mlir::TypeAttr::get(sym_type));
 }
 
+static void printGlobalOpTypeAndInitialValue(OpAsmPrinter &p, cir::GlobalOp op,
+                                             TypeAttr type,
+                                             Attribute initAttr) {
+  if (!op.isDeclaration()) {
+    p << "= ";
+    // This also prints the type...
+    if (initAttr)
+      printConstant(p, initAttr);
+  } else {
+    p << ": " << type;
+  }
+}
+
+static ParseResult
+parseGlobalOpTypeAndInitialValue(OpAsmParser &parser, TypeAttr &typeAttr,
+                                 Attribute &initialValueAttr) {
+  mlir::Type opTy;
+  if (parser.parseOptionalEqual().failed()) {
+    // Absence of equal means a declaration, so we need to parse the type.
+    //  cir.global @a : !cir.int
+    if (parser.parseColonType(opTy))
+      return failure();
+  } else {
+    // Parse constant with initializer, examples:
+    //  cir.global @y = #cir.fp<1.250000e+00> : !cir.double
+    //  cir.global @rgb = #cir.const_array<[...] : !cir.array>
+    if (parseConstantValue(parser, initialValueAttr).failed())
+      return failure();
+
+    assert(mlir::isa(initialValueAttr) &&
+           "Non-typed attrs shouldn't appear here.");
+    auto typedAttr = mlir::cast(initialValueAttr);
+    opTy = typedAttr.getType();
+  }
+
+  typeAttr = TypeAttr::get(opTy);
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // FuncOp
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/Dialect/IR/CMakeLists.txt b/clang/lib/CIR/Dialect/IR/CMakeLists.txt
index df60f69df6fc0..baf8bff185221 100644
--- a/clang/lib/CIR/Dialect/IR/CMakeLists.txt
+++ b/clang/lib/CIR/Dialect/IR/CMakeLists.txt
@@ -5,6 +5,7 @@ add_clang_library(MLIRCIR
 
   DEPENDS
   MLIRCIROpsIncGen
+  MLIRCIRAttrsEnumsGen
 
   LINK_LIBS PUBLIC
   MLIRIR
diff --git a/clang/lib/CIR/Interfaces/CMakeLists.txt b/clang/lib/CIR/Interfaces/CMakeLists.txt
index fcd8b6963d06c..b826bf612cc35 100644
--- a/clang/lib/CIR/Interfaces/CMakeLists.txt
+++ b/clang/lib/CIR/Interfaces/CMakeLists.txt
@@ -5,6 +5,7 @@ add_clang_library(MLIRCIRInterfaces
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Interfaces
 
   DEPENDS
+  MLIRCIRAttrsEnumsGen
   MLIRCIRFPTypeInterfaceIncGen
 
   LINK_LIBS
diff --git a/clang/test/CIR/global-var-simple.cpp b/clang/test/CIR/global-var-simple.cpp
index bbd452655a01b..ffcc3ef71a6c7 100644
--- a/clang/test/CIR/global-var-simple.cpp
+++ b/clang/test/CIR/global-var-simple.cpp
@@ -13,11 +13,11 @@ unsigned char uc;
 short ss;
 // CHECK: cir.global @ss : !cir.int
 
-unsigned short us;
-// CHECK: cir.global @us : !cir.int
+unsigned short us = 100;
+// CHECK: cir.global @us = #cir.int<100> : !cir.int
 
-int si;
-// CHECK: cir.global @si : !cir.int
+int si = 42;
+// CHECK: cir.global @si = #cir.int<42> : !cir.int
 
 unsigned ui;
 // CHECK: cir.global @ui : !cir.int
@@ -31,8 +31,8 @@ unsigned long ul;
 long long sll;
 // CHECK: cir.global @sll : !cir.int
 
-unsigned long long ull;
-// CHECK: cir.global @ull : !cir.int
+unsigned long long ull = 123456;
+// CHECK: cir.global @ull = #cir.int<123456> : !cir.int
 
 __int128 s128;
 // CHECK: cir.global @s128 : !cir.int
@@ -67,8 +67,8 @@ __bf16 bf16;
 float f;
 // CHECK: cir.global @f : !cir.float
 
-double d;
-// CHECK: cir.global @d : !cir.double
+double d = 1.25;
+// CHECK: cir.global @d = #cir.fp<1.250000e+00> : !cir.double
 
 long double ld;
 // CHECK: cir.global @ld : !cir.long_double
@@ -79,8 +79,8 @@ __float128 f128;
 void *vp;
 // CHECK: cir.global @vp : !cir.ptr
 
-int *ip;
-// CHECK: cir.global @ip : !cir.ptr>
+int *ip = 0;
+// CHECK: cir.global @ip = #cir.ptr : !cir.ptr>
 
 double *dp;
 // CHECK: cir.global @dp : !cir.ptr
@@ -91,8 +91,8 @@ char **cpp;
 void (*fp)();
 // CHECK: cir.global @fp : !cir.ptr>
 
-int (*fpii)(int);
-// CHECK: cir.global @fpii : !cir.ptr (!cir.int)>>
+int (*fpii)(int) = 0;
+// CHECK: cir.global @fpii = #cir.ptr : !cir.ptr (!cir.int)>>
 
 void (*fpvar)(int, ...);
 // CHECK: cir.global @fpvar : !cir.ptr, ...)>>

From cb1ad985b53c87b53974e37bba60129acb294f0d Mon Sep 17 00:00:00 2001
From: Maksim Levental 
Date: Sat, 28 Dec 2024 14:39:52 -0800
Subject: [PATCH 142/567] [mlir][llvmir] implement missing attrs `getChecked`
 (#121248)

---
 mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp
index c7ddc1b36f4d4..28e8b81a05576 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp
@@ -48,6 +48,7 @@ void LLVMDialect::registerAttributes() {
   addAttributes<
 #define GET_ATTRDEF_LIST
 #include "mlir/Dialect/LLVMIR/LLVMOpsAttrDefs.cpp.inc"
+
       >();
 }
 
@@ -288,6 +289,16 @@ TargetFeaturesAttr TargetFeaturesAttr::get(MLIRContext *context,
                    }));
 }
 
+TargetFeaturesAttr
+TargetFeaturesAttr::getChecked(function_ref emitError,
+                               MLIRContext *context,
+                               llvm::ArrayRef features) {
+  return Base::getChecked(emitError, context,
+                          llvm::map_to_vector(features, [&](StringRef feature) {
+                            return StringAttr::get(context, feature);
+                          }));
+}
+
 TargetFeaturesAttr TargetFeaturesAttr::get(MLIRContext *context,
                                            StringRef targetFeatures) {
   SmallVector features;
@@ -296,6 +307,16 @@ TargetFeaturesAttr TargetFeaturesAttr::get(MLIRContext *context,
   return get(context, features);
 }
 
+TargetFeaturesAttr
+TargetFeaturesAttr::getChecked(function_ref emitError,
+                               MLIRContext *context, StringRef targetFeatures) {
+  SmallVector features;
+  targetFeatures.split(features, ',', /*MaxSplit=*/-1,
+                       /*KeepEmpty=*/false);
+  ArrayRef featuresRef(features);
+  return getChecked(emitError, context, featuresRef);
+}
+
 LogicalResult
 TargetFeaturesAttr::verify(function_ref emitError,
                            llvm::ArrayRef features) {

From de294c968bf292794ca9f0a6a481d3dff3bcc2eb Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi 
Date: Sun, 29 Dec 2024 12:02:13 +0900
Subject: [PATCH 143/567] [profile] Enable testing Continuous mode on Linux
 (#121238)

Based on #115987, with the introduction of `REQUIRES: continuous-mode`.
Also Linux assumes `runtime_reloc`.

FIXME: image-with-no-counters.c is still excluded.
---
 compiler-rt/test/profile/ContinuousSyncMode/basic.c          | 2 +-
 compiler-rt/test/profile/ContinuousSyncMode/get-filename.c   | 2 +-
 .../test/profile/ContinuousSyncMode/image-with-mcdc.c        | 2 +-
 .../test/profile/ContinuousSyncMode/multi-threaded.cpp       | 2 +-
 compiler-rt/test/profile/ContinuousSyncMode/online-merging.c | 2 +-
 .../test/profile/ContinuousSyncMode/pid-substitution.c       | 2 +-
 compiler-rt/test/profile/ContinuousSyncMode/set-filename.c   | 2 +-
 compiler-rt/test/profile/lit.cfg.py                          | 5 ++++-
 8 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/compiler-rt/test/profile/ContinuousSyncMode/basic.c b/compiler-rt/test/profile/ContinuousSyncMode/basic.c
index e8bd087a0f59d..531877b78a1a2 100644
--- a/compiler-rt/test/profile/ContinuousSyncMode/basic.c
+++ b/compiler-rt/test/profile/ContinuousSyncMode/basic.c
@@ -1,4 +1,4 @@
-// REQUIRES: target={{.*(darwin|aix).*}}
+// REQUIRES: continuous-mode
 
 // RUN: %clang_profgen_cont -fcoverage-mapping -o %t.exe %s
 // RUN: echo "garbage" > %t.profraw
diff --git a/compiler-rt/test/profile/ContinuousSyncMode/get-filename.c b/compiler-rt/test/profile/ContinuousSyncMode/get-filename.c
index 40a0cc5ffd688..e341dd429eb84 100644
--- a/compiler-rt/test/profile/ContinuousSyncMode/get-filename.c
+++ b/compiler-rt/test/profile/ContinuousSyncMode/get-filename.c
@@ -1,4 +1,4 @@
-// REQUIRES: target={{.*(darwin|aix).*}}
+// REQUIRES: continuous-mode
 
 // RUN: %clang_pgogen_cont -o %t.exe %s
 // RUN: env LLVM_PROFILE_FILE="%c%t.profraw" %run %t.exe %t.profraw
diff --git a/compiler-rt/test/profile/ContinuousSyncMode/image-with-mcdc.c b/compiler-rt/test/profile/ContinuousSyncMode/image-with-mcdc.c
index d171badbf4d33..fa24e26c4c53b 100644
--- a/compiler-rt/test/profile/ContinuousSyncMode/image-with-mcdc.c
+++ b/compiler-rt/test/profile/ContinuousSyncMode/image-with-mcdc.c
@@ -1,4 +1,4 @@
-// REQUIRES: target={{.*(darwin|aix).*}}
+// REQUIRES: continuous-mode
 
 // RUN: %clang_profgen_cont -fcoverage-mapping -fcoverage-mcdc -O3 -o %t.exe %s
 // RUN: env LLVM_PROFILE_FILE="%c%t.profraw" %run %t.exe 3 3
diff --git a/compiler-rt/test/profile/ContinuousSyncMode/multi-threaded.cpp b/compiler-rt/test/profile/ContinuousSyncMode/multi-threaded.cpp
index ff05a69a5e7d4..aa0a46e0fc396 100644
--- a/compiler-rt/test/profile/ContinuousSyncMode/multi-threaded.cpp
+++ b/compiler-rt/test/profile/ContinuousSyncMode/multi-threaded.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: target={{.*(darwin|aix).*}}
+// REQUIRES: continuous-mode
 
 // RUN: rm -f %t.profraw
 // RUN: %clangxx_pgogen_cont -lpthread %s -o %t.exe -mllvm -disable-vp -fprofile-update=atomic
diff --git a/compiler-rt/test/profile/ContinuousSyncMode/online-merging.c b/compiler-rt/test/profile/ContinuousSyncMode/online-merging.c
index 54346487a5c79..c1931410f8c76 100644
--- a/compiler-rt/test/profile/ContinuousSyncMode/online-merging.c
+++ b/compiler-rt/test/profile/ContinuousSyncMode/online-merging.c
@@ -1,4 +1,4 @@
-// REQUIRES: target={{.*(darwin|aix).*}}
+// REQUIRES: continuous-mode
 
 // Test the online merging mode (%m) along with continuous mode (%c).
 //
diff --git a/compiler-rt/test/profile/ContinuousSyncMode/pid-substitution.c b/compiler-rt/test/profile/ContinuousSyncMode/pid-substitution.c
index 309b685a95c5b..8a00b28825cae 100644
--- a/compiler-rt/test/profile/ContinuousSyncMode/pid-substitution.c
+++ b/compiler-rt/test/profile/ContinuousSyncMode/pid-substitution.c
@@ -1,4 +1,4 @@
-// REQUIRES: target={{.*(darwin|aix).*}}
+// REQUIRES: continuous-mode
 
 // RUN: rm -rf %t.dir && mkdir -p %t.dir
 // RUN: %clang_pgogen_cont -o %t.exe %s
diff --git a/compiler-rt/test/profile/ContinuousSyncMode/set-filename.c b/compiler-rt/test/profile/ContinuousSyncMode/set-filename.c
index 106e12e4e3b6e..abc72646d16b4 100644
--- a/compiler-rt/test/profile/ContinuousSyncMode/set-filename.c
+++ b/compiler-rt/test/profile/ContinuousSyncMode/set-filename.c
@@ -1,4 +1,4 @@
-// REQUIRES: target={{.*(darwin|aix).*}}
+// REQUIRES: continuous-mode
 
 // RUN: %clang_pgogen_cont -o %t.exe %s
 // RUN: env LLVM_PROFILE_FILE="%c%t.profraw" %run %t.exe %t.profraw %t.bad
diff --git a/compiler-rt/test/profile/lit.cfg.py b/compiler-rt/test/profile/lit.cfg.py
index 72a389eaf0dfb..fc2baf7c40b8f 100644
--- a/compiler-rt/test/profile/lit.cfg.py
+++ b/compiler-rt/test/profile/lit.cfg.py
@@ -31,7 +31,7 @@ def get_required_attr(config, attr_name):
 target_is_msvc = bool(re.match(r".*-windows-msvc$", config.target_triple))
 
 # Whether continous profile collection (%c) requires runtime counter relocation on this platform
-runtime_reloc = bool(config.host_os in ["AIX"])
+runtime_reloc = bool(config.host_os in ["AIX", "Linux"])
 
 if config.host_os in ["Linux"]:
     extra_link_flags = ["-ldl"]
@@ -210,3 +210,6 @@ def exclude_unsupported_files_for_aix(dirname):
 
 if config.have_curl:
     config.available_features.add("curl")
+
+if config.host_os in ("AIX", "Darwin", "Linux"):
+    config.available_features.add("continuous-mode")

From 02e8972c378bf60cc16a85815d29faafdbad7180 Mon Sep 17 00:00:00 2001
From: Justin Bogner 
Date: Sat, 28 Dec 2024 21:48:25 -0700
Subject: [PATCH 144/567] Revert "[nfc][Driver] Remove {{(.exe)?}} from
 sanitizer test" again (#121280)

This reverts #121162, which was a reapply of the previous revert earlier
in #121160 - The change blatantly breaks tests on windows and it isn't
clear why it's being made.

Note that I needed to add the optional .exe suffix to a few more check
lines that were added as a follow up.

This reverts commit 8e9fda1c1140e067c5344c61df56c34167296f17.
---
 clang/test/Driver/sanitizer-ld.c | 132 +++++++++++++++----------------
 1 file changed, 66 insertions(+), 66 deletions(-)

diff --git a/clang/test/Driver/sanitizer-ld.c b/clang/test/Driver/sanitizer-ld.c
index 17766cef86d2a..a82c45136d7bf 100644
--- a/clang/test/Driver/sanitizer-ld.c
+++ b/clang/test/Driver/sanitizer-ld.c
@@ -15,7 +15,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-LINUX
 //
-// CHECK-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld"
+// CHECK-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-ASAN-LINUX-NOT: "-lc"
 // CHECK-ASAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan_static.a" "--no-whole-archive"
 // CHECK-ASAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
@@ -33,7 +33,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-NO-LINK-RUNTIME-LINUX
 //
-// CHECK-ASAN-NO-LINK-RUNTIME-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld"
+// CHECK-ASAN-NO-LINK-RUNTIME-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 
 // RUN: %clang -fsanitize=address -fno-sanitize-link-runtime -### %s 2>&1 \
 // RUN:     --target=arm64e-apple-macosx -fuse-ld=ld \
@@ -41,7 +41,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-NO-LINK-RUNTIME-DARWIN
 //
-// CHECK-ASAN-NO-LINK-RUNTIME-DARWIN: "{{.*}}ld"
+// CHECK-ASAN-NO-LINK-RUNTIME-DARWIN: "{{.*}}ld{{(.exe)?}}"
 
 // RUN: %clang -fsanitize=address -### %s 2>&1 \
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld \
@@ -80,7 +80,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-SHARED-ASAN-LINUX
 //
-// CHECK-SHARED-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld"
+// CHECK-SHARED-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-SHARED-ASAN-LINUX-NOT: "-lc"
 // CHECK-SHARED-ASAN-LINUX: libclang_rt.asan.so"
 // CHECK-SHARED-ASAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan-preinit.a" "--no-whole-archive"
@@ -98,7 +98,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-DSO-SHARED-ASAN-LINUX
 //
-// CHECK-DSO-SHARED-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld"
+// CHECK-DSO-SHARED-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-DSO-SHARED-ASAN-LINUX-NOT: "-lc"
 // CHECK-DSO-SHARED-ASAN-LINUX: libclang_rt.asan.so"
 // CHECK-DSO-SHARED-ASAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan_static.a" "--no-whole-archive"
@@ -115,7 +115,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_freebsd_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-FREEBSD
 //
-// CHECK-ASAN-FREEBSD: "{{(.*[^-.0-9A-Z_a-z])?}}ld"
+// CHECK-ASAN-FREEBSD: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-ASAN-FREEBSD-NOT: "-lc"
 // CHECK-ASAN-FREEBSD: freebsd{{/|\\+}}libclang_rt.asan_static.a"
 // CHECK-ASAN-FREEBSD: freebsd{{/|\\+}}libclang_rt.asan.a"
@@ -130,7 +130,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_freebsd_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-FREEBSD-LDL
 //
-// CHECK-ASAN-FREEBSD-LDL: "{{(.*[^-.0-9A-Z_a-z])?}}ld"
+// CHECK-ASAN-FREEBSD-LDL: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-ASAN-FREEBSD-LDL-NOT: "-ldl"
 // CHECK-ASAN-FREEBSD-LDL: "--whole-archive" "{{.*}}libclang_rt.asan_static.a" "--no-whole-archive"
 // CHECK-ASAN-FREEBSD-LDL: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
@@ -148,7 +148,7 @@
 // RUN:     -fsanitize-link-c++-runtime \
 // RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-CXX
 
-// CHECK-ASAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld"
+// CHECK-ASAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-ASAN-LINUX-CXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
 // CHECK-ASAN-LINUX-CXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan_cxx.a" "--no-whole-archive"
 // CHECK-ASAN-LINUX-CXX-NOT: "--dynamic-list"
@@ -167,7 +167,7 @@
 // RUN:     -fno-sanitize-link-c++-runtime \
 // RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-CNOCXX
 
-// CHECK-ASAN-LINUX-CNOCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld"
+// CHECK-ASAN-LINUX-CNOCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-ASAN-LINUX-CNOCXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
 // CHECK-ASAN-LINUX-CNOCXX-SAME: "--export-dynamic"
 // CHECK-ASAN-LINUX-CNOCXX-NOT: stdc++
@@ -184,7 +184,7 @@
 // RUN:     -fno-sanitize-link-c++-runtime \
 // RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-NOCXX
 
-// CHECK-ASAN-LINUX-NOCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld"
+// CHECK-ASAN-LINUX-NOCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-ASAN-LINUX-NOCXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
 // CHECK-ASAN-LINUX-NOCXX-SAME: "--export-dynamic"
 // CHECK-ASAN-LINUX-NOCXX-SAME: "-lstdc++"
@@ -201,7 +201,7 @@
 // RUN:     -nostdlib++ \
 // RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-NOSTDCXX
 
-// CHECK-ASAN-LINUX-NOSTDCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld"
+// CHECK-ASAN-LINUX-NOSTDCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-ASAN-LINUX-NOSTDCXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
 // CHECK-ASAN-LINUX-NOSTDCXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan_cxx.a" "--no-whole-archive"
 // CHECK-ASAN-LINUX-NOSTDCXX-SAME: "--export-dynamic"
@@ -217,7 +217,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree -lstdc++ -static 2>&1 \
 // RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-CXX-STATIC
 //
-// CHECK-ASAN-LINUX-CXX-STATIC: "{{(.*[^-.0-9A-Z_a-z])?}}ld"
+// CHECK-ASAN-LINUX-CXX-STATIC: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-ASAN-LINUX-CXX-STATIC-NOT: stdc++
 // CHECK-ASAN-LINUX-CXX-STATIC: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
 // CHECK-ASAN-LINUX-CXX-STATIC: "--dynamic-list={{.*}}libclang_rt.asan.a.syms"
@@ -228,7 +228,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-ARM
 //
-// CHECK-ASAN-ARM: "{{(.*[^.0-9A-Z_a-z])?}}ld"
+// CHECK-ASAN-ARM: "{{(.*[^.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-ASAN-ARM-NOT: "-lc"
 // CHECK-ASAN-ARM: libclang_rt.asan_static.a"
 // CHECK-ASAN-ARM: libclang_rt.asan.a"
@@ -238,7 +238,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-ARMv7
 //
-// CHECK-ASAN-ARMv7: "{{(.*[^.0-9A-Z_a-z])?}}ld"
+// CHECK-ASAN-ARMv7: "{{(.*[^.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-ASAN-ARMv7-NOT: "-lc"
 // CHECK-ASAN-ARMv7: libclang_rt.asan_static.a"
 // CHECK-ASAN-ARMv7: libclang_rt.asan.a"
@@ -357,7 +357,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-TYSAN-LINUX-CXX
 //
-// CHECK-TYSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld"
+// CHECK-TYSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-TYSAN-LINUX-CXX-NOT: stdc++
 // CHECK-TYSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tysan{{[^.]*}}.a" "--no-whole-archive"
 // CHECK-TYSAN-LINUX-CXX: stdc++
@@ -368,7 +368,7 @@
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-TYSAN-DARWIN-CXX
-// CHECK-TYSAN-DARWIN-CXX: "{{.*}}ld"
+// CHECK-TYSAN-DARWIN-CXX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-TYSAN-DARWIN-CXX: libclang_rt.tysan_osx_dynamic.dylib
 // CHECK-TYSAN-DARWIN-CXX-NOT: -lc++abi
 
@@ -379,7 +379,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-TSAN-LINUX-CXX
 //
-// CHECK-TSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld"
+// CHECK-TSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-TSAN-LINUX-CXX-NOT: stdc++
 // CHECK-TSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tsan.a" "--no-whole-archive"
 // CHECK-TSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.tsan.a.syms"
@@ -398,7 +398,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-TSAN-NO-LINK-RUNTIME-LINUX
 //
-// CHECK-TSAN-NO-LINK-RUNTIME-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld"
+// CHECK-TSAN-NO-LINK-RUNTIME-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 
 // RUN: not %clang -fsanitize=thread -fno-sanitize-link-runtime -### %s 2>&1 \
 // RUN:     --target=arm64e-apple-ios -fuse-ld=ld \
@@ -406,7 +406,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-TSAN-NO-LINK-RUNTIME-DARWIN
 //
-// CHECK-TSAN-NO-LINK-RUNTIME-DARWIN: "{{(.*[^-.0-9A-Z_a-z])?}}ld"
+// CHECK-TSAN-NO-LINK-RUNTIME-DARWIN: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 
 // RUN: %clangxx -### %s 2>&1 \
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld -stdlib=platform -lstdc++ \
@@ -415,7 +415,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-MSAN-LINUX-CXX
 //
-// CHECK-MSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld"
+// CHECK-MSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-MSAN-LINUX-CXX-NOT: stdc++
 // CHECK-MSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.msan.a" "--no-whole-archive"
 // CHECK-MSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.msan.a.syms"
@@ -434,7 +434,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-MSAN-NO-LINK-RUNTIME-LINUX
 //
-// CHECK-MSAN-NO-LINK-RUNTIME-LINUX: "{{.*}}ld"
+// CHECK-MSAN-NO-LINK-RUNTIME-LINUX: "{{.*}}ld{{(.exe)?}}"
 
 // RUN: %clang -fsanitize=undefined -### %s 2>&1 \
 // RUN:     --target=x86_64-unknown-linux-gnux32 -fuse-ld=ld \
@@ -455,7 +455,7 @@
 // RUN:     -static-libsan \
 // RUN:   | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX
 
-// CHECK-UBSAN-LINUX: "{{.*}}ld"
+// CHECK-UBSAN-LINUX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-UBSAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive"
 // CHECK-UBSAN-LINUX-NOT: "-lstdc++"
 // CHECK-UBSAN-LINUX: "-lpthread"
@@ -467,7 +467,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-UBSAN-NO-LINK-RUNTIME-LINUX
 //
-// CHECK-UBSAN-NO-LINK-RUNTIME-LINUX: "{{.*}}ld"
+// CHECK-UBSAN-NO-LINK-RUNTIME-LINUX: "{{.*}}ld{{(.exe)?}}"
 
 // RUN: %clang -fsanitize=undefined -fno-sanitize-link-runtime -### %s 2>&1 \
 // RUN:     --target=x86_64-apple-darwin -fuse-ld=ld \
@@ -475,7 +475,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-UBSAN-NO-LINK-RUNTIME-DARWIN
 //
-// CHECK-UBSAN-NO-LINK-RUNTIME-DARWIN: "{{.*}}ld"
+// CHECK-UBSAN-NO-LINK-RUNTIME-DARWIN: "{{.*}}ld{{(.exe)?}}"
 
 // RUN: %clang -fsanitize=fuzzer -fno-sanitize-link-runtime -### %s 2>&1 \
 // RUN:     --target=arm64e-apple-watchos -fuse-ld=ld \
@@ -483,7 +483,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-FUZZER-NO-LINK-RUNTIME-DARWIN
 //
-// CHECK-FUZZER-NO-LINK-RUNTIME-DARWIN: "{{.*}}ld"
+// CHECK-FUZZER-NO-LINK-RUNTIME-DARWIN: "{{.*}}ld{{(.exe)?}}"
 
 // RUN: %clang -fsanitize=undefined -### %s 2>&1 \
 // RUN:     --target=i386-unknown-linux -fuse-ld=ld \
@@ -506,7 +506,7 @@
 // RUN:     -shared -shared-libsan \
 // RUN:   | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX-SHAREDLIBASAN
 
-// CHECK-UBSAN-LINUX-SHAREDLIBASAN: "{{.*}}ld"
+// CHECK-UBSAN-LINUX-SHAREDLIBASAN: "{{.*}}ld{{(.exe)?}}"
 // CHECK-UBSAN-LINUX-SHAREDLIBASAN: "{{.*}}libclang_rt.ubsan_standalone.so{{.*}}"
 
 // RUN: %clang -fsanitize=undefined -fsanitize-link-c++-runtime -### %s 2>&1 \
@@ -523,7 +523,7 @@
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX-CXX
-// CHECK-UBSAN-LINUX-CXX: "{{.*}}ld"
+// CHECK-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive"
 // CHECK-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone_cxx.a" "--no-whole-archive"
 // CHECK-UBSAN-LINUX-CXX: "-lstdc++"
@@ -535,7 +535,7 @@
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-UBSAN-MINIMAL-LINUX
-// CHECK-UBSAN-MINIMAL-LINUX: "{{.*}}ld"
+// CHECK-UBSAN-MINIMAL-LINUX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-UBSAN-MINIMAL-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_minimal.a" "--no-whole-archive"
 // CHECK-UBSAN-MINIMAL-LINUX: "-lpthread"
 // CHECK-UBSAN-MINIMAL-LINUX: "-lresolv"
@@ -544,7 +544,7 @@
 // RUN:     --target=x86_64-apple-darwin -fuse-ld=ld \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-UBSAN-MINIMAL-DARWIN
-// CHECK-UBSAN-MINIMAL-DARWIN: "{{.*}}ld"
+// CHECK-UBSAN-MINIMAL-DARWIN: "{{.*}}ld{{(.exe)?}}"
 // CHECK-UBSAN-MINIMAL-DARWIN: "{{.*}}libclang_rt.ubsan_minimal_osx_dynamic.dylib"
 
 // RUN: not %clang -fsanitize=undefined -### %s 2>&1 \
@@ -570,7 +570,7 @@
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-UBSAN-LINUX
-// CHECK-ASAN-UBSAN-LINUX: "{{.*}}ld"
+// CHECK-ASAN-UBSAN-LINUX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-ASAN-UBSAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan_static.a" "--no-whole-archive"
 // CHECK-ASAN-UBSAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
 // CHECK-ASAN-UBSAN-LINUX: "--dynamic-list={{.*}}libclang_rt.asan.a.syms"
@@ -583,7 +583,7 @@
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-UBSAN-LINUX-CXX
-// CHECK-ASAN-UBSAN-LINUX-CXX: "{{.*}}ld"
+// CHECK-ASAN-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-ASAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.asan_static.a" "--no-whole-archive"
 // CHECK-ASAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
 // CHECK-ASAN-UBSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.asan.a.syms"
@@ -598,7 +598,7 @@
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX
-// CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX: "{{.*}}ld"
+// CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan_static.a" "--no-whole-archive"
 // CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
 // CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-SAME: "--dynamic-list={{.*}}libclang_rt.asan.a.syms"
@@ -612,7 +612,7 @@
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-MSAN-UBSAN-LINUX-CXX
-// CHECK-MSAN-UBSAN-LINUX-CXX: "{{.*}}ld"
+// CHECK-MSAN-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-MSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.msan.a" "--no-whole-archive"
 // CHECK-MSAN-UBSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.msan.a.syms"
 // CHECK-MSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.msan_cxx.a" "--no-whole-archive"
@@ -624,7 +624,7 @@
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-TSAN-UBSAN-LINUX-CXX
-// CHECK-TSAN-UBSAN-LINUX-CXX: "{{.*}}ld"
+// CHECK-TSAN-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-TSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tsan.a" "--no-whole-archive"
 // CHECK-TSAN-UBSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.tsan.a.syms"
 // CHECK-TSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tsan_cxx.a" "--no-whole-archive"
@@ -637,7 +637,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:     -shared \
 // RUN:   | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX-SHARED
-// CHECK-UBSAN-LINUX-SHARED: "{{.*}}ld"
+// CHECK-UBSAN-LINUX-SHARED: "{{.*}}ld{{(.exe)?}}"
 // CHECK-UBSAN-LINUX-SHARED-NOT: --export-dynamic
 // CHECK-UBSAN-LINUX-SHARED-NOT: --dynamic-list
 
@@ -647,7 +647,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-LSAN-LINUX
 //
-// CHECK-LSAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld"
+// CHECK-LSAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-LSAN-LINUX-NOT: "-lc"
 // CHECK-LSAN-LINUX: libclang_rt.lsan.a"
 // CHECK-LSAN-LINUX: "-lpthread"
@@ -660,7 +660,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-LSAN-NO-LINK-RUNTIME-LINUX
 //
-// CHECK-LSAN-NO-LINK-RUNTIME-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld"
+// CHECK-LSAN-NO-LINK-RUNTIME-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 
 // RUN: %clang -### %s 2>&1 \
 // RUN:  --target=x86_64-unknown-linux -fuse-ld=ld -fsanitize=leak -fsanitize-coverage=func \
@@ -668,7 +668,7 @@
 // RUN:  --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-LSAN-COV-LINUX
 //
-// CHECK-LSAN-COV-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld"
+// CHECK-LSAN-COV-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-LSAN-COV-LINUX-NOT: "-lc"
 // CHECK-LSAN-COV-LINUX: libclang_rt.lsan.a
 // CHECK-LSAV-COV-LINUX: libclang_rt.lsan-x86_64.a"
@@ -681,7 +681,7 @@
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-LSAN-ASAN-LINUX
-// CHECK-LSAN-ASAN-LINUX: "{{.*}}ld"
+// CHECK-LSAN-ASAN-LINUX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-LSAN-ASAN-LINUX: libclang_rt.asan_static
 // CHECK-LSAN-ASAN-LINUX: libclang_rt.asan
 // CHECK-LSAN-ASAN-LINUX: "--dynamic-list={{.*}}libclang_rt.asan.a.syms"
@@ -691,7 +691,7 @@
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-COV-LINUX
-// CHECK-ASAN-COV-LINUX: "{{.*}}ld"
+// CHECK-ASAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-ASAN-COV-LINUX: libclang_rt.asan_static
 // CHECK-ASAN-COV-LINUX: libclang_rt.asan
 // CHECK-ASAN-COV-LINUX: "--dynamic-list={{.*}}libclang_rt.asan.a.syms"
@@ -704,7 +704,7 @@
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-MSAN-COV-LINUX
-// CHECK-MSAN-COV-LINUX: "{{.*}}ld"
+// CHECK-MSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-MSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.msan.a" "--no-whole-archive"
 // CHECK-MSAN-COV-LINUX: "--dynamic-list={{.*}}libclang_rt.msan.a.syms"
 // CHECK-MSAN-COV-LINUX-NOT: "-lstdc++"
@@ -716,7 +716,7 @@
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-DFSAN-COV-LINUX
-// CHECK-DFSAN-COV-LINUX: "{{.*}}ld"
+// CHECK-DFSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-DFSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.dfsan.a" "--no-whole-archive"
 // CHECK-DFSAN-COV-LINUX-NOT: "-lstdc++"
 // CHECK-DFSAN-COV-LINUX: "-lpthread"
@@ -727,7 +727,7 @@
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-UBSAN-COV-LINUX
-// CHECK-UBSAN-COV-LINUX: "{{.*}}ld"
+// CHECK-UBSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-UBSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive"
 // CHECK-UBSAN-COV-LINUX-NOT: "-lstdc++"
 // CHECK-UBSAN-COV-LINUX: "-lpthread"
@@ -738,7 +738,7 @@
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-COV-LINUX
-// CHECK-COV-LINUX: "{{.*}}ld"
+// CHECK-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive"
 // CHECK-COV-LINUX-NOT: "-lstdc++"
 // CHECK-COV-LINUX: "-lpthread"
@@ -750,7 +750,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-NSAN-LINUX
 //
-// CHECK-NSAN-LINUX: "{{.*}}ld"
+// CHECK-NSAN-LINUX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-NSAN-LINUX-NOT: "-lc"
 // CHECK-NSAN-LINUX: libclang_rt.nsan.a"
 // CHECK-NSAN-LINUX: "-lpthread" "-lrt" "-lm" "-ldl" "-lresolv"
@@ -778,7 +778,7 @@
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-CFI-LINUX
-// CHECK-CFI-LINUX: "{{.*}}ld"
+// CHECK-CFI-LINUX: "{{.*}}ld{{(.exe)?}}"
 
 // CFI with diagnostics links the UBSan runtime.
 // RUN: not %clang -fsanitize=cfi -fno-sanitize-trap=cfi -fsanitize-recover=cfi \
@@ -787,7 +787,7 @@
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-CFI-DIAG-LINUX
-// CHECK-CFI-DIAG-LINUX: "{{.*}}ld"
+// CHECK-CFI-DIAG-LINUX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-CFI-DIAG-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive"
 
 // Cross-DSO CFI links the CFI runtime.
@@ -796,7 +796,7 @@
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-CFI-CROSS-DSO-LINUX
-// CHECK-CFI-CROSS-DSO-LINUX: "{{.*}}ld"
+// CHECK-CFI-CROSS-DSO-LINUX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-CFI-CROSS-DSO-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.cfi.a" "--no-whole-archive"
 // CHECK-CFI-CROSS-DSO-LINUX: -export-dynamic
 
@@ -807,7 +807,7 @@
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-CFI-CROSS-DSO-DIAG-LINUX
-// CHECK-CFI-CROSS-DSO-DIAG-LINUX: "{{.*}}ld"
+// CHECK-CFI-CROSS-DSO-DIAG-LINUX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-CFI-CROSS-DSO-DIAG-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.cfi_diag.a" "--no-whole-archive"
 // CHECK-CFI-CROSS-DSO-DIAG-LINUX: -export-dynamic
 
@@ -836,7 +836,7 @@
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-DARWIN106-CXX
-// CHECK-ASAN-DARWIN106-CXX: "{{.*}}ld"
+// CHECK-ASAN-DARWIN106-CXX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-ASAN-DARWIN106-CXX: libclang_rt.asan_osx_dynamic.dylib
 // CHECK-ASAN-DARWIN106-CXX-NOT: -lc++abi
 
@@ -846,7 +846,7 @@
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-LSAN-DARWIN106-CXX
-// CHECK-LSAN-DARWIN106-CXX: "{{.*}}ld"
+// CHECK-LSAN-DARWIN106-CXX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-LSAN-DARWIN106-CXX: libclang_rt.lsan_osx_dynamic.dylib
 // CHECK-LSAN-DARWIN106-CXX-NOT: -lc++abi
 
@@ -856,7 +856,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-SAFESTACK-LINUX
 //
-// CHECK-SAFESTACK-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld"
+// CHECK-SAFESTACK-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-SAFESTACK-LINUX-NOT: "-lc"
 // CHECK-SAFESTACK-LINUX-NOT: whole-archive
 // CHECK-SAFESTACK-LINUX: "-u" "__safestack_init"
@@ -869,7 +869,7 @@
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld \
 // RUN:   | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-X86-64
 // CHECK-SHADOWCALLSTACK-LINUX-X86-64-NOT: error:
-// CHECK-SHADOWCALLSTACK-LINUX-X86-64: "{{(.*[^-.0-9A-Z_a-z])?}}ld"
+// CHECK-SHADOWCALLSTACK-LINUX-X86-64: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 
 // RUN: not %clang -fsanitize=shadow-call-stack -### %s 2>&1 \
 // RUN:     --target=aarch64-unknown-linux -fuse-ld=ld \
@@ -886,7 +886,7 @@
 // RUN:     --target=riscv64-unknown-linux -fuse-ld=ld \
 // RUN:   | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-RISCV64
 // CHECK-SHADOWCALLSTACK-LINUX-RISCV64-NOT: error:
-// CHECK-SHADOWCALLSTACK-LINUX-RISCV64: "{{(.*[^-.0-9A-Z_a-z])?}}ld"
+// CHECK-SHADOWCALLSTACK-LINUX-RISCV64: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 
 // RUN: %clang -target riscv64-linux-android -fsanitize=shadow-call-stack %s -### 2>&1 \
 // RUN:   | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-ANDROID-RISCV64
@@ -906,7 +906,7 @@
 // RUN:     --target=arm64-unknown-ios -fuse-ld=ld \
 // RUN:   | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18
 // CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18-NOT: error:
-// CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18: "{{(.*[^-.0-9A-Z_a-z])?}}ld"
+// CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 
 // RUN: %clang -fsanitize=shadow-call-stack -### %s 2>&1 \
 // RUN:     --target=aarch64-unknown-linux-android -fuse-ld=ld \
@@ -923,7 +923,7 @@
 // RUN:     -fsanitize=safe-stack --target=x86_64-unknown-linux -fuse-ld=ld \
 // RUN:   | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-SAFESTACK
 // CHECK-SHADOWCALLSTACK-SAFESTACK-NOT: error:
-// CHECK-SHADOWCALLSTACK-SAFESTACK: "{{(.*[^-.0-9A-Z_a-z])?}}ld"
+// CHECK-SHADOWCALLSTACK-SAFESTACK: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-SHADOWCALLSTACK-SAFESTACK: libclang_rt.safestack.a
 
 // RUN: not %clang -fsanitize=cfi -fsanitize-stats -### %s 2>&1 \
@@ -931,7 +931,7 @@
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-CFI-STATS-LINUX
-// CHECK-CFI-STATS-LINUX: "{{.*}}ld"
+// CHECK-CFI-STATS-LINUX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-CFI-STATS-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.stats_client.a" "--no-whole-archive"
 // CHECK-CFI-STATS-LINUX-NOT: "--whole-archive"
 // CHECK-CFI-STATS-LINUX: "{{[^"]*}}libclang_rt.stats.a"
@@ -940,7 +940,7 @@
 // RUN:     --target=x86_64-apple-darwin -fuse-ld=ld \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-CFI-STATS-DARWIN
-// CHECK-CFI-STATS-DARWIN: "{{.*}}ld"
+// CHECK-CFI-STATS-DARWIN: "{{.*}}ld{{(.exe)?}}"
 // CHECK-CFI-STATS-DARWIN: "{{[^"]*}}libclang_rt.stats_client_osx.a"
 // CHECK-CFI-STATS-DARWIN: "{{[^"]*}}libclang_rt.stats_osx_dynamic.dylib"
 
@@ -1066,7 +1066,7 @@
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-SCUDO-LINUX
-// CHECK-SCUDO-LINUX: "{{.*}}ld"
+// CHECK-SCUDO-LINUX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-SCUDO-LINUX: "--whole-archive" "{{.*}}libclang_rt.scudo_standalone.a" "--no-whole-archive"
 // CHECK-SCUDO-LINUX-NOT: "-lstdc++"
 // CHECK-SCUDO-LINUX: "-lpthread"
@@ -1079,7 +1079,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-SCUDO-SHARED-LINUX
 //
-// CHECK-SCUDO-SHARED-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld"
+// CHECK-SCUDO-SHARED-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-SCUDO-SHARED-LINUX-NOT: "-lc"
 // CHECK-SCUDO-SHARED-LINUX: libclang_rt.scudo_standalone.so"
 // CHECK-SCUDO-SHARED-LINUX-NOT: "-lpthread"
@@ -1122,7 +1122,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-HWASAN-X86-64-LINUX
 //
-// CHECK-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld"
+// CHECK-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-HWASAN-X86-64-LINUX-NOT: "-lc"
 // CHECK-HWASAN-X86-64-LINUX: libclang_rt.hwasan.a"
 // CHECK-HWASAN-X86-64-LINUX-NOT: "--export-dynamic"
@@ -1139,7 +1139,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-SHARED-HWASAN-X86-64-LINUX
 //
-// CHECK-SHARED-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld"
+// CHECK-SHARED-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-SHARED-HWASAN-X86-64-LINUX-NOT: "-lc"
 // CHECK-SHARED-HWASAN-X86-64-LINUX: libclang_rt.hwasan.so"
 // CHECK-SHARED-HWASAN-X86-64-LINUX: libclang_rt.hwasan-preinit.a"
@@ -1156,7 +1156,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-DSO-SHARED-HWASAN-X86-64-LINUX
 //
-// CHECK-DSO-SHARED-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld"
+// CHECK-DSO-SHARED-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-DSO-SHARED-HWASAN-X86-64-LINUX-NOT: "-lc"
 // CHECK-DSO-SHARED-HWASAN-X86-64-LINUX: libclang_rt.hwasan.so"
 // CHECK-DSO-SHARED-HWASAN-X86-64-LINUX-NOT: "-lpthread"
@@ -1172,7 +1172,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-HWASAN-AARCH64-LINUX
 //
-// CHECK-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld"
+// CHECK-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-HWASAN-AARCH64-LINUX-NOT: "-lc"
 // CHECK-HWASAN-AARCH64-LINUX: libclang_rt.hwasan.a"
 // CHECK-HWASAN-AARCH64-LINUX-NOT: "--export-dynamic"
@@ -1190,7 +1190,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-SHARED-HWASAN-AARCH64-LINUX
 //
-// CHECK-SHARED-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld"
+// CHECK-SHARED-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lc"
 // CHECK-SHARED-HWASAN-AARCH64-LINUX: libclang_rt.hwasan.so"
 // CHECK-SHARED-HWASAN-AARCH64-LINUX: libclang_rt.hwasan-preinit.a"
@@ -1207,7 +1207,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | %{filecheck} --check-prefix=CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX
 //
-// CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld"
+// CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lc"
 // CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX: libclang_rt.hwasan.so"
 // CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lpthread"

From 1557eeda738d7dbe51d2f52fce28a1fd6f5844ce Mon Sep 17 00:00:00 2001
From: quic_hchandel <165007698+hchandel@users.noreply.github.com>
Date: Sun, 29 Dec 2024 11:14:12 +0530
Subject: [PATCH 145/567] [RISCV] Add Qualcomm uC Xqciac (Load-Store Adress
 calculation) extension (#121037)

This extension adds 3 instructions that perform load-store address
calculation.

The current spec can be found at:
https://github.com/quic/riscv-unified-db/releases/latest

This patch adds assembler only support.

---------

Co-authored-by: Harsh Chandel 
Co-authored-by: Sudharsan Veeravalli 
---
 .../Driver/print-supported-extensions-riscv.c |  1 +
 llvm/docs/RISCVUsage.rst                      |  3 ++
 llvm/docs/ReleaseNotes.md                     |  2 +
 .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 12 +++++
 .../RISCV/Disassembler/RISCVDisassembler.cpp  |  6 +++
 .../Target/RISCV/MCTargetDesc/RISCVBaseInfo.h |  1 +
 llvm/lib/Target/RISCV/RISCVFeatures.td        |  8 +++
 llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td   | 38 ++++++++++++++
 llvm/lib/TargetParser/RISCVISAInfo.cpp        |  2 +-
 llvm/test/CodeGen/RISCV/attributes.ll         |  2 +
 llvm/test/MC/RISCV/xqciac-invalid.s           | 43 ++++++++++++++++
 llvm/test/MC/RISCV/xqciac-valid.s             | 49 +++++++++++++++++++
 .../TargetParser/RISCVISAInfoTest.cpp         |  5 +-
 13 files changed, 169 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/MC/RISCV/xqciac-invalid.s
 create mode 100644 llvm/test/MC/RISCV/xqciac-valid.s

diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c
index 8344c1aa39973..8e46690cce5a6 100644
--- a/clang/test/Driver/print-supported-extensions-riscv.c
+++ b/clang/test/Driver/print-supported-extensions-riscv.c
@@ -189,6 +189,7 @@
 // CHECK-NEXT:     ssctr                1.0       'Ssctr' (Control Transfer Records Supervisor Level)
 // CHECK-NEXT:     svukte               0.3       'Svukte' (Address-Independent Latency of User-Mode Faults to Supervisor Addresses)
 // CHECK-NEXT:     xqcia                0.2       'Xqcia' (Qualcomm uC Arithmetic Extension)
+// CHECK-NEXT:     xqciac               0.2       'Xqciac' (Qualcomm uC Load-Store Address Calculation Extension)
 // CHECK-NEXT:     xqcics               0.2       'Xqcics' (Qualcomm uC Conditional Select Extension)
 // CHECK-NEXT:     xqcicsr              0.2       'Xqcicsr' (Qualcomm uC CSR Extension)
 // CHECK-NEXT:     xqcilsm              0.2       'Xqcilsm' (Qualcomm uC Load Store Multiple Extension)
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index f6a0dd4bf2383..22600f5720553 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -429,6 +429,9 @@ The current vendor extensions supported are:
 ``experimental-Xqcia``
   LLVM implements `version 0.2 of the Qualcomm uC Arithmetic extension specification `__ by Qualcomm.  All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32.
 
+``experimental-Xqciac``
+  LLVM implements `version 0.2 of the Qualcomm uC Load-Store Address Calculation extension specification `__ by Qualcomm.  All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32.
+
 ``experimental-Xqcics``
   LLVM implements `version 0.2 of the Qualcomm uC Conditional Select extension specification `__ by Qualcomm.  All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32.
 
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 5999f78f7e067..99a93b0467602 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -224,6 +224,8 @@ Changes to the RISC-V Backend
   extension.
 * Adds experimental assembler support for the Qualcomm uC 'Xqcia` (Arithmetic)
   extension.
+* Adds experimental assembler support for the Qualcomm uC 'Xqciac` (Load-Store Address Calculation)
+  extension.
 * Adds experimental assembler support for the Qualcomm uC 'Xqcics` (Conditonal Select)
   extension.
 * Adds experimental assembler support for the Qualcomm uC 'Xqcilsm` (Load Store Multiple)
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 9dcf2e973e6c5..4c1fd5aa41e2b 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -734,6 +734,16 @@ struct RISCVOperand final : public MCParsedAsmOperand {
            VK == RISCVMCExpr::VK_RISCV_None;
   }
 
+  bool isUImm5GT3() const {
+    if (!isImm())
+      return false;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
+    int64_t Imm;
+    bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
+    return IsConstantImm && isUInt<5>(Imm) && (Imm > 3) &&
+           VK == RISCVMCExpr::VK_RISCV_None;
+  }
+
   bool isUImm8GE32() const {
     int64_t Imm;
     RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
@@ -1520,6 +1530,8 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 5) - 1);
   case Match_InvalidUImm5NonZero:
     return generateImmOutOfRangeError(Operands, ErrorInfo, 1, (1 << 5) - 1);
+  case Match_InvalidUImm5GT3:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, 4, (1 << 5) - 1);
   case Match_InvalidUImm6:
     return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 6) - 1);
   case Match_InvalidUImm7:
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 99017195185fd..57443d3f38e3c 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -692,6 +692,9 @@ DecodeStatus RISCVDisassembler::getInstruction32(MCInst &MI, uint64_t &Size,
                         "Qualcomm uC Conditional Select custom opcode table");
   TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXqcilsm, DecoderTableXqcilsm32,
                         "Qualcomm uC Load Store Multiple custom opcode table");
+  TRY_TO_DECODE_FEATURE(
+      RISCV::FeatureVendorXqciac, DecoderTableXqciac32,
+      "Qualcomm uC Load-Store Address Calculation custom opcode table");
   TRY_TO_DECODE(true, DecoderTable32, "RISCV32 table");
 
   return MCDisassembler::Fail;
@@ -718,6 +721,9 @@ DecodeStatus RISCVDisassembler::getInstruction16(MCInst &MI, uint64_t &Size,
   TRY_TO_DECODE_FEATURE(
       RISCV::FeatureStdExtZcmp, DecoderTableRVZcmp16,
       "Zcmp table (16-bit Push/Pop & Double Move Instructions)");
+  TRY_TO_DECODE_FEATURE(
+      RISCV::FeatureVendorXqciac, DecoderTableXqciac16,
+      "Qualcomm uC Load-Store Address Calculation custom 16bit opcode table");
   TRY_TO_DECODE_AND_ADD_SP(STI.hasFeature(RISCV::FeatureVendorXwchc),
                            DecoderTableXwchc16,
                            "WCH QingKe XW custom opcode table");
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index b9f4db065f215..7fb5fc7a83130 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -302,6 +302,7 @@ enum OperandType : unsigned {
   OPERAND_UIMM4,
   OPERAND_UIMM5,
   OPERAND_UIMM5_NONZERO,
+  OPERAND_UIMM5_GT3,
   OPERAND_UIMM5_LSB0,
   OPERAND_UIMM6,
   OPERAND_UIMM6_LSB0,
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index dfc5658806abb..916b140c5bde7 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1274,6 +1274,14 @@ def HasVendorXqcilsm
       AssemblerPredicate<(all_of FeatureVendorXqcilsm),
                          "'Xqcilsm' (Qualcomm uC Load Store Multiple Extension)">;
 
+def FeatureVendorXqciac
+    : RISCVExperimentalExtension<0, 2, "Qualcomm uC Load-Store Address Calculation Extension",
+                                 [FeatureStdExtZca]>;
+def HasVendorXqciac
+    : Predicate<"Subtarget->hasVendorXqciac()">,
+      AssemblerPredicate<(all_of FeatureVendorXqciac),
+                         "'Xqciac' (Qualcomm uC Load-Store Address Calculation Extension)">;
+
 //===----------------------------------------------------------------------===//
 // LLVM specific features and extensions
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index 05b559178bfe6..ca73fbccd9d2d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -21,6 +21,13 @@ def uimm5nonzero : RISCVOp,
   let OperandType = "OPERAND_UIMM5_NONZERO";
 }
 
+def uimm5gt3 : RISCVOp, ImmLeaf 3) && isUInt<5>(Imm);}]> {
+  let ParserMatchClass = UImmAsmOperand<5, "GT3">;
+  let DecoderMethod = "decodeUImmOperand<5>";
+  let OperandType = "OPERAND_UIMM5_GT3";
+}
+
 def uimm11 : RISCVUImmLeafOp<11>;
 
 //===----------------------------------------------------------------------===//
@@ -184,6 +191,37 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0
 } // Predicates = [HasVendorXqcia, IsRV32], DecoderNamespace = "Xqcia"
 
+let Predicates = [HasVendorXqciac, IsRV32], DecoderNamespace = "Xqciac" in {
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+  def QC_C_MULADDI : RVInst16CL<0b001, 0b10, (outs GPRC:$rd_wb),
+                               (ins GPRC:$rd, GPRC:$rs1, uimm5:$uimm),
+                               "qc.c.muladdi", "$rd, $rs1, $uimm"> {
+    let Constraints = "$rd = $rd_wb";
+    bits<5> uimm;
+
+    let Inst{12-10} = uimm{3-1};
+    let Inst{6} = uimm{0};
+    let Inst{5} = uimm{4};
+  }
+
+  def QC_MULADDI : RVInstI<0b110, OPC_CUSTOM_0, (outs GPRNoX0:$rd_wb),
+                           (ins GPRNoX0:$rd, GPRNoX0:$rs1, simm12:$imm12),
+                           "qc.muladdi", "$rd, $rs1, $imm12"> {
+    let Constraints = "$rd = $rd_wb";
+  }
+
+  def QC_SHLADD : RVInstRBase<0b011, OPC_CUSTOM_0, (outs GPRNoX0:$rd),
+                              (ins GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$shamt),
+                              "qc.shladd", "$rd, $rs1, $rs2, $shamt"> {
+    bits<5> shamt;
+
+    let Inst{31-30} = 0b01;
+    let Inst{29-25} = shamt;
+  }
+
+} // hasSideEffects = 0, mayLoad = 0, mayStore = 0
+} // Predicates = [HasVendorXqciac, IsRV32], DecoderNamespace = "Xqciac"
+
 let Predicates = [HasVendorXqcics, IsRV32], DecoderNamespace = "Xqcics" in {
   def QC_SELECTIIEQ : QCISELECTIICC <0b010, "qc.selectiieq">;
   def QC_SELECTIINE : QCISELECTIICC <0b011, "qc.selectiine">;
diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp
index cafc9d304e83a..e4e459a77b5f8 100644
--- a/llvm/lib/TargetParser/RISCVISAInfo.cpp
+++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp
@@ -742,7 +742,7 @@ Error RISCVISAInfo::checkDependency() {
   bool HasZvl = MinVLen != 0;
   bool HasZcmt = Exts.count("zcmt") != 0;
   static constexpr StringLiteral XqciExts[] = {
-      {"xqcia"}, {"xqcics"}, {"xqcicsr"}, {"xqcilsm"}, {"xqcisls"}};
+      {"xqcia"}, {"xqciac"}, {"xqcics"}, {"xqcicsr"}, {"xqcilsm"}, {"xqcisls"}};
 
   if (HasI && HasE)
     return getIncompatibleError("i", "e");
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index f63bc944ccf22..3f2b2c9470783 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -82,6 +82,7 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+xtheadsync %s -o - | FileCheck --check-prefix=RV32XTHEADSYNC %s
 ; RUN: llc -mtriple=riscv32 -mattr=+xwchc %s -o - | FileCheck --check-prefix=RV32XWCHC %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcia %s -o - | FileCheck --check-prefix=RV32XQCIA %s
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqciac %s -o - | FileCheck --check-prefix=RV32XQCIAC %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcics %s -o - | FileCheck --check-prefix=RV32XQCICS %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicsr %s -o - | FileCheck --check-prefix=RV32XQCICSR %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcilsm %s -o - | FileCheck --check-prefix=RV32XQCILSM %s
@@ -391,6 +392,7 @@
 ; RV32XTHEADSYNC: .attribute 5, "rv32i2p1_xtheadsync1p0"
 ; RV32XWCHC: .attribute 5, "rv32i2p1_xwchc2p2"
 ; RV32XQCIA: .attribute 5, "rv32i2p1_xqcia0p2"
+; RV32XQCIAC: .attribute 5, "rv32i2p1_zca1p0_xqciac0p2"
 ; RV32XQCICS: .attribute 5, "rv32i2p1_xqcics0p2"
 ; RV32XQCICSR: .attribute 5, "rv32i2p1_xqcicsr0p2"
 ; RV32XQCILSM: .attribute 5, "rv32i2p1_xqcilsm0p2"
diff --git a/llvm/test/MC/RISCV/xqciac-invalid.s b/llvm/test/MC/RISCV/xqciac-invalid.s
new file mode 100644
index 0000000000000..4e0182aff9cc2
--- /dev/null
+++ b/llvm/test/MC/RISCV/xqciac-invalid.s
@@ -0,0 +1,43 @@
+# Xqciac - Qualcomm uC Load-Store Address Calculation Extension
+# RUN: not llvm-mc -triple riscv32 -mattr=+experimental-xqciac < %s 2>&1 \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-IMM %s
+# RUN: not llvm-mc -triple riscv32 -mattr=-experimental-xqciac < %s 2>&1 \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-EXT %s
+
+# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction
+qc.c.muladdi x5, x10, 4
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.c.muladdi x15
+
+# CHECK-IMM: :[[@LINE+1]]:24: error: immediate must be an integer in the range [0, 31]
+qc.c.muladdi x10, x15, 32
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciac' (Qualcomm uC Load-Store Address Calculation Extension)
+qc.c.muladdi x10, x15, 20
+
+
+# CHECK: :[[@LINE+1]]:12: error: invalid operand for instruction
+qc.muladdi x0, x10, 1048577
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.muladdi x10
+
+# CHECK-IMM: :[[@LINE+1]]:22: error: operand must be a symbol with %lo/%pcrel_lo/%tprel_lo modifier or an integer in the range [-2048, 2047]
+qc.muladdi x10, x15, 8589934592
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciac' (Qualcomm uC Load-Store Address Calculation Extension)
+qc.muladdi x10, x15, 577
+
+
+# CHECK: :[[@LINE+1]]:11: error: invalid operand for instruction
+qc.shladd 0, x10, 1048577
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.shladd x10
+
+# CHECK-IMM: :[[@LINE+1]]:26: error: immediate must be an integer in the range [4, 31]
+qc.shladd x10, x15, x11, 2
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciac' (Qualcomm uC Load-Store Address Calculation Extension)
+qc.shladd x10, x15, x11, 5
diff --git a/llvm/test/MC/RISCV/xqciac-valid.s b/llvm/test/MC/RISCV/xqciac-valid.s
new file mode 100644
index 0000000000000..6e97d8cc447e1
--- /dev/null
+++ b/llvm/test/MC/RISCV/xqciac-valid.s
@@ -0,0 +1,49 @@
+# Xqciac - Qualcomm uC Load-Store Address Calculation Extension
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqciac -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqciac < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-xqciac -M no-aliases --no-print-imm-hex -d - \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqciac -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqciac < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-xqciac --no-print-imm-hex -d - \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+
+# CHECK-INST: qc.c.muladdi    a0, a1, 0
+# CHECK-ENC: encoding: [0x8a,0x21]
+qc.c.muladdi x10, x11, 0
+
+# CHECK-INST: qc.c.muladdi    a0, a1, 31
+# CHECK-ENC: encoding: [0xea,0x3d]
+qc.c.muladdi x10, x11, 31
+
+# CHECK-INST: qc.c.muladdi    a0, a1, 16
+# CHECK-ENC: encoding: [0xaa,0x21]
+qc.c.muladdi x10, x11, 16
+
+
+# CHECK-INST: qc.muladdi      tp, t0, 1234
+# CHECK-ENC: encoding: [0x0b,0xe2,0x22,0x4d]
+qc.muladdi x4, x5, 1234
+
+# CHECK-INST: qc.muladdi      a0, a1, -2048
+# CHECK-ENC: encoding: [0x0b,0xe5,0x05,0x80]
+qc.muladdi x10, x11, -2048
+
+# CHECK-INST: qc.muladdi      a0, a1, 2047
+# CHECK-ENC: encoding: [0x0b,0xe5,0xf5,0x7f]
+qc.muladdi x10, x11, 2047
+
+
+# CHECK-INST: qc.shladd       tp, t0, t1, 12
+# CHECK-ENC: encoding: [0x0b,0xb2,0x62,0x58]
+qc.shladd x4, x5, x6, 12
+
+# CHECK-INST: qc.shladd       a0, a1, a2, 4
+# CHECK-ENC: encoding: [0x0b,0xb5,0xc5,0x48]
+qc.shladd x10, x11, x12, 4
+
+# CHECK-INST: qc.shladd       a0, a1, a2, 31
+# CHECK-ENC: encoding: [0x0b,0xb5,0xc5,0x7e]
+qc.shladd x10, x11, x12, 31
diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
index ed334f00eb93a..176cf82ac34b1 100644
--- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
@@ -654,8 +654,8 @@ TEST(ParseArchString, RejectsConflictingExtensions) {
   }
 
   for (StringRef Input :
-       {"rv64i_xqcisls0p2", "rv64i_xqcia0p2", "rv64i_xqcicsr0p2",
-        "rv64i_xqcilsm0p2", "rv64i_xqcics0p2"}) {
+       {"rv64i_xqcisls0p2", "rv64i_xqcia0p2", "rv64i_xqciac0p2",
+        "rv64i_xqcicsr0p2", "rv64i_xqcilsm0p2", "rv64i_xqcics0p2"}) {
     EXPECT_THAT(
         toString(RISCVISAInfo::parseArchString(Input, true).takeError()),
         ::testing::EndsWith(" is only supported for 'rv32'"));
@@ -1113,6 +1113,7 @@ Experimental extensions
     ssctr                1.0
     svukte               0.3
     xqcia                0.2
+    xqciac               0.2
     xqcics               0.2
     xqcicsr              0.2
     xqcilsm              0.2

From 66dd7e63d8a1860c11f3b9acf011cad4e18f0195 Mon Sep 17 00:00:00 2001
From: Fangrui Song 
Date: Sat, 28 Dec 2024 23:51:44 -0800
Subject: [PATCH 146/567] Simplify enablePostRAScheduler and test
 enablePostRAScheduler() early

---
 llvm/lib/CodeGen/PostRASchedulerList.cpp | 39 ++++++++----------------
 1 file changed, 12 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/CodeGen/PostRASchedulerList.cpp b/llvm/lib/CodeGen/PostRASchedulerList.cpp
index 2f7cfdd275b4f..badfd9a68d6a0 100644
--- a/llvm/lib/CodeGen/PostRASchedulerList.cpp
+++ b/llvm/lib/CodeGen/PostRASchedulerList.cpp
@@ -98,12 +98,6 @@ namespace {
     }
 
     bool runOnMachineFunction(MachineFunction &Fn) override;
-
-  private:
-    bool enablePostRAScheduler(
-        const TargetSubtargetInfo &ST, CodeGenOptLevel OptLevel,
-        TargetSubtargetInfo::AntiDepBreakMode &Mode,
-        TargetSubtargetInfo::RegClassVector &CriticalPathRCs) const;
   };
   char PostRAScheduler::ID = 0;
 
@@ -259,13 +253,8 @@ LLVM_DUMP_METHOD void SchedulePostRATDList::dumpSchedule() const {
 }
 #endif
 
-bool PostRAScheduler::enablePostRAScheduler(
-    const TargetSubtargetInfo &ST, CodeGenOptLevel OptLevel,
-    TargetSubtargetInfo::AntiDepBreakMode &Mode,
-    TargetSubtargetInfo::RegClassVector &CriticalPathRCs) const {
-  Mode = ST.getAntiDepBreakMode();
-  ST.getCriticalPathRCs(CriticalPathRCs);
-
+static bool enablePostRAScheduler(const TargetSubtargetInfo &ST,
+                                  CodeGenOptLevel OptLevel) {
   // Check for explicit enable/disable of post-ra scheduling.
   if (EnablePostRAScheduler.getPosition() > 0)
     return EnablePostRAScheduler;
@@ -278,24 +267,17 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
   if (skipFunction(Fn.getFunction()))
     return false;
 
-  TII = Fn.getSubtarget().getInstrInfo();
-  MachineLoopInfo &MLI = getAnalysis().getLI();
-  AliasAnalysis *AA = &getAnalysis().getAAResults();
+  const auto &Subtarget = Fn.getSubtarget();
   TargetPassConfig *PassConfig = &getAnalysis();
-
-  RegClassInfo.runOnMachineFunction(Fn);
-
-  TargetSubtargetInfo::AntiDepBreakMode AntiDepMode =
-    TargetSubtargetInfo::ANTIDEP_NONE;
-  SmallVector CriticalPathRCs;
-
   // Check that post-RA scheduling is enabled for this target.
-  // This may upgrade the AntiDepMode.
-  if (!enablePostRAScheduler(Fn.getSubtarget(), PassConfig->getOptLevel(),
-                             AntiDepMode, CriticalPathRCs))
+  if (!enablePostRAScheduler(Subtarget, PassConfig->getOptLevel()))
     return false;
 
-  // Check for antidep breaking override...
+  TII = Subtarget.getInstrInfo();
+  MachineLoopInfo &MLI = getAnalysis().getLI();
+  AliasAnalysis *AA = &getAnalysis().getAAResults();
+  TargetSubtargetInfo::AntiDepBreakMode AntiDepMode =
+      Subtarget.getAntiDepBreakMode();
   if (EnableAntiDepBreaking.getPosition() > 0) {
     AntiDepMode = (EnableAntiDepBreaking == "all")
       ? TargetSubtargetInfo::ANTIDEP_ALL
@@ -303,6 +285,9 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
          ? TargetSubtargetInfo::ANTIDEP_CRITICAL
          : TargetSubtargetInfo::ANTIDEP_NONE);
   }
+  SmallVector CriticalPathRCs;
+  Subtarget.getCriticalPathRCs(CriticalPathRCs);
+  RegClassInfo.runOnMachineFunction(Fn);
 
   LLVM_DEBUG(dbgs() << "PostRAScheduler\n");
 

From ff29f38c02eb425a6809dec26f221cea3d99b57c Mon Sep 17 00:00:00 2001
From: Jacek Caban 
Date: Sun, 29 Dec 2024 11:43:45 +0100
Subject: [PATCH 147/567] [LLD][COFF] Store and validate load config in
 SymbolTable (#120324)

Improve diagnostics for invalid load configurations.
---
 lld/COFF/Driver.cpp                      |  1 +
 lld/COFF/SymbolTable.cpp                 | 47 +++++++++++++++++++++++
 lld/COFF/SymbolTable.h                   |  4 ++
 lld/COFF/Writer.cpp                      | 49 +++++-------------------
 lld/test/COFF/guard-warnings.s           |  2 +-
 lld/test/COFF/loadcfg-short.test         | 33 ++++++++++++++++
 lld/test/COFF/loadcfg-size.test          | 33 ++++++++++++++++
 lld/test/COFF/loadcfg-uninitialized.test | 33 ++++++++++++++++
 8 files changed, 162 insertions(+), 40 deletions(-)
 create mode 100644 lld/test/COFF/loadcfg-short.test
 create mode 100644 lld/test/COFF/loadcfg-size.test
 create mode 100644 lld/test/COFF/loadcfg-uninitialized.test

diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index a4d6b94812d93..ae5b095fba772 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -2824,6 +2824,7 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) {
 
   if (ctx.symtabEC)
     ctx.symtabEC->initializeECThunks();
+  ctx.forEachSymtab([](SymbolTable &symtab) { symtab.initializeLoadConfig(); });
 
   // Identify unreferenced COMDAT sections.
   if (config->doGC) {
diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp
index b1d375b226583..fc78afb4c9e40 100644
--- a/lld/COFF/SymbolTable.cpp
+++ b/lld/COFF/SymbolTable.cpp
@@ -27,6 +27,7 @@
 #include 
 
 using namespace llvm;
+using namespace llvm::support;
 
 namespace lld::coff {
 
@@ -596,6 +597,52 @@ std::pair SymbolTable::insert(StringRef name, InputFile *file) {
   return result;
 }
 
+void SymbolTable::initializeLoadConfig() {
+  auto sym =
+      dyn_cast_or_null(findUnderscore("_load_config_used"));
+  if (!sym) {
+    if (ctx.config.guardCF != GuardCFLevel::Off)
+      Warn(ctx)
+          << "Control Flow Guard is enabled but '_load_config_used' is missing";
+    if (ctx.config.dependentLoadFlags)
+      Warn(ctx) << "_load_config_used not found, /dependentloadflag will have "
+                   "no effect";
+    return;
+  }
+
+  SectionChunk *sc = sym->getChunk();
+  if (!sc->hasData) {
+    Err(ctx) << "_load_config_used points to uninitialized data";
+    return;
+  }
+  uint64_t offsetInChunk = sym->getValue();
+  if (offsetInChunk + 4 > sc->getSize()) {
+    Err(ctx) << "_load_config_used section chunk is too small";
+    return;
+  }
+
+  ArrayRef secContents = sc->getContents();
+  loadConfigSize =
+      *reinterpret_cast(&secContents[offsetInChunk]);
+  if (offsetInChunk + loadConfigSize > sc->getSize()) {
+    Err(ctx) << "_load_config_used specifies a size larger than its containing "
+                "section chunk";
+    return;
+  }
+
+  uint32_t expectedAlign = ctx.config.is64() ? 8 : 4;
+  if (sc->getAlignment() < expectedAlign)
+    Warn(ctx) << "'_load_config_used' is misaligned (expected alignment to be "
+              << expectedAlign << " bytes, got " << sc->getAlignment()
+              << " instead)";
+  else if (!isAligned(Align(expectedAlign), offsetInChunk))
+    Warn(ctx) << "'_load_config_used' is misaligned (section offset is 0x"
+              << Twine::utohexstr(sym->getValue()) << " not aligned to "
+              << expectedAlign << " bytes)";
+
+  loadConfigSym = sym;
+}
+
 void SymbolTable::addEntryThunk(Symbol *from, Symbol *to) {
   entryThunks.push_back({from, to});
 }
diff --git a/lld/COFF/SymbolTable.h b/lld/COFF/SymbolTable.h
index b694893b903aa..8548a6d036a9d 100644
--- a/lld/COFF/SymbolTable.h
+++ b/lld/COFF/SymbolTable.h
@@ -138,6 +138,10 @@ class SymbolTable {
       callback(pair.second);
   }
 
+  DefinedRegular *loadConfigSym = nullptr;
+  uint32_t loadConfigSize = 0;
+  void initializeLoadConfig();
+
 private:
   /// Given a name without "__imp_" prefix, returns a defined symbol
   /// with the "__imp_" prefix, if it exists.
diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index 3c6112b7fc89a..e6b239c83dd4a 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -1837,22 +1837,10 @@ template  void Writer::writeHeader() {
     dir[DEBUG_DIRECTORY].RelativeVirtualAddress = debugDirectory->getRVA();
     dir[DEBUG_DIRECTORY].Size = debugDirectory->getSize();
   }
-  if (Symbol *sym = ctx.symtab.findUnderscore("_load_config_used")) {
-    if (auto *b = dyn_cast(sym)) {
-      SectionChunk *sc = b->getChunk();
-      assert(b->getRVA() >= sc->getRVA());
-      uint64_t offsetInChunk = b->getRVA() - sc->getRVA();
-      if (!sc->hasData || offsetInChunk + 4 > sc->getSize())
-        Fatal(ctx) << "_load_config_used is malformed";
-
-      ArrayRef secContents = sc->getContents();
-      uint32_t loadConfigSize =
-          *reinterpret_cast(&secContents[offsetInChunk]);
-      if (offsetInChunk + loadConfigSize > sc->getSize())
-        Fatal(ctx) << "_load_config_used is too large";
-      dir[LOAD_CONFIG_TABLE].RelativeVirtualAddress = b->getRVA();
-      dir[LOAD_CONFIG_TABLE].Size = loadConfigSize;
-    }
+  if (ctx.symtab.loadConfigSym) {
+    dir[LOAD_CONFIG_TABLE].RelativeVirtualAddress =
+        ctx.symtab.loadConfigSym->getRVA();
+    dir[LOAD_CONFIG_TABLE].Size = ctx.symtab.loadConfigSize;
   }
   if (!delayIdata.empty()) {
     dir[DELAY_IMPORT_DESCRIPTOR].RelativeVirtualAddress =
@@ -2649,31 +2637,14 @@ void Writer::fixTlsAlignment() {
 }
 
 void Writer::prepareLoadConfig() {
-  Symbol *sym = ctx.symtab.findUnderscore("_load_config_used");
-  auto *b = cast_if_present(sym);
-  if (!b) {
-    if (ctx.config.guardCF != GuardCFLevel::Off)
-      Warn(ctx)
-          << "Control Flow Guard is enabled but '_load_config_used' is missing";
-    if (ctx.config.dependentLoadFlags)
-      Warn(ctx) << "_load_config_used not found, /dependentloadflag will have "
-                   "no effect";
+  if (!ctx.symtab.loadConfigSym)
     return;
-  }
 
-  OutputSection *sec = ctx.getOutputSection(b->getChunk());
-  uint8_t *buf = buffer->getBufferStart();
-  uint8_t *secBuf = buf + sec->getFileOff();
-  uint8_t *symBuf = secBuf + (b->getRVA() - sec->getRVA());
-  uint32_t expectedAlign = ctx.config.is64() ? 8 : 4;
-  if (b->getChunk()->getAlignment() < expectedAlign)
-    Warn(ctx) << "'_load_config_used' is misaligned (expected alignment to be "
-              << expectedAlign << " bytes, got "
-              << b->getChunk()->getAlignment() << " instead)";
-  else if (!isAligned(Align(expectedAlign), b->getRVA()))
-    Warn(ctx) << "'_load_config_used' is misaligned (RVA is 0x"
-              << Twine::utohexstr(b->getRVA()) << " not aligned to "
-              << expectedAlign << " bytes)";
+  OutputSection *sec =
+      ctx.getOutputSection(ctx.symtab.loadConfigSym->getChunk());
+  uint8_t *secBuf = buffer->getBufferStart() + sec->getFileOff();
+  uint8_t *symBuf =
+      secBuf + (ctx.symtab.loadConfigSym->getRVA() - sec->getRVA());
 
   if (ctx.config.is64())
     prepareLoadConfig(reinterpret_cast(symBuf));
diff --git a/lld/test/COFF/guard-warnings.s b/lld/test/COFF/guard-warnings.s
index 77448ee95c009..092871597d1f8 100644
--- a/lld/test/COFF/guard-warnings.s
+++ b/lld/test/COFF/guard-warnings.s
@@ -38,7 +38,7 @@
 
 # RUN: llvm-mc -triple x86_64-windows-msvc %t/loadcfg-misaligned2.s -filetype=obj -o %t/loadcfg-misaligned2.obj
 # RUN: lld-link %t/main.obj %t/loadcfg-misaligned2.obj -guard:cf,longjmp,ehcont -out:%t-misaligned2.exe -entry:main %basename_t-exp.lib 2>&1 | FileCheck %s --check-prefix=WARN_ALIGN2
-# WARN_ALIGN2: warning: '_load_config_used' is misaligned (RVA is 0x{{[0-9A-F]*}}2 not aligned to 8 bytes)
+# WARN_ALIGN2: warning: '_load_config_used' is misaligned (section offset is 0x{{[0-9A-F]*}}2 not aligned to 8 bytes)
 
 # RUN: llvm-mc -triple x86_64-windows-msvc %t/loadcfg-full.s -filetype=obj -o %t/loadcfg-full.obj
 # RUN: lld-link %t/main.obj %t/loadcfg-full.obj -guard:cf,longjmp,ehcont -out:%t.exe -entry:main %basename_t-exp.lib 2>&1 | FileCheck %s --check-prefix=NOWARN --allow-empty
diff --git a/lld/test/COFF/loadcfg-short.test b/lld/test/COFF/loadcfg-short.test
new file mode 100644
index 0000000000000..dd4d4389ddc1c
--- /dev/null
+++ b/lld/test/COFF/loadcfg-short.test
@@ -0,0 +1,33 @@
+# RUN: yaml2obj %s -o %t.obj
+# RUN: not lld-link -out:%t.dll %t.obj -dll -noentry 2>&1 | FileCheck %s
+# CHECK: lld-link: error: _load_config_used section chunk is too small
+
+--- !COFF
+header:
+  Machine:         IMAGE_FILE_MACHINE_AMD64
+  Characteristics: []
+sections:
+  - Name:            .rdata
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    Alignment:       16
+    SectionData:     '030000'
+symbols:
+  - Name:            .rdata
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          112
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          3
+  - Name:            _load_config_used
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+...
diff --git a/lld/test/COFF/loadcfg-size.test b/lld/test/COFF/loadcfg-size.test
new file mode 100644
index 0000000000000..871590f2328b6
--- /dev/null
+++ b/lld/test/COFF/loadcfg-size.test
@@ -0,0 +1,33 @@
+# RUN: yaml2obj %s -o %t.obj
+# RUN: not lld-link -out:%t.dll %t.obj -dll -noentry 2>&1 | FileCheck %s
+# CHECK: lld-link: error: _load_config_used specifies a size larger than its containing section chunk
+
+--- !COFF
+header:
+  Machine:         IMAGE_FILE_MACHINE_AMD64
+  Characteristics: []
+sections:
+  - Name:            .rdata
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    Alignment:       16
+    SectionData:     '0c00000000000000'
+symbols:
+  - Name:            .rdata
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          112
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          3
+  - Name:            _load_config_used
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+...
diff --git a/lld/test/COFF/loadcfg-uninitialized.test b/lld/test/COFF/loadcfg-uninitialized.test
new file mode 100644
index 0000000000000..5f956bc7224bc
--- /dev/null
+++ b/lld/test/COFF/loadcfg-uninitialized.test
@@ -0,0 +1,33 @@
+# RUN: yaml2obj %s -o %t.obj
+# RUN: not lld-link -out:%t.dll %t.obj -dll -noentry 2>&1 | FileCheck %s
+# CHECK: lld-link: error: _load_config_used points to uninitialized data
+
+--- !COFF
+header:
+  Machine:         IMAGE_FILE_MACHINE_AMD64
+  Characteristics: []
+sections:
+  - Name:            .rdata
+    Characteristics: [ IMAGE_SCN_CNT_UNINITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    Alignment:       16
+    VirtualSize:     0x140
+symbols:
+  - Name:            .rdata
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          112
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          3
+  - Name:            _load_config_used
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+...

From e45e091b90896023584b303539bd8ae16d8932b3 Mon Sep 17 00:00:00 2001
From: Congcong Cai 
Date: Sun, 29 Dec 2024 19:22:25 +0800
Subject: [PATCH 148/567] [clang-tidy] swap
 cppcoreguidelines-narrowing-conversions and bugprone-narrowing-conversions
 (#120245)

According to #116591.
> Coding guidelines should "cherry-pick" (and posddsibly
configure/harden/make more strict) base checks.
We should move narrowing conversion to bugprone and keep alias in
cppcoreguidelines
---
 .../bugprone/BugproneTidyModule.cpp           |   4 +-
 .../clang-tidy/bugprone/CMakeLists.txt        |   1 +
 .../NarrowingConversionsCheck.cpp             |   4 +-
 .../NarrowingConversionsCheck.h               |  12 +-
 .../cppcoreguidelines/CMakeLists.txt          |   1 -
 .../CppCoreGuidelinesTidyModule.cpp           |   4 +-
 clang-tools-extra/docs/ReleaseNotes.rst       |   7 +
 .../checks/bugprone/narrowing-conversions.rst | 126 ++++++++++++++++-
 .../narrowing-conversions.rst                 | 127 +-----------------
 .../docs/clang-tidy/checks/list.rst           |   4 +-
 .../narrowing-conversions-bitfields.cpp       |   4 +-
 ...-conversions-equivalentbitwidth-option.cpp |  20 +--
 ...sions-ignoreconversionfromtypes-option.cpp |  28 ++--
 ...rrowing-conversions-intemplates-option.cpp |  12 +-
 .../narrowing-conversions-long-is-32bits.cpp  |   6 +-
 ...versions-narrowingfloatingpoint-option.cpp |  24 ++--
 ...ng-conversions-narrowinginteger-option.cpp |  16 +--
 ...narrowingintegertofloatingpoint-option.cpp |  19 +++
 ...rowing-conversions-pedanticmode-option.cpp |  12 +-
 .../narrowing-conversions-unsigned-char.cpp   |  22 +--
 .../narrowing-conversions.cpp                 |  98 +++++++-------
 ...narrowingintegertofloatingpoint-option.cpp |  19 ---
 22 files changed, 289 insertions(+), 281 deletions(-)
 rename clang-tools-extra/clang-tidy/{cppcoreguidelines => bugprone}/NarrowingConversionsCheck.cpp (99%)
 rename clang-tools-extra/clang-tidy/{cppcoreguidelines => bugprone}/NarrowingConversionsCheck.h (90%)
 rename clang-tools-extra/test/clang-tidy/checkers/{cppcoreguidelines => bugprone}/narrowing-conversions-bitfields.cpp (97%)
 rename clang-tools-extra/test/clang-tidy/checkers/{cppcoreguidelines => bugprone}/narrowing-conversions-equivalentbitwidth-option.cpp (66%)
 rename clang-tools-extra/test/clang-tidy/checkers/{cppcoreguidelines => bugprone}/narrowing-conversions-ignoreconversionfromtypes-option.cpp (75%)
 rename clang-tools-extra/test/clang-tidy/checkers/{cppcoreguidelines => bugprone}/narrowing-conversions-intemplates-option.cpp (73%)
 rename clang-tools-extra/test/clang-tidy/checkers/{cppcoreguidelines => bugprone}/narrowing-conversions-long-is-32bits.cpp (84%)
 rename clang-tools-extra/test/clang-tidy/checkers/{cppcoreguidelines => bugprone}/narrowing-conversions-narrowingfloatingpoint-option.cpp (70%)
 rename clang-tools-extra/test/clang-tidy/checkers/{cppcoreguidelines => bugprone}/narrowing-conversions-narrowinginteger-option.cpp (59%)
 create mode 100644 clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-narrowingintegertofloatingpoint-option.cpp
 rename clang-tools-extra/test/clang-tidy/checkers/{cppcoreguidelines => bugprone}/narrowing-conversions-pedanticmode-option.cpp (52%)
 rename clang-tools-extra/test/clang-tidy/checkers/{cppcoreguidelines => bugprone}/narrowing-conversions-unsigned-char.cpp (80%)
 rename clang-tools-extra/test/clang-tidy/checkers/{cppcoreguidelines => bugprone}/narrowing-conversions.cpp (77%)
 delete mode 100644 clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-narrowingintegertofloatingpoint-option.cpp

diff --git a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
index 33ac65e715ce8..b27616f3dcc65 100644
--- a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
@@ -9,7 +9,6 @@
 #include "../ClangTidy.h"
 #include "../ClangTidyModule.h"
 #include "../ClangTidyModuleRegistry.h"
-#include "../cppcoreguidelines/NarrowingConversionsCheck.h"
 #include "ArgumentCommentCheck.h"
 #include "AssertSideEffectCheck.h"
 #include "AssignmentInIfConditionCheck.h"
@@ -47,6 +46,7 @@
 #include "MultiLevelImplicitPointerConversionCheck.h"
 #include "MultipleNewInOneExpressionCheck.h"
 #include "MultipleStatementMacroCheck.h"
+#include "NarrowingConversionsCheck.h"
 #include "NoEscapeCheck.h"
 #include "NonZeroEnumToBoolConversionCheck.h"
 #include "NondeterministicPointerIterationOrderCheck.h"
@@ -183,7 +183,7 @@ class BugproneModule : public ClangTidyModule {
         "bugprone-pointer-arithmetic-on-polymorphic-object");
     CheckFactories.registerCheck(
         "bugprone-redundant-branch-condition");
-    CheckFactories.registerCheck(
+    CheckFactories.registerCheck(
         "bugprone-narrowing-conversions");
     CheckFactories.registerCheck("bugprone-no-escape");
     CheckFactories.registerCheck(
diff --git a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
index 13adad7c3dadb..73ab22381631c 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
@@ -42,6 +42,7 @@ add_clang_library(clangTidyBugproneModule STATIC
   MultiLevelImplicitPointerConversionCheck.cpp
   MultipleNewInOneExpressionCheck.cpp
   MultipleStatementMacroCheck.cpp
+  NarrowingConversionsCheck.cpp
   NoEscapeCheck.cpp
   NonZeroEnumToBoolConversionCheck.cpp
   NondeterministicPointerIterationOrderCheck.cpp
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/NarrowingConversionsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.cpp
similarity index 99%
rename from clang-tools-extra/clang-tidy/cppcoreguidelines/NarrowingConversionsCheck.cpp
rename to clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.cpp
index 45fef9471d521..a950704208c73 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/NarrowingConversionsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.cpp
@@ -22,7 +22,7 @@
 
 using namespace clang::ast_matchers;
 
-namespace clang::tidy::cppcoreguidelines {
+namespace clang::tidy::bugprone {
 
 namespace {
 
@@ -614,4 +614,4 @@ void NarrowingConversionsCheck::check(const MatchFinder::MatchResult &Result) {
     return handleImplicitCast(*Result.Context, *Cast);
   llvm_unreachable("must be binary operator or cast expression");
 }
-} // namespace clang::tidy::cppcoreguidelines
+} // namespace clang::tidy::bugprone
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/NarrowingConversionsCheck.h b/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.h
similarity index 90%
rename from clang-tools-extra/clang-tidy/cppcoreguidelines/NarrowingConversionsCheck.h
rename to clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.h
index 1add40b91778a..20403f920b925 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/NarrowingConversionsCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.h
@@ -6,19 +6,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_NARROWING_CONVERSIONS_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_NARROWING_CONVERSIONS_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NARROWING_CONVERSIONS_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NARROWING_CONVERSIONS_H
 
 #include "../ClangTidyCheck.h"
 
-namespace clang::tidy::cppcoreguidelines {
+namespace clang::tidy::bugprone {
 
 /// Checks for narrowing conversions, e.g:
 ///   int i = 0;
 ///   i += 0.1;
 ///
 /// For the user-facing documentation see:
-/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/narrowing-conversions.html
+/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/narrowing-conversions.html
 class NarrowingConversionsCheck : public ClangTidyCheck {
 public:
   NarrowingConversionsCheck(StringRef Name, ClangTidyContext *Context);
@@ -104,6 +104,6 @@ class NarrowingConversionsCheck : public ClangTidyCheck {
   const bool PedanticMode;
 };
 
-} // namespace clang::tidy::cppcoreguidelines
+} // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_NARROWING_CONVERSIONS_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NARROWING_CONVERSIONS_H
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt b/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt
index 07bb89ec7937a..1f4107c0b35e7 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt
@@ -16,7 +16,6 @@ add_clang_library(clangTidyCppCoreGuidelinesModule STATIC
   MacroUsageCheck.cpp
   MisleadingCaptureDefaultByValueCheck.cpp
   MissingStdForwardCheck.cpp
-  NarrowingConversionsCheck.cpp
   NoMallocCheck.cpp
   NoSuspendWithLockCheck.cpp
   OwningMemoryCheck.cpp
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp
index e9f0201615616..6adef04264347 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp
@@ -9,6 +9,7 @@
 #include "../ClangTidy.h"
 #include "../ClangTidyModule.h"
 #include "../ClangTidyModuleRegistry.h"
+#include "../bugprone/NarrowingConversionsCheck.h"
 #include "../misc/NonPrivateMemberVariablesInClassesCheck.h"
 #include "../misc/UnconventionalAssignOperatorCheck.h"
 #include "../modernize/AvoidCArraysCheck.h"
@@ -30,7 +31,6 @@
 #include "MacroUsageCheck.h"
 #include "MisleadingCaptureDefaultByValueCheck.h"
 #include "MissingStdForwardCheck.h"
-#include "NarrowingConversionsCheck.h"
 #include "NoMallocCheck.h"
 #include "NoSuspendWithLockCheck.h"
 #include "OwningMemoryCheck.h"
@@ -87,7 +87,7 @@ class CppCoreGuidelinesModule : public ClangTidyModule {
         "cppcoreguidelines-misleading-capture-default-by-value");
     CheckFactories.registerCheck(
         "cppcoreguidelines-missing-std-forward");
-    CheckFactories.registerCheck(
+    CheckFactories.registerCheck(
         "cppcoreguidelines-narrowing-conversions");
     CheckFactories.registerCheck("cppcoreguidelines-no-malloc");
     CheckFactories.registerCheck(
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index fabd0cc78ac64..3cab440155250 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -361,6 +361,13 @@ Removed checks
 Miscellaneous
 ^^^^^^^^^^^^^
 
+- The :doc:`bugprone-narrowing-conversions `
+  check is no longer an alias of :doc:`cppcoreguidelines-narrowing-conversions
+  `. Instead,
+  :doc:`cppcoreguidelines-narrowing-conversions
+  ` is now an alias
+  of :doc:`bugprone-narrowing-conversions `.
+
 Improvements to include-fixer
 -----------------------------
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/narrowing-conversions.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/narrowing-conversions.rst
index f4bb40b341bcd..1a1217ed5a21c 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/narrowing-conversions.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/narrowing-conversions.rst
@@ -1,10 +1,126 @@
 .. title:: clang-tidy - bugprone-narrowing-conversions
-.. meta::
-   :http-equiv=refresh: 5;URL=../cppcoreguidelines/narrowing-conversions.html
 
 bugprone-narrowing-conversions
 ==============================
 
-The bugprone-narrowing-conversions check is an alias, please see
-:doc:`cppcoreguidelines-narrowing-conversions <../cppcoreguidelines/narrowing-conversions>`
-for more information.
+`cppcoreguidelines-narrowing-conversions` redirects here as an alias for this check.
+
+Checks for silent narrowing conversions, e.g: ``int i = 0; i += 0.1;``. While
+the issue is obvious in this former example, it might not be so in the
+following: ``void MyClass::f(double d) { int_member_ += d; }``.
+
+We flag narrowing conversions from:
+ - an integer to a narrower integer (e.g. ``char`` to ``unsigned char``)
+   if WarnOnIntegerNarrowingConversion Option is set,
+ - an integer to a narrower floating-point (e.g. ``uint64_t`` to ``float``)
+   if WarnOnIntegerToFloatingPointNarrowingConversion Option is set,
+ - a floating-point to an integer (e.g. ``double`` to ``int``),
+ - a floating-point to a narrower floating-point (e.g. ``double`` to ``float``)
+   if WarnOnFloatingPointNarrowingConversion Option is set.
+
+This check will flag:
+ - All narrowing conversions that are not marked by an explicit cast (c-style or
+   ``static_cast``). For example: ``int i = 0; i += 0.1;``,
+   ``void f(int); f(0.1);``,
+ - All applications of binary operators with a narrowing conversions.
+   For example: ``int i; i+= 0.1;``.
+
+Arithmetic with smaller integer types than ``int`` trigger implicit conversions,
+as explained under `"Integral Promotion" on cppreference.com
+`_.
+This check diagnoses more instances of narrowing than the compiler warning
+`-Wconversion` does. The example below demonstrates this behavior.
+
+.. code-block:: c++
+
+  // The following function definition demonstrates usage of arithmetic with
+  // integer types smaller than `int` and how the narrowing conversion happens
+  // implicitly.
+  void computation(short argument1, short argument2) {
+    // Arithmetic written by humans:
+    short result = argument1 + argument2;
+    // Arithmetic actually performed by C++:
+    short result = static_cast(static_cast(argument1) + static_cast(argument2));
+  }
+
+  void recommended_resolution(short argument1, short argument2) {
+    short result = argument1 + argument2;
+    //           ^ warning: narrowing conversion from 'int' to signed type 'short' is implementation-defined
+
+    // The cppcoreguidelines recommend to resolve this issue by using the GSL
+    // in one of two ways. Either by a cast that throws if a loss of precision
+    // would occur.
+    short result = gsl::narrow(argument1 + argument2);
+    // Or it can be resolved without checking the result risking invalid results.
+    short result = gsl::narrow_cast(argument1 + argument2);
+
+    // A classical `static_cast` will silence the warning as well if the GSL
+    // is not available.
+    short result = static_cast(argument1 + argument2);
+  }
+
+Options
+-------
+
+.. option:: WarnOnIntegerNarrowingConversion
+
+    When `true`, the check will warn on narrowing integer conversion
+    (e.g. ``int`` to ``size_t``). `true` by default.
+
+.. option:: WarnOnIntegerToFloatingPointNarrowingConversion
+
+    When `true`, the check will warn on narrowing integer to floating-point
+    conversion (e.g. ``size_t`` to ``double``). `true` by default.
+
+.. option:: WarnOnFloatingPointNarrowingConversion
+
+    When `true`, the check will warn on narrowing floating point conversion
+    (e.g. ``double`` to ``float``). `true` by default.
+
+.. option:: WarnWithinTemplateInstantiation
+
+    When `true`, the check will warn on narrowing conversions within template
+    instantiations. `false` by default.
+
+.. option:: WarnOnEquivalentBitWidth
+
+    When `true`, the check will warn on narrowing conversions that arise from
+    casting between types of equivalent bit width. (e.g.
+    `int n = uint(0);` or `long long n = double(0);`) `true` by default.
+
+.. option:: IgnoreConversionFromTypes
+
+   Narrowing conversions from any type in this semicolon-separated list will be
+   ignored. This may be useful to weed out commonly occurring, but less commonly
+   problematic assignments such as `int n = std::vector().size();` or
+   `int n = std::difference(it1, it2);`. The default list is empty, but one
+   suggested list for a legacy codebase would be
+   `size_t;ptrdiff_t;size_type;difference_type`.
+
+.. option:: PedanticMode
+
+    When `true`, the check will warn on assigning a floating point constant
+    to an integer value even if the floating point value is exactly
+    representable in the destination type (e.g. ``int i = 1.0;``).
+    `false` by default.
+
+FAQ
+---
+
+ - What does "narrowing conversion from 'int' to 'float'" mean?
+
+An IEEE754 Floating Point number can represent all integer values in the range
+[-2^PrecisionBits, 2^PrecisionBits] where PrecisionBits is the number of bits in
+the mantissa.
+
+For ``float`` this would be [-2^23, 2^23], where ``int`` can represent values in
+the range [-2^31, 2^31-1].
+
+ - What does "implementation-defined" mean?
+
+You may have encountered messages like "narrowing conversion from 'unsigned int'
+to signed type 'int' is implementation-defined".
+The C/C++ standard does not mandate two's complement for signed integers, and so
+the compiler is free to define what the semantics are for converting an unsigned
+integer to signed integer. Clang's implementation uses the two's complement
+format.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/narrowing-conversions.rst b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/narrowing-conversions.rst
index 7cc0b2809b458..ea24e870d32d4 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/narrowing-conversions.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/narrowing-conversions.rst
@@ -1,129 +1,14 @@
 .. title:: clang-tidy - cppcoreguidelines-narrowing-conversions
+.. meta::
+   :http-equiv=refresh: 5;URL=../bugprone/narrowing-conversions.html
 
 cppcoreguidelines-narrowing-conversions
 =======================================
 
-Checks for silent narrowing conversions, e.g: ``int i = 0; i += 0.1;``. While
-the issue is obvious in this former example, it might not be so in the
-following: ``void MyClass::f(double d) { int_member_ += d; }``.
-
-This check implements `ES.46
+This check implements part of  `ES.46
 `_
 from the C++ Core Guidelines.
 
-We enforce only part of the guideline, more specifically, we flag narrowing conversions from:
- - an integer to a narrower integer (e.g. ``char`` to ``unsigned char``)
-   if WarnOnIntegerNarrowingConversion Option is set,
- - an integer to a narrower floating-point (e.g. ``uint64_t`` to ``float``)
-   if WarnOnIntegerToFloatingPointNarrowingConversion Option is set,
- - a floating-point to an integer (e.g. ``double`` to ``int``),
- - a floating-point to a narrower floating-point (e.g. ``double`` to ``float``)
-   if WarnOnFloatingPointNarrowingConversion Option is set.
-
-This check will flag:
- - All narrowing conversions that are not marked by an explicit cast (c-style or
-   ``static_cast``). For example: ``int i = 0; i += 0.1;``,
-   ``void f(int); f(0.1);``,
- - All applications of binary operators with a narrowing conversions.
-   For example: ``int i; i+= 0.1;``.
-
-Arithmetic with smaller integer types than ``int`` trigger implicit conversions,
-as explained under `"Integral Promotion" on cppreference.com
-`_.
-This check diagnoses more instances of narrowing than the compiler warning
-`-Wconversion` does. The example below demonstrates this behavior.
-
-.. code-block:: c++
-
-   // The following function definition demonstrates usage of arithmetic with
-   // integer types smaller than `int` and how the narrowing conversion happens
-   // implicitly.
-   void computation(short argument1, short argument2) {
-     // Arithmetic written by humans:
-     short result = argument1 + argument2;
-     // Arithmetic actually performed by C++:
-     short result = static_cast(static_cast(argument1) + static_cast(argument2));
-   }
-
-   void recommended_resolution(short argument1, short argument2) {
-     short result = argument1 + argument2;
-     //           ^ warning: narrowing conversion from 'int' to signed type 'short' is implementation-defined
-
-     // The cppcoreguidelines recommend to resolve this issue by using the GSL
-     // in one of two ways. Either by a cast that throws if a loss of precision
-     // would occur.
-     short result = gsl::narrow(argument1 + argument2);
-     // Or it can be resolved without checking the result risking invalid results.
-     short result = gsl::narrow_cast(argument1 + argument2);
-
-     // A classical `static_cast` will silence the warning as well if the GSL
-     // is not available.
-     short result = static_cast(argument1 + argument2);
-   }
-
-
-Options
--------
-
-.. option:: WarnOnIntegerNarrowingConversion
-
-    When `true`, the check will warn on narrowing integer conversion
-    (e.g. ``int`` to ``size_t``). `true` by default.
-
-.. option:: WarnOnIntegerToFloatingPointNarrowingConversion
-
-    When `true`, the check will warn on narrowing integer to floating-point
-    conversion (e.g. ``size_t`` to ``double``). `true` by default.
-
-.. option:: WarnOnFloatingPointNarrowingConversion
-
-    When `true`, the check will warn on narrowing floating point conversion
-    (e.g. ``double`` to ``float``). `true` by default.
-
-.. option:: WarnWithinTemplateInstantiation
-
-    When `true`, the check will warn on narrowing conversions within template
-    instantiations. `false` by default.
-
-.. option:: WarnOnEquivalentBitWidth
-
-    When `true`, the check will warn on narrowing conversions that arise from
-    casting between types of equivalent bit width. (e.g.
-    `int n = uint(0);` or `long long n = double(0);`) `true` by default.
-
-.. option:: IgnoreConversionFromTypes
-
-   Narrowing conversions from any type in this semicolon-separated list will be
-   ignored. This may be useful to weed out commonly occurring, but less commonly
-   problematic assignments such as `int n = std::vector().size();` or
-   `int n = std::difference(it1, it2);`. The default list is empty, but one
-   suggested list for a legacy codebase would be
-   `size_t;ptrdiff_t;size_type;difference_type`.
-
-.. option:: PedanticMode
-
-    When `true`, the check will warn on assigning a floating point constant
-    to an integer value even if the floating point value is exactly
-    representable in the destination type (e.g. ``int i = 1.0;``).
-    `false` by default.
-
-FAQ
----
-
- - What does "narrowing conversion from 'int' to 'float'" mean?
-
-An IEEE754 Floating Point number can represent all integer values in the range
-[-2^PrecisionBits, 2^PrecisionBits] where PrecisionBits is the number of bits in
-the mantissa.
-
-For ``float`` this would be [-2^23, 2^23], where ``int`` can represent values in
-the range [-2^31, 2^31-1].
-
- - What does "implementation-defined" mean?
-
-You may have encountered messages like "narrowing conversion from 'unsigned int'
-to signed type 'int' is implementation-defined".
-The C/C++ standard does not mandate two's complement for signed integers, and so
-the compiler is free to define what the semantics are for converting an unsigned
-integer to signed integer. Clang's implementation uses the two's complement
-format.
+The cppcoreguidelines-narrowing-conversions check is an alias, please see
+:doc:`bugprone-narrowing-conversions <../bugprone/narrowing-conversions>`
+for more information.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index 4d8853a0f6d86..e8f9b4e829634 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -114,6 +114,7 @@ Clang-Tidy Checks
    :doc:`bugprone-multi-level-implicit-pointer-conversion `,
    :doc:`bugprone-multiple-new-in-one-expression `,
    :doc:`bugprone-multiple-statement-macro `,
+   :doc:`bugprone-narrowing-conversions `,
    :doc:`bugprone-no-escape `,
    :doc:`bugprone-non-zero-enum-to-bool-conversion `,
    :doc:`bugprone-nondeterministic-pointer-iteration-order `,
@@ -190,7 +191,6 @@ Clang-Tidy Checks
    :doc:`cppcoreguidelines-macro-usage `,
    :doc:`cppcoreguidelines-misleading-capture-default-by-value `, "Yes"
    :doc:`cppcoreguidelines-missing-std-forward `,
-   :doc:`cppcoreguidelines-narrowing-conversions `,
    :doc:`cppcoreguidelines-no-malloc `,
    :doc:`cppcoreguidelines-no-suspend-with-lock `,
    :doc:`cppcoreguidelines-owning-memory `,
@@ -411,7 +411,6 @@ Check aliases
 .. csv-table::
    :header: "Name", "Redirect", "Offers fixes"
 
-   :doc:`bugprone-narrowing-conversions `, :doc:`cppcoreguidelines-narrowing-conversions `,
    :doc:`cert-arr39-c `, :doc:`bugprone-sizeof-expression `,
    :doc:`cert-con36-c `, :doc:`bugprone-spuriously-wake-up-functions `,
    :doc:`cert-con54-cpp `, :doc:`bugprone-spuriously-wake-up-functions `,
@@ -541,6 +540,7 @@ Check aliases
    :doc:`cppcoreguidelines-c-copy-assignment-signature `, :doc:`misc-unconventional-assign-operator `,
    :doc:`cppcoreguidelines-explicit-virtual-functions `, :doc:`modernize-use-override `, "Yes"
    :doc:`cppcoreguidelines-macro-to-enum `, :doc:`modernize-macro-to-enum `, "Yes"
+   :doc:`cppcoreguidelines-narrowing-conversions `, :doc:`bugprone-narrowing-conversions `,
    :doc:`cppcoreguidelines-noexcept-destructor `, :doc:`performance-noexcept-destructor `, "Yes"
    :doc:`cppcoreguidelines-noexcept-move-operations `, :doc:`performance-noexcept-move-constructor `, "Yes"
    :doc:`cppcoreguidelines-noexcept-swap `, :doc:`performance-noexcept-swap `, "Yes"
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-bitfields.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-bitfields.cpp
similarity index 97%
rename from clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-bitfields.cpp
rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-bitfields.cpp
index 36fde38202efc..a7bb3c8d0c0c7 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-bitfields.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-bitfields.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy %s cppcoreguidelines-narrowing-conversions %t \
+// RUN: %check_clang_tidy %s bugprone-narrowing-conversions %t \
 // RUN:   -std=c++17 -- -target x86_64-unknown-linux
 
 #define CHAR_BITS 8
@@ -31,7 +31,7 @@ struct CompleteBitfield {
 };
 
 int example_warning(unsigned x) {
-  // CHECK-MESSAGES: :[[@LINE+1]]:10: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE+1]]:10: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
   return x;
 }
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-equivalentbitwidth-option.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-equivalentbitwidth-option.cpp
similarity index 66%
rename from clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-equivalentbitwidth-option.cpp
rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-equivalentbitwidth-option.cpp
index fb5c7e36eeb0d..0deb006711367 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-equivalentbitwidth-option.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-equivalentbitwidth-option.cpp
@@ -1,35 +1,35 @@
 // RUN: %check_clang_tidy -check-suffix=DEFAULT %s \
-// RUN: cppcoreguidelines-narrowing-conversions %t -- 
+// RUN: bugprone-narrowing-conversions %t -- 
 
 // RUN: %check_clang_tidy -check-suffix=DISABLED %s \
-// RUN: cppcoreguidelines-narrowing-conversions %t -- \
+// RUN: bugprone-narrowing-conversions %t -- \
 // RUN: -config='{CheckOptions: { \
-// RUN:   cppcoreguidelines-narrowing-conversions.WarnOnEquivalentBitWidth: 0}}'
+// RUN:   bugprone-narrowing-conversions.WarnOnEquivalentBitWidth: 0}}'
 
 void narrowing_equivalent_bitwidth() {
   int i;
   unsigned int ui;
   i = ui;
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
   // DISABLED: Warning disabled with WarnOnEquivalentBitWidth=0.
 
   float f;
   i = f;
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'float' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'float' to 'int' [bugprone-narrowing-conversions]
   // DISABLED: Warning disabled with WarnOnEquivalentBitWidth=0.
 
   f = i;
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'int' to 'float' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]
   // DISABLED: Warning disabled with WarnOnEquivalentBitWidth=0.
 
   long long ll;
   double d;
   ll = d;
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:8: warning: narrowing conversion from 'double' to 'long long' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:8: warning: narrowing conversion from 'double' to 'long long' [bugprone-narrowing-conversions]
   // DISABLED: Warning disabled with WarnOnEquivalentBitWidth=0.
 
   d = ll;
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'long long' to 'double' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'long long' to 'double' [bugprone-narrowing-conversions]
   // DISABLED: Warning disabled with WarnOnEquivalentBitWidth=0.
 }
 
@@ -37,6 +37,6 @@ void most_narrowing_is_not_ok() {
   int i;
   long long ui;
   i = ui;
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
-  // CHECK-MESSAGES-DISABLED: :[[@LINE-2]]:7: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
+  // CHECK-MESSAGES-DISABLED: :[[@LINE-2]]:7: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
 }
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-ignoreconversionfromtypes-option.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-ignoreconversionfromtypes-option.cpp
similarity index 75%
rename from clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-ignoreconversionfromtypes-option.cpp
rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-ignoreconversionfromtypes-option.cpp
index 91e908f535a0d..6d93f5d642b5e 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-ignoreconversionfromtypes-option.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-ignoreconversionfromtypes-option.cpp
@@ -1,10 +1,10 @@
 // RUN: %check_clang_tidy -check-suffix=DEFAULT %s \
-// RUN: cppcoreguidelines-narrowing-conversions %t --
+// RUN: bugprone-narrowing-conversions %t --
 
 // RUN: %check_clang_tidy -check-suffix=IGNORED %s \
-// RUN: cppcoreguidelines-narrowing-conversions %t -- \
+// RUN: bugprone-narrowing-conversions %t -- \
 // RUN: -config='{CheckOptions: { \
-// RUN:   cppcoreguidelines-narrowing-conversions.IgnoreConversionFromTypes: "global_size_t;nested_size_type;long" \
+// RUN:   bugprone-narrowing-conversions.IgnoreConversionFromTypes: "global_size_t;nested_size_type;long" \
 // RUN: }}'
 
 // We use global_size_t instead of 'size_t' because windows predefines size_t.
@@ -20,7 +20,7 @@ void narrowing_global_size_t() {
   int i;
   global_size_t j;
   i = j;
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'global_size_t' (aka 'long long') to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'global_size_t' (aka 'long long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
   // IGNORED: Warning is disabled with IgnoreConversionFromTypes=global_size_t.
 }
 
@@ -28,7 +28,7 @@ void narrowing_size_type() {
   int i;
   vector::nested_size_type j;
   i = j;
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'vector::nested_size_type' (aka 'long long') to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'vector::nested_size_type' (aka 'long long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
   // IGNORED: Warning is disabled with IgnoreConversionFromTypes=nested_size_type.
 }
 
@@ -36,11 +36,11 @@ void narrowing_size_method() {
   vector v;
   int i, j;
   i = v.size();
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'global_size_t' (aka 'long long') to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'global_size_t' (aka 'long long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
   // IGNORED: Warning is disabled with IgnoreConversionFromTypes=global_size_t.
 
   i = j + v.size();
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'global_size_t' (aka 'long long') to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'global_size_t' (aka 'long long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
   // IGNORED: Warning is disabled with IgnoreConversionFromTypes=global_size_t.
 }
 
@@ -49,7 +49,7 @@ void narrowing_size_method_binary_expr() {
   int j;
   vector v;
   i = j + v.size();
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'global_size_t' (aka 'long long') to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'global_size_t' (aka 'long long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
   // IGNORED: Warning is disabled with IgnoreConversionFromTypes=global_size_t.
 }
 
@@ -57,11 +57,11 @@ void narrowing_size_method_binary_op() {
   int i, j;
   vector v;
   i += v.size();
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:8: warning: narrowing conversion from 'global_size_t' (aka 'long long') to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:8: warning: narrowing conversion from 'global_size_t' (aka 'long long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
   // IGNORED: Warning is disabled with IgnoreConversionFromTypes=global_size_t.
 
   i += j + v.size();
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:8: warning: narrowing conversion from 'global_size_t' (aka 'long long') to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:8: warning: narrowing conversion from 'global_size_t' (aka 'long long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
   // IGNORED: Warning is disabled with IgnoreConversionFromTypes=global_size_t.
 }
 
@@ -69,13 +69,13 @@ void most_narrowing_is_not_ok() {
   int i;
   long long j;
   i = j;
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
-  // CHECK-MESSAGES-IGNORED: :[[@LINE-2]]:7: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
+  // CHECK-MESSAGES-IGNORED: :[[@LINE-2]]:7: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
 }
 
 void test_ignore_builtin_type_pr58809() {
   long x = 123;
   short y = x;
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:13: warning: narrowing conversion from 'long' to signed type 'short' is implementation-defined [cppcoreguidelines-narrowing-conversions]
-  // CHECK-MESSAGES-NOT-IGNORED: :[[@LINE-2]]:13: warning: narrowing conversion from 'long' to signed type 'short' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:13: warning: narrowing conversion from 'long' to signed type 'short' is implementation-defined [bugprone-narrowing-conversions]
+  // CHECK-MESSAGES-NOT-IGNORED: :[[@LINE-2]]:13: warning: narrowing conversion from 'long' to signed type 'short' is implementation-defined [bugprone-narrowing-conversions]
 }
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-intemplates-option.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-intemplates-option.cpp
similarity index 73%
rename from clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-intemplates-option.cpp
rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-intemplates-option.cpp
index cb19ed78cce8a..625dc45abcbec 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-intemplates-option.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-intemplates-option.cpp
@@ -1,10 +1,10 @@
 // RUN: %check_clang_tidy -check-suffix=DEFAULT %s \
-// RUN: cppcoreguidelines-narrowing-conversions %t --
+// RUN: bugprone-narrowing-conversions %t --
 
 // RUN: %check_clang_tidy -check-suffix=WARN %s \
-// RUN: cppcoreguidelines-narrowing-conversions %t -- \
+// RUN: bugprone-narrowing-conversions %t -- \
 // RUN: -config='{CheckOptions: { \
-// RUN:   cppcoreguidelines-narrowing-conversions.WarnWithinTemplateInstantiation: 1 \
+// RUN:   bugprone-narrowing-conversions.WarnWithinTemplateInstantiation: 1 \
 // RUN: }}'
 
 template 
@@ -12,7 +12,7 @@ void assign_in_template(OrigType jj) {
   int ii;
   ii = jj;
   // DEFAULT: Warning disabled because WarnWithinTemplateInstantiation=0.
-  // CHECK-MESSAGES-WARN: :[[@LINE-2]]:8: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-WARN: :[[@LINE-2]]:8: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
 }
 
 void narrow_inside_template_not_ok() {
@@ -23,8 +23,8 @@ void narrow_inside_template_not_ok() {
 void assign_outside_template(long long jj) {
   int ii;
   ii = jj;
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:8: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
-  // CHECK-MESSAGES-WARN: :[[@LINE-2]]:8: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:8: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
+  // CHECK-MESSAGES-WARN: :[[@LINE-2]]:8: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
 }
 
 void narrow_outside_template_not_ok() {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-long-is-32bits.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-long-is-32bits.cpp
similarity index 84%
rename from clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-long-is-32bits.cpp
rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-long-is-32bits.cpp
index dcf1848a30f66..8e801a0eeea37 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-long-is-32bits.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-long-is-32bits.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy %s cppcoreguidelines-narrowing-conversions %t \
+// RUN: %check_clang_tidy %s bugprone-narrowing-conversions %t \
 // RUN: -- -- -target x86_64-unknown-linux -m32
 
 static_assert(sizeof(int) * 8 == 32, "int is 32-bits");
@@ -16,8 +16,8 @@ void narrow_integer_to_signed_integer_is_not_ok() {
 
   i = l;  // int and long are the same type.
   i = ll; // int64_t does not fit in an int32_t
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
   ll = ul;  // uint32_t fits into int64_t
   ll = ull; // uint64_t does not fit in an int64_t
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'unsigned long long' to signed type 'long long' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'unsigned long long' to signed type 'long long' is implementation-defined [bugprone-narrowing-conversions]
 }
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-narrowingfloatingpoint-option.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-narrowingfloatingpoint-option.cpp
similarity index 70%
rename from clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-narrowingfloatingpoint-option.cpp
rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-narrowingfloatingpoint-option.cpp
index 6cad3204c18e4..9ded2f0923f4e 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-narrowingfloatingpoint-option.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-narrowingfloatingpoint-option.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy %s cppcoreguidelines-narrowing-conversions %t \
+// RUN: %check_clang_tidy %s bugprone-narrowing-conversions %t \
 // RUN: -- -- -target x86_64-unknown-linux -fsigned-char
 
 namespace floats {
@@ -6,15 +6,15 @@ namespace floats {
 void narrow_constant_floating_point_to_int_not_ok(double d) {
   int i = 0;
   i += 0.5;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'double' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'double' to 'int' [bugprone-narrowing-conversions]
   i += 0.5f;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'float' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'float' to 'int' [bugprone-narrowing-conversions]
   i *= 0.5f;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'float' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'float' to 'int' [bugprone-narrowing-conversions]
   i /= 0.5f;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'float' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'float' to 'int' [bugprone-narrowing-conversions]
   i += (double)0.5f;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'double' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'double' to 'int' [bugprone-narrowing-conversions]
   i += 2.0;
   i += 2.0f;
 }
@@ -28,11 +28,11 @@ float narrow_double_to_float_return() {
 void narrow_double_to_float_not_ok(double d) {
   float f;
   f = d;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'double' to 'float' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'double' to 'float' [bugprone-narrowing-conversions]
   f = 15_double;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'double' to 'float' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'double' to 'float' [bugprone-narrowing-conversions]
   f += d;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'double' to 'float' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'double' to 'float' [bugprone-narrowing-conversions]
   f = narrow_double_to_float_return();
 }
 
@@ -46,11 +46,11 @@ void narrow_fp_constants() {
   f = __builtin_nanf("0");    // float NaN is not narrowing.
 
   f = __builtin_huge_val(); // max double is not within-range of float.
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from constant 'double' to 'float' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from constant 'double' to 'float' [bugprone-narrowing-conversions]
   f = -__builtin_huge_val(); // -max double is not within-range of float.
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from constant 'double' to 'float' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from constant 'double' to 'float' [bugprone-narrowing-conversions]
   f = __builtin_inf(); // double infinity is not within-range of float.
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from constant 'double' to 'float' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from constant 'double' to 'float' [bugprone-narrowing-conversions]
   f = __builtin_nan("0"); // double NaN is not narrowing.
 }
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-narrowinginteger-option.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-narrowinginteger-option.cpp
similarity index 59%
rename from clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-narrowinginteger-option.cpp
rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-narrowinginteger-option.cpp
index f58de65f04232..fce90ecf0881d 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-narrowinginteger-option.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-narrowinginteger-option.cpp
@@ -1,23 +1,23 @@
 // RUN: %check_clang_tidy -check-suffix=DEFAULT %s \
-// RUN: cppcoreguidelines-narrowing-conversions %t -- \
-// RUN: -config='{CheckOptions: {cppcoreguidelines-narrowing-conversions.WarnOnIntegerNarrowingConversion: true}}'
+// RUN: bugprone-narrowing-conversions %t -- \
+// RUN: -config='{CheckOptions: {bugprone-narrowing-conversions.WarnOnIntegerNarrowingConversion: true}}'
 
 // RUN: %check_clang_tidy -check-suffix=DISABLED %s \
-// RUN: cppcoreguidelines-narrowing-conversions %t -- \
-// RUN: -config='{CheckOptions: {cppcoreguidelines-narrowing-conversions.WarnOnIntegerNarrowingConversion: false}}'
+// RUN: bugprone-narrowing-conversions %t -- \
+// RUN: -config='{CheckOptions: {bugprone-narrowing-conversions.WarnOnIntegerNarrowingConversion: false}}'
 
 void foo(unsigned long long value) {
   int a = value;
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:11: warning: narrowing conversion from 'unsigned long long' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:11: warning: narrowing conversion from 'unsigned long long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
   // DISABLED: No warning for integer narrowing conversions when WarnOnIntegerNarrowingConversion = false.
   long long b = value;
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:17: warning: narrowing conversion from 'unsigned long long' to signed type 'long long' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:17: warning: narrowing conversion from 'unsigned long long' to signed type 'long long' is implementation-defined [bugprone-narrowing-conversions]
   // DISABLED: No warning for integer narrowing conversions when WarnOnIntegerNarrowingConversion = false.
 }
 
 void casting_float_to_bool_is_still_operational_when_integer_narrowing_is_disabled(float f) {
   if (f) {
-    // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'float' to 'bool' [cppcoreguidelines-narrowing-conversions]
-    // CHECK-MESSAGES-DISABLED: :[[@LINE-2]]:7: warning: narrowing conversion from 'float' to 'bool' [cppcoreguidelines-narrowing-conversions]
+    // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'float' to 'bool' [bugprone-narrowing-conversions]
+    // CHECK-MESSAGES-DISABLED: :[[@LINE-2]]:7: warning: narrowing conversion from 'float' to 'bool' [bugprone-narrowing-conversions]
   }
 }
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-narrowingintegertofloatingpoint-option.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-narrowingintegertofloatingpoint-option.cpp
new file mode 100644
index 0000000000000..704d24dbb973d
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-narrowingintegertofloatingpoint-option.cpp
@@ -0,0 +1,19 @@
+// RUN: %check_clang_tidy -check-suffix=DEFAULT %s \
+// RUN: bugprone-narrowing-conversions %t -- \
+// RUN: -config='{CheckOptions: {bugprone-narrowing-conversions.WarnOnIntegerToFloatingPointNarrowingConversion: true}}'
+
+// RUN: %check_clang_tidy -check-suffix=DISABLED %s \
+// RUN: bugprone-narrowing-conversions %t -- \
+// RUN: -config='{CheckOptions: {bugprone-narrowing-conversions.WarnOnIntegerToFloatingPointNarrowingConversion: false}}'
+
+void foo(unsigned long long value) {
+  double a = value;
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:14: warning: narrowing conversion from 'unsigned long long' to 'double' [bugprone-narrowing-conversions]
+  // DISABLED: No warning for integer to floating-point narrowing conversions when WarnOnIntegerToFloatingPointNarrowingConversion = false.
+}
+
+void floating_point_to_integer_is_still_not_ok(double f) {
+  int a = f;
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:11: warning: narrowing conversion from 'double' to 'int' [bugprone-narrowing-conversions]
+  // CHECK-MESSAGES-DISABLED: :[[@LINE-2]]:11: warning: narrowing conversion from 'double' to 'int' [bugprone-narrowing-conversions]
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-pedanticmode-option.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-pedanticmode-option.cpp
similarity index 52%
rename from clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-pedanticmode-option.cpp
rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-pedanticmode-option.cpp
index eb1a5a67ee118..d2e2eada96c4b 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-pedanticmode-option.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-pedanticmode-option.cpp
@@ -1,22 +1,22 @@
-// RUN: %check_clang_tidy %s cppcoreguidelines-narrowing-conversions %t \
+// RUN: %check_clang_tidy %s bugprone-narrowing-conversions %t \
 // RUN: -config="{CheckOptions: { \
-// RUN:   cppcoreguidelines-narrowing-conversions.PedanticMode: true}}" \
+// RUN:   bugprone-narrowing-conversions.PedanticMode: true}}" \
 // RUN: -- -target x86_64-unknown-linux -fsigned-char
 
 namespace floats {
 
 void triggers_wrong_constant_type_warning(double d) {
   int i = 0.0;
-  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: constant value should be of type of type 'int' instead of 'double' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: constant value should be of type of type 'int' instead of 'double' [bugprone-narrowing-conversions]
   i += 2.0;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: constant value should be of type of type 'int' instead of 'double' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: constant value should be of type of type 'int' instead of 'double' [bugprone-narrowing-conversions]
   i += 2.0f;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: constant value should be of type of type 'int' instead of 'float' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: constant value should be of type of type 'int' instead of 'float' [bugprone-narrowing-conversions]
 }
 
 void triggers_narrowing_warning_when_overflowing() {
   unsigned short us = 65537.0;
-  // CHECK-MESSAGES: :[[@LINE-1]]:23: warning: narrowing conversion from constant 'double' to 'unsigned short' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:23: warning: narrowing conversion from constant 'double' to 'unsigned short' [bugprone-narrowing-conversions]
 }
 
 } // namespace floats
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-unsigned-char.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-unsigned-char.cpp
similarity index 80%
rename from clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-unsigned-char.cpp
rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-unsigned-char.cpp
index 6bd437f98d44c..6a544b46b65d0 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-unsigned-char.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-unsigned-char.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy %s cppcoreguidelines-narrowing-conversions %t \
+// RUN: %check_clang_tidy %s bugprone-narrowing-conversions %t \
 // RUN: -- -- -target x86_64-unknown-linux -funsigned-char
 
 void narrow_integer_to_unsigned_integer_is_ok() {
@@ -42,24 +42,24 @@ void narrow_integer_to_signed_integer_is_not_ok() {
 
   sc = sc;
   sc = s;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'short' to signed type 'signed char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'short' to signed type 'signed char' is implementation-defined [bugprone-narrowing-conversions]
   sc = i;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'int' to signed type 'signed char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'int' to signed type 'signed char' is implementation-defined [bugprone-narrowing-conversions]
   sc = l;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'long' to signed type 'signed char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'long' to signed type 'signed char' is implementation-defined [bugprone-narrowing-conversions]
   sc = ll;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'long long' to signed type 'signed char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'long long' to signed type 'signed char' is implementation-defined [bugprone-narrowing-conversions]
 
   sc = c;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'char' to signed type 'signed char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'char' to signed type 'signed char' is implementation-defined [bugprone-narrowing-conversions]
   sc = us;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'unsigned short' to signed type 'signed char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'unsigned short' to signed type 'signed char' is implementation-defined [bugprone-narrowing-conversions]
   sc = ui;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'unsigned int' to signed type 'signed char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'unsigned int' to signed type 'signed char' is implementation-defined [bugprone-narrowing-conversions]
   sc = ul;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'unsigned long' to signed type 'signed char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'unsigned long' to signed type 'signed char' is implementation-defined [bugprone-narrowing-conversions]
   sc = ull;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'unsigned long long' to signed type 'signed char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'unsigned long long' to signed type 'signed char' is implementation-defined [bugprone-narrowing-conversions]
 }
 
 void narrow_constant_to_unsigned_integer_is_ok() {
@@ -72,7 +72,7 @@ void narrow_constant_to_unsigned_integer_is_ok() {
   unsigned char uc3 = -1;  // unsigned dst type is well defined.
   unsigned char uc4 = 256; // unsigned dst type is well defined.
   signed char sc = 128;
-  // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: narrowing conversion from constant value 128 (0x00000080) of type 'int' to signed type 'signed char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: narrowing conversion from constant value 128 (0x00000080) of type 'int' to signed type 'signed char' is implementation-defined [bugprone-narrowing-conversions]
 }
 
 void narrow_conditional_operator_contant_to_unsigned_is_ok(bool b) {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions.cpp
similarity index 77%
rename from clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions.cpp
rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions.cpp
index 29b38e74e1a22..39875264bd1e6 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions.cpp
@@ -1,6 +1,6 @@
-// RUN: %check_clang_tidy %s cppcoreguidelines-narrowing-conversions %t \
+// RUN: %check_clang_tidy %s bugprone-narrowing-conversions %t \
 // RUN: -config="{CheckOptions: { \
-// RUN:   cppcoreguidelines-narrowing-conversions.WarnOnFloatingPointNarrowingConversion: false}}" \
+// RUN:   bugprone-narrowing-conversions.WarnOnFloatingPointNarrowingConversion: false}}" \
 // RUN: -- -target x86_64-unknown-linux -fsigned-char
 
 float ceil(float);
@@ -20,27 +20,27 @@ float operator"" _float(unsigned long long);
 void narrow_fp_to_int_not_ok(double d) {
   int i = 0;
   i = d;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'double' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'double' to 'int' [bugprone-narrowing-conversions]
   i = 0.5f;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from constant 'float' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from constant 'float' to 'int' [bugprone-narrowing-conversions]
   i = static_cast(d);
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'float' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'float' to 'int' [bugprone-narrowing-conversions]
   i = ConvertsToFloat();
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'float' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'float' to 'int' [bugprone-narrowing-conversions]
   i = 15_float;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'float' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'float' to 'int' [bugprone-narrowing-conversions]
   i += d;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'double' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'double' to 'int' [bugprone-narrowing-conversions]
   i += 0.5;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'double' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'double' to 'int' [bugprone-narrowing-conversions]
   i += 0.5f;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'float' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'float' to 'int' [bugprone-narrowing-conversions]
   i *= 0.5f;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'float' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'float' to 'int' [bugprone-narrowing-conversions]
   i /= 0.5f;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'float' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'float' to 'int' [bugprone-narrowing-conversions]
   i += (double)0.5f;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'double' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'double' to 'int' [bugprone-narrowing-conversions]
   i += 2.0;
   i += 2.0f;
 }
@@ -84,29 +84,29 @@ void narrow_double_to_float_not_ok_binary_ops(double d) {
 
 void narrow_fp_constant_to_bool_not_ok() {
   bool b1 = 1.0;
-  // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: narrowing conversion from constant 'double' to 'bool' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: narrowing conversion from constant 'double' to 'bool' [bugprone-narrowing-conversions]
   bool b2 = 1.0f;
-  // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: narrowing conversion from constant 'float' to 'bool' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: narrowing conversion from constant 'float' to 'bool' [bugprone-narrowing-conversions]
 }
 
 void narrow_integer_to_floating() {
   {
     long long ll; // 64 bits
     float f = ll; // doesn't fit in 24 bits
-    // CHECK-MESSAGES: :[[@LINE-1]]:15: warning: narrowing conversion from 'long long' to 'float' [cppcoreguidelines-narrowing-conversions]
+    // CHECK-MESSAGES: :[[@LINE-1]]:15: warning: narrowing conversion from 'long long' to 'float' [bugprone-narrowing-conversions]
     double d = ll; // doesn't fit in 53 bits.
-    // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: narrowing conversion from 'long long' to 'double' [cppcoreguidelines-narrowing-conversions]
+    // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: narrowing conversion from 'long long' to 'double' [bugprone-narrowing-conversions]
   }
   {
     int i;       // 32 bits
     float f = i; // doesn't fit in 24 bits
-    // CHECK-MESSAGES: :[[@LINE-1]]:15: warning: narrowing conversion from 'int' to 'float' [cppcoreguidelines-narrowing-conversions]
+    // CHECK-MESSAGES: :[[@LINE-1]]:15: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]
     double d = i; // fits in 53 bits.
   }
   {
     short n1, n2;
     float f = n1 + n2; // 'n1 + n2' is of type 'int' because of integer rules
-    // CHECK-MESSAGES: :[[@LINE-1]]:15: warning: narrowing conversion from 'int' to 'float' [cppcoreguidelines-narrowing-conversions]
+    // CHECK-MESSAGES: :[[@LINE-1]]:15: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]
   }
   {
     short s;      // 16 bits
@@ -156,41 +156,41 @@ void narrow_integer_to_signed_integer_is_not_ok() {
 
   c = c;
   c = s;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'short' to signed type 'char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'short' to signed type 'char' is implementation-defined [bugprone-narrowing-conversions]
   c = i;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'int' to signed type 'char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'int' to signed type 'char' is implementation-defined [bugprone-narrowing-conversions]
   c = l;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'long' to signed type 'char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'long' to signed type 'char' is implementation-defined [bugprone-narrowing-conversions]
   c = ll;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'long long' to signed type 'char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'long long' to signed type 'char' is implementation-defined [bugprone-narrowing-conversions]
 
   c = uc;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned char' to signed type 'char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned char' to signed type 'char' is implementation-defined [bugprone-narrowing-conversions]
   c = us;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned short' to signed type 'char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned short' to signed type 'char' is implementation-defined [bugprone-narrowing-conversions]
   c = ui;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned int' to signed type 'char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned int' to signed type 'char' is implementation-defined [bugprone-narrowing-conversions]
   c = ul;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned long' to signed type 'char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned long' to signed type 'char' is implementation-defined [bugprone-narrowing-conversions]
   c = ull;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned long long' to signed type 'char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned long long' to signed type 'char' is implementation-defined [bugprone-narrowing-conversions]
 
   i = c;
   i = s;
   i = i;
   i = l;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'long' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
   i = ll;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
 
   i = uc;
   i = us;
   i = ui;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
   i = ul;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned long' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
   i = ull;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned long long' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned long long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
 
   ll = c;
   ll = s;
@@ -202,9 +202,9 @@ void narrow_integer_to_signed_integer_is_not_ok() {
   ll = us;
   ll = ui;
   ll = ul;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'unsigned long' to signed type 'long long' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'unsigned long' to signed type 'long long' is implementation-defined [bugprone-narrowing-conversions]
   ll = ull;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'unsigned long long' to signed type 'long long' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'unsigned long long' to signed type 'long long' is implementation-defined [bugprone-narrowing-conversions]
 }
 
 void narrow_constant_to_unsigned_integer_is_ok() {
@@ -222,16 +222,16 @@ void narrow_constant_to_signed_integer_is_not_ok() {
   char c1 = -128;
   char c2 = 127;
   char c3 = -129;
-  // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: narrowing conversion from constant value -129 (0xFFFFFF7F) of type 'int' to signed type 'char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: narrowing conversion from constant value -129 (0xFFFFFF7F) of type 'int' to signed type 'char' is implementation-defined [bugprone-narrowing-conversions]
   char c4 = 128;
-  // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: narrowing conversion from constant value 128 (0x00000080) of type 'int' to signed type 'char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: narrowing conversion from constant value 128 (0x00000080) of type 'int' to signed type 'char' is implementation-defined [bugprone-narrowing-conversions]
 
   short s1 = -32768;
   short s2 = 32767;
   short s3 = -32769;
-  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: narrowing conversion from constant value -32769 (0xFFFF7FFF) of type 'int' to signed type 'short' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: narrowing conversion from constant value -32769 (0xFFFF7FFF) of type 'int' to signed type 'short' is implementation-defined [bugprone-narrowing-conversions]
   short s4 = 32768;
-  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: narrowing conversion from constant value 32768 (0x00008000) of type 'int' to signed type 'short' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: narrowing conversion from constant value 32768 (0x00008000) of type 'int' to signed type 'short' is implementation-defined [bugprone-narrowing-conversions]
 }
 
 void narrow_conditional_operator_contant_to_unsigned_is_ok(bool b) {
@@ -244,22 +244,22 @@ void narrow_conditional_operator_contant_to_unsigned_is_ok(bool b) {
 void narrow_conditional_operator_contant_to_signed_is_not_ok(bool b) {
   char uc1 = b ? 1 : 0;
   char uc2 = b ? 1 : 128;
-  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: narrowing conversion from constant value 128 (0x00000080) of type 'int' to signed type 'char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: narrowing conversion from constant value 128 (0x00000080) of type 'int' to signed type 'char' is implementation-defined [bugprone-narrowing-conversions]
   char uc3 = b ? -129 : 0;
-  // CHECK-MESSAGES: :[[@LINE-1]]:18: warning: narrowing conversion from constant value -129 (0xFFFFFF7F) of type 'int' to signed type 'char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:18: warning: narrowing conversion from constant value -129 (0xFFFFFF7F) of type 'int' to signed type 'char' is implementation-defined [bugprone-narrowing-conversions]
   unsigned long long ysize;
   long long mirror = b ? -1 : ysize - 1;
-  // CHECK-MESSAGES: :[[@LINE-1]]:26: warning: narrowing conversion from constant value 18446744073709551615 (0xFFFFFFFFFFFFFFFF) of type 'unsigned long long' to signed type 'long long' is implementation-defined [cppcoreguidelines-narrowing-conversions]
-  // CHECK-MESSAGES: :[[@LINE-2]]:37: warning: narrowing conversion from 'unsigned long long' to signed type 'long long' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:26: warning: narrowing conversion from constant value 18446744073709551615 (0xFFFFFFFFFFFFFFFF) of type 'unsigned long long' to signed type 'long long' is implementation-defined [bugprone-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-2]]:37: warning: narrowing conversion from 'unsigned long long' to signed type 'long long' is implementation-defined [bugprone-narrowing-conversions]
 }
 
 void narrow_constant_to_floating_point() {
   float f_ok = 1ULL << 24;              // fits in 24 bits mantissa.
   float f_not_ok = (1ULL << 24) + 1ULL; // doesn't fit in 24 bits mantissa.
-  // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: narrowing conversion from constant value 16777217 of type 'unsigned long long' to 'float' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: narrowing conversion from constant value 16777217 of type 'unsigned long long' to 'float' [bugprone-narrowing-conversions]
   double d_ok = 1ULL << 53;              // fits in 53 bits mantissa.
   double d_not_ok = (1ULL << 53) + 1ULL; // doesn't fit in 53 bits mantissa.
-  // CHECK-MESSAGES: :[[@LINE-1]]:21: warning: narrowing conversion from constant value 9007199254740993 of type 'unsigned long long' to 'double' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:21: warning: narrowing conversion from constant value 9007199254740993 of type 'unsigned long long' to 'double' [bugprone-narrowing-conversions]
 }
 
 void casting_integer_to_bool_is_ok() {
@@ -275,13 +275,13 @@ void casting_integer_to_bool_is_ok() {
 void casting_float_to_bool_is_not_ok() {
   float f;
   while (f) {
-    // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: narrowing conversion from 'float' to 'bool' [cppcoreguidelines-narrowing-conversions]
+    // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: narrowing conversion from 'float' to 'bool' [bugprone-narrowing-conversions]
   }
   for (; f;) {
-    // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: narrowing conversion from 'float' to 'bool' [cppcoreguidelines-narrowing-conversions]
+    // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: narrowing conversion from 'float' to 'bool' [bugprone-narrowing-conversions]
   }
   if (f) {
-    // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'float' to 'bool' [cppcoreguidelines-narrowing-conversions]
+    // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'float' to 'bool' [bugprone-narrowing-conversions]
   }
 }
 
@@ -352,7 +352,7 @@ void typedef_context() {
   i64 = i;   // Okay, no narrowing.
 
   i = i64;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'myint64_t' (aka 'long long') to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'myint64_t' (aka 'long long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
 }
 
 } // namespace floats
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-narrowingintegertofloatingpoint-option.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-narrowingintegertofloatingpoint-option.cpp
deleted file mode 100644
index 35ca61b6a9a8c..0000000000000
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-narrowingintegertofloatingpoint-option.cpp
+++ /dev/null
@@ -1,19 +0,0 @@
-// RUN: %check_clang_tidy -check-suffix=DEFAULT %s \
-// RUN: cppcoreguidelines-narrowing-conversions %t -- \
-// RUN: -config='{CheckOptions: {cppcoreguidelines-narrowing-conversions.WarnOnIntegerToFloatingPointNarrowingConversion: true}}'
-
-// RUN: %check_clang_tidy -check-suffix=DISABLED %s \
-// RUN: cppcoreguidelines-narrowing-conversions %t -- \
-// RUN: -config='{CheckOptions: {cppcoreguidelines-narrowing-conversions.WarnOnIntegerToFloatingPointNarrowingConversion: false}}'
-
-void foo(unsigned long long value) {
-  double a = value;
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:14: warning: narrowing conversion from 'unsigned long long' to 'double' [cppcoreguidelines-narrowing-conversions]
-  // DISABLED: No warning for integer to floating-point narrowing conversions when WarnOnIntegerToFloatingPointNarrowingConversion = false.
-}
-
-void floating_point_to_integer_is_still_not_ok(double f) {
-  int a = f;
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:11: warning: narrowing conversion from 'double' to 'int' [cppcoreguidelines-narrowing-conversions]
-  // CHECK-MESSAGES-DISABLED: :[[@LINE-2]]:11: warning: narrowing conversion from 'double' to 'int' [cppcoreguidelines-narrowing-conversions]
-}

From 412e30b2274a134c01c2140ac7c7579be70f0896 Mon Sep 17 00:00:00 2001
From: Matthias Springer 
Date: Sun, 29 Dec 2024 12:35:33 +0100
Subject: [PATCH 149/567] [mlir][Transforms] Dialect Conversion: Add 1:N op
 replacement test case (#121271)

This commit adds a test case that performs two back-to-back 1:N
replacements: `(i16) -> (i16, i16) -> ((i16, i16), (i16, i16))`. For the
moment, 3 argument materializations are inserted. In the future (when
the conversion value mapping supports 1:N), a single target
materialization will be inserted. Addresses a
[comment](https://github.com/llvm/llvm-project/pull/116524#discussion_r1894629711)
in #116524.
---
 mlir/test/Transforms/test-legalizer.mlir    | 24 +++++++++-
 mlir/test/lib/Dialect/Test/TestPatterns.cpp | 51 +++++++++++++++++++--
 2 files changed, 70 insertions(+), 5 deletions(-)

diff --git a/mlir/test/Transforms/test-legalizer.mlir b/mlir/test/Transforms/test-legalizer.mlir
index 2ca5f49637523..297eb5acef21b 100644
--- a/mlir/test/Transforms/test-legalizer.mlir
+++ b/mlir/test/Transforms/test-legalizer.mlir
@@ -450,7 +450,7 @@ func.func @fold_legalization() -> i32 {
 // -----
 
 // CHECK-LABEL: func @convert_detached_signature()
-//       CHECK:   "test.legal_op_with_region"() ({
+//       CHECK:   "test.legal_op"() ({
 //       CHECK:   ^bb0(%arg0: f64):
 //       CHECK:     "test.return"() : () -> ()
 //       CHECK:   }) : () -> ()
@@ -483,3 +483,25 @@ func.func @test_1_to_n_block_signature_conversion() {
   "test.return"() : () -> ()
 }
 
+// -----
+
+// CHECK: notifyOperationInserted: test.step_1
+// CHECK: notifyOperationReplaced: test.multiple_1_to_n_replacement
+// CHECK: notifyOperationErased: test.multiple_1_to_n_replacement
+// CHECK: notifyOperationInserted: test.legal_op
+// CHECK: notifyOperationReplaced: test.step_1
+// CHECK: notifyOperationErased: test.step_1
+
+// CHECK-LABEL: func @test_multiple_1_to_n_replacement()
+//       CHECK:   %[[legal_op:.*]]:4 = "test.legal_op"() : () -> (f16, f16, f16, f16)
+// TODO: There should be a single cast (i.e., a single target materialization).
+// This is currently not possible due to 1:N limitations of the conversion
+// mapping. Instead, we have 3 argument materializations.
+//       CHECK:   %[[cast1:.*]] = "test.cast"(%[[legal_op]]#2, %[[legal_op]]#3) : (f16, f16) -> f16
+//       CHECK:   %[[cast2:.*]] = "test.cast"(%[[legal_op]]#0, %[[legal_op]]#1) : (f16, f16) -> f16
+//       CHECK:   %[[cast3:.*]] = "test.cast"(%[[cast2]], %[[cast1]]) : (f16, f16) -> f16
+//       CHECK:   "test.valid"(%[[cast3]]) : (f16) -> ()
+func.func @test_multiple_1_to_n_replacement() {
+  %0 = "test.multiple_1_to_n_replacement"() : () -> (f16)
+  "test.invalid"(%0) : (f16) -> ()
+}
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index a470497fdbb56..826c222990be4 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -785,7 +785,7 @@ struct TestDetachedSignatureConversion : public ConversionPattern {
                   ConversionPatternRewriter &rewriter) const final {
     if (op->getNumRegions() != 1)
       return failure();
-    OperationState state(op->getLoc(), "test.legal_op_with_region", operands,
+    OperationState state(op->getLoc(), "test.legal_op", operands,
                          op->getResultTypes(), {}, BlockRange());
     Region *newRegion = state.addRegion();
     rewriter.inlineRegionBefore(op->getRegion(0), *newRegion,
@@ -1234,6 +1234,49 @@ class TestRepetitive1ToNConsumer : public ConversionPattern {
   }
 };
 
+/// A pattern that tests two back-to-back 1 -> 2 op replacements.
+class TestMultiple1ToNReplacement : public ConversionPattern {
+public:
+  TestMultiple1ToNReplacement(MLIRContext *ctx, const TypeConverter &converter)
+      : ConversionPattern(converter, "test.multiple_1_to_n_replacement", 1,
+                          ctx) {}
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    // Helper function that replaces the given op with a new op of the given
+    // name and doubles each result (1 -> 2 replacement of each result).
+    auto replaceWithDoubleResults = [&](Operation *op, StringRef name) {
+      SmallVector types;
+      for (Type t : op->getResultTypes()) {
+        types.push_back(t);
+        types.push_back(t);
+      }
+      OperationState state(op->getLoc(), name,
+                           /*operands=*/{}, types, op->getAttrs());
+      auto *newOp = rewriter.create(state);
+      SmallVector repls;
+      for (size_t i = 0, e = op->getNumResults(); i < e; ++i)
+        repls.push_back(newOp->getResults().slice(2 * i, 2));
+      rewriter.replaceOpWithMultiple(op, repls);
+      return newOp;
+    };
+
+    // Replace test.multiple_1_to_n_replacement with test.step_1.
+    Operation *repl1 = replaceWithDoubleResults(op, "test.step_1");
+    // Now replace test.step_1 with test.legal_op.
+    // TODO: Ideally, it should not be necessary to reset the insertion point
+    // here. Based on the API calls, it looks like test.step_1 is entirely
+    // erased. But that's not the case: an argument materialization will
+    // survive. And that argument materialization will be used by the users of
+    // `op`. If we don't reset the insertion point here, we get dominance
+    // errors. This will be fixed when we have 1:N support in the conversion
+    // value mapping.
+    rewriter.setInsertionPoint(repl1);
+    replaceWithDoubleResults(repl1, "test.legal_op");
+    return success();
+  }
+};
+
 } // namespace
 
 namespace {
@@ -1319,7 +1362,8 @@ struct TestLegalizePatternDriver
              TestUndoPropertiesModification, TestEraseOp,
              TestRepetitive1ToNConsumer>(&getContext());
     patterns.add(&getContext(), converter);
+                 TestPassthroughInvalidOp, TestMultiple1ToNReplacement>(
+        &getContext(), converter);
     patterns.add(converter, &getContext());
     mlir::populateAnyFunctionOpInterfaceTypeConversionPattern(patterns,
                                                               converter);
@@ -1330,8 +1374,7 @@ struct TestLegalizePatternDriver
     target.addLegalOp();
     target.addLegalOp();
-    target.addLegalOp(
-        OperationName("test.legal_op_with_region", &getContext()));
+    target.addLegalOp(OperationName("test.legal_op", &getContext()));
     target
         .addIllegalOp();
     target.addDynamicallyLegalOp([](TestReturnOp op) {

From b34ed25dd5c74abcb46872cbaab34b91e27a0bda Mon Sep 17 00:00:00 2001
From: Congcong Cai 
Date: Sun, 29 Dec 2024 19:51:55 +0800
Subject: [PATCH 150/567] [clang-tidy][NFC] add link libs for bugprone module

Fixed bug issue introduced in #120245
---
 clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt          | 1 -
 clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
index 73ab22381631c..8bd5646c5fe05 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
@@ -96,7 +96,6 @@ add_clang_library(clangTidyBugproneModule STATIC
 
   LINK_LIBS
   clangTidy
-  clangTidyCppCoreGuidelinesModule
   clangTidyUtils
 
   DEPENDS
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt b/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt
index 1f4107c0b35e7..b023f76a25432 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt
@@ -37,6 +37,7 @@ add_clang_library(clangTidyCppCoreGuidelinesModule STATIC
 
   LINK_LIBS
   clangTidy
+  clangTidyBugproneModule
   clangTidyMiscModule
   clangTidyModernizeModule
   clangTidyPerformanceModule

From 7144325109353f9153780f93ec5a8fee8fdc0927 Mon Sep 17 00:00:00 2001
From: Jacek Caban 
Date: Sun, 29 Dec 2024 12:55:10 +0100
Subject: [PATCH 151/567] [LLD][COFF] Prepare both load configs on ARM64X
 (#120326)

---
 lld/COFF/Writer.cpp               | 34 +++++++++++++++++--------------
 lld/test/COFF/arm64x-loadconfig.s |  2 ++
 2 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index e6b239c83dd4a..5946c2944aa21 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -282,7 +282,8 @@ class Writer {
   uint32_t getSizeOfInitializedData();
 
   void prepareLoadConfig();
-  template  void prepareLoadConfig(T *loadConfig);
+  template 
+  void prepareLoadConfig(SymbolTable &symtab, T *loadConfig);
 
   std::unique_ptr &buffer;
   std::map partialSections;
@@ -2637,22 +2638,25 @@ void Writer::fixTlsAlignment() {
 }
 
 void Writer::prepareLoadConfig() {
-  if (!ctx.symtab.loadConfigSym)
-    return;
+  ctx.forEachSymtab([&](SymbolTable &symtab) {
+    if (!symtab.loadConfigSym)
+      return;
 
-  OutputSection *sec =
-      ctx.getOutputSection(ctx.symtab.loadConfigSym->getChunk());
-  uint8_t *secBuf = buffer->getBufferStart() + sec->getFileOff();
-  uint8_t *symBuf =
-      secBuf + (ctx.symtab.loadConfigSym->getRVA() - sec->getRVA());
+    OutputSection *sec = ctx.getOutputSection(symtab.loadConfigSym->getChunk());
+    uint8_t *secBuf = buffer->getBufferStart() + sec->getFileOff();
+    uint8_t *symBuf = secBuf + (symtab.loadConfigSym->getRVA() - sec->getRVA());
 
-  if (ctx.config.is64())
-    prepareLoadConfig(reinterpret_cast(symBuf));
-  else
-    prepareLoadConfig(reinterpret_cast(symBuf));
+    if (ctx.config.is64())
+      prepareLoadConfig(symtab,
+                        reinterpret_cast(symBuf));
+    else
+      prepareLoadConfig(symtab,
+                        reinterpret_cast(symBuf));
+  });
 }
 
-template  void Writer::prepareLoadConfig(T *loadConfig) {
+template 
+void Writer::prepareLoadConfig(SymbolTable &symtab, T *loadConfig) {
   size_t loadConfigSize = loadConfig->Size;
 
 #define RETURN_IF_NOT_CONTAINS(field)                                          \
@@ -2665,12 +2669,12 @@ template  void Writer::prepareLoadConfig(T *loadConfig) {
   if (loadConfigSize >= offsetof(T, field) + sizeof(T::field))
 
 #define CHECK_VA(field, sym)                                                   \
-  if (auto *s = dyn_cast(ctx.symtab.findUnderscore(sym)))    \
+  if (auto *s = dyn_cast(symtab.findUnderscore(sym)))        \
     if (loadConfig->field != ctx.config.imageBase + s->getRVA())               \
       Warn(ctx) << #field " not set correctly in '_load_config_used'";
 
 #define CHECK_ABSOLUTE(field, sym)                                             \
-  if (auto *s = dyn_cast(ctx.symtab.findUnderscore(sym)))     \
+  if (auto *s = dyn_cast(symtab.findUnderscore(sym)))         \
     if (loadConfig->field != s->getVA())                                       \
       Warn(ctx) << #field " not set correctly in '_load_config_used'";
 
diff --git a/lld/test/COFF/arm64x-loadconfig.s b/lld/test/COFF/arm64x-loadconfig.s
index 0d4fe0ed6d6e0..6023828a2746f 100644
--- a/lld/test/COFF/arm64x-loadconfig.s
+++ b/lld/test/COFF/arm64x-loadconfig.s
@@ -4,6 +4,7 @@
 // RUN: llvm-mc -filetype=obj -triple=aarch64-windows test.s -o test.obj
 // RUN: llvm-mc -filetype=obj -triple=aarch64-windows loadconfig.s -o loadconfig.obj
 // RUN: llvm-mc -filetype=obj -triple=aarch64-windows loadconfig-short.s -o loadconfig-short.obj
+// RUN: llvm-mc -filetype=obj -triple=arm64ec-windows loadconfig-short.s -o loadconfig-short-arm64ec.obj
 
 // RUN: lld-link -machine:arm64x -out:out.dll -dll -noentry loadconfig.obj test.obj
 
@@ -43,6 +44,7 @@
 // HEADERS-NEXT: VirtualSize: 0x38
 
 // RUN: lld-link -machine:arm64x -out:out-short.dll -dll -noentry loadconfig-short.obj 2>&1 | FileCheck --check-prefix=WARN-RELOC-SIZE %s
+// RUN: lld-link -machine:arm64x -out:out-short.dll -dll -noentry loadconfig-short-arm64ec.obj 2>&1 | FileCheck --check-prefix=WARN-RELOC-SIZE %s
 // WARN-RELOC-SIZE: lld-link: warning: '_load_config_used' structure too small to include dynamic relocations
 
 #--- test.s

From db7123fbbc530587941ea3c78666103233282120 Mon Sep 17 00:00:00 2001
From: Jacek Caban 
Date: Sun, 29 Dec 2024 14:04:00 +0100
Subject: [PATCH 152/567] [LLD][COFF] Use EC symbol table for CHPE metadata
 (#120328)

Copy CHPE metadata pointer from EC load config to native configuration.
---
 lld/COFF/Driver.cpp                     |   2 +-
 lld/COFF/Writer.cpp                     |  60 +++++++++----
 lld/test/COFF/Inputs/loadconfig-arm64.s |  15 ++++
 lld/test/COFF/arm64ec-codemap.test      |   2 +-
 lld/test/COFF/arm64ec-entry-thunk.s     |   2 +-
 lld/test/COFF/arm64ec-lib.test          |   2 +-
 lld/test/COFF/arm64ec-range-thunks.s    |   2 +-
 lld/test/COFF/arm64x-loadconfig.s       | 115 +++++++++++++++++++++++-
 8 files changed, 174 insertions(+), 26 deletions(-)
 create mode 100644 lld/test/COFF/Inputs/loadconfig-arm64.s

diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index ae5b095fba772..be01ee41c9a2f 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -2548,7 +2548,7 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) {
     symtab.addAbsolute(mangle("__guard_eh_cont_count"), 0);
     symtab.addAbsolute(mangle("__guard_eh_cont_table"), 0);
 
-    if (isArm64EC(ctx.config.machine)) {
+    if (symtab.isEC()) {
       symtab.addAbsolute("__arm64x_extra_rfe_table", 0);
       symtab.addAbsolute("__arm64x_extra_rfe_table_size", 0);
       symtab.addAbsolute("__arm64x_redirection_metadata", 0);
diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index 5946c2944aa21..b3dd5f6cf4926 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -575,7 +575,7 @@ bool Writer::createThunks(OutputSection *os, int margin) {
 
 // Create a code map for CHPE metadata.
 void Writer::createECCodeMap() {
-  if (!isArm64EC(ctx.config.machine))
+  if (!ctx.symtabEC)
     return;
 
   // Clear the map in case we were're recomputing the map after adding
@@ -611,7 +611,8 @@ void Writer::createECCodeMap() {
 
   closeRange();
 
-  Symbol *tableCountSym = ctx.symtab.findUnderscore("__hybrid_code_map_count");
+  Symbol *tableCountSym =
+      ctx.symtabEC->findUnderscore("__hybrid_code_map_count");
   cast(tableCountSym)->setVA(codeMap.size());
 }
 
@@ -1229,8 +1230,7 @@ void Writer::createMiscChunks() {
   // Create /guard:cf tables if requested.
   createGuardCFTables();
 
-  if (isArm64EC(config->machine))
-    createECChunks();
+  createECChunks();
 
   if (config->autoImport)
     createRuntimePseudoRelocs();
@@ -2158,7 +2158,11 @@ void Writer::maybeAddRVATable(SymbolRVASet tableSymbols, StringRef tableSym,
 
 // Create CHPE metadata chunks.
 void Writer::createECChunks() {
-  for (Symbol *s : ctx.symtab.expSymbols) {
+  SymbolTable *symtab = ctx.symtabEC;
+  if (!symtab)
+    return;
+
+  for (Symbol *s : symtab->expSymbols) {
     auto sym = dyn_cast(s);
     if (!sym || !sym->getChunk())
       continue;
@@ -2177,9 +2181,9 @@ void Writer::createECChunks() {
       // we should use the #foo$hp_target symbol as the redirection target.
       // First, try to look up the $hp_target symbol. If it can't be found,
       // assume it's a regular function and look for #foo instead.
-      Symbol *targetSym = ctx.symtab.find((targetName + "$hp_target").str());
+      Symbol *targetSym = symtab->find((targetName + "$hp_target").str());
       if (!targetSym)
-        targetSym = ctx.symtab.find(targetName);
+        targetSym = symtab->find(targetName);
       Defined *t = dyn_cast_or_null(targetSym);
       if (t && isArm64EC(t->getChunk()->getMachine()))
         exportThunks.push_back({chunk, t});
@@ -2188,20 +2192,20 @@ void Writer::createECChunks() {
 
   auto codeMapChunk = make(codeMap);
   rdataSec->addChunk(codeMapChunk);
-  Symbol *codeMapSym = ctx.symtab.findUnderscore("__hybrid_code_map");
+  Symbol *codeMapSym = symtab->findUnderscore("__hybrid_code_map");
   replaceSymbol(codeMapSym, codeMapSym->getName(),
                                   codeMapChunk);
 
   CHPECodeRangesChunk *ranges = make(exportThunks);
   rdataSec->addChunk(ranges);
   Symbol *rangesSym =
-      ctx.symtab.findUnderscore("__x64_code_ranges_to_entry_points");
+      symtab->findUnderscore("__x64_code_ranges_to_entry_points");
   replaceSymbol(rangesSym, rangesSym->getName(), ranges);
 
   CHPERedirectionChunk *entryPoints = make(exportThunks);
   a64xrmSec->addChunk(entryPoints);
   Symbol *entryPointsSym =
-      ctx.symtab.findUnderscore("__arm64x_redirection_metadata");
+      symtab->findUnderscore("__arm64x_redirection_metadata");
   replaceSymbol(entryPointsSym, entryPointsSym->getName(),
                                   entryPoints);
 }
@@ -2294,7 +2298,8 @@ void Writer::setSectionPermissions() {
 
 // Set symbols used by ARM64EC metadata.
 void Writer::setECSymbols() {
-  if (!isArm64EC(ctx.config.machine))
+  SymbolTable *symtab = ctx.symtabEC;
+  if (!symtab)
     return;
 
   llvm::stable_sort(exportThunks, [](const std::pair &a,
@@ -2302,45 +2307,45 @@ void Writer::setECSymbols() {
     return a.first->getRVA() < b.first->getRVA();
   });
 
-  Symbol *rfeTableSym = ctx.symtab.findUnderscore("__arm64x_extra_rfe_table");
+  Symbol *rfeTableSym = symtab->findUnderscore("__arm64x_extra_rfe_table");
   replaceSymbol(rfeTableSym, "__arm64x_extra_rfe_table",
                                   pdata.first);
 
   if (pdata.first) {
     Symbol *rfeSizeSym =
-        ctx.symtab.findUnderscore("__arm64x_extra_rfe_table_size");
+        symtab->findUnderscore("__arm64x_extra_rfe_table_size");
     cast(rfeSizeSym)
         ->setVA(pdata.last->getRVA() + pdata.last->getSize() -
                 pdata.first->getRVA());
   }
 
   Symbol *rangesCountSym =
-      ctx.symtab.findUnderscore("__x64_code_ranges_to_entry_points_count");
+      symtab->findUnderscore("__x64_code_ranges_to_entry_points_count");
   cast(rangesCountSym)->setVA(exportThunks.size());
 
   Symbol *entryPointCountSym =
-      ctx.symtab.findUnderscore("__arm64x_redirection_metadata_count");
+      symtab->findUnderscore("__arm64x_redirection_metadata_count");
   cast(entryPointCountSym)->setVA(exportThunks.size());
 
-  Symbol *iatSym = ctx.symtab.findUnderscore("__hybrid_auxiliary_iat");
+  Symbol *iatSym = symtab->findUnderscore("__hybrid_auxiliary_iat");
   replaceSymbol(iatSym, "__hybrid_auxiliary_iat",
                                   idata.auxIat.empty() ? nullptr
                                                        : idata.auxIat.front());
 
-  Symbol *iatCopySym = ctx.symtab.findUnderscore("__hybrid_auxiliary_iat_copy");
+  Symbol *iatCopySym = symtab->findUnderscore("__hybrid_auxiliary_iat_copy");
   replaceSymbol(
       iatCopySym, "__hybrid_auxiliary_iat_copy",
       idata.auxIatCopy.empty() ? nullptr : idata.auxIatCopy.front());
 
   Symbol *delayIatSym =
-      ctx.symtab.findUnderscore("__hybrid_auxiliary_delayload_iat");
+      symtab->findUnderscore("__hybrid_auxiliary_delayload_iat");
   replaceSymbol(
       delayIatSym, "__hybrid_auxiliary_delayload_iat",
       delayIdata.getAuxIat().empty() ? nullptr
                                      : delayIdata.getAuxIat().front());
 
   Symbol *delayIatCopySym =
-      ctx.symtab.findUnderscore("__hybrid_auxiliary_delayload_iat_copy");
+      symtab->findUnderscore("__hybrid_auxiliary_delayload_iat_copy");
   replaceSymbol(
       delayIatCopySym, "__hybrid_auxiliary_delayload_iat_copy",
       delayIdata.getAuxIatCopy().empty() ? nullptr
@@ -2695,6 +2700,23 @@ void Writer::prepareLoadConfig(SymbolTable &symtab, T *loadConfig) {
     }
   }
 
+  IF_CONTAINS(CHPEMetadataPointer) {
+    // On ARM64X, only the EC version of the load config contains
+    // CHPEMetadataPointer. Copy its value to the native load config.
+    if (ctx.hybridSymtab && !symtab.isEC() &&
+        ctx.hybridSymtab->loadConfigSize >=
+            offsetof(T, CHPEMetadataPointer) + sizeof(T::CHPEMetadataPointer)) {
+      OutputSection *sec =
+          ctx.getOutputSection(ctx.hybridSymtab->loadConfigSym->getChunk());
+      uint8_t *secBuf = buffer->getBufferStart() + sec->getFileOff();
+      auto hybridLoadConfig =
+          reinterpret_cast(
+              secBuf +
+              (ctx.hybridSymtab->loadConfigSym->getRVA() - sec->getRVA()));
+      loadConfig->CHPEMetadataPointer = hybridLoadConfig->CHPEMetadataPointer;
+    }
+  }
+
   if (ctx.config.guardCF == GuardCFLevel::Off)
     return;
   RETURN_IF_NOT_CONTAINS(GuardFlags)
diff --git a/lld/test/COFF/Inputs/loadconfig-arm64.s b/lld/test/COFF/Inputs/loadconfig-arm64.s
new file mode 100644
index 0000000000000..67d1a0aea50e4
--- /dev/null
+++ b/lld/test/COFF/Inputs/loadconfig-arm64.s
@@ -0,0 +1,15 @@
+        .section .rdata,"dr"
+        .globl _load_config_used
+        .p2align 3, 0
+_load_config_used:
+        .word 0x140
+        .fill 0x7c,1,0
+        .xword __guard_fids_table
+        .xword __guard_fids_count
+        .xword __guard_flags
+        .xword 0
+        .xword __guard_iat_table
+        .xword __guard_iat_count
+        .xword __guard_longjmp_table
+        .xword __guard_longjmp_count
+        .fill 0x80,1,0
diff --git a/lld/test/COFF/arm64ec-codemap.test b/lld/test/COFF/arm64ec-codemap.test
index 2d79538f0a7eb..050261117be2e 100644
--- a/lld/test/COFF/arm64ec-codemap.test
+++ b/lld/test/COFF/arm64ec-codemap.test
@@ -9,7 +9,7 @@ RUN: llvm-mc -filetype=obj -triple=arm64ec-windows data-sec2.s -o data-sec2.obj
 RUN: llvm-mc -filetype=obj -triple=arm64ec-windows empty-sec.s -o arm64ec-empty-sec.obj
 RUN: llvm-mc -filetype=obj -triple=x86_64-windows x86_64-func-sym.s -o x86_64-func-sym.obj
 RUN: llvm-mc -filetype=obj -triple=x86_64-windows empty-sec.s -o x86_64-empty-sec.obj
-RUN: llvm-mc -filetype=obj -triple=aarch64-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64.obj
+RUN: llvm-mc -filetype=obj -triple=aarch64-windows %S/Inputs/loadconfig-arm64.s -o loadconfig-arm64.obj
 RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj
 
 Link ARM64EC DLL and verify that the code is arranged as expected.
diff --git a/lld/test/COFF/arm64ec-entry-thunk.s b/lld/test/COFF/arm64ec-entry-thunk.s
index bf5cb42755b62..b31d315eeb7a8 100644
--- a/lld/test/COFF/arm64ec-entry-thunk.s
+++ b/lld/test/COFF/arm64ec-entry-thunk.s
@@ -27,7 +27,7 @@ thunk:
     .rva func
 
 // RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadcfg.obj
-// RUN: llvm-mc -filetype=obj -triple=aarch64-windows %S/Inputs/loadconfig-arm64ec.s -o native-loadcfg.obj
+// RUN: llvm-mc -filetype=obj -triple=aarch64-windows %S/Inputs/loadconfig-arm64.s -o native-loadcfg.obj
 // RUN: llvm-mc -filetype=obj -triple=arm64ec-windows test-simple.s -o test-simple.obj
 // RUN: lld-link -machine:arm64ec -dll -noentry -out:out-simple.dll loadcfg.obj test-simple.obj
 // RUN: llvm-objdump -d out-simple.dll | FileCheck --check-prefix=DISASM %s
diff --git a/lld/test/COFF/arm64ec-lib.test b/lld/test/COFF/arm64ec-lib.test
index ea07d28f1a411..8698a5ceccbe7 100644
--- a/lld/test/COFF/arm64ec-lib.test
+++ b/lld/test/COFF/arm64ec-lib.test
@@ -11,7 +11,7 @@ RUN: llvm-mc -filetype=obj -triple=arm64ec-windows ref-alias.s -o ref-alias.obj
 RUN: llvm-mc -filetype=obj -triple=arm64ec-windows ref-thunk.s -o ref-thunk.obj
 RUN: llvm-mc -filetype=obj -triple=arm64ec-windows func.s -o func.obj
 RUN: llvm-mc -filetype=obj -triple=x86_64-windows func-x86_64.s -o func-x86_64.obj
-RUN: llvm-mc -filetype=obj -triple=aarch64-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64.obj
+RUN: llvm-mc -filetype=obj -triple=aarch64-windows %S/Inputs/loadconfig-arm64.s -o loadconfig-arm64.obj
 RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj
 
 RUN: llvm-lib -machine:arm64ec -out:sym-arm64ec.lib sym-arm64ec.obj nsym-aarch64.obj
diff --git a/lld/test/COFF/arm64ec-range-thunks.s b/lld/test/COFF/arm64ec-range-thunks.s
index 09d9b013f97a5..dcfa6365b4e3a 100644
--- a/lld/test/COFF/arm64ec-range-thunks.s
+++ b/lld/test/COFF/arm64ec-range-thunks.s
@@ -5,7 +5,7 @@
 # RUN: llvm-mc -filetype=obj -triple=aarch64-windows native-funcs.s -o funcs-aarch64.obj
 # RUN: llvm-mc -filetype=obj -triple=x86_64-windows space.s -o space-x86_64.obj
 # RUN: llvm-mc -filetype=obj -triple=aarch64-windows space.s -o space-aarch64.obj
-# RUN: llvm-mc -filetype=obj -triple=aarch64-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64.obj
+# RUN: llvm-mc -filetype=obj -triple=aarch64-windows %S/Inputs/loadconfig-arm64.s -o loadconfig-arm64.obj
 # RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj
 
 
diff --git a/lld/test/COFF/arm64x-loadconfig.s b/lld/test/COFF/arm64x-loadconfig.s
index 6023828a2746f..8d2ab55554634 100644
--- a/lld/test/COFF/arm64x-loadconfig.s
+++ b/lld/test/COFF/arm64x-loadconfig.s
@@ -2,13 +2,15 @@
 // RUN: split-file %s %t.dir && cd %t.dir
 
 // RUN: llvm-mc -filetype=obj -triple=aarch64-windows test.s -o test.obj
+// RUN: llvm-mc -filetype=obj -triple=arm64ec-windows chpe.s -o chpe.obj
 // RUN: llvm-mc -filetype=obj -triple=aarch64-windows loadconfig.s -o loadconfig.obj
+// RUN: llvm-mc -filetype=obj -triple=arm64ec-windows loadconfig-ec.s -o loadconfig-ec.obj
 // RUN: llvm-mc -filetype=obj -triple=aarch64-windows loadconfig-short.s -o loadconfig-short.obj
 // RUN: llvm-mc -filetype=obj -triple=arm64ec-windows loadconfig-short.s -o loadconfig-short-arm64ec.obj
 
 // RUN: lld-link -machine:arm64x -out:out.dll -dll -noentry loadconfig.obj test.obj
 
-// RUN: llvm-readobj --coff-load-config out.dll | FileCheck -check-prefix=DYNRELOCS %s
+// RUN: llvm-readobj --coff-load-config out.dll | FileCheck --check-prefix=DYNRELOCS %s
 // DYNRELOCS:      DynamicValueRelocTableOffset: 0xC
 // DYNRELOCS-NEXT: DynamicValueRelocTableSection: 4
 // DYNRELOCS:      DynamicRelocations [
@@ -35,7 +37,7 @@
 // DYNRELOCS-NEXT:   ]
 // DYNRELOCS-NEXT: ]
 
-// RUN: llvm-readobj --headers out.dll | FileCheck -check-prefix=HEADERS %s
+// RUN: llvm-readobj --headers out.dll | FileCheck --check-prefix=HEADERS %s
 // HEADERS:      BaseRelocationTableRVA: 0x4000
 // HEADERS-NEXT: BaseRelocationTableSize: 0xC
 // HEADERS:      LoadConfigTableRVA: 0x1000
@@ -47,6 +49,70 @@
 // RUN: lld-link -machine:arm64x -out:out-short.dll -dll -noentry loadconfig-short-arm64ec.obj 2>&1 | FileCheck --check-prefix=WARN-RELOC-SIZE %s
 // WARN-RELOC-SIZE: lld-link: warning: '_load_config_used' structure too small to include dynamic relocations
 
+// Check that the CHPE metadata pointer is correctly copied from the EC load config to the native load config.
+
+// RUN: lld-link -machine:arm64x -out:out-hyb.dll -dll -noentry loadconfig.obj loadconfig-ec.obj chpe.obj test.obj
+
+// RUN: llvm-readobj --coff-load-config out-hyb.dll | FileCheck --check-prefix=LOADCFG %s
+// LOADCFG:      Format: COFF-ARM64X
+// LOADCFG-NEXT: Arch: aarch64
+// LOADCFG-NEXT: AddressSize: 64bit
+// LOADCFG-NEXT: LoadConfig [
+// LOADCFG-NEXT:   Size: 0x140
+// LOADCFG:      CHPEMetadata [
+// LOADCFG-NEXT:   Version: 0x2
+// LOADCFG:        RedirectionMetadata: 12288
+// LOADCFG:        AlternateEntryPoint: 0x0
+// LOADCFG-NEXT:   AuxiliaryIAT: 0x0
+// LOADCFG-NEXT:   GetX64InformationFunctionPointer: 0x0
+// LOADCFG-NEXT:   SetX64InformationFunctionPointer: 0x0
+// LOADCFG-NEXT:   ExtraRFETable: 0x0
+// LOADCFG-NEXT:   ExtraRFETableSize: 0x0
+// LOADCFG-NEXT:   __os_arm64x_dispatch_fptr: 0x0
+// LOADCFG-NEXT:   AuxiliaryIATCopy: 0x0
+// LOADCFG-NEXT:   AuxiliaryDelayloadIAT: 0x0
+// LOADCFG-NEXT:   AuxiliaryDelayloadIATCopy: 0x0
+// LOADCFG-NEXT:   HybridImageInfoBitfield: 0x0
+// LOADCFG:      ]
+// LOADCFG-NEXT: DynamicRelocations [
+// LOADCFG-NEXT:   Version: 0x1
+// LOADCFG-NEXT:   Arm64X [
+// LOADCFG-NEXT:     Entry [
+// LOADCFG-NEXT:       RVA: 0x7C
+// LOADCFG-NEXT:       Type: VALUE
+// LOADCFG-NEXT:       Size: 0x2
+// LOADCFG-NEXT:       Value: 0x8664
+// LOADCFG-NEXT:     ]
+// LOADCFG-NEXT:     Entry [
+// LOADCFG-NEXT:       RVA: 0x150
+// LOADCFG-NEXT:       Type: VALUE
+// LOADCFG-NEXT:       Size: 0x4
+// LOADCFG-NEXT:       Value: 0x0
+// LOADCFG-NEXT:     ]
+// LOADCFG-NEXT:     Entry [
+// LOADCFG-NEXT:       RVA: 0x154
+// LOADCFG-NEXT:       Type: VALUE
+// LOADCFG-NEXT:       Size: 0x4
+// LOADCFG-NEXT:       Value: 0x0
+// LOADCFG-NEXT:     ]
+// LOADCFG-NEXT:   ]
+// LOADCFG-NEXT: ]
+// LOADCFG-NEXT: HybridObject {
+// LOADCFG-NEXT:   Format: COFF-x86-64
+// LOADCFG-NEXT:   Arch: x86_64
+// LOADCFG-NEXT:   AddressSize: 64bit
+
+// RUN: llvm-readobj --coff-basereloc out-hyb.dll | FileCheck --check-prefix=BASERELOC %s
+// BASERELOC:      BaseReloc [
+// BASERELOC-NEXT:   Entry {
+// BASERELOC-NEXT:     Type: DIR64
+// BASERELOC-NEXT:     Address: 0x1208
+// BASERELOC-NEXT:   }
+// BASERELOC-NEXT:   Entry {
+// BASERELOC:          Type: DIR64
+// BASERELOC-NEXT:     Address: 0x2074
+// BASERELOC-NEXT:   }
+
 #--- test.s
         .data
 sym:
@@ -61,6 +127,16 @@ _load_config_used:
         .word 0x140
         .fill 0x13c,1,0
 
+#--- loadconfig-ec.s
+        .section .rdata,"dr"
+        .globl _load_config_used
+        .p2align 3, 0
+_load_config_used:
+        .word 0x140
+        .fill 0xc4,1,0
+        .xword __chpe_metadata
+        .fill 0x70,1,0
+
 #--- loadconfig-short.s
         .section .rdata,"dr"
         .globl _load_config_used
@@ -68,3 +144,38 @@ _load_config_used:
 _load_config_used:
         .word 0xe4
         .fill 0xe0,1,0
+
+#--- chpe.s
+        .data
+        .globl __chpe_metadata
+        .p2align 3, 0
+__chpe_metadata:
+        .word 2
+        .rva __hybrid_code_map
+        .word __hybrid_code_map_count
+        .rva __x64_code_ranges_to_entry_points
+        .rva __arm64x_redirection_metadata
+        .word 0 // __os_arm64x_dispatch_call_no_redirect
+        .word 0 // __os_arm64x_dispatch_ret
+        .word 0 // __os_arm64x_check_call
+        .word 0 // __os_arm64x_check_icall
+        .word 0 // __os_arm64x_check_icall_cfg
+        .rva __arm64x_native_entrypoint
+        .rva __hybrid_auxiliary_iat
+        .word __x64_code_ranges_to_entry_points_count
+        .word __arm64x_redirection_metadata_count
+        .word 0 // __os_arm64x_get_x64_information
+        .word 0 // __os_arm64x_set_x64_information
+        .rva __arm64x_extra_rfe_table
+        .word __arm64x_extra_rfe_table_size
+        .word 0 // __os_arm64x_dispatch_fptr
+        .rva __hybrid_auxiliary_iat_copy
+        .rva __hybrid_auxiliary_delayload_iat
+        .rva __hybrid_auxiliary_delayload_iat_copy
+        .word __hybrid_image_info_bitfield
+        .word 0 // __os_arm64x_helper3
+        .word 0 // __os_arm64x_helper4
+        .word 0 // __os_arm64x_helper5
+        .word 0 // __os_arm64x_helper6
+        .word 0 // __os_arm64x_helper7
+        .word 0 // __os_arm64x_helper8

From 6cbc64ed922cc69bc292d394ba5c681fa309f404 Mon Sep 17 00:00:00 2001
From: Sergei Barannikov 
Date: Sun, 29 Dec 2024 16:29:55 +0300
Subject: [PATCH 153/567] [TableGen][GISel] Fix IMPLICIT_DEF operand being
 added as a use (#121283)

`IMPLICIT_DEF` has one operand that is a def, not a use.
---
 .../GlobalISelEmitter/undef-tied-input.td     | 24 +++++++++++++++++++
 llvm/utils/TableGen/GlobalISelEmitter.cpp     |  2 +-
 2 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/TableGen/GlobalISelEmitter/undef-tied-input.td

diff --git a/llvm/test/TableGen/GlobalISelEmitter/undef-tied-input.td b/llvm/test/TableGen/GlobalISelEmitter/undef-tied-input.td
new file mode 100644
index 0000000000000..a2ee3dc311772
--- /dev/null
+++ b/llvm/test/TableGen/GlobalISelEmitter/undef-tied-input.td
@@ -0,0 +1,24 @@
+// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common %s | FileCheck %s
+
+include "llvm/Target/Target.td"
+include "GlobalISelEmitterCommon.td"
+
+def undef_tied : OperandWithDefaultOps {
+  let MIOperandInfo = (ops GPR32:$inactive);
+}
+
+let Constraints = "$opt.inactive = $rd" in
+def I1 : I<(outs GPR32:$rd), (ins GPR32:$rs, undef_tied:$opt),
+           [(set GPR32:$rd, (abs i32:$rs))]>;
+
+// CHECK-LABEL: // (abs:{ *:[i32] } i32:{ *:[i32] }:$rs)  =>  (I1:{ *:[i32] } i32:{ *:[i32] }:$rs)
+// CHECK-NEXT: GIR_MakeTempReg, /*TempRegID*/0, /*TypeID*/GILLT_s32,
+// CHECK-NEXT: GIR_BuildMI, /*InsnID*/1, /*Opcode*/GIMT_Encode2(TargetOpcode::IMPLICIT_DEF),
+// CHECK-NEXT: GIR_AddTempRegister, /*InsnID*/1, /*TempRegID*/0, /*TempRegFlags*/GIMT_Encode2(RegState::Define),
+// CHECK-NEXT: GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::I1),
+// CHECK-NEXT: GIR_RootToRootCopy, /*OpIdx*/0, // DstI[rd]
+// CHECK-NEXT: GIR_RootToRootCopy, /*OpIdx*/1, // rs
+// CHECK-NEXT: GIR_AddSimpleTempRegister, /*InsnID*/0, /*TempRegID*/0,
+// CHECK-NEXT: GIR_RootConstrainSelectedInstOperands,
+// CHECK-NEXT: // GIR_Coverage, 0,
+// CHECK-NEXT: GIR_EraseRootFromParent_Done,
diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp
index f0fb11625883e..092cdd4ad5b43 100644
--- a/llvm/utils/TableGen/GlobalISelEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp
@@ -1756,7 +1756,7 @@ Error GlobalISelEmitter::importDefaultOperandRenderers(
             &Target.getInstruction(RK.getDef("IMPLICIT_DEF")));
         BuildMIAction &IDMIBuilder =
             *static_cast(InsertPt->get());
-        IDMIBuilder.addRenderer(TempRegID);
+        IDMIBuilder.addRenderer(TempRegID, /*IsDef=*/true);
         DstMIBuilder.addRenderer(TempRegID);
       } else {
         DstMIBuilder.addRenderer(Target, Def);

From 01c8cd664a9bea23a49c863a39351949ac11a4fd Mon Sep 17 00:00:00 2001
From: David Green 
Date: Sun, 29 Dec 2024 15:56:12 +0000
Subject: [PATCH 154/567] [AArch64][GlobalISel] Full reverse shuffles.
 (#119083)

A full shuffle reverse needs to use EXT+REV64. This adds handling for more types than SDAG so long as the mask is isReverseMask to make the patterns simpler.
---
 llvm/lib/Target/AArch64/AArch64Combine.td     |  11 +-
 .../GISel/AArch64PostLegalizerLowering.cpp    |  13 ++
 llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll  |  14 +--
 .../CodeGen/AArch64/neon-reverseshuffle.ll    | 114 +++++++-----------
 4 files changed, 74 insertions(+), 78 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 1b1d81fcd07a2..ce1980697abbb 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -131,6 +131,15 @@ def ext: GICombineRule <
   (apply [{ applyEXT(*${root}, ${matchinfo}); }])
 >;
 
+def fullrev: GICombineRule <
+  (defs root:$root, shuffle_matchdata:$matchinfo),
+  (match (G_IMPLICIT_DEF $src2),
+         (G_SHUFFLE_VECTOR $src, $src1, $src2, $mask):$root,
+         [{ return ShuffleVectorInst::isReverseMask(${mask}.getShuffleMask(),
+                                                    ${mask}.getShuffleMask().size()); }]),
+  (apply [{ applyFullRev(*${root}, MRI); }])
+>;
+
 def insertelt_nonconst: GICombineRule <
   (defs root:$root, shuffle_matchdata:$matchinfo),
   (match (wip_match_opcode G_INSERT_VECTOR_ELT):$root,
@@ -163,7 +172,7 @@ def form_duplane : GICombineRule <
   (apply [{ applyDupLane(*${root}, MRI, B, ${matchinfo}); }])
 >;
 
-def shuffle_vector_lowering : GICombineGroup<[dup, rev, ext, zip, uzp, trn,
+def shuffle_vector_lowering : GICombineGroup<[dup, rev, ext, zip, uzp, trn, fullrev,
                                               form_duplane, shuf_to_ins]>;
 
 // Turn G_UNMERGE_VALUES -> G_EXTRACT_VECTOR_ELT's
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index 5fe2e3cefa112..6bba70d45a61d 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -405,6 +405,19 @@ void applyEXT(MachineInstr &MI, ShuffleVectorPseudo &MatchInfo) {
   MI.eraseFromParent();
 }
 
+void applyFullRev(MachineInstr &MI, MachineRegisterInfo &MRI) {
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+  LLT DstTy = MRI.getType(Dst);
+  assert(DstTy.getSizeInBits() == 128 &&
+         "Expected 128bit vector in applyFullRev");
+  MachineIRBuilder MIRBuilder(MI);
+  auto Cst = MIRBuilder.buildConstant(LLT::scalar(32), 8);
+  auto Rev = MIRBuilder.buildInstr(AArch64::G_REV64, {DstTy}, {Src});
+  MIRBuilder.buildInstr(AArch64::G_EXT, {Dst}, {Rev, Rev, Cst});
+  MI.eraseFromParent();
+}
+
 bool matchNonConstInsert(MachineInstr &MI, MachineRegisterInfo &MRI) {
   assert(MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT);
 
diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
index ee9fff7ceebc6..f0c9dccb21d84 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
@@ -440,11 +440,10 @@ define <8 x i16> @shufsext_v8i8_v8i16(<8 x i8> %src, <8 x i8> %b) {
 ;
 ; CHECK-GI-LABEL: shufsext_v8i8_v8i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    adrp x8, .LCPI14_0
-; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
 ; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI14_0]
-; CHECK-GI-NEXT:    tbl v0.16b, { v2.16b, v3.16b }, v0.16b
+; CHECK-GI-NEXT:    rev64 v0.8h, v0.8h
+; CHECK-GI-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -493,11 +492,10 @@ define <8 x i16> @shufzext_v8i8_v8i16(<8 x i8> %src, <8 x i8> %b) {
 ;
 ; CHECK-GI-LABEL: shufzext_v8i8_v8i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    adrp x8, .LCPI16_0
-; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI16_0]
-; CHECK-GI-NEXT:    tbl v0.16b, { v2.16b, v3.16b }, v0.16b
+; CHECK-GI-NEXT:    rev64 v0.8h, v0.8h
+; CHECK-GI-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll b/llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll
index abdfb996fa166..db5b93282e9c4 100644
--- a/llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll
+++ b/llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll
@@ -23,19 +23,11 @@ entry:
 }
 
 define <4 x i32> @v4i32(<4 x i32> %a) {
-; CHECK-SD-LABEL: v4i32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    rev64 v0.4s, v0.4s
-; CHECK-SD-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: v4i32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    adrp x8, .LCPI2_0
-; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 def $q0_q1
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI2_0]
-; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: v4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ret
 entry:
   %V128 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> 
   ret <4 x i32> %V128
@@ -52,19 +44,11 @@ entry:
 }
 
 define <8 x i16> @v8i16(<8 x i16> %a) {
-; CHECK-SD-LABEL: v8i16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    rev64 v0.8h, v0.8h
-; CHECK-SD-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: v8i16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    adrp x8, .LCPI4_0
-; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 def $q0_q1
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI4_0]
-; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: v8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rev64 v0.8h, v0.8h
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ret
 entry:
   %V128 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> 
   ret <8 x i16> %V128
@@ -93,6 +77,22 @@ entry:
   ret <8 x i16> %V128
 }
 
+define <4 x i16> @v8i16_3(<8 x i16> %a) {
+; CHECK-SD-LABEL: v8i16_3:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    rev64 v0.4h, v0.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i16_3:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    rev64 v0.8h, v0.8h
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %V128 = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> 
+  ret <4 x i16> %V128
+}
+
 define <4 x i16> @v4i16(<4 x i16> %a) {
 ; CHECK-LABEL: v4i16:
 ; CHECK:       // %bb.0: // %entry
@@ -104,19 +104,11 @@ entry:
 }
 
 define <16 x i8> @v16i8(<16 x i8> %a) {
-; CHECK-SD-LABEL: v16i8:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    rev64 v0.16b, v0.16b
-; CHECK-SD-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: v16i8:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    adrp x8, .LCPI7_0
-; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 def $q0_q1
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI7_0]
-; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rev64 v0.16b, v0.16b
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ret
 entry:
   %V128 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> 
   ret <16 x i8> %V128
@@ -125,18 +117,18 @@ entry:
 define <16 x i8> @v16i8_2(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-SD-LABEL: v16i8_2:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    adrp x8, .LCPI8_0
+; CHECK-SD-NEXT:    adrp x8, .LCPI9_0
 ; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
-; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI8_0]
+; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI9_0]
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
 ; CHECK-SD-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: v16i8_2:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    adrp x8, .LCPI8_0
+; CHECK-GI-NEXT:    adrp x8, .LCPI9_0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI8_0]
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI9_0]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
 ; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-GI-NEXT:    ret
@@ -166,19 +158,11 @@ entry:
 }
 
 define <4 x float> @v4f32(<4 x float> %a) {
-; CHECK-SD-LABEL: v4f32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    rev64 v0.4s, v0.4s
-; CHECK-SD-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: v4f32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    adrp x8, .LCPI11_0
-; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 def $q0_q1
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI11_0]
-; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: v4f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ret
 entry:
   %V128 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> 
   ret <4 x float> %V128
@@ -195,19 +179,11 @@ entry:
 }
 
 define <8 x half> @v8f16(<8 x half> %a) {
-; CHECK-SD-LABEL: v8f16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    rev64 v0.8h, v0.8h
-; CHECK-SD-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: v8f16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    adrp x8, .LCPI13_0
-; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 def $q0_q1
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI13_0]
-; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: v8f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rev64 v0.8h, v0.8h
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ret
 entry:
   %V128 = shufflevector <8 x half> %a, <8 x half> undef, <8 x i32> 
   ret <8 x half> %V128

From 08e2c15a287df132ca2186f2d56669219a7ed8a1 Mon Sep 17 00:00:00 2001
From: Maksim Levental 
Date: Sun, 29 Dec 2024 09:13:10 -0800
Subject: [PATCH 155/567] [mlir][python] disable nanobind leak warnings
 (#121099)

---
 mlir/lib/Bindings/Python/IRCore.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index 86afa956398ae..05c000bfd8ca0 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -2587,6 +2587,8 @@ class PyOpAttributeMap {
 //------------------------------------------------------------------------------
 
 void mlir::python::populateIRCore(nb::module_ &m) {
+  // disable leak warnings which tend to be false positives.
+  nb::set_leak_warnings(false);
   //----------------------------------------------------------------------------
   // Enums.
   //----------------------------------------------------------------------------

From 8487d2460e8cf80c7c3b240cf46969eeeb4ed18d Mon Sep 17 00:00:00 2001
From: Maksim Levental 
Date: Sun, 29 Dec 2024 09:13:46 -0800
Subject: [PATCH 156/567] [mlir][shape] DCE unimplemented extra decl (#121275)

---
 mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
index 08a0398e74b0c..8bccba426ab12 100644
--- a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
+++ b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
@@ -321,11 +321,6 @@ def Shape_DimOp : Shape_Op<"dim",
   let assemblyFormat = "$value `,` $index attr-dict `:` type($value) `,`"
                        "type($index) `->` type($extent)";
 
-  let builders = [
-    // Builder that allows passing a constant dimension as a simple integer.
-    OpBuilder<(ins "Value":$value, "int64_t":$index)>
-  ];
-
   let extraClassDeclaration = [{
     /// Get the `index` value as integer if it is constant.
     std::optional getConstantIndex();

From c3ef6d469d89accd152c216ed8644783fb221c90 Mon Sep 17 00:00:00 2001
From: Fangrui Song 
Date: Sun, 29 Dec 2024 10:55:07 -0800
Subject: [PATCH 157/567] Move two LLVM_DEBUG banners after skippers

so that they don't show in -debug output when they are not run.
---
 llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp             | 4 ++--
 llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp b/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp
index 687acd90b405c..8437422843147 100644
--- a/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp
+++ b/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp
@@ -106,8 +106,6 @@ bool StackMapLiveness::runOnMachineFunction(MachineFunction &MF) {
   if (!EnablePatchPointLiveness)
     return false;
 
-  LLVM_DEBUG(dbgs() << "********** COMPUTING STACKMAP LIVENESS: "
-                    << MF.getName() << " **********\n");
   TRI = MF.getSubtarget().getRegisterInfo();
   ++NumStackMapFuncVisited;
 
@@ -121,6 +119,8 @@ bool StackMapLiveness::runOnMachineFunction(MachineFunction &MF) {
 
 /// Performs the actual liveness calculation for the function.
 bool StackMapLiveness::calculateLiveness(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "********** COMPUTING STACKMAP LIVENESS: "
+                    << MF.getName() << " **********\n");
   bool HasChanged = false;
   // For all basic blocks in the function.
   for (auto &MBB : MF) {
diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp
index 3b370d8c3eb15..64728a20bd076 100644
--- a/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp
+++ b/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp
@@ -57,8 +57,6 @@ char X86LoadValueInjectionRetHardeningPass::ID = 0;
 
 bool X86LoadValueInjectionRetHardeningPass::runOnMachineFunction(
     MachineFunction &MF) {
-  LLVM_DEBUG(dbgs() << "***** " << getPassName() << " : " << MF.getName()
-                    << " *****\n");
   const X86Subtarget *Subtarget = &MF.getSubtarget();
   if (!Subtarget->useLVIControlFlowIntegrity() || !Subtarget->is64Bit())
     return false; // FIXME: support 32-bit
@@ -68,6 +66,8 @@ bool X86LoadValueInjectionRetHardeningPass::runOnMachineFunction(
   if (!F.hasOptNone() && skipFunction(F))
     return false;
 
+  LLVM_DEBUG(dbgs() << "***** " << getPassName() << " : " << MF.getName()
+                    << " *****\n");
   ++NumFunctionsConsidered;
   const X86RegisterInfo *TRI = Subtarget->getRegisterInfo();
   const X86InstrInfo *TII = Subtarget->getInstrInfo();

From 7f3428d3ed71d87a2088b77b6cab9f3d86544234 Mon Sep 17 00:00:00 2001
From: Florian Hahn 
Date: Sun, 29 Dec 2024 19:05:08 +0000
Subject: [PATCH 158/567] [VPlan] Compute induction end values in VPlan.
 (#112145)

Use createDerivedIV to compute IV end values directly in VPlan, instead
of creating them up-front.

This allows updating IV users outside the loop as follow-up.

Depends on https://github.com/llvm/llvm-project/pull/110004 and
https://github.com/llvm/llvm-project/pull/109975.

PR: https://github.com/llvm/llvm-project/pull/112145
---
 .../Vectorize/LoopVectorizationPlanner.h      |   9 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    | 332 ++++++++++--------
 .../AArch64/clamped-trip-count.ll             |   4 +-
 .../AArch64/conditional-branches-cost.ll      |   4 +-
 .../epilog-vectorization-widen-inductions.ll  |   4 +-
 .../AArch64/fixed-order-recurrence.ll         |   4 +-
 .../AArch64/force-target-instruction-cost.ll  |   6 +-
 .../AArch64/induction-costs-sve.ll            |   2 +-
 .../LoopVectorize/AArch64/induction-costs.ll  |   4 +-
 .../LoopVectorize/AArch64/intrinsiccost.ll    |  12 +-
 .../AArch64/low_trip_count_predicates.ll      |  10 +-
 .../AArch64/mul-simplification.ll             |   2 +-
 .../AArch64/reduction-recurrence-costs-sve.ll |   8 +-
 .../AArch64/scalable-avoid-scalarization.ll   |   2 +-
 .../AArch64/scalable-strict-fadd.ll           |  12 +-
 .../LoopVectorize/AArch64/sve-epilog-vect.ll  |   2 +-
 .../AArch64/sve-inductions-unusual-types.ll   |   4 +-
 .../AArch64/sve-interleaved-accesses.ll       |  16 +-
 .../AArch64/sve-live-out-pointer-induction.ll |   4 +-
 .../AArch64/sve-tail-folding-forced.ll        |   2 +-
 .../LoopVectorize/AArch64/sve-tail-folding.ll |   2 +-
 .../LoopVectorize/AArch64/sve-widen-gep.ll    |   9 +-
 .../LoopVectorize/AArch64/sve-widen-phi.ll    |  12 +-
 .../AArch64/sve2-histcnt-vplan.ll             |   6 +-
 .../AArch64/synthesize-mask-for-call.ll       |  14 +-
 .../widen-call-with-intrinsic-or-libfunc.ll   |   6 +-
 .../LoopVectorize/PowerPC/exit-branch-cost.ll |  54 +--
 .../RISCV/blend-any-of-reduction-cost.ll      |   2 +-
 .../RISCV/blocks-with-dead-instructions.ll    |  14 +-
 .../LoopVectorize/RISCV/dead-ops-cost.ll      |  32 +-
 .../first-order-recurrence-scalable-vf1.ll    |   2 +-
 .../LoopVectorize/RISCV/induction-costs.ll    |   8 +-
 .../RISCV/masked_gather_scatter.ll            |   4 +-
 .../RISCV/riscv-vector-reverse.ll             |  34 +-
 .../LoopVectorize/RISCV/strided-accesses.ll   |  16 +-
 ...-force-tail-with-evl-reverse-load-store.ll |   8 +-
 ...orize-force-tail-with-evl-uniform-store.ll |   2 +-
 .../RISCV/vplan-vp-intrinsics-reduction.ll    |  12 +-
 .../predicated-first-order-recurrence.ll      |   4 +-
 .../LoopVectorize/X86/conversion-cost.ll      |   2 +-
 .../LoopVectorize/X86/cost-model.ll           |   8 +-
 .../X86/epilog-vectorization-inductions.ll    |   8 +-
 .../X86/fixed-order-recurrence.ll             |   4 +-
 .../LoopVectorize/X86/float-induction-x86.ll  |   4 +-
 .../LoopVectorize/X86/gather_scatter.ll       |  14 +-
 .../LoopVectorize/X86/induction-costs.ll      |   2 +-
 .../LoopVectorize/X86/intrinsiccost.ll        |  12 +-
 .../Transforms/LoopVectorize/X86/optsize.ll   |   4 +-
 .../X86/pr109581-unused-blend.ll              |   2 +-
 .../Transforms/LoopVectorize/X86/pr72969.ll   |   4 +-
 .../LoopVectorize/X86/scatter_crash.ll        |  72 ++--
 .../LoopVectorize/X86/small-size.ll           |  72 ++--
 .../LoopVectorize/branch-weights.ll           |   2 +-
 .../epilog-vectorization-any-of-reductions.ll |   8 +-
 .../first-order-recurrence-chains-vplan.ll    |  12 +-
 .../first-order-recurrence-chains.ll          |   2 +-
 .../first-order-recurrence-complex.ll         |  34 +-
 ...-order-recurrence-sink-replicate-region.ll |  19 +-
 .../LoopVectorize/first-order-recurrence.ll   |  72 ++--
 .../Transforms/LoopVectorize/induction.ll     |  52 +--
 .../interleave-and-scalarize-only.ll          |  11 +-
 .../LoopVectorize/interleaved-accesses.ll     |   4 +-
 .../invariant-store-vectorization.ll          |   2 +-
 .../iv-select-cmp-nested-loop.ll              |   6 +-
 .../optimal-epilog-vectorization.ll           |   4 +-
 .../pr59319-loop-access-info-invalidation.ll  |  34 +-
 llvm/test/Transforms/LoopVectorize/pr66616.ll |   2 +-
 .../LoopVectorize/reduction-align.ll          |   8 +-
 .../LoopVectorize/reduction-inloop-cond.ll    |   8 +-
 .../LoopVectorize/scalable-inductions.ll      |   8 +-
 .../LoopVectorize/select-reduction.ll         |   4 +-
 ...e-reduction-results-in-tail-folded-loop.ll |   2 +-
 .../uncountable-early-exit-vplan.ll           |   9 +-
 .../LoopVectorize/vplan-iv-transforms.ll      |   3 +-
 .../LoopVectorize/vplan-predicate-switch.ll   |   4 +-
 .../vplan-printing-before-execute.ll          |  16 +-
 .../LoopVectorize/vplan-printing.ll           |  47 ++-
 .../vplan-sink-scalars-and-merge-vf1.ll       |   7 +-
 .../vplan-sink-scalars-and-merge.ll           |   9 +-
 .../PhaseOrdering/ARM/arm_mult_q15.ll         |  14 +-
 80 files changed, 695 insertions(+), 600 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 650a4859780da..26a2de8c80977 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -231,12 +231,15 @@ class VPBuilder {
         new VPInstruction(Ptr, Offset, GEPNoWrapFlags::inBounds(), DL, Name));
   }
 
+  /// Convert the input value \p Current to the corresponding value of an
+  /// induction with \p Start and \p Step values, using \p Start + \p Current *
+  /// \p Step.
   VPDerivedIVRecipe *createDerivedIV(InductionDescriptor::InductionKind Kind,
                                      FPMathOperator *FPBinOp, VPValue *Start,
-                                     VPCanonicalIVPHIRecipe *CanonicalIV,
-                                     VPValue *Step, const Twine &Name = "") {
+                                     VPValue *Current, VPValue *Step,
+                                     const Twine &Name = "") {
     return tryInsertInstruction(
-        new VPDerivedIVRecipe(Kind, FPBinOp, Start, CanonicalIV, Step, Name));
+        new VPDerivedIVRecipe(Kind, FPBinOp, Start, Current, Step, Name));
   }
 
   VPScalarCastRecipe *createScalarCast(Instruction::CastOps Opcode, VPValue *Op,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index af6fce4b15190..1975df3cacbca 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -517,22 +517,6 @@ class InnerLoopVectorizer {
   /// Fix the non-induction PHIs in \p Plan.
   void fixNonInductionPHIs(VPTransformState &State);
 
-  /// Create a ResumePHI VPInstruction for the induction \p InductionPhiIRI to
-  /// resume iteration count in the scalar epilogue from where the vectorized
-  /// loop left off, and add it to the scalar preheader of VPlan. Also creates
-  /// the induction resume value, and the value for the bypass block, if needed.
-  /// \p Step is the SCEV-expanded induction step to use. In cases where the
-  /// loop skeleton is more complicated (i.e., epilogue vectorization) and the
-  /// resume values can come from an additional bypass block,
-  /// \p MainVectorTripCount provides the trip count of the main vector loop,
-  /// used to compute the resume value reaching the scalar loop preheader
-  /// directly from this additional bypass block.
-  void createInductionResumeVPValue(VPIRInstruction *InductionPhiIRI,
-                                    const InductionDescriptor &ID, Value *Step,
-                                    ArrayRef BypassBlocks,
-                                    VPBuilder &ScalarPHBuilder,
-                                    Value *MainVectorTripCount = nullptr);
-
   /// Returns the original loop trip count.
   Value *getTripCount() const { return TripCount; }
 
@@ -588,17 +572,10 @@ class InnerLoopVectorizer {
   /// vector loop preheader, middle block and scalar preheader.
   void createVectorLoopSkeleton(StringRef Prefix);
 
-  /// Create new phi nodes for the induction variables to resume iteration count
-  /// in the scalar epilogue, from where the vectorized loop left off.
-  /// In cases where the loop skeleton is more complicated (i.e. epilogue
-  /// vectorization), \p MainVectorTripCount provides the trip count of the main
-  /// loop, used to compute these resume values. If \p IVSubset is provided, it
-  /// contains the phi nodes for which resume values are needed, because they
-  /// will generate wide induction phis in the epilogue loop.
-  void
-  createInductionResumeVPValues(const SCEV2ValueTy &ExpandedSCEVs,
-                                Value *MainVectorTripCount = nullptr,
-                                SmallPtrSetImpl *IVSubset = nullptr);
+  /// Create and record the values for induction variables to resume coming from
+  /// the additional bypass block.
+  void createInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs,
+                                             Value *MainVectorTripCount);
 
   /// Allow subclasses to override and print debug traces before/after vplan
   /// execution, when trace information is requested.
@@ -2641,61 +2618,6 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
                  nullptr, Twine(Prefix) + "scalar.ph");
 }
 
-void InnerLoopVectorizer::createInductionResumeVPValue(
-    VPIRInstruction *InductionPhiRI, const InductionDescriptor &II, Value *Step,
-    ArrayRef BypassBlocks, VPBuilder &ScalarPHBuilder,
-    Value *MainVectorTripCount) {
-  // TODO: Move to LVP or general VPlan construction, once no IR values are
-  // generated.
-  auto *OrigPhi = cast(&InductionPhiRI->getInstruction());
-  Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
-  assert(VectorTripCount && "Expected valid arguments");
-
-  Instruction *OldInduction = Legal->getPrimaryInduction();
-  // For the primary induction the end values are known.
-  Value *EndValue = VectorTripCount;
-  Value *EndValueFromAdditionalBypass = MainVectorTripCount;
-  // Otherwise compute them accordingly.
-  if (OrigPhi != OldInduction) {
-    IRBuilder<> B(LoopVectorPreHeader->getTerminator());
-
-    // Fast-math-flags propagate from the original induction instruction.
-    if (isa_and_nonnull(II.getInductionBinOp()))
-      B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
-
-    EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(),
-                                    Step, II.getKind(), II.getInductionBinOp());
-    EndValue->setName("ind.end");
-
-    // Compute the end value for the additional bypass (if applicable).
-    if (MainVectorTripCount) {
-      B.SetInsertPoint(getAdditionalBypassBlock(),
-                       getAdditionalBypassBlock()->getFirstInsertionPt());
-      EndValueFromAdditionalBypass =
-          emitTransformedIndex(B, MainVectorTripCount, II.getStartValue(), Step,
-                               II.getKind(), II.getInductionBinOp());
-      EndValueFromAdditionalBypass->setName("ind.end");
-    }
-  }
-
-  auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
-      VPInstruction::ResumePhi,
-      {Plan.getOrAddLiveIn(EndValue), Plan.getOrAddLiveIn(II.getStartValue())},
-      OrigPhi->getDebugLoc(), "bc.resume.val");
-  assert(InductionPhiRI->getNumOperands() == 0 &&
-         "InductionPhiRI should not have any operands");
-  InductionPhiRI->addOperand(ResumePhiRecipe);
-
-  if (EndValueFromAdditionalBypass) {
-    // Store the bypass value here, as it needs to be added as operand to its
-    // scalar preheader phi node after the epilogue skeleton has been created.
-    // TODO: Directly add as extra operand to the VPResumePHI recipe.
-    assert(!Induction2AdditionalBypassValue.contains(OrigPhi) &&
-           "entry for OrigPhi already exits");
-    Induction2AdditionalBypassValue[OrigPhi] = EndValueFromAdditionalBypass;
-  }
-}
-
 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
 /// expansion results.
 static Value *getExpandedStep(const InductionDescriptor &ID,
@@ -2733,46 +2655,40 @@ static void addFullyUnrolledInstructionsToIgnore(
   }
 }
 
-void InnerLoopVectorizer::createInductionResumeVPValues(
-    const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount,
-    SmallPtrSetImpl *IVSubset) {
-  // We are going to resume the execution of the scalar loop.
-  // Go over all of the induction variable PHIs of the scalar loop header and
-  // fix their starting values, which depend on the counter of the last
-  // iteration of the vectorized loop. If we come from one of the
-  // LoopBypassBlocks then we need to start from the original start value.
-  // Otherwise we provide the trip count from the main vector loop.
-  VPBasicBlock *ScalarPHVPBB = Plan.getScalarPreheader();
-  VPBuilder ScalarPHBuilder(ScalarPHVPBB, ScalarPHVPBB->begin());
-  bool HasCanonical = false;
-  for (VPRecipeBase &R : *Plan.getScalarHeader()) {
-    auto *PhiR = cast(&R);
-    auto *Phi = dyn_cast(&PhiR->getInstruction());
-    if (!Phi)
-      break;
-    if (!Legal->getInductionVars().contains(Phi) ||
-        (IVSubset && !IVSubset->contains(Phi)))
-      continue;
-    const InductionDescriptor &II = Legal->getInductionVars().find(Phi)->second;
-    createInductionResumeVPValue(PhiR, II, getExpandedStep(II, ExpandedSCEVs),
-                                 LoopBypassBlocks, ScalarPHBuilder,
-                                 MainVectorTripCount);
-    auto *ConstStart = dyn_cast(II.getStartValue());
-    auto *ConstStep = II.getConstIntStepValue();
-    if (Phi->getType() == VectorTripCount->getType() && ConstStart &&
-        ConstStart->isZero() && ConstStep && ConstStep->isOne())
-      HasCanonical = true;
-  }
-
-  if (!IVSubset || HasCanonical)
-    return;
-  // When vectorizing the epilogue, create a resume phi for the canonical IV if
-  // no suitable resume phi was already created.
-  ScalarPHBuilder.createNaryOp(
-      VPInstruction::ResumePhi,
-      {&Plan.getVectorTripCount(),
-       Plan.getOrAddLiveIn(ConstantInt::get(VectorTripCount->getType(), 0))},
-      {}, "vec.epilog.resume.val");
+void InnerLoopVectorizer::createInductionAdditionalBypassValues(
+    const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount) {
+  assert(MainVectorTripCount && "Must have bypass information");
+
+  Instruction *OldInduction = Legal->getPrimaryInduction();
+  IRBuilder<> BypassBuilder(getAdditionalBypassBlock(),
+                            getAdditionalBypassBlock()->getFirstInsertionPt());
+  for (const auto &InductionEntry : Legal->getInductionVars()) {
+    PHINode *OrigPhi = InductionEntry.first;
+    const InductionDescriptor &II = InductionEntry.second;
+    Value *Step = getExpandedStep(II, ExpandedSCEVs);
+    // For the primary induction the additional bypass end value is known.
+    // Otherwise it is computed.
+    Value *EndValueFromAdditionalBypass = MainVectorTripCount;
+    if (OrigPhi != OldInduction) {
+      auto *BinOp = II.getInductionBinOp();
+      // Fast-math-flags propagate from the original induction instruction.
+      if (isa_and_nonnull(BinOp))
+        BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
+
+      // Compute the end value for the additional bypass.
+      EndValueFromAdditionalBypass =
+          emitTransformedIndex(BypassBuilder, MainVectorTripCount,
+                               II.getStartValue(), Step, II.getKind(), BinOp);
+      EndValueFromAdditionalBypass->setName("ind.end");
+    }
+
+    // Store the bypass value here, as it needs to be added as operand to its
+    // scalar preheader phi node after the epilogue skeleton has been created.
+    // TODO: Directly add as extra operand to the VPResumePHI recipe.
+    assert(!Induction2AdditionalBypassValue.contains(OrigPhi) &&
+           "entry for OrigPhi already exits");
+    Induction2AdditionalBypassValue[OrigPhi] = EndValueFromAdditionalBypass;
+  }
 }
 
 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton(
@@ -2832,9 +2748,6 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton(
   // faster.
   emitMemRuntimeChecks(LoopScalarPreHeader);
 
-  // Emit phis for the new starting index of the scalar loop.
-  createInductionResumeVPValues(ExpandedSCEVs);
-
   return LoopVectorPreHeader;
 }
 
@@ -7968,17 +7881,6 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
   // Generate the induction variable.
   EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
 
-  // Generate VPValues and ResumePhi recipes for wide inductions in the epilogue
-  // plan only. Other inductions only need a resume value for the canonical
-  // induction, which will get created during epilogue skeleton construction.
-  SmallPtrSet WideIVs;
-  for (VPRecipeBase &H :
-       EPI.EpiloguePlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
-    if (auto *WideIV = dyn_cast(&H))
-      WideIVs.insert(WideIV->getPHINode());
-  }
-  createInductionResumeVPValues(ExpandedSCEVs, nullptr, &WideIVs);
-
   return LoopVectorPreHeader;
 }
 
@@ -8128,14 +8030,11 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
       Phi->removeIncomingValue(EPI.MemSafetyCheck);
   }
 
-  // Generate induction resume values. These variables save the new starting
-  // indexes for the scalar loop. They are used to test if there are any tail
-  // iterations left once the vector loop has completed.
-  // Note that when the vectorized epilogue is skipped due to iteration count
-  // check, then the resume value for the induction variable comes from
-  // the trip count of the main vector loop, passed as the second argument.
-  createInductionResumeVPValues(ExpandedSCEVs, EPI.VectorTripCount);
-
+  // Generate bypass values from the additional bypass block. Note that when the
+  // vectorized epilogue is skipped due to iteration count check, then the
+  // resume value for the induction variable comes from the trip count of the
+  // main vector loop, passed as the second argument.
+  createInductionAdditionalBypassValues(ExpandedSCEVs, EPI.VectorTripCount);
   return LoopVectorPreHeader;
 }
 
@@ -8955,14 +8854,55 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
                        {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
 }
 
-/// Create resume phis in the scalar preheader for first-order recurrences and
-/// reductions and update the VPIRInstructions wrapping the original phis in the
-/// scalar header.
+/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
+/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
+/// the end value of the induction.
+static VPValue *addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV,
+                                               VPBuilder &VectorPHBuilder,
+                                               VPBuilder &ScalarPHBuilder,
+                                               VPTypeAnalysis &TypeInfo,
+                                               VPValue *VectorTC) {
+  auto *WideIntOrFp = dyn_cast(WideIV);
+  // Truncated wide inductions resume from the last lane of their vector value
+  // in the last vector iteration which is handled elsewhere.
+  if (WideIntOrFp && WideIntOrFp->getTruncInst())
+    return nullptr;
+
+  VPValue *Start = WideIV->getStartValue();
+  VPValue *Step = WideIV->getStepValue();
+  const InductionDescriptor &ID = WideIV->getInductionDescriptor();
+  VPValue *EndValue = VectorTC;
+  if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
+    EndValue = VectorPHBuilder.createDerivedIV(
+        ID.getKind(), dyn_cast_or_null(ID.getInductionBinOp()),
+        Start, VectorTC, Step);
+  }
+
+  // EndValue is derived from the vector trip count (which has the same type as
+  // the widest induction) and thus may be wider than the induction here.
+  Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
+  if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
+    EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
+                                                ScalarTypeOfWideIV);
+  }
+
+  auto *ResumePhiRecipe =
+      ScalarPHBuilder.createNaryOp(VPInstruction::ResumePhi, {EndValue, Start},
+                                   WideIV->getDebugLoc(), "bc.resume.val");
+  return ResumePhiRecipe;
+}
+
+/// Create resume phis in the scalar preheader for first-order recurrences,
+/// reductions and inductions, and update the VPIRInstructions wrapping the
+/// original phis in the scalar header.
 static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
+  VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
   auto *ScalarPH = Plan.getScalarPreheader();
   auto *MiddleVPBB = cast(ScalarPH->getSinglePredecessor());
-  VPBuilder ScalarPHBuilder(ScalarPH);
+  VPBuilder VectorPHBuilder(
+      cast(Plan.getVectorLoopRegion()->getSinglePredecessor()));
   VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
+  VPBuilder ScalarPHBuilder(ScalarPH);
   VPValue *OneVPV = Plan.getOrAddLiveIn(
       ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
   for (VPRecipeBase &ScalarPhiR : *Plan.getScalarHeader()) {
@@ -8970,9 +8910,23 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
     auto *ScalarPhiI = dyn_cast(&ScalarPhiIRI->getInstruction());
     if (!ScalarPhiI)
       break;
+
     auto *VectorPhiR = cast(Builder.getRecipe(ScalarPhiI));
-    if (!isa(VectorPhiR))
+    if (auto *WideIVR = dyn_cast(VectorPhiR)) {
+      if (VPValue *ResumePhi = addResumePhiRecipeForInduction(
+              WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
+              &Plan.getVectorTripCount())) {
+        ScalarPhiIRI->addOperand(ResumePhi);
+        continue;
+      }
+      // TODO: Also handle truncated inductions here. Computing end-values
+      // separately should be done as VPlan-to-VPlan optimization, after
+      // legalizing all resume values to use the last lane from the loop.
+      assert(cast(VectorPhiR)->getTruncInst() &&
+             "should only skip truncated wide inductions");
       continue;
+    }
+
     // The backedge value provides the value to resume coming out of a loop,
     // which for FORs is a vector whose last element needs to be extracted. The
     // start value provides the value if the loop is bypassed.
@@ -9474,6 +9428,18 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
   bool HasNUW = true;
   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
                         DebugLoc());
+
+  // Collect mapping of IR header phis to header phi recipes, to be used in
+  // addScalarResumePhis.
+  VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
+  for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
+    if (isa(&R))
+      continue;
+    auto *HeaderR = cast(&R);
+    RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR);
+  }
+  addScalarResumePhis(RecipeBuilder, *Plan);
+
   assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
   return Plan;
 }
@@ -9762,13 +9728,18 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) {
     State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
 
   Value *Step = State.get(getStepValue(), VPLane(0));
-  Value *CanonicalIV = State.get(getOperand(1), VPLane(0));
+  Value *Index = State.get(getOperand(1), VPLane(0));
   Value *DerivedIV = emitTransformedIndex(
-      State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
-      Kind, cast_if_present(FPBinOp));
+      State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind,
+      cast_if_present(FPBinOp));
   DerivedIV->setName(Name);
-  assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
-
+  // If index is the vector trip count, the concrete value will only be set in
+  // prepareToExecute, leading to missed simplifications, e.g. if it is 0.
+  // TODO: Remove the special case for the vector trip count once it is computed
+  // in VPlan and can be used during VPlan simplification.
+  assert((DerivedIV != Index ||
+          getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) &&
+         "IV didn't need transforming?");
   State.set(this, DerivedIV, VPLane(0));
 }
 
@@ -10078,6 +10049,57 @@ LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
                               !EnableLoopVectorization) {}
 
+/// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
+/// vectorization. Remove ResumePhis from \p MainPlan for inductions that
+/// don't have a corresponding wide induction in \p EpiPlan.
+static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
+  // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those
+  // will need their resume-values computed in the main vector loop. Others
+  // can be removed from the main VPlan.
+  SmallPtrSet EpiWidenedPhis;
+  for (VPRecipeBase &R :
+       EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
+    if (isa(&R))
+      continue;
+    EpiWidenedPhis.insert(
+        cast(R.getVPSingleValue()->getUnderlyingValue()));
+  }
+  for (VPRecipeBase &R : make_early_inc_range(
+           *cast(MainPlan.getScalarHeader()))) {
+    auto *VPIRInst = cast(&R);
+    auto *IRI = dyn_cast(&VPIRInst->getInstruction());
+    if (!IRI)
+      break;
+    if (EpiWidenedPhis.contains(IRI))
+      continue;
+    // There is no corresponding wide induction in the epilogue plan that would
+    // need a resume value. Remove the VPIRInst wrapping the scalar header phi
+    // together with the corresponding ResumePhi. The resume values for the
+    // scalar loop will be created during execution of EpiPlan.
+    VPRecipeBase *ResumePhi = VPIRInst->getOperand(0)->getDefiningRecipe();
+    VPIRInst->eraseFromParent();
+    ResumePhi->eraseFromParent();
+  }
+  VPlanTransforms::removeDeadRecipes(MainPlan);
+
+  using namespace VPlanPatternMatch;
+  VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
+  VPValue *VectorTC = &MainPlan.getVectorTripCount();
+  // If there is a suitable resume value for the canonical induction in the
+  // scalar (which will become vector) epilogue loop we are done. Otherwise
+  // create it below.
+  if (any_of(*MainScalarPH, [VectorTC](VPRecipeBase &R) {
+        return match(&R, m_VPInstruction(
+                             m_Specific(VectorTC), m_SpecificInt(0)));
+      }))
+    return;
+  VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
+  ScalarPHBuilder.createNaryOp(
+      VPInstruction::ResumePhi,
+      {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {},
+      "vec.epilog.resume.val");
+}
+
 /// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
 /// SCEVs from \p ExpandedSCEVs and set resume values for header recipes.
 static void
@@ -10542,12 +10564,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
         // to be vectorized by executing the plan (potentially with a different
         // factor) again shortly afterwards.
         VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
+        preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
                                           BestEpiPlan);
         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
                                            EPI, &LVL, &CM, BFI, PSI, Checks,
                                            *BestMainPlan);
-
         auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
                                              *BestMainPlan, MainILV, DT, false);
         ++LoopsVectorized;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
index ac7f1478cf68c..5b77ced73bce0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
@@ -13,9 +13,9 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 8, [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call  @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8)
 ; CHECK-NEXT:    [[TMP8:%.*]] = call  @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul  [[TMP8]], splat (i64 1)
@@ -99,9 +99,9 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call  @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = call  @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul  [[TMP8]], splat (i64 1)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 867355952cafe..caa98d766a8c3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -821,11 +821,11 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
 ; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 257, [[TMP2]]
 ; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; PRED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
 ; PRED-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], 8
 ; PRED-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
 ; PRED-NEXT:    [[IND_END1:%.*]] = mul i64 [[N_VEC]], 2
-; PRED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
 ; PRED-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 2
 ; PRED-NEXT:    [[TMP8:%.*]] = sub i64 257, [[TMP7]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
index 03de9acaf499b..88b14b18c1588 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
@@ -233,8 +233,8 @@ define void @test_widen_induction_variable_start(ptr %A, i64 %N, i64 %start) {
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]]
 ; CHECK-NEXT:    [[IND_END4:%.*]] = add i64 [[START]], [[N_VEC3]]
@@ -409,8 +409,8 @@ define void @test_widen_extended_induction(ptr %dst) {
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ 10000, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i8> [[DOTSPLAT]], 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll
index f7a1eb455fc1a..a939969af852e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll
@@ -48,8 +48,8 @@ define void @firstorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
@@ -154,10 +154,10 @@ define void @thirdorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE45]], [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT7:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT6]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE44]], [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT10:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT9]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
index cbf9bf08c2a20..08a6001431903 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
@@ -91,8 +91,8 @@ define void @test_iv_cost(ptr %ptr.start, i8 %a, i64 %b) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[START]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT:    [[IND_END6:%.*]] = getelementptr i8, ptr [[PTR_START]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[IND_END:%.*]] = sub i64 [[START]], [[N_VEC]]
+; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr i8, ptr [[PTR_START]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[START]], [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
@@ -117,11 +117,11 @@ define void @test_iv_cost(ptr %ptr.start, i8 %a, i64 %b) {
 ; CHECK-NEXT:    br i1 [[CMP_N11]], label %[[EXIT_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END1]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[START]], %[[ITER_CHECK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL11:%.*]] = phi ptr [ [[IND_END5]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[PTR_START]], %[[ITER_CHECK]] ], [ [[IND_END6]], %[[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL9:%.*]] = phi ptr [ [[IND_END5]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[PTR_START]], %[[ITER_CHECK]] ], [ [[IND_END2]], %[[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL11]], %[[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL9]], %[[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], -1
 ; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 1
 ; CHECK-NEXT:    store i8 0, ptr [[PTR_IV]], align 1
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index d42e6af1cec0c..56a468ed1310b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -819,8 +819,8 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) {
 ; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX1]], 1
 ; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
 ; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; PRED-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
 ; PRED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX1]], 1
+; PRED-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
 ; PRED-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
 ; PRED-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
index bf27f9e6be65e..f9cc195e36702 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
@@ -333,10 +333,10 @@ define i64 @test_ptr_ivs_and_widened_ivs(ptr %src, i32 %N) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SRC]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SRC]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i32 [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[P:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[SHL:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
index 393ee8d30433b..9dceb0167a4ac 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
@@ -50,12 +50,12 @@ define void @saddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[DOTCAST1:%.*]] = trunc nuw i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[IND_END8:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END13:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP6]]
+; CHECK-NEXT:    [[IND_END10:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END10:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP7]]
-; CHECK-NEXT:    [[DOTCAST7:%.*]] = trunc nuw i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[IND_END8:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST7]]
+; CHECK-NEXT:    [[IND_END13:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 12
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
@@ -171,10 +171,10 @@ define void @umin(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[IND_END12:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]]
-; CHECK-NEXT:    [[IND_END9:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[DOTCAST6:%.*]] = trunc nuw i64 [[N_VEC]] to i32
 ; CHECK-NEXT:    [[IND_END7:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST6]]
+; CHECK-NEXT:    [[IND_END9:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[IND_END12:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 24
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
index 528e202b4997f..6d57f212fd88c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
@@ -105,9 +105,9 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS1-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 8
 ; CHECK-VS1-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP3]], [[TMP29]]
 ; CHECK-VS1-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF2]]
-; CHECK-VS1-NEXT:    [[IND_END:%.*]] = add i64 [[TMP0]], [[N_VEC3]]
 ; CHECK-VS1-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VS1-NEXT:    [[TMP31:%.*]] = mul i64 [[TMP30]], 8
+; CHECK-VS1-NEXT:    [[TMP39:%.*]] = add i64 [[TMP0]], [[N_VEC3]]
 ; CHECK-VS1-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement  poison, i8 [[CONV]], i64 0
 ; CHECK-VS1-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector  [[BROADCAST_SPLATINSERT7]],  poison,  zeroinitializer
 ; CHECK-VS1-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
@@ -127,7 +127,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS1-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC3]]
 ; CHECK-VS1-NEXT:    br i1 [[CMP_N10]], label %[[WHILE_END_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK-VS1:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-VS1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[ITER_CHECK]] ], [ [[IND_END4]], %[[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-VS1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP39]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[ITER_CHECK]] ], [ [[IND_END4]], %[[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-VS1-NEXT:    br label %[[WHILE_BODY:.*]]
 ; CHECK-VS1:       [[WHILE_BODY]]:
 ; CHECK-VS1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[WHILE_BODY]] ]
@@ -213,9 +213,9 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS2-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 4
 ; CHECK-VS2-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP3]], [[TMP29]]
 ; CHECK-VS2-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF2]]
-; CHECK-VS2-NEXT:    [[IND_END:%.*]] = add i64 [[TMP0]], [[N_VEC3]]
 ; CHECK-VS2-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VS2-NEXT:    [[TMP31:%.*]] = mul i64 [[TMP30]], 4
+; CHECK-VS2-NEXT:    [[TMP39:%.*]] = add i64 [[TMP0]], [[N_VEC3]]
 ; CHECK-VS2-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement  poison, i8 [[CONV]], i64 0
 ; CHECK-VS2-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector  [[BROADCAST_SPLATINSERT7]],  poison,  zeroinitializer
 ; CHECK-VS2-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
@@ -235,7 +235,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS2-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC3]]
 ; CHECK-VS2-NEXT:    br i1 [[CMP_N10]], label %[[WHILE_END_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK-VS2:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-VS2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[ITER_CHECK]] ], [ [[IND_END4]], %[[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-VS2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP39]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[ITER_CHECK]] ], [ [[IND_END4]], %[[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-VS2-NEXT:    br label %[[WHILE_BODY:.*]]
 ; CHECK-VS2:       [[WHILE_BODY]]:
 ; CHECK-VS2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[WHILE_BODY]] ]
@@ -428,9 +428,9 @@ define void @overflow_indvar_known_false(ptr nocapture noundef %p, i32 noundef %
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP1]], [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 16
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call  @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[TMP1]])
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement  poison, i8 [[CONV]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector  [[BROADCAST_SPLATINSERT]],  poison,  zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll b/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll
index 771dd00034c70..0ff98d2abe776 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll
@@ -19,8 +19,8 @@ define i64 @mul_select_operand_known_1_via_scev() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> [[VEC_PHI]])
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ 12, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
index 0cea16d103678..3d4f7e0e4924b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
@@ -137,9 +137,9 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
 ; VSCALEFORTUNING2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; VSCALEFORTUNING2-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VSCALEFORTUNING2:       scalar.ph:
-; VSCALEFORTUNING2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VSCALEFORTUNING2-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP24]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; VSCALEFORTUNING2-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP24]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; VSCALEFORTUNING2-NEXT:    [[SCALAR_RECUR_INIT11:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; VSCALEFORTUNING2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; VSCALEFORTUNING2-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP50]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; VSCALEFORTUNING2-NEXT:    br label [[LOOP:%.*]]
 ; VSCALEFORTUNING2:       loop:
@@ -260,9 +260,9 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
 ; PRED-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement  [[TMP29]], i32 [[TMP47]]
 ; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; PRED:       scalar.ph:
-; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; PRED-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP28]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1]] ]
+; PRED-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP28]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
 ; PRED-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1]] ]
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1]] ]
 ; PRED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP44]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1]] ]
 ; PRED-NEXT:    br label [[LOOP1:%.*]]
 ; PRED:       loop:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
index a426cdf08062c..a83c62b04afc7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
@@ -23,9 +23,9 @@ define void @test_no_scalarization(ptr %a, ptr noalias %b, i32 %idx, i32 %n) #0
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], [[TMP5]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i32 [[IDX]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = add i32 [[IDX]], [[N_VEC]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement  poison, i32 [[IDX]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector  [[DOTSPLATINSERT]],  poison,  zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = call  @llvm.stepvector.nxv2i32()
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index 6ecaff048ca3c..cb4fd04d1bc4f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -512,9 +512,9 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
 ; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]]
 ; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-UNORDERED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = insertelement  splat (float -0.000000e+00), float [[A2]], i32 0
 ; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = insertelement  splat (float -0.000000e+00), float [[A1]], i32 0
 ; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -540,9 +540,9 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-UNORDERED:       scalar.ph:
-; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[A2]], [[ENTRY]] ]
 ; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX2:%.*]] = phi float [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[A1]], [[ENTRY]] ]
+; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-UNORDERED:       for.body:
 ; CHECK-UNORDERED-NEXT:    [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ]
@@ -583,9 +583,9 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
 ; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]]
 ; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-ORDERED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-ORDERED:       vector.body:
 ; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -607,9 +607,9 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-ORDERED:       scalar.ph:
-; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ [[A2]], [[ENTRY]] ]
 ; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX2:%.*]] = phi float [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ [[A1]], [[ENTRY]] ]
+; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-ORDERED:       for.body:
 ; CHECK-ORDERED-NEXT:    [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ]
@@ -649,9 +649,9 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-TF-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP2]], [[TMP5]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-ORDERED-TF-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-ORDERED-TF-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-ORDERED-TF-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-ORDERED-TF-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
 ; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP2]], [[TMP9]]
@@ -684,9 +684,9 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-TF:       middle.block:
 ; CHECK-ORDERED-TF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-ORDERED-TF:       scalar.ph:
-; CHECK-ORDERED-TF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[A2]], [[ENTRY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[BC_MERGE_RDX2:%.*]] = phi float [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ [[A1]], [[ENTRY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-ORDERED-TF-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-ORDERED-TF:       for.body:
 ; CHECK-ORDERED-TF-NEXT:    [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
index 7d058a6ef25db..295c0655a4b4d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
@@ -380,9 +380,9 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 {
 ; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP23]], 8
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 10000, [[TMP24]]
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 10000, [[N_MOD_VF2]]
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC3]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 8
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC3]]
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
index 25d3b3fe3b837..90ef2da3d1637 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
@@ -16,10 +16,10 @@ define void @induction_i7(ptr %dst) #0 {
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i7
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP40]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i7
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement  poison, i64 [[TMP40]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT_:%.*]] = shufflevector  [[DOTSPLATINSERT]],  poison,  zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = trunc  [[DOTSPLAT_]] to 
@@ -82,10 +82,10 @@ define void @induction_i3_zext(ptr %dst) #0 {
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i3
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP40]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i3
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement  poison, i64 [[TMP40]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT_:%.*]] = shufflevector  [[DOTSPLATINSERT]],  poison,  zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = trunc  [[DOTSPLAT_]] to 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 9b37ba588f5d6..05c0bc0761ea4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -467,9 +467,9 @@ define void @even_load_static_tc(ptr noalias nocapture readonly %A, ptr noalias
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i64 512, [[TMP1]]
-; CHECK-NEXT:    [[IND_END:%.*]] = shl nuw nsw i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = shl nuw nsw i64 [[N_VEC]], 1
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -553,9 +553,9 @@ define void @even_load_dynamic_tc(ptr noalias nocapture readonly %A, ptr noalias
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP6]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[TMP9]]
-; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1248,9 +1248,9 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP5]], -4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], [[DOTNEG]]
-; CHECK-NEXT:    [[IND_END:%.*]] = shl nuw i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = shl nuw i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = call  @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl  [[TMP8]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP6]], 3
@@ -1339,10 +1339,10 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP5]], -4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], [[DOTNEG]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END:%.*]] = or disjoint i64 [[TMP6]], 3
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw i64 [[N_VEC]], 1
+; CHECK-NEXT:    [[IND_END:%.*]] = or disjoint i64 [[TMP11]], 3
 ; CHECK-NEXT:    [[TMP9:%.*]] = call  @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl  [[TMP9]], splat (i64 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add  [[TMP10]], splat (i64 3)
@@ -1449,9 +1449,9 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP8]], -4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP1]], [[DOTNEG]]
-; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw nsw i32 [[TMP11]], 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], -1
@@ -1492,9 +1492,9 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) #1 {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[TMP33:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD2:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
index 64b69be5f5259..322f96f45d191 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
@@ -19,12 +19,12 @@ define ptr @test(ptr %start.1, ptr %start.2, ptr %end) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], [[TMP7]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[N_VEC]], 8
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START_1:%.*]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[N_VEC]], 8
 ; CHECK-NEXT:    [[IND_END3:%.*]] = getelementptr i8, ptr [[START_2]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
index d81cfbf08ec93..1f7d0b745f929 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
@@ -11,7 +11,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ; VPLANS-LABEL: Checking a loop in 'simple_memset'
 ; VPLANS:      VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' {
 ; VPLANS-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
-; VPLANS-NEXT: vp<[[TC:%[0-9]+]]> = original trip-count
+; VPLANS:      vp<[[TC:%[0-9]+]]> = original trip-count
 ; VPLANS-EMPTY:
 ; VPLANS-NEXT: ir-bb:
 ; VPLANS-NEXT:  EMIT vp<[[TC]]> = EXPAND SCEV (1 umax %n)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
index e5b9812604f16..75b2df93c9350 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
@@ -215,9 +215,9 @@ define void @copy_stride4(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP2]], [[TMP5]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP2]], [[TMP9]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
index de1500421a915..603bd98e81497 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
@@ -17,6 +17,8 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:  vp<[[END1:%.+]]> = DERIVED-IV ir<%start.1> + vp<[[VEC_TC]]> * ir<8>
+; CHECK-NEXT:  vp<[[END2:%.+]]> = DERIVED-IV ir<%start.2> + vp<[[VEC_TC]]> * ir<1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:    vector loop: {
@@ -55,11 +57,12 @@ define void @pointer_induction_used_as_vector(ptr noalias %start.1, ptr noalias
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 8
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START_1:%.*]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr i8, ptr [[START_2:%.*]], i64 [[N_VEC]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[START_2]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
@@ -150,9 +153,9 @@ define void @pointer_induction(ptr noalias %start, i64 %N) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START:%.*]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START:%.*]], i64 [[N_VEC]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
index 4b096e17a4fa0..881de8dc79823 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
@@ -25,10 +25,10 @@ define void @widen_ptr_phi_unrolled(ptr noalias nocapture %a, ptr noalias nocapt
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -8
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[N_VEC]], 3
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP26:%.*]] = shl i64 [[N_VEC]], 3
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP26]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -138,12 +138,12 @@ define void @widen_2ptrs_phi_unrolled(ptr noalias nocapture %dst, ptr noalias no
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -8
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr i8, ptr [[DST:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 3
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -229,12 +229,12 @@ define i32 @pointer_iv_mixed(ptr noalias %a, ptr noalias %b, i64 %n) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], [[DOTNEG]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 1
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[A]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll
index 8037a3a0c0f84..c119248c0be43 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll
@@ -45,10 +45,11 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi [[VTC]], ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb:
-; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %exitcond = icmp eq i64 %iv.next, %N
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -90,10 +91,11 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi [[VTC]], ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb:
-; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %exitcond = icmp eq i64 %iv.next, %N
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
index 8ac46fe7687d2..0c246c6ee93e3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
@@ -43,10 +43,11 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:  EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb:
-; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -88,10 +89,11 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:  EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb:
-; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -138,6 +140,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:  EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb:
@@ -183,10 +186,11 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:  EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb:
-; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -232,6 +236,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:  EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb:
@@ -277,10 +282,11 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:  EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb:
-; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
index 648f6e874abbe..a119707bed120 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
@@ -41,10 +41,11 @@ target triple = "arm64-apple-ios"
 ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb:
-; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %cmp = icmp ne i64 %iv.next, 1024
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -86,10 +87,11 @@ target triple = "arm64-apple-ios"
 ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb:
-; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %cmp = icmp ne i64 %iv.next, 1024
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
index 79ced9a483ef7..2f1af7951dbc2 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
@@ -108,7 +108,7 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT:    [[IND_END27:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
@@ -117,40 +117,40 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP52]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF24:%.*]] = urem i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_VEC25:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF24]]
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC25]]
+; CHECK-NEXT:    [[TMP56:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC25]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[VEC_EPILOG_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i64> [[DOTSPLAT]], 
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX29:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT35:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND30:%.*]] = phi <2 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT31:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI32:%.*]] = phi <2 x i64> [ [[TMP55]], %[[VEC_EPILOG_PH]] ], [ [[TMP56:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP57:%.*]] = add i64 [[INDEX29]], 0
-; CHECK-NEXT:    [[NEXT_GEP33:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP57]]
-; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr i8, ptr [[NEXT_GEP33]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD34:%.*]] = load <2 x i8>, ptr [[TMP58]], align 1
-; CHECK-NEXT:    [[TMP59:%.*]] = zext <2 x i8> [[WIDE_LOAD34]] to <2 x i64>
-; CHECK-NEXT:    [[TMP60:%.*]] = shl <2 x i64> [[VEC_IND30]], splat (i64 1)
-; CHECK-NEXT:    [[TMP61:%.*]] = shl <2 x i64> [[TMP59]], [[TMP60]]
-; CHECK-NEXT:    [[TMP56]] = or <2 x i64> [[TMP61]], [[VEC_PHI32]]
-; CHECK-NEXT:    [[INDEX_NEXT35]] = add nuw i64 [[INDEX29]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT31]] = add <2 x i64> [[VEC_IND30]], splat (i64 2)
-; CHECK-NEXT:    [[TMP62:%.*]] = icmp eq i64 [[INDEX_NEXT35]], [[N_VEC25]]
-; CHECK-NEXT:    br i1 [[TMP62]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    [[INDEX26:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT32:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND27:%.*]] = phi <2 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT28:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI29:%.*]] = phi <2 x i64> [ [[TMP57]], %[[VEC_EPILOG_PH]] ], [ [[TMP58:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP59:%.*]] = add i64 [[INDEX26]], 0
+; CHECK-NEXT:    [[NEXT_GEP30:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP59]]
+; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr i8, ptr [[NEXT_GEP30]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD32:%.*]] = load <2 x i8>, ptr [[TMP60]], align 1
+; CHECK-NEXT:    [[TMP61:%.*]] = zext <2 x i8> [[WIDE_LOAD32]] to <2 x i64>
+; CHECK-NEXT:    [[TMP62:%.*]] = shl <2 x i64> [[VEC_IND27]], splat (i64 1)
+; CHECK-NEXT:    [[TMP63:%.*]] = shl <2 x i64> [[TMP61]], [[TMP62]]
+; CHECK-NEXT:    [[TMP58]] = or <2 x i64> [[TMP63]], [[VEC_PHI29]]
+; CHECK-NEXT:    [[INDEX_NEXT32]] = add nuw i64 [[INDEX26]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT28]] = add <2 x i64> [[VEC_IND27]], splat (i64 2)
+; CHECK-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT32]], [[N_VEC25]]
+; CHECK-NEXT:    br i1 [[TMP54]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP54:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[TMP56]])
-; CHECK-NEXT:    [[CMP_N36:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC25]]
-; CHECK-NEXT:    br i1 [[CMP_N36]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT:    [[TMP55:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[TMP58]])
+; CHECK-NEXT:    [[CMP_N33:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC25]]
+; CHECK-NEXT:    br i1 [[CMP_N33]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL35:%.*]] = phi i64 [ [[N_VEC25]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL36:%.*]] = phi ptr [ [[IND_END]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[START]], %[[ITER_CHECK]] ], [ [[IND_END27]], %[[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX37:%.*]] = phi i64 [ [[TMP54]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[TMP52]], %[[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL34:%.*]] = phi i64 [ [[N_VEC25]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX35:%.*]] = phi i64 [ [[TMP55]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[TMP52]], %[[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL36:%.*]] = phi ptr [ [[TMP56]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[START]], %[[ITER_CHECK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL35]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX37]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL34]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX35]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL36]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[TMP53:%.*]] = load i8, ptr [[PTR_IV]], align 1
 ; CHECK-NEXT:    [[CONV3:%.*]] = zext i8 [[TMP53]] to i64
@@ -164,7 +164,7 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[AND:%.*]] = select i1 [[CMP_I166_I]], i1 [[CMP2]], i1 false
 ; CHECK-NEXT:    br i1 [[AND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP52]], %[[MIDDLE_BLOCK]] ], [ [[TMP54]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP52]], %[[MIDDLE_BLOCK]] ], [ [[TMP55]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[RES:%.*]] = icmp eq i64 [[RED_NEXT_LCSSA]], 0
 ; CHECK-NEXT:    ret i1 [[RES]]
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll
index 130ef7542cfb4..f9c1ab4a81810 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll
@@ -100,8 +100,8 @@ define i32 @any_of_reduction_used_in_blend_with_mutliple_phis(ptr %src, i64 %N,
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[ANY_OF_RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ANY_OF_RED_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
index 11efac951082a..f2318d6057eec 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
@@ -22,9 +22,9 @@ define void @block_with_dead_inst_1(ptr %src, i64 %N) #0 {
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i64 [[TMP6]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP8]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP11:%.*]] = call  @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul  [[TMP11]], splat (i64 3)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add  zeroinitializer, [[TMP13]]
@@ -116,9 +116,9 @@ define void @block_with_dead_inst_2(ptr %src) #0 {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 333, [[TMP5]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP8:%.*]] = call  @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul  [[TMP8]], splat (i64 3)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add  zeroinitializer, [[TMP10]]
@@ -210,9 +210,9 @@ define void @multiple_blocks_with_dead_insts_3(ptr %src) #0 {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 333, [[TMP5]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP8:%.*]] = call  @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul  [[TMP8]], splat (i64 3)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add  zeroinitializer, [[TMP10]]
@@ -314,9 +314,9 @@ define void @multiple_blocks_with_dead_insts_4(ptr %src, i64 %N) #0 {
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i64 [[TMP6]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP8]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP11:%.*]] = call  @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul  [[TMP11]], splat (i64 3)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add  zeroinitializer, [[TMP13]]
@@ -420,9 +420,9 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_5(ptr %src) #0 {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 333, [[TMP5]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP8:%.*]] = call  @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul  [[TMP8]], splat (i64 3)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add  zeroinitializer, [[TMP10]]
@@ -534,9 +534,9 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_6(ptr %src, i1 %
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i64 [[TMP6]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP8]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP11:%.*]] = call  @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul  [[TMP11]], splat (i64 3)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add  zeroinitializer, [[TMP13]]
@@ -851,9 +851,9 @@ define void @dead_load_in_block(ptr %dst, ptr %src, i8 %N, i64 %x) #0 {
 ; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], [[TMP15]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP18:%.*]] = call  @llvm.stepvector.nxv2i64()
 ; CHECK-NEXT:    [[TMP20:%.*]] = mul  [[TMP18]], splat (i64 3)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add  zeroinitializer, [[TMP20]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
index 16c23cd777b65..450405f193465 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
@@ -29,10 +29,10 @@ define void @dead_load(ptr %p, i16 %start) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP5]], [[TMP11]]
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[N_VEC]], 3
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[START_EXT]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 8
+; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[N_VEC]], 3
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[START_EXT]], [[TMP18]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement  poison, i64 [[START_EXT]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector  [[DOTSPLATINSERT]],  poison,  zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = call  @llvm.stepvector.nxv8i64()
@@ -111,9 +111,9 @@ define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 252, [[TMP6]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[N_VEC]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], 4
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[N_VEC]], 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = call  @llvm.stepvector.nxv4i32()
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul  [[TMP9]], splat (i32 4)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add  zeroinitializer, [[TMP11]]
@@ -334,9 +334,9 @@ define void @test_phi_in_latch_redundant(ptr %dst, i32 %a) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 37, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 37, [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 9
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 9
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement  poison, i32 [[A]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector  [[BROADCAST_SPLATINSERT]],  poison,  zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor  [[BROADCAST_SPLAT]], splat (i32 -1)
@@ -419,9 +419,9 @@ define void @gather_interleave_group_with_dead_insert_pos(i64 %N, ptr noalias %s
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[TMP9:%.*]] = call  @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul  [[TMP9]], splat (i64 2)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add  zeroinitializer, [[TMP11]]
@@ -433,24 +433,24 @@ define void @gather_interleave_group_with_dead_insert_pos(i64 %N, ptr noalias %s
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi  [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load , ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP14]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load , ptr [[TMP15]], align 1
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv8i8( [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[TMP17:%.*]] = zext  [[TMP16]] to 
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[DST]],  [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP17]],  [[TMP18]], i32 4,  splat (i1 true))
+; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = zext  [[TMP17]] to 
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[DST]],  [[VEC_IND]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP18]],  [[TMP19]], i32 4,  splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add  [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
index dd2e75f1f5e21..58d6fd05241f2 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
@@ -28,8 +28,8 @@ define i64 @pr97452_scalable_vf1_for(ptr %src) #0 {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[WIDE_LOAD1]], i32 3
 ; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[L:%.*]], %[[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
index 8131c7bfd752d..e4425a9327385 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
@@ -63,13 +63,13 @@ define void @skip_free_iv_truncate(i16 %x, ptr %A) #0 {
 ; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], i64 [[TMP46]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP5]], [[TMP48]]
+; CHECK-NEXT:    [[TMP51:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP52:%.*]] = mul i64 [[TMP51]], 8
 ; CHECK-NEXT:    [[TMP49:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[X_I64]], [[TMP49]]
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
 ; CHECK-NEXT:    [[TMP50:%.*]] = mul i32 [[DOTCAST]], 3
 ; CHECK-NEXT:    [[IND_END22:%.*]] = add i32 [[X_I32]], [[TMP50]]
-; CHECK-NEXT:    [[TMP51:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP52:%.*]] = mul i64 [[TMP51]], 8
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement  poison, i64 [[X_I64]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector  [[DOTSPLATINSERT]],  poison,  zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = call  @llvm.stepvector.nxv8i64()
@@ -92,11 +92,11 @@ define void @skip_free_iv_truncate(i16 %x, ptr %A) #0 {
 ; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[X_I64]], %[[VECTOR_MEMCHECK]] ], [ [[X_I64]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL14:%.*]] = phi i32 [ [[IND_END22]], %[[MIDDLE_BLOCK]] ], [ [[X_I32]], %[[VECTOR_MEMCHECK]] ], [ [[X_I32]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL13:%.*]] = phi i32 [ [[IND_END22]], %[[MIDDLE_BLOCK]] ], [ [[X_I32]], %[[VECTOR_MEMCHECK]] ], [ [[X_I32]], %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[IV_CONV:%.*]] = phi i32 [ [[BC_RESUME_VAL14]], %[[SCALAR_PH]] ], [ [[TMP64:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_CONV:%.*]] = phi i32 [ [[BC_RESUME_VAL13]], %[[SCALAR_PH]] ], [ [[TMP64:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[GEP_I64:%.*]] = getelementptr i64, ptr [[A]], i64 [[IV]]
 ; CHECK-NEXT:    [[TMP61:%.*]] = load i64, ptr [[GEP_I64]], align 8
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext i32 [[IV_CONV]] to i64
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
index 2c19aab81251a..dc63072aa795f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
@@ -38,9 +38,9 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV32-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
 ; RV32-NEXT:    [[N_MOD_VF:%.*]] = urem i64 625, [[TMP4]]
 ; RV32-NEXT:    [[N_VEC:%.*]] = sub i64 625, [[N_MOD_VF]]
-; RV32-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 16
 ; RV32-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; RV32-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; RV32-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 16
 ; RV32-NEXT:    [[TMP7:%.*]] = call  @llvm.stepvector.nxv2i64()
 ; RV32-NEXT:    [[TMP9:%.*]] = mul  [[TMP7]], splat (i64 16)
 ; RV32-NEXT:    [[INDUCTION:%.*]] = add  zeroinitializer, [[TMP9]]
@@ -117,9 +117,9 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV64-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
 ; RV64-NEXT:    [[N_MOD_VF:%.*]] = urem i64 625, [[TMP4]]
 ; RV64-NEXT:    [[N_VEC:%.*]] = sub i64 625, [[N_MOD_VF]]
-; RV64-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 16
 ; RV64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; RV64-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; RV64-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 16
 ; RV64-NEXT:    [[TMP7:%.*]] = call  @llvm.stepvector.nxv2i64()
 ; RV64-NEXT:    [[TMP9:%.*]] = mul  [[TMP7]], splat (i64 16)
 ; RV64-NEXT:    [[INDUCTION:%.*]] = add  zeroinitializer, [[TMP9]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index eb60c24393df9..951d833fa941e 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -64,6 +64,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + vp<[[VEC_TC]]> * ir<-1>
+; CHECK-NEXT:    vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + vp<[[VEC_TC]]> * ir<-1>
 ; CHECK-NEXT:  Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   vector loop: {
@@ -92,11 +94,13 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:    EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[END1]]>, ir<%0>
+; CHECK-NEXT:    EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END2]]>, ir<%n>
 ; CHECK-NEXT:  Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb:
-; CHECK-NEXT:    IR   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT:    IR   %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
+; CHECK-NEXT:    IR   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME1]]> from scalar.ph)
+; CHECK-NEXT:    IR   %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph)
 ; CHECK:         IR   %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
@@ -181,11 +185,10 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    IR   %16 = mul i64 %15, 4
 ; CHECK-NEXT:    IR   %n.mod.vf = urem i64 %0, %16
 ; CHECK-NEXT:    IR   %n.vec = sub i64 %0, %n.mod.vf
-; CHECK-NEXT:    IR   %ind.end = sub i64 %0, %n.vec
-; CHECK-NEXT:    IR   %.cast = trunc i64 %n.vec to i32
-; CHECK-NEXT:    IR   %ind.end3 = sub i32 %n, %.cast
 ; CHECK-NEXT:    IR   %17 = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    IR   %18 = mul i64 %17, 4
+; CHECK-NEXT:    vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + ir<[[VEC_TC]]> * ir<-1>
+; CHECK-NEXT:    vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + ir<[[VEC_TC]]> * ir<-1>
 ; CHECK-NEXT:  Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   vector loop: {
@@ -217,8 +220,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb:
-; CHECK-NEXT:    EMIT vp<[[RESUME_1:%.+]]> = resume-phi ir<%ind.end>, ir<%0>
-; CHECK-NEXT:    EMIT vp<[[RESUME_2:%.+]]>.1 = resume-phi ir<%ind.end3>, ir<%n>
+; CHECK-NEXT:    EMIT vp<[[RESUME_1:%.+]]> = resume-phi vp<[[END1]]>, ir<%0>
+; CHECK-NEXT:    EMIT vp<[[RESUME_2:%.+]]>.1 = resume-phi vp<[[END2]]>, ir<%n>
 ; CHECK-NEXT:  Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb:
@@ -311,6 +314,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + vp<[[VEC_TC]]> * ir<-1>
+; CHECK-NEXT:    vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + vp<[[VEC_TC]]> * ir<-1>
 ; CHECK-NEXT:  Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   vector loop: {
@@ -339,11 +344,13 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:    EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[END1]]>, ir<%0>
+; CHECK-NEXT:    EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END2]]>, ir<%n>
 ; CHECK-NEXT:  Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb:
-; CHECK-NEXT:    IR   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT:    IR   %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
+; CHECK-NEXT:    IR   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME1]]> from scalar.ph)
+; CHECK-NEXT:    IR   %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph)
 ; CHECK:         IR   %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
@@ -428,11 +435,10 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    IR   %16 = mul i64 %15, 4
 ; CHECK-NEXT:    IR   %n.mod.vf = urem i64 %0, %16
 ; CHECK-NEXT:    IR   %n.vec = sub i64 %0, %n.mod.vf
-; CHECK-NEXT:    IR   %ind.end = sub i64 %0, %n.vec
-; CHECK-NEXT:    IR   %.cast = trunc i64 %n.vec to i32
-; CHECK-NEXT:    IR   %ind.end3 = sub i32 %n, %.cast
 ; CHECK-NEXT:    IR   %17 = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    IR   %18 = mul i64 %17, 4
+; CHECK-NEXT:    vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + ir<[[VEC_TC]]> * ir<-1>
+; CHECK-NEXT:    vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + ir<[[VEC_TC]]> * ir<-1>
 ; CHECK-NEXT:  Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   vector loop: {
@@ -464,8 +470,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb:
-; CHECK-NEXT:    EMIT vp<[[RESUME1:%.+]]> = resume-phi ir<%ind.end>, ir<%0>
-; CHECK-NEXT:    EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi ir<%ind.end3>, ir<%n>
+; CHECK-NEXT:    EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[END1]]>, ir<%0>
+; CHECK-NEXT:    EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END2]]>, ir<%n>
 ; CHECK-NEXT:  Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
index 8395ffd58db90..30cb33e64eccf 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
@@ -86,9 +86,9 @@ define void @single_constant_stride_int_iv(ptr %p) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 64
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 64
 ; CHECK-NEXT:    [[TMP6:%.*]] = call  @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul  [[TMP6]], splat (i64 64)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add  zeroinitializer, [[TMP8]]
@@ -162,10 +162,10 @@ define void @single_constant_stride_ptr_iv(ptr %p) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[N_VEC]], 8
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[N_VEC]], 8
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[TMP18]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
@@ -328,7 +328,6 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) {
 ; NOSTRIDED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
 ; NOSTRIDED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
 ; NOSTRIDED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; NOSTRIDED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], [[STRIDE]]
 ; NOSTRIDED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; NOSTRIDED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
 ; NOSTRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -348,7 +347,7 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) {
 ; NOSTRIDED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; NOSTRIDED:       scalar.ph:
 ; NOSTRIDED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
-; NOSTRIDED-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
+; NOSTRIDED-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; NOSTRIDED-NEXT:    br label [[LOOP:%.*]]
 ; NOSTRIDED:       loop:
 ; NOSTRIDED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -606,7 +605,6 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) {
 ; NOSTRIDED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
 ; NOSTRIDED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
 ; NOSTRIDED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; NOSTRIDED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], [[STRIDE]]
 ; NOSTRIDED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; NOSTRIDED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
 ; NOSTRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -626,7 +624,7 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) {
 ; NOSTRIDED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; NOSTRIDED:       scalar.ph:
 ; NOSTRIDED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
-; NOSTRIDED-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
+; NOSTRIDED-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; NOSTRIDED-NEXT:    br label [[LOOP:%.*]]
 ; NOSTRIDED:       loop:
 ; NOSTRIDED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -731,12 +729,12 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
 ; STRIDED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP9]]
 ; STRIDED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; STRIDED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; STRIDED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
 ; STRIDED-NEXT:    [[TMP10:%.*]] = mul i64 [[N_VEC]], [[STRIDE]]
 ; STRIDED-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP10]]
 ; STRIDED-NEXT:    [[TMP11:%.*]] = mul i64 [[N_VEC]], [[STRIDE]]
 ; STRIDED-NEXT:    [[IND_END7:%.*]] = getelementptr i8, ptr [[P2]], i64 [[TMP11]]
-; STRIDED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; STRIDED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
 ; STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; STRIDED:       vector.body:
 ; STRIDED-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
index 209c251f6a6a4..a330b6964a660 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -20,10 +20,10 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
-; IF-EVL-NEXT:    [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; IF-EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
+; IF-EVL-NEXT:    [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -121,10 +121,10 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
-; IF-EVL-NEXT:    [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; IF-EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
+; IF-EVL-NEXT:    [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
index 7efa65ed270ef..a2f85b9ed4ffe 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
@@ -25,9 +25,9 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP7]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[SPEC_SELECT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[SPEC_SELECT]], [[N_VEC]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
index e7eb5778ffb93..cd1d734f00eaa 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
@@ -63,11 +63,12 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-NEXT: Successor(s): ir-bb, scalar.ph
 ; IF-EVL-OUTLOOP-EMPTY:
 ; IF-EVL-OUTLOOP-NEXT: scalar.ph:
+; IF-EVL-OUTLOOP-NEXT:   EMIT vp<[[IV_RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; IF-EVL-OUTLOOP-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start>
 ; IF-EVL-OUTLOOP-NEXT: Successor(s): ir-bb
 ; IF-EVL-OUTLOOP-EMPTY:
 ; IF-EVL-OUTLOOP-NEXT: ir-bb:
-; IF-EVL-OUTLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; IF-EVL-OUTLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[IV_RESUME]]> from scalar.ph)
 ; IF-EVL-OUTLOOP-NEXT:   IR   %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ]
 ; IF-EVL-OUTLOOP:        IR   %exitcond.not = icmp eq i64 %iv.next, %n
 ; IF-EVL-OUTLOOP-NEXT: No successors
@@ -113,11 +114,12 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb, scalar.ph
 ; IF-EVL-INLOOP-EMPTY:
 ; IF-EVL-INLOOP-NEXT: scalar.ph:
+; IF-EVL-INLOOP-NEXT:   EMIT vp<[[IV_RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; IF-EVL-INLOOP-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start>
 ; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb
 ; IF-EVL-INLOOP-EMPTY:
 ; IF-EVL-INLOOP-NEXT: ir-bb:
-; IF-EVL-INLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; IF-EVL-INLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[IV_RESUME]]> from scalar.ph)
 ; IF-EVL-INLOOP-NEXT:   IR   %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ]
 ; IF-EVL-INLOOP:        IR   %exitcond.not = icmp eq i64 %iv.next, %n
 ; IF-EVL-INLOOP-NEXT: No successors
@@ -159,11 +161,12 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-OUTLOOP-NEXT: Successor(s): ir-bb, scalar.ph
 ; NO-VP-OUTLOOP-EMPTY:
 ; NO-VP-OUTLOOP-NEXT: scalar.ph:
+; NO-VP-OUTLOOP-NEXT:   EMIT vp<[[IV_RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; NO-VP-OUTLOOP-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start>
 ; NO-VP-OUTLOOP-NEXT: Successor(s): ir-bb
 ; NO-VP-OUTLOOP-EMPTY:
 ; NO-VP-OUTLOOP-NEXT: ir-bb:
-; NO-VP-OUTLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; NO-VP-OUTLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[IV_RESUME]]> from scalar.ph)
 ; NO-VP-OUTLOOP-NEXT:   IR   %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ]
 ; NO-VP-OUTLOOP:        IR   %exitcond.not = icmp eq i64 %iv.next, %n
 ; NO-VP-OUTLOOP-NEXT: No successors
@@ -205,11 +208,12 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-INLOOP-NEXT: Successor(s): ir-bb, scalar.ph
 ; NO-VP-INLOOP-EMPTY:
 ; NO-VP-INLOOP-NEXT: scalar.ph:
+; NO-VP-INLOOP-NEXT:   EMIT vp<[[IV_RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; NO-VP-INLOOP-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start>
 ; NO-VP-INLOOP-NEXT: Successor(s): ir-bb
 ; NO-VP-INLOOP-EMPTY:
 ; NO-VP-INLOOP-NEXT: ir-bb:
-; NO-VP-INLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; NO-VP-INLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[IV_RESUME]]> from scalar.ph)
 ; NO-VP-INLOOP-NEXT:   IR   %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ]
 ; NO-VP-INLOOP:        IR   %exitcond.not = icmp eq i64 %iv.next, %n
 ; NO-VP-INLOOP-NEXT: No successors
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll
index d0754f1c2bb55..7b0fa644ea001 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll
@@ -66,8 +66,8 @@ define void @func_21() {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[TMP12]], i32 1
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 6, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 6, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LV:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
index f3190369ae2a2..15bdbea612a70 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
@@ -42,8 +42,8 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 3, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 3, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF2]]
 ; CHECK-NEXT:    [[IND_END4:%.*]] = add i64 3, [[N_VEC3]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 6a12be7da192e..5c0aeb526e50c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -495,8 +495,8 @@ define i1 @any_of_cost(ptr %start, ptr %end) #0 {
 ; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP30]], i1 false, i1 false
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ false, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ false, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[ANY_OF:%.*]] = phi i1 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ANY_OF_NEXT:%.*]], [[LOOP]] ]
@@ -986,8 +986,8 @@ define void @reduction_store(ptr noalias %src, ptr %dst, i1 %x) #2 {
 ; CHECK-NEXT:    store i32 [[TMP10]], ptr [[DST:%.*]], align 4
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 24, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 24, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
index 5fb7df2c74d93..c14ddca6c913d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
@@ -60,8 +60,8 @@ define void @test_pr59459(i64 %iv.start, ptr %arr) {
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IV_START]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IV_START]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF3:%.*]] = urem i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[N_VEC4:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF3]]
 ; CHECK-NEXT:    [[IND_END5:%.*]] = add i64 [[IV_START]], [[N_VEC4]]
@@ -171,11 +171,11 @@ define void @test_induction_step_needs_expansion(ptr noalias %j, ptr %k, i64 %l,
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[K:%.*]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 16
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 32
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 32
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 48
 ; CHECK-NEXT:    store <16 x i16> [[TMP4]], ptr [[TMP9]], align 2
 ; CHECK-NEXT:    store <16 x i16> [[TMP5]], ptr [[TMP10]], align 2
-; CHECK-NEXT:    store <16 x i16> [[TMP6]], ptr [[TMP11]], align 2
+; CHECK-NEXT:    store <16 x i16> [[TMP6]], ptr [[TMP21]], align 2
 ; CHECK-NEXT:    store <16 x i16> [[TMP7]], ptr [[TMP12]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i16> [[STEP_ADD_3]], [[TMP1]]
@@ -191,8 +191,8 @@ define void @test_induction_step_needs_expansion(ptr noalias %j, ptr %k, i64 %l,
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF4:%.*]] = urem i64 [[L]], 8
 ; CHECK-NEXT:    [[N_VEC5:%.*]] = sub i64 [[L]], [[N_MOD_VF4]]
 ; CHECK-NEXT:    [[DOTCAST7:%.*]] = trunc i64 [[N_VEC5]] to i16
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
index 0e511cfc9bffe..6fc70802e82cb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
@@ -48,8 +48,8 @@ define void @firstorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
@@ -154,10 +154,10 @@ define void @thirdorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE45]], [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT6]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE44]], [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT9:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT7]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
index ab0b45473a623..fc6059d036cd0 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
@@ -61,8 +61,8 @@ define void @fp_iv_loop1(ptr noalias nocapture %A, i32 %N) #0 {
 ; AUTO_VEC-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; AUTO_VEC-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[FOR_BODY]], label [[VEC_EPILOG_PH]]
 ; AUTO_VEC:       vec.epilog.ph:
-; AUTO_VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi float [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 1.000000e+00, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AUTO_VEC-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; AUTO_VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi float [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 1.000000e+00, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AUTO_VEC-NEXT:    [[N_VEC3:%.*]] = and i64 [[ZEXT]], 2147483644
 ; AUTO_VEC-NEXT:    [[DOTCAST5:%.*]] = uitofp nneg i64 [[N_VEC3]] to float
 ; AUTO_VEC-NEXT:    [[TMP7:%.*]] = fmul fast float [[DOTCAST5]], 5.000000e-01
@@ -441,8 +441,8 @@ define void @fadd_reassoc_FMF(ptr nocapture %p, i32 %N) {
 ; AUTO_VEC-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; AUTO_VEC-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[FOR_BODY]], label [[VEC_EPILOG_PH]]
 ; AUTO_VEC:       vec.epilog.ph:
-; AUTO_VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi float [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 1.000000e+00, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AUTO_VEC-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; AUTO_VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi float [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 1.000000e+00, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AUTO_VEC-NEXT:    [[N_VEC6:%.*]] = and i64 [[TMP0]], 4294967292
 ; AUTO_VEC-NEXT:    [[DOTCAST8:%.*]] = uitofp nneg i64 [[N_VEC6]] to float
 ; AUTO_VEC-NEXT:    [[TMP12:%.*]] = fmul reassoc float [[DOTCAST8]], 4.200000e+01
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
index b6bccab5c2e4a..8c338d6a746c4 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@@ -665,16 +665,16 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; AVX512:       vec.epilog.iter.check:
-; AVX512-NEXT:    [[TMP22:%.*]] = mul i64 [[N_VEC]], 64
-; AVX512-NEXT:    [[IND_END15:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP22]]
 ; AVX512-NEXT:    [[TMP23:%.*]] = mul i64 [[N_VEC]], 4
 ; AVX512-NEXT:    [[IND_END12:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP23]]
+; AVX512-NEXT:    [[TMP38:%.*]] = mul i64 [[N_VEC]], 64
+; AVX512-NEXT:    [[IND_END15:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP38]]
 ; AVX512-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]]
 ; AVX512-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
 ; AVX512-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; AVX512:       vec.epilog.ph:
-; AVX512-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[DEST]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AVX512-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; AVX512-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[DEST]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AVX512-NEXT:    [[N_MOD_VF9:%.*]] = urem i64 [[TMP3]], 8
 ; AVX512-NEXT:    [[N_VEC10:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF9]]
 ; AVX512-NEXT:    [[TMP24:%.*]] = mul i64 [[N_VEC10]], 4
@@ -691,12 +691,12 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt
 ; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP27]]
 ; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[IDXPROM]]
 ; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i32 0
-; AVX512-NEXT:    [[WIDE_LOAD23:%.*]] = load <8 x float>, ptr [[TMP30]], align 4, !alias.scope [[META17:![0-9]+]]
-; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD23]], <8 x ptr> [[TMP26]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META20:![0-9]+]], !noalias [[META22:![0-9]+]]
+; AVX512-NEXT:    [[WIDE_LOAD17:%.*]] = load <8 x float>, ptr [[TMP30]], align 4, !alias.scope [[META17:![0-9]+]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD17]], <8 x ptr> [[TMP26]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META20:![0-9]+]], !noalias [[META22:![0-9]+]]
 ; AVX512-NEXT:    [[TMP31:%.*]] = getelementptr float, ptr [[TMP28]], i32 0
-; AVX512-NEXT:    [[WIDE_LOAD24:%.*]] = load <8 x float>, ptr [[TMP31]], align 4, !alias.scope [[META24:![0-9]+]]
+; AVX512-NEXT:    [[WIDE_LOAD18:%.*]] = load <8 x float>, ptr [[TMP31]], align 4, !alias.scope [[META24:![0-9]+]]
 ; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, <8 x ptr> [[TMP26]], i64 1
-; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD24]], <8 x ptr> [[TMP32]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META20]], !noalias [[META22]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD18]], <8 x ptr> [[TMP32]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META20]], !noalias [[META22]]
 ; AVX512-NEXT:    [[INDEX_NEXT24]] = add nuw i64 [[INDEX18]], 8
 ; AVX512-NEXT:    [[PTR_IND20]] = getelementptr i8, ptr [[POINTER_PHI19]], i64 512
 ; AVX512-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT24]], [[N_VEC10]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
index 3b550449006f3..68cbfad91c541 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
@@ -348,10 +348,10 @@ define void @multiple_pointer_ivs_with_scalar_uses_only(ptr %A, ptr %B) #0 {
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ -12, [[MIDDLE_BLOCK]] ], [ 100, [[VECTOR_MEMCHECK]] ], [ 100, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 2048, [[VECTOR_MEMCHECK]] ], [ 2048, [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[A]], [[VECTOR_MEMCHECK]] ], [ [[A]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi ptr [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ [[B]], [[VECTOR_MEMCHECK]] ], [ [[B]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL6:%.*]] = phi ptr [ [[IND_END5]], [[MIDDLE_BLOCK]] ], [ [[B]], [[VECTOR_MEMCHECK]] ], [ [[B]], [[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 2048, [[VECTOR_MEMCHECK]] ], [ 2048, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV_1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[DEC:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
index b2772648b5ee1..f50177e61ef08 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
@@ -56,12 +56,12 @@ define void @uaddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[DOTCAST1:%.*]] = trunc nuw i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[IND_END10:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST1]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END15:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP12]]
+; CHECK-NEXT:    [[IND_END12:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = shl nuw nsw i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END12:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP13]]
-; CHECK-NEXT:    [[DOTCAST9:%.*]] = trunc nuw i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[IND_END10:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST9]]
+; CHECK-NEXT:    [[IND_END15:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 56
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
@@ -181,10 +181,10 @@ define void @fshl(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[IND_END14:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]]
-; CHECK-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[DOTCAST8:%.*]] = trunc nuw i64 [[N_VEC]] to i32
 ; CHECK-NEXT:    [[IND_END9:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST8]]
+; CHECK-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[IND_END14:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 120
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
index 597be33ca63f6..9e87cc29be4e8 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
@@ -368,9 +368,9 @@ define void @tail_folded_store_avx512(ptr %start, ptr %end) #3 {
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP3]], 63
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 64
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP3]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i32 [[N_VEC]], -72
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i32 [[TMP4]]
-; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP3]], 1
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <64 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT3]], <64 x i32> poison, <64 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -414,9 +414,9 @@ define void @tail_folded_store_avx512(ptr %start, ptr %end) #3 {
 ; AUTOVF-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP3]], 7
 ; AUTOVF-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 8
 ; AUTOVF-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; AUTOVF-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP3]], 1
 ; AUTOVF-NEXT:    [[TMP4:%.*]] = mul i32 [[N_VEC]], -72
 ; AUTOVF-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i32 [[TMP4]]
-; AUTOVF-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP3]], 1
 ; AUTOVF-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
 ; AUTOVF-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT3]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; AUTOVF-NEXT:    br label [[VECTOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr109581-unused-blend.ll b/llvm/test/Transforms/LoopVectorize/X86/pr109581-unused-blend.ll
index bb7fe4d4f1e56..270e6bcd9ab1f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr109581-unused-blend.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr109581-unused-blend.ll
@@ -91,8 +91,8 @@ define i32 @unused_blend_after_unrolling(ptr %p, i32 %a, i1 %c.1, i16 %x, i16 %y
 ; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i16> [[PREDPHI]], i32 3
 ; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 97, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 97, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[B:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP_LATCH:.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
index 41868d62a35a5..d1c0201ccb9a4 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
@@ -85,9 +85,9 @@ define void @test(ptr %p) {
 ; VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
 ; VEC-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VEC:       scalar.ph:
-; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[VECTOR_SCEVCHECK]] ], [ 1, [[ENTRY:%.*]] ]
+; VEC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 1, [[VECTOR_SCEVCHECK]] ], [ 1, [[ENTRY:%.*]] ]
+; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[VECTOR_SCEVCHECK]] ], [ 1, [[ENTRY]] ]
 ; VEC-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
-; VEC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 1, [[VECTOR_SCEVCHECK]] ], [ 1, [[ENTRY]] ]
 ; VEC-NEXT:    br label [[FOR_BODY:%.*]]
 ; VEC:       for.body:
 ; VEC-NEXT:    [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IDX:%.*]], [[FOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
index 236ed30be4f13..c14c34cade6b6 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
@@ -65,16 +65,16 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP6]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT99:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[IND_END12:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[TMP64:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[IND_END9:%.*]] = add i64 8, [[TMP64]]
+; CHECK-NEXT:    [[IND_END12:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP6]], [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 8, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi i64 [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF6:%.*]] = urem i64 [[TMP6]], 8
 ; CHECK-NEXT:    [[N_VEC7:%.*]] = sub i64 [[TMP6]], [[N_MOD_VF6]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[N_VEC7]], 2
@@ -112,16 +112,16 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[BC_RESUME_VAL10:%.*]] = phi i64 [ [[IND_END8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 8, [[ITER_CHECK]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL13:%.*]] = phi i64 [ [[IND_END11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[IND_END12]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       iter.check27:
+; CHECK:       iter.check23:
 ; CHECK-NEXT:    [[TMP26:%.*]] = add nsw i64 [[TMP3]], -9
 ; CHECK-NEXT:    [[TMP27:%.*]] = lshr i64 [[TMP26]], 1
 ; CHECK-NEXT:    [[TMP28:%.*]] = add nuw i64 [[TMP27]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK25:%.*]] = icmp ult i64 [[TMP28]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK25]], label [[VEC_EPILOG_SCALAR_PH46:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK29:%.*]]
-; CHECK:       vector.main.loop.iter.check29:
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK25]], label [[VEC_EPILOG_SCALAR_PH41:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK25:%.*]]
+; CHECK:       vector.main.loop.iter.check25:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK28:%.*]] = icmp ult i64 [[TMP28]], 16
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK28]], label [[VEC_EPILOG_PH47:%.*]], label [[VECTOR_PH30:%.*]]
-; CHECK:       vector.ph30:
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK28]], label [[VEC_EPILOG_PH42:%.*]], label [[VECTOR_PH30:%.*]]
+; CHECK:       vector.ph26:
 ; CHECK-NEXT:    [[N_MOD_VF31:%.*]] = urem i64 [[TMP28]], 16
 ; CHECK-NEXT:    [[N_VEC32:%.*]] = sub i64 [[TMP28]], [[N_MOD_VF31]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = mul i64 [[N_VEC32]], 2
@@ -129,11 +129,11 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[IND_END43:%.*]] = mul i64 [[N_VEC32]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i1> poison, i1 [[TOBOOL6]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i1> [[BROADCAST_SPLATINSERT]], <16 x i1> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY35:%.*]]
-; CHECK:       vector.body35:
-; CHECK-NEXT:    [[INDEX34:%.*]] = phi i64 [ 0, [[VECTOR_PH30]] ], [ [[INDEX_NEXT39:%.*]], [[VECTOR_BODY35]] ]
-; CHECK-NEXT:    [[VEC_IND35:%.*]] = phi <16 x i64> [ , [[VECTOR_PH30]] ], [ [[VEC_IND_NEXT36:%.*]], [[VECTOR_BODY35]] ]
-; CHECK-NEXT:    [[VEC_IND37:%.*]] = phi <16 x i64> [ , [[VECTOR_PH30]] ], [ [[VEC_IND_NEXT38:%.*]], [[VECTOR_BODY35]] ]
+; CHECK-NEXT:    br label [[VECTOR_BODY29:%.*]]
+; CHECK:       vector.body29:
+; CHECK-NEXT:    [[INDEX34:%.*]] = phi i64 [ 0, [[VECTOR_PH30]] ], [ [[INDEX_NEXT39:%.*]], [[VECTOR_BODY29]] ]
+; CHECK-NEXT:    [[VEC_IND35:%.*]] = phi <16 x i64> [ , [[VECTOR_PH30]] ], [ [[VEC_IND_NEXT36:%.*]], [[VECTOR_BODY29]] ]
+; CHECK-NEXT:    [[VEC_IND37:%.*]] = phi <16 x i64> [ , [[VECTOR_PH30]] ], [ [[VEC_IND_NEXT38:%.*]], [[VECTOR_BODY29]] ]
 ; CHECK-NEXT:    [[TMP30:%.*]] = sub nsw <16 x i64> splat (i64 8), [[VEC_IND35]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, <16 x i64> [[VEC_IND35]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = add nsw <16 x i64> [[TMP30]], [[VEC_IND37]]
@@ -153,21 +153,21 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[VEC_IND_NEXT36]] = add <16 x i64> [[VEC_IND35]], splat (i64 32)
 ; CHECK-NEXT:    [[VEC_IND_NEXT38]] = add <16 x i64> [[VEC_IND37]], splat (i64 32)
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT39]], [[N_VEC32]]
-; CHECK-NEXT:    br i1 [[TMP41]], label [[MIDDLE_BLOCK24:%.*]], label [[VECTOR_BODY35]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block24:
+; CHECK-NEXT:    br i1 [[TMP41]], label [[MIDDLE_BLOCK20:%.*]], label [[VECTOR_BODY29]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block20:
 ; CHECK-NEXT:    [[CMP_N40:%.*]] = icmp eq i64 [[TMP28]], [[N_VEC32]]
-; CHECK-NEXT:    br i1 [[CMP_N40]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK48:%.*]]
-; CHECK:       vec.epilog.iter.check49:
-; CHECK-NEXT:    [[IND_END58:%.*]] = mul i64 [[N_VEC32]], 2
+; CHECK-NEXT:    br i1 [[CMP_N40]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK43:%.*]]
+; CHECK:       vec.epilog.iter.check43:
 ; CHECK-NEXT:    [[TMP42:%.*]] = mul i64 [[N_VEC32]], 2
 ; CHECK-NEXT:    [[IND_END55:%.*]] = add i64 8, [[TMP42]]
+; CHECK-NEXT:    [[IND_END58:%.*]] = mul i64 [[N_VEC32]], 2
 ; CHECK-NEXT:    [[N_VEC_REMAINING49:%.*]] = sub i64 [[TMP28]], [[N_VEC32]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK50:%.*]] = icmp ult i64 [[N_VEC_REMAINING49]], 8
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK50]], label [[VEC_EPILOG_SCALAR_PH46]], label [[VEC_EPILOG_PH47]]
-; CHECK:       vec.epilog.ph48:
-; CHECK-NEXT:    [[BC_RESUME_VAL42:%.*]] = phi i64 [ [[IND_END41]], [[VEC_EPILOG_ITER_CHECK48]] ], [ 8, [[VECTOR_MAIN_LOOP_ITER_CHECK29]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL44:%.*]] = phi i64 [ [[IND_END43]], [[VEC_EPILOG_ITER_CHECK48]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK29]] ]
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL51:%.*]] = phi i64 [ [[N_VEC32]], [[VEC_EPILOG_ITER_CHECK48]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK29]] ]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK50]], label [[VEC_EPILOG_SCALAR_PH41]], label [[VEC_EPILOG_PH42]]
+; CHECK:       vec.epilog.ph42:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL51:%.*]] = phi i64 [ [[N_VEC32]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK25]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL42:%.*]] = phi i64 [ [[IND_END41]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 8, [[VECTOR_MAIN_LOOP_ITER_CHECK25]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL44:%.*]] = phi i64 [ [[IND_END43]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK25]] ]
 ; CHECK-NEXT:    [[N_MOD_VF52:%.*]] = urem i64 [[TMP28]], 8
 ; CHECK-NEXT:    [[N_VEC53:%.*]] = sub i64 [[TMP28]], [[N_MOD_VF52]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 [[N_VEC53]], 2
@@ -181,11 +181,11 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[INDUCTION69:%.*]] = add <8 x i64> [[DOTSPLAT68]], 
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT72:%.*]] = insertelement <8 x i1> poison, i1 [[TOBOOL6]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT73:%.*]] = shufflevector <8 x i1> [[BROADCAST_SPLATINSERT72]], <8 x i1> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY58:%.*]]
-; CHECK:       vec.epilog.vector.body58:
-; CHECK-NEXT:    [[INDEX61:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL51]], [[VEC_EPILOG_PH47]] ], [ [[INDEX_NEXT74:%.*]], [[VEC_EPILOG_VECTOR_BODY58]] ]
-; CHECK-NEXT:    [[VEC_IND65:%.*]] = phi <8 x i64> [ [[INDUCTION64]], [[VEC_EPILOG_PH47]] ], [ [[VEC_IND_NEXT66:%.*]], [[VEC_EPILOG_VECTOR_BODY58]] ]
-; CHECK-NEXT:    [[VEC_IND70:%.*]] = phi <8 x i64> [ [[INDUCTION69]], [[VEC_EPILOG_PH47]] ], [ [[VEC_IND_NEXT71:%.*]], [[VEC_EPILOG_VECTOR_BODY58]] ]
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY50:%.*]]
+; CHECK:       vec.epilog.vector.body50:
+; CHECK-NEXT:    [[INDEX61:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL51]], [[VEC_EPILOG_PH42]] ], [ [[INDEX_NEXT74:%.*]], [[VEC_EPILOG_VECTOR_BODY50]] ]
+; CHECK-NEXT:    [[VEC_IND65:%.*]] = phi <8 x i64> [ [[INDUCTION64]], [[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT66:%.*]], [[VEC_EPILOG_VECTOR_BODY50]] ]
+; CHECK-NEXT:    [[VEC_IND70:%.*]] = phi <8 x i64> [ [[INDUCTION69]], [[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT71:%.*]], [[VEC_EPILOG_VECTOR_BODY50]] ]
 ; CHECK-NEXT:    [[TMP44:%.*]] = sub nsw <8 x i64> splat (i64 8), [[VEC_IND65]]
 ; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, <8 x i64> [[VEC_IND65]]
 ; CHECK-NEXT:    [[TMP46:%.*]] = add nsw <8 x i64> [[TMP44]], [[VEC_IND70]]
@@ -205,17 +205,17 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[VEC_IND_NEXT66]] = add <8 x i64> [[VEC_IND65]], splat (i64 16)
 ; CHECK-NEXT:    [[VEC_IND_NEXT71]] = add <8 x i64> [[VEC_IND70]], splat (i64 16)
 ; CHECK-NEXT:    [[TMP55:%.*]] = icmp eq i64 [[INDEX_NEXT74]], [[N_VEC53]]
-; CHECK-NEXT:    br i1 [[TMP55]], label [[VEC_EPILOG_MIDDLE_BLOCK46:%.*]], label [[VEC_EPILOG_VECTOR_BODY58]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       vec.epilog.middle.block46:
-; CHECK-NEXT:    [[CMP_N75:%.*]] = icmp eq i64 [[TMP28]], [[N_VEC53]]
-; CHECK-NEXT:    br i1 [[CMP_N75]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH46]]
-; CHECK:       vec.epilog.scalar.ph47:
-; CHECK-NEXT:    [[BC_RESUME_VAL56:%.*]] = phi i64 [ [[IND_END54]], [[VEC_EPILOG_MIDDLE_BLOCK46]] ], [ 8, [[ITER_CHECK27]] ], [ [[IND_END55]], [[VEC_EPILOG_ITER_CHECK48]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL59:%.*]] = phi i64 [ [[IND_END57]], [[VEC_EPILOG_MIDDLE_BLOCK46]] ], [ 0, [[ITER_CHECK27]] ], [ [[IND_END58]], [[VEC_EPILOG_ITER_CHECK48]] ]
+; CHECK-NEXT:    br i1 [[TMP55]], label [[VEC_EPILOG_MIDDLE_BLOCK40:%.*]], label [[VEC_EPILOG_VECTOR_BODY50]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       vec.epilog.middle.block40:
+; CHECK-NEXT:    [[CMP_N65:%.*]] = icmp eq i64 [[TMP28]], [[N_VEC53]]
+; CHECK-NEXT:    br i1 [[CMP_N65]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH41]]
+; CHECK:       vec.epilog.scalar.ph41:
+; CHECK-NEXT:    [[BC_RESUME_VAL56:%.*]] = phi i64 [ [[IND_END54]], [[VEC_EPILOG_MIDDLE_BLOCK40]] ], [ 8, [[ITER_CHECK27]] ], [ [[IND_END55]], [[VEC_EPILOG_ITER_CHECK43]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL67:%.*]] = phi i64 [ [[IND_END57]], [[VEC_EPILOG_MIDDLE_BLOCK40]] ], [ 0, [[ITER_CHECK27]] ], [ [[IND_END58]], [[VEC_EPILOG_ITER_CHECK43]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY_US:%.*]]
 ; CHECK:       for.body.us:
-; CHECK-NEXT:    [[INDVARS_IV78:%.*]] = phi i64 [ [[INDVARS_IV_NEXT79:%.*]], [[FOR_COND_CLEANUP4_US_LCSSA_US_US:%.*]] ], [ [[BC_RESUME_VAL56]], [[VEC_EPILOG_SCALAR_PH46]] ]
-; CHECK-NEXT:    [[INDVARS_IV70:%.*]] = phi i64 [ [[INDVARS_IV_NEXT71:%.*]], [[FOR_COND_CLEANUP4_US_LCSSA_US_US]] ], [ [[BC_RESUME_VAL59]], [[VEC_EPILOG_SCALAR_PH46]] ]
+; CHECK-NEXT:    [[INDVARS_IV78:%.*]] = phi i64 [ [[INDVARS_IV_NEXT79:%.*]], [[FOR_COND_CLEANUP4_US_LCSSA_US_US:%.*]] ], [ [[BC_RESUME_VAL56]], [[VEC_EPILOG_SCALAR_PH41]] ]
+; CHECK-NEXT:    [[INDVARS_IV70:%.*]] = phi i64 [ [[INDVARS_IV_NEXT71:%.*]], [[FOR_COND_CLEANUP4_US_LCSSA_US_US]] ], [ [[BC_RESUME_VAL67]], [[VEC_EPILOG_SCALAR_PH41]] ]
 ; CHECK-NEXT:    [[TMP56:%.*]] = sub nsw i64 8, [[INDVARS_IV78]]
 ; CHECK-NEXT:    [[ADD_PTR_US:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, i64 [[INDVARS_IV78]]
 ; CHECK-NEXT:    [[TMP57:%.*]] = add nsw i64 [[TMP56]], [[INDVARS_IV70]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
index 55ff26c55b512..c9132bab80f19 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
@@ -142,8 +142,8 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_114]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT20:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT19]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY15:%.*]]
-; CHECK:       vector.body15:
-; CHECK-NEXT:    [[INDEX16:%.*]] = phi i64 [ 0, [[VECTOR_PH9]] ], [ [[INDEX_NEXT29:%.*]], [[PRED_STORE_CONTINUE28:%.*]] ]
+; CHECK:       vector.body14:
+; CHECK-NEXT:    [[INDEX16:%.*]] = phi i64 [ 0, [[VECTOR_PH9]] ], [ [[INDEX_NEXT29:%.*]], [[PRED_STORE_CONTINUE78:%.*]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[I_0_LCSSA]], [[INDEX16]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX16]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT18:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT17]], <4 x i64> poison, <4 x i32> zeroinitializer
@@ -151,7 +151,7 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT20]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP18]], i64 0
 ; CHECK-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
-; CHECK:       pred.store.if21:
+; CHECK:       pred.store.if20:
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 [[OFFSET_IDX]]
@@ -160,10 +160,10 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[TMP25:%.*]] = and i32 [[TMP23]], [[TMP21]]
 ; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP24]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE22]]
-; CHECK:       pred.store.continue22:
+; CHECK:       pred.store.continue21:
 ; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i1> [[TMP18]], i64 1
 ; CHECK-NEXT:    br i1 [[TMP26]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
-; CHECK:       pred.store.if23:
+; CHECK:       pred.store.if22:
 ; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], 1
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[TMP27]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4
@@ -173,10 +173,10 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[TMP33:%.*]] = and i32 [[TMP31]], [[TMP29]]
 ; CHECK-NEXT:    store i32 [[TMP33]], ptr [[TMP32]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE24]]
-; CHECK:       pred.store.continue24:
+; CHECK:       pred.store.continue23:
 ; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[TMP18]], i64 2
 ; CHECK-NEXT:    br i1 [[TMP34]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
-; CHECK:       pred.store.if25:
+; CHECK:       pred.store.if24:
 ; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[TMP35]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP36]], align 4
@@ -186,10 +186,10 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[TMP41:%.*]] = and i32 [[TMP39]], [[TMP37]]
 ; CHECK-NEXT:    store i32 [[TMP41]], ptr [[TMP40]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE26]]
-; CHECK:       pred.store.continue26:
+; CHECK:       pred.store.continue25:
 ; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i1> [[TMP18]], i64 3
-; CHECK-NEXT:    br i1 [[TMP42]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28]]
-; CHECK:       pred.store.if27:
+; CHECK-NEXT:    br i1 [[TMP42]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE78]]
+; CHECK:       pred.store.if26:
 ; CHECK-NEXT:    [[TMP43:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[TMP43]]
 ; CHECK-NEXT:    [[TMP45:%.*]] = load i32, ptr [[TMP44]], align 4
@@ -198,8 +198,8 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 [[TMP43]]
 ; CHECK-NEXT:    [[TMP49:%.*]] = and i32 [[TMP47]], [[TMP45]]
 ; CHECK-NEXT:    store i32 [[TMP49]], ptr [[TMP48]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE28]]
-; CHECK:       pred.store.continue28:
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE78]]
+; CHECK:       pred.store.continue27:
 ; CHECK-NEXT:    [[INDEX_NEXT29]] = add nuw i64 [[INDEX16]], 4
 ; CHECK-NEXT:    [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT29]], [[N_VEC12]]
 ; CHECK-NEXT:    br i1 [[TMP50]], label [[MIDDLE_BLOCK7:%.*]], label [[VECTOR_BODY15]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -273,7 +273,7 @@ define void @example3(i32 %n, ptr noalias nocapture %p, ptr noalias nocapture %q
 ; CHECK-NEXT:    [[BROADCAST_SPLAT12:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT11]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE18:%.*]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE16:%.*]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[OFFSET_IDX6:%.*]] = shl i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
@@ -290,38 +290,38 @@ define void @example3(i32 %n, ptr noalias nocapture %p, ptr noalias nocapture %q
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP3]], i64 1
-; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]]
-; CHECK:       pred.store.if13:
+; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
+; CHECK:       pred.store.if11:
 ; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint i64 [[OFFSET_IDX]], 4
 ; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = or disjoint i64 [[OFFSET_IDX6]], 4
 ; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[NEXT_GEP8]], align 16
 ; CHECK-NEXT:    store i32 [[TMP9]], ptr [[NEXT_GEP3]], align 16
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE14]]
-; CHECK:       pred.store.continue14:
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
+; CHECK:       pred.store.continue12:
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP3]], i64 2
-; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]]
-; CHECK:       pred.store.if15:
+; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]]
+; CHECK:       pred.store.if13:
 ; CHECK-NEXT:    [[TMP11:%.*]] = or disjoint i64 [[OFFSET_IDX]], 8
 ; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = or disjoint i64 [[OFFSET_IDX6]], 8
 ; CHECK-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[NEXT_GEP9]], align 16
 ; CHECK-NEXT:    store i32 [[TMP13]], ptr [[NEXT_GEP4]], align 16
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE16]]
-; CHECK:       pred.store.continue16:
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE14]]
+; CHECK:       pred.store.continue14:
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP3]], i64 3
-; CHECK-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18]]
-; CHECK:       pred.store.if17:
+; CHECK-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16]]
+; CHECK:       pred.store.if15:
 ; CHECK-NEXT:    [[TMP15:%.*]] = or disjoint i64 [[OFFSET_IDX]], 12
 ; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = or disjoint i64 [[OFFSET_IDX6]], 12
 ; CHECK-NEXT:    [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[NEXT_GEP10]], align 16
 ; CHECK-NEXT:    store i32 [[TMP17]], ptr [[NEXT_GEP5]], align 16
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE18]]
-; CHECK:       pred.store.continue18:
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE16]]
+; CHECK:       pred.store.continue16:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -470,8 +470,8 @@ define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst)
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP1]], i64 1
-; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]]
-; CHECK:       pred.store.if10:
+; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; CHECK:       pred.store.if9:
 ; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint i64 [[OFFSET_IDX5]], 4
 ; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = or disjoint i64 [[OFFSET_IDX]], 2
@@ -480,11 +480,11 @@ define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst)
 ; CHECK-NEXT:    [[TMP10:%.*]] = zext i16 [[TMP9]] to i32
 ; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i32 [[TMP10]], 7
 ; CHECK-NEXT:    store i32 [[TMP11]], ptr [[NEXT_GEP7]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE11]]
-; CHECK:       pred.store.continue11:
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE10]]
+; CHECK:       pred.store.continue10:
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i1> [[TMP1]], i64 2
-; CHECK-NEXT:    br i1 [[TMP12]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]]
-; CHECK:       pred.store.if12:
+; CHECK-NEXT:    br i1 [[TMP12]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
+; CHECK:       pred.store.if11:
 ; CHECK-NEXT:    [[TMP13:%.*]] = or disjoint i64 [[OFFSET_IDX5]], 8
 ; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = or disjoint i64 [[OFFSET_IDX]], 4
@@ -493,11 +493,11 @@ define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst)
 ; CHECK-NEXT:    [[TMP16:%.*]] = zext i16 [[TMP15]] to i32
 ; CHECK-NEXT:    [[TMP17:%.*]] = shl nuw nsw i32 [[TMP16]], 7
 ; CHECK-NEXT:    store i32 [[TMP17]], ptr [[NEXT_GEP8]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE13]]
-; CHECK:       pred.store.continue13:
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
+; CHECK:       pred.store.continue12:
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[TMP1]], i64 3
-; CHECK-NEXT:    br i1 [[TMP18]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15]]
-; CHECK:       pred.store.if14:
+; CHECK-NEXT:    br i1 [[TMP18]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE15]]
+; CHECK:       pred.store.if13:
 ; CHECK-NEXT:    [[TMP19:%.*]] = or disjoint i64 [[OFFSET_IDX5]], 12
 ; CHECK-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP19]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = or disjoint i64 [[OFFSET_IDX]], 6
@@ -507,7 +507,7 @@ define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst)
 ; CHECK-NEXT:    [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 7
 ; CHECK-NEXT:    store i32 [[TMP23]], ptr [[NEXT_GEP9]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE15]]
-; CHECK:       pred.store.continue15:
+; CHECK:       pred.store.continue14:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/branch-weights.ll b/llvm/test/Transforms/LoopVectorize/branch-weights.ll
index 6e7efe018faf6..e11f77d8aeaec 100644
--- a/llvm/test/Transforms/LoopVectorize/branch-weights.ll
+++ b/llvm/test/Transforms/LoopVectorize/branch-weights.ll
@@ -33,7 +33,7 @@
 ; CHECK:   br i1 {{.+}}, label %vec.epilog.middle.block, label %vec.epilog.vector.body, !prof [[PROF_F0_VEC_EPILOG_VECTOR_BODY:![0-9]+]]
 ;
 ; CHECK: vec.epilog.middle.block:
-; CHECK:   br i1 %cmp.n9, label %exit.loopexit, label %vec.epilog.scalar.ph, !prof [[PROF_F0_MIDDLE_BLOCKS:![0-9]+]]
+; CHECK:   br i1 %cmp.n{{.+}}, label %exit.loopexit, label %vec.epilog.scalar.ph, !prof [[PROF_F0_MIDDLE_BLOCKS:![0-9]+]]
 ;
 ; CHECK: vec.epilog.scalar.ph:
 ; CHECK:   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
index c159ec868c357..94593a7d9a81d 100644
--- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
@@ -241,9 +241,9 @@ define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) {
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ], [ false, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i1 [[BC_MERGE_RDX]], false
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]]
@@ -275,8 +275,8 @@ define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) {
 ; CHECK-NEXT:    br i1 [[CMP_N8]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL7:%.*]] = phi i32 [ [[IND_END5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[IND_END6]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX17:%.*]] = phi i1 [ [[RDX_SELECT16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ false, [[ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL7:%.*]] = phi i32 [ [[IND_END5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[IND_END6]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL4]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -426,8 +426,8 @@ define i1 @any_of_reduction_i1_epilog2(ptr %start, ptr %end, i64 %x) {
 ; CHECK-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC8]]
 ; CHECK-NEXT:    br i1 [[CMP_N10]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[START]], [[ITER_CHECK:%.*]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX23:%.*]] = phi i1 [ [[RDX_SELECT22]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ true, [[ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX23:%.*]] = phi i1 [ [[RDX_SELECT22]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ true, [[ITER_CHECK:%.*]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[START]], [[ITER_CHECK]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[RED:%.*]] = phi i1 [ [[BC_MERGE_RDX23]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[SELECT:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
index fe6121ca3d004..32d32a64049ac 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
@@ -45,12 +45,13 @@ define void @test_chained_first_order_recurrences_1(ptr %ptr) {
 ; CHECK-NEXT:  scalar.ph
 ; CHECK-NEXT:    EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<22>
 ; CHECK-NEXT:    EMIT vp<[[RESUME_2_P:%.*]]>.1 = resume-phi vp<[[RESUME_2]]>.1, ir<33>
+; CHECK-NEXT:    EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb:
 ; CHECK-NEXT:    IR   %for.1 = phi i16 [ 22, %entry ], [ %for.1.next, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
 ; CHECK-NEXT:    IR   %for.2 = phi i16 [ 33, %entry ], [ %for.1, %loop ] (extra operand: vp<[[RESUME_2_P]]>.1 from scalar.ph)
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond.not = icmp eq i64 %iv.next, 1000
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
@@ -125,13 +126,14 @@ define void @test_chained_first_order_recurrences_3(ptr %ptr) {
 ; CHECK-NEXT:    EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<22>
 ; CHECK-NEXT:    EMIT vp<[[RESUME_2_P:%.*]]>.1 = resume-phi vp<[[RESUME_2]]>.1, ir<33>
 ; CHECK-NEXT:    EMIT vp<[[RESUME_3_P:%.*]]>.2 = resume-phi vp<[[RESUME_3]]>.2, ir<33>
+; CHECK-NEXT:    EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb:
 ; CHECK-NEXT:    IR   %for.1 = phi i16 [ 22, %entry ], [ %for.1.next, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
 ; CHECK-NEXT:    IR   %for.2 = phi i16 [ 33, %entry ], [ %for.1, %loop ] (extra operand: vp<[[RESUME_2_P]]>.1 from scalar.ph)
 ; CHECK-NEXT:    IR   %for.3 = phi i16 [ 33, %entry ], [ %for.2, %loop ] (extra operand: vp<[[RESUME_3_P]]>.2 from scalar.ph)
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond.not = icmp eq i64 %iv.next, 1000
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -205,12 +207,13 @@ define i32 @test_chained_first_order_recurrences_4(ptr %base, i64 %x) {
 ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RESUME_X:%.+]]> = resume-phi vp<[[EXT_X]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RESUME_Y:%.+]]>.1 = resume-phi vp<[[EXT_Y]]>.1, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb:
-; CHECK-NEXT:   IR   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %for.x = phi i64 [ %for.x.next, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_X]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %for.y = phi i32 [ %for.x.prev, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_Y]]>.1 from scalar.ph)
 ; CHECK:     No successors
@@ -282,12 +285,13 @@ define i32 @test_chained_first_order_recurrences_5_hoist_to_load(ptr %base) {
 ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RESUME_X:%.+]]> = resume-phi vp<[[EXT_X]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RESUME_Y:%.+]]>.1 = resume-phi vp<[[EXT_Y]]>.1, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb:
-; CHECK-NEXT:   IR   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %for.x = phi i64 [ %for.x.next, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_X]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %for.y = phi i32 [ %for.x.prev, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_Y]]>.1 from scalar.ph)
 ; CHECK:     No successors
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
index 7aedb218e1352..fc71f8a934047 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
@@ -661,10 +661,10 @@ define double @test_resinking_required(ptr %p, ptr noalias %a, ptr noalias %b) {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT9:%.*]] = extractelement <4 x double> [[TMP4]], i32 3
 ; CHECK-NEXT:    br i1 true, label %End, label %scalar.ph
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    %bc.resume.val = phi i64 [ 0, %middle.block ], [ 0, %Entry ]
 ; CHECK-NEXT:    phi double [ [[TMP0]], %middle.block ], [ 0.000000e+00, %Entry ]
 ; CHECK-NEXT:    phi double [ [[TMP3]], %middle.block ], [ 0.000000e+00, %Entry ]
 ; CHECK-NEXT:    phi double [ [[VECTOR_RECUR_EXTRACT9]], %middle.block ], [ 0.000000e+00, %Entry ]
+; CHECK-NEXT:    %bc.resume.val = phi i64 [ 0, %middle.block ], [ 0, %Entry ]
 ; CHECK:      End:
 ; CHECK-NEXT:    = phi double [ {{.+}}, %Loop ], [ [[TMP0]], %middle.block ]
 ; CHECK-NEXT:    = phi double [ {{.+}}, %Loop ], [ [[TMP3]], %middle.block ]
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
index fe16e8ce6f97b..253ecaca75be8 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
@@ -41,8 +41,8 @@ define void @can_sink_after_store(i32 %x, ptr %ptr, i64 %tc) local_unnamed_addr
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1997, [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1997, [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR:%.*]]
 ; CHECK:       for:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[PRE_NEXT:%.*]], [[FOR]] ]
@@ -121,8 +121,8 @@ define void @sink_sdiv(i32 %x, ptr %ptr, i64 %tc) local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1997, [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1997, [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR:%.*]]
 ; CHECK:       for:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[PRE_NEXT:%.*]], [[FOR]] ]
@@ -202,8 +202,8 @@ define void @can_sink_with_additional_user(i32 %x, ptr %ptr, i64 %tc) {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1997, [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1997, [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR:%.*]]
 ; CHECK:       for:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[PRE_NEXT:%.*]], [[FOR]] ]
@@ -387,9 +387,9 @@ define void @instruction_with_2_FOR_operands(ptr noalias %A, ptr noalias %B, ptr
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[BB74:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[BB:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB:%.*]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi float [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1.000000e+00, [[BB]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ]
 ; CHECK-NEXT:    br label [[BB13:%.*]]
 ; CHECK:       bb13:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi float [ [[TMP60:%.*]], [[BB13]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
@@ -463,9 +463,9 @@ define void @instruction_with_2_FOR_operands_and_multiple_other_uses(ptr noalias
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[BB:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB:%.*]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi float [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi float [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], [[LOOP]] ]
@@ -554,9 +554,9 @@ define void @instruction_with_2_FOR_operands_and_multiple_other_uses_chain(ptr n
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[BB:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB:%.*]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi float [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi float [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], [[LOOP]] ]
@@ -852,8 +852,8 @@ define void @sink_dominance(ptr %ptr, i32 %N) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[UMAX1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ]
@@ -935,8 +935,8 @@ define void @sink_dominance_2(ptr %ptr, i32 %N) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[UMAX1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ]
@@ -1057,9 +1057,9 @@ define void @test_for_sink_instruction_after_same_incoming_1(ptr %ptr) {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 1.000000e+01, [[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 1.000000e+01, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT3:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT2]], [[MIDDLE_BLOCK]] ], [ 2.000000e+01, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], [[LOOP]] ]
@@ -1125,9 +1125,9 @@ define void @test_for_sink_instruction_after_same_incoming_2(ptr %ptr) {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 2.000000e+01, [[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 2.000000e+01, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT3:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT2]], [[MIDDLE_BLOCK]] ], [ 1.000000e+01, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
index a1278907a7290..0b2e7fe484390 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@@ -82,11 +82,12 @@ define void @sink_replicate_region_1(i32 %x, ptr %ptr, ptr noalias %dst) optsize
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb:
 ; CHECK-NEXT:   IR   %0 = phi i32 [ 0, %entry ], [ %conv, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
-; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:        IR   %ec = icmp eq i32 %iv.next, 20001
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -172,11 +173,12 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb:
 ; CHECK-NEXT:   IR   %recur = phi i32 [ 0, %entry ], [ %recur.next, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
-; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:        IR   %ec = icmp eq i32 %iv.next, 20001
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -245,12 +247,13 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RESUME_RED:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<1234>
 ; CHECK-NEXT: Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb:
 ; CHECK-NEXT:   IR   %recur = phi i32 [ 0, %entry ], [ %recur.next, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
-; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %and.red = phi i32 [ 1234, %entry ], [ %and.red.next, %loop ]
 ; CHECK:        IR   %ec = icmp eq i32 %iv.next, 20001
 ; CHECK-NEXT: No successors
@@ -361,11 +364,12 @@ define void @sink_replicate_region_4_requires_split_at_end_of_block(i32 %x, ptr
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb:
 ; CHECK-NEXT:   IR   %0 = phi i32 [ 0, %entry ], [ %conv, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
-; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:        IR   %ec = icmp eq i32 %iv.next, 20001
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -458,11 +462,12 @@ define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb:
 ; CHECK-NEXT:   IR   %recur = phi i32 [ 0, %entry ], [ %recur.next, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
-; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:        IR   %C = icmp sgt i32 %iv.next, %recur.next
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -503,6 +508,7 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir<2> + vp<[[VEC_TC]]> * ir<1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector loop: {
@@ -546,11 +552,12 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias
 ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[END]]>, ir<2>
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb:
-; CHECK-NEXT:   IR   %iv = phi i64 [ 2, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ 2, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %.pn = phi i32 [ 0, %entry ], [ %l, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
 ; CHECK:        IR   %ec = icmp ugt i64 %iv, 3
 ; CHECK-NEXT: No successors
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index 2d50f8219549a..7f562a4f2c445 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -55,8 +55,8 @@ define void @recurrence_1(ptr readonly noalias %a, ptr noalias %b, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[PRE_LOAD]], [[FOR_PREHEADER]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-IC:       scalar.body:
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP17:%.*]], [[SCALAR_BODY]] ]
@@ -111,8 +111,8 @@ define void @recurrence_1(ptr readonly noalias %a, ptr noalias %b, i32 %n) {
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[PRE_LOAD]], [[FOR_PREHEADER]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-VF:       scalar.body:
 ; UNROLL-NO-VF-NEXT:    [[TMP16:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP17:%.*]], [[SCALAR_BODY]] ]
@@ -165,8 +165,8 @@ define void @recurrence_1(ptr readonly noalias %a, ptr noalias %b, i32 %n) {
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[PRE_LOAD]], [[FOR_PREHEADER]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; SINK-AFTER:       scalar.body:
 ; SINK-AFTER-NEXT:    [[TMP12:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP13:%.*]], [[SCALAR_BODY]] ]
@@ -265,8 +265,8 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_PREHEADER]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ poison, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-IC:       for.cond.cleanup.loopexit:
@@ -334,8 +334,8 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_PREHEADER]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ], [ poison, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-VF:       for.cond.cleanup.loopexit:
@@ -398,8 +398,8 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_PREHEADER]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ poison, [[FOR_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; SINK-AFTER:       for.cond.cleanup.loopexit:
@@ -525,8 +525,8 @@ define void @recurrence_3(ptr readonly noalias %a, ptr noalias %b, i32 %n, float
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[TMP0]], [[FOR_PREHEADER]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-IC:       scalar.body:
 ; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP22:%.*]], [[SCALAR_BODY]] ]
@@ -598,8 +598,8 @@ define void @recurrence_3(ptr readonly noalias %a, ptr noalias %b, i32 %n, float
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ [[TMP0]], [[FOR_PREHEADER]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-VF:       scalar.body:
 ; UNROLL-NO-VF-NEXT:    [[TMP20:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP21:%.*]], [[SCALAR_BODY]] ]
@@ -669,8 +669,8 @@ define void @recurrence_3(ptr readonly noalias %a, ptr noalias %b, i32 %n, float
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[TMP0]], [[FOR_PREHEADER]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; SINK-AFTER:       scalar.body:
 ; SINK-AFTER-NEXT:    [[TMP14:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP15:%.*]], [[SCALAR_BODY]] ]
@@ -912,8 +912,8 @@ define i32 @PR27246() {
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_COND1:%.*]]
 ; UNROLL-NO-IC:       for.cond.cleanup:
 ; UNROLL-NO-IC-NEXT:    [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ]
@@ -956,8 +956,8 @@ define i32 @PR27246() {
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_COND1:%.*]]
 ; UNROLL-NO-VF:       for.cond.cleanup:
 ; UNROLL-NO-VF-NEXT:    [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ]
@@ -1005,8 +1005,8 @@ define i32 @PR27246() {
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    br label [[FOR_COND1:%.*]]
 ; SINK-AFTER:       for.cond.cleanup:
 ; SINK-AFTER-NEXT:    [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ]
@@ -1780,8 +1780,8 @@ define void @sink_after(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-IC:       for.body:
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ]
@@ -1836,8 +1836,8 @@ define void @sink_after(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-VF:       for.body:
 ; UNROLL-NO-VF-NEXT:    [[TMP17:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ]
@@ -1888,8 +1888,8 @@ define void @sink_after(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; SINK-AFTER-NEXT:    br label [[FOR_BODY:%.*]]
 ; SINK-AFTER:       for.body:
 ; SINK-AFTER-NEXT:    [[TMP11:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ]
@@ -2016,8 +2016,8 @@ define void @PR34711(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP30]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP30]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-IC:       for.body:
 ; UNROLL-NO-IC-NEXT:    [[TMP47:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP48:%.*]], [[FOR_BODY]] ]
@@ -2076,8 +2076,8 @@ define void @PR34711(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) {
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-VF:       for.body:
 ; UNROLL-NO-VF-NEXT:    [[TMP17:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ]
@@ -2143,8 +2143,8 @@ define void @PR34711(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) {
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; SINK-AFTER-NEXT:    br label [[FOR_BODY:%.*]]
 ; SINK-AFTER:       for.body:
 ; SINK-AFTER-NEXT:    [[TMP25:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP26:%.*]], [[FOR_BODY]] ]
@@ -2240,8 +2240,8 @@ define void @sink_after_with_multiple_users(ptr noalias %a, ptr noalias %b, i64
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-IC:       for.body:
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP20:%.*]], [[FOR_BODY]] ]
@@ -2299,8 +2299,8 @@ define void @sink_after_with_multiple_users(ptr noalias %a, ptr noalias %b, i64
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-VF:       for.body:
 ; UNROLL-NO-VF-NEXT:    [[TMP19:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP20:%.*]], [[FOR_BODY]] ]
@@ -2353,8 +2353,8 @@ define void @sink_after_with_multiple_users(ptr noalias %a, ptr noalias %b, i64
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; SINK-AFTER-NEXT:    br label [[FOR_BODY:%.*]]
 ; SINK-AFTER:       for.body:
 ; SINK-AFTER-NEXT:    [[TMP12:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ]
@@ -2685,8 +2685,8 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-IC-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 7
 ; UNROLL-NO-IC-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 8
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1
+; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT5]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -2816,8 +2816,8 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-VF-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1
 ; UNROLL-NO-VF-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 2
 ; UNROLL-NO-VF-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; UNROLL-NO-VF-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1
+; UNROLL-NO-VF-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE4:%.*]] ]
@@ -2881,8 +2881,8 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; SINK-AFTER-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 3
 ; SINK-AFTER-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
 ; SINK-AFTER-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; SINK-AFTER-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1
+; SINK-AFTER-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
 ; SINK-AFTER-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -2990,8 +2990,8 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) {
 ; UNROLL-NO-IC-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 7
 ; UNROLL-NO-IC-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 8
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1
+; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -3189,8 +3189,8 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) {
 ; UNROLL-NO-VF-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1
 ; UNROLL-NO-VF-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 2
 ; UNROLL-NO-VF-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; UNROLL-NO-VF-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1
+; UNROLL-NO-VF-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
@@ -3273,8 +3273,8 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) {
 ; SINK-AFTER-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 3
 ; SINK-AFTER-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
 ; SINK-AFTER-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; SINK-AFTER-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1
+; SINK-AFTER-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
 ; SINK-AFTER-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index 28c1c2afbe081..2175eab9752cf 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -2773,8 +2773,8 @@ define i32 @i8_loop() nounwind readnone ssp uwtable {
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[TMP0]])
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
@@ -2844,8 +2844,8 @@ define i32 @i8_loop() nounwind readnone ssp uwtable {
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]])
 ; UNROLL-NO-IC-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL-NO-IC:       loop:
 ; UNROLL-NO-IC-NEXT:    [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
@@ -2910,8 +2910,8 @@ define i32 @i16_loop() nounwind readnone ssp uwtable {
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[TMP0]])
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
@@ -2981,8 +2981,8 @@ define i32 @i16_loop() nounwind readnone ssp uwtable {
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]])
 ; UNROLL-NO-IC-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL-NO-IC:       loop:
 ; UNROLL-NO-IC-NEXT:    [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
@@ -3051,8 +3051,8 @@ define i32 @max_i32_backedgetaken() nounwind readnone ssp uwtable {
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[TMP0]])
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
@@ -3122,8 +3122,8 @@ define i32 @max_i32_backedgetaken() nounwind readnone ssp uwtable {
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]])
 ; UNROLL-NO-IC-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL-NO-IC:       loop:
 ; UNROLL-NO-IC-NEXT:    [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
@@ -5561,9 +5561,9 @@ define i64 @trunc_with_first_order_recurrence() {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[VEC_IND2]], i32 1
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 42, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       exit:
@@ -5781,9 +5781,9 @@ define i64 @trunc_with_first_order_recurrence() {
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[STEP_ADD7]], i32 1
 ; UNROLL-NO-IC-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
-; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 42, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL-NO-IC:       exit:
@@ -6236,9 +6236,9 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_2_CONV:%.*]], [[LOOP]] ]
@@ -6307,9 +6307,9 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; IND-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; IND-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; IND:       scalar.ph:
-; IND-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; IND-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; IND-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; IND-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
-; IND-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; IND-NEXT:    br label [[LOOP:%.*]]
 ; IND:       loop:
 ; IND-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_2_CONV:%.*]], [[LOOP]] ]
@@ -6382,9 +6382,9 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; UNROLL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL:       scalar.ph:
-; UNROLL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
-; UNROLL-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL:       loop:
 ; UNROLL-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_2_CONV:%.*]], [[LOOP]] ]
@@ -6463,9 +6463,9 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL-NO-IC:       loop:
 ; UNROLL-NO-IC-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_2_CONV:%.*]], [[LOOP]] ]
@@ -6538,9 +6538,9 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; INTERLEAVE:       scalar.ph:
-; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; INTERLEAVE-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; INTERLEAVE-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
-; INTERLEAVE-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; INTERLEAVE-NEXT:    br label [[LOOP:%.*]]
 ; INTERLEAVE:       loop:
 ; INTERLEAVE-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_2_CONV:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
index bb17580ac4d11..5bc832fbd6842 100644
--- a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
@@ -14,6 +14,7 @@
 ; DBG-NEXT: Successor(s): vector.ph
 ; DBG-EMPTY:
 ; DBG-NEXT: vector.ph:
+; DBG-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir<%start> + vp<[[VEC_TC]]> * ir<1>
 ; DBG-NEXT: Successor(s): vector loop
 ; DBG-EMPTY:
 ; DBG-NEXT:  vector loop: {
@@ -76,6 +77,7 @@ declare i32 @llvm.smin.i32(i32, i32)
 ; DBG-NEXT: Successor(s): vector.ph
 ; DBG-EMPTY:
 ; DBG-NEXT: vector.ph:
+; DBG-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir + vp<[[VEC_TC]]> * ir
 ; DBG-NEXT: Successor(s): vector loop
 ; DBG-EMPTY:
 ; DBG-NEXT:  vector loop: {
@@ -116,11 +118,13 @@ declare i32 @llvm.smin.i32(i32, i32)
 ; DBG-NEXT: Successor(s): ir-bb, scalar.ph
 ; DBG-EMPTY:
 ; DBG-NEXT: scalar.ph:
+; DBG-NEXT:  EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
+; DBG-NEXT:  EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END]]>, ir
 ; DBG-NEXT: Successor(s): ir-bb
 ; DBG-EMPTY:
 ; DBG-NEXT: ir-bb:
-; DBG-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
-; DBG-NEXT:   IR   %d = phi i1 [ false, %entry ], [ %d.next, %loop.latch ]
+; DBG-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] (extra operand: vp<[[RESUME1]]> from scalar.ph)
+; DBG-NEXT:   IR   %d = phi i1 [ false, %entry ], [ %d.next, %loop.latch ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph)
 ; DBG-NEXT:   IR   %d.next = xor i1 %d, true
 ; DBG-NEXT: No successors
 ; DBG-EMPTY:
@@ -222,11 +226,12 @@ exit:
 ; DBG-NEXT: Successor(s): ir-bb, scalar.ph
 ; DBG-EMPTY:
 ; DBG-NEXT: scalar.ph:
+; DBG-NEXT:  EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; DBG-NEXT:  EMIT vp<[[RESUME_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
 ; DBG-NEXT: Successor(s): ir-bb
 ; DBG-EMPTY:
 ; DBG-NEXT: ir-bb:
-; DBG-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; DBG-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; DBG-NEXT:   IR   %for = phi i32 [ 0, %entry ], [ %iv.trunc, %loop ] (extra operand: vp<[[RESUME_P]]> from scalar.ph)
 ; DBG:        IR   %ec = icmp slt i32 %iv.next.trunc, %n
 ; DBG-NEXT: No successors
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
index 0e7a68cd47ad8..abd91d33157e6 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -1508,9 +1508,9 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[TMP16:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD2:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
index 88be9fa1c8689..aef25a05ea124 100644
--- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
@@ -401,8 +401,8 @@ define i32 @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_INC8_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[TMP4]], [[VECTOR_MEMCHECK]] ], [ [[TMP4]], [[FOR_BODY3_LR_PH]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[ARRAYIDX5_PROMOTED]], [[VECTOR_MEMCHECK]] ], [ [[ARRAYIDX5_PROMOTED]], [[FOR_BODY3_LR_PH]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[TMP4]], [[VECTOR_MEMCHECK]] ], [ [[TMP4]], [[FOR_BODY3_LR_PH]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
 ; CHECK:       for.body3:
 ; CHECK-NEXT:    [[TMP20:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP22:%.*]], [[FOR_BODY3]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-nested-loop.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-nested-loop.ll
index 07ee5892dc28e..681ffe946d17d 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-nested-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-nested-loop.ll
@@ -40,8 +40,8 @@ define i64 @select_iv_def_from_outer_loop(ptr %a, i64 %start, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label %[[OUTER_LOOP_EXIT]], label %[[SCALAR_PH]]
 ; CHECK-VF4IC1:       [[SCALAR_PH]]:
-; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_LOOP]] ]
 ; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_OUTER]], %[[OUTER_LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_LOOP]] ]
 ; CHECK-VF4IC1-NEXT:    br label %[[INNER_LOOP:.*]]
 ; CHECK-VF4IC1:       [[INNER_LOOP]]:
 ; CHECK-VF4IC1-NEXT:    [[RDX_INNER:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT:%.*]], %[[INNER_LOOP]] ]
@@ -114,8 +114,8 @@ define i64 @select_iv_def_from_outer_loop(ptr %a, i64 %start, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-VF4IC4-NEXT:    br i1 [[CMP_N]], label %[[OUTER_LOOP_EXIT]], label %[[SCALAR_PH]]
 ; CHECK-VF4IC4:       [[SCALAR_PH]]:
-; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_LOOP]] ]
 ; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_OUTER]], %[[OUTER_LOOP]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_LOOP]] ]
 ; CHECK-VF4IC4-NEXT:    br label %[[INNER_LOOP:.*]]
 ; CHECK-VF4IC4:       [[INNER_LOOP]]:
 ; CHECK-VF4IC4-NEXT:    [[RDX_INNER:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT:%.*]], %[[INNER_LOOP]] ]
@@ -189,8 +189,8 @@ define i64 @select_iv_def_from_outer_loop(ptr %a, i64 %start, i64 %n) {
 ; CHECK-VF1IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-VF1IC4-NEXT:    br i1 [[CMP_N]], label %[[OUTER_LOOP_EXIT]], label %[[SCALAR_PH]]
 ; CHECK-VF1IC4:       [[SCALAR_PH]]:
-; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_LOOP]] ]
 ; CHECK-VF1IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_OUTER]], %[[OUTER_LOOP]] ]
+; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_LOOP]] ]
 ; CHECK-VF1IC4-NEXT:    br label %[[INNER_LOOP:.*]]
 ; CHECK-VF1IC4:       [[INNER_LOOP]]:
 ; CHECK-VF1IC4-NEXT:    [[RDX_INNER:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT:%.*]], %[[INNER_LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
index 70199fa1e0797..1bfb34165e52e 100644
--- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
@@ -500,8 +500,8 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(ptr %dst
 ; CHECK-NEXT:    [[IND_END5:%.*]] = mul i8 84, [[INDUCTION_IV]]
 ; CHECK-NEXT:    br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 84, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[IND_END4:%.*]] = mul i8 84, [[INDUCTION_IV]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <4 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT8]], <4 x i8> poison, <4 x i32> zeroinitializer
@@ -590,8 +590,8 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(ptr %dst
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[IND_END5:%.*]] = mul i8 84, [[INDUCTION_IV]]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK-PROFITABLE-BY-DEFAULT:       vec.epilog.ph:
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 84, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[IND_END4:%.*]] = mul i8 84, [[INDUCTION_IV]]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <2 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT8]], <2 x i8> poison, <2 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll b/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll
index 3dafe8270dc3f..a4b229d0a96b2 100644
--- a/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll
@@ -52,13 +52,13 @@ define void @reduced(ptr %0, ptr %1, i64 %iv, ptr %2, i64 %iv76, i64 %iv93) {
 ; CHECK:       vector.ph7:
 ; CHECK-NEXT:    [[N_MOD_VF8:%.*]] = urem i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF8]]
-; CHECK-NEXT:    br label [[VECTOR_BODY11:%.*]]
-; CHECK:       vector.body9:
-; CHECK-NEXT:    [[INDEX12:%.*]] = phi i64 [ 0, [[VECTOR_PH7]] ], [ [[INDEX_NEXT13:%.*]], [[VECTOR_BODY11]] ]
+; CHECK-NEXT:    br label [[VECTOR_BODY10:%.*]]
+; CHECK:       vector.body10:
+; CHECK-NEXT:    [[INDEX12:%.*]] = phi i64 [ 0, [[VECTOR_PH7]] ], [ [[INDEX_NEXT13:%.*]], [[VECTOR_BODY10]] ]
 ; CHECK-NEXT:    store i32 0, ptr [[TMP1]], align 4, !alias.scope !4, !noalias !7
 ; CHECK-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX12]], 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK4:%.*]], label [[VECTOR_BODY11]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK4:%.*]], label [[VECTOR_BODY10]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       middle.block4:
 ; CHECK-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N10]], label [[LOOP_3_LR_PH:%.*]], label [[SCALAR_PH5]]
@@ -69,8 +69,8 @@ define void @reduced(ptr %0, ptr %1, i64 %iv, ptr %2, i64 %iv76, i64 %iv93) {
 ; CHECK-NEXT:    [[IDXPROM_I_I61:%.*]] = and i64 [[IV761_LCSSA]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I62:%.*]] = getelementptr i32, ptr [[TMP0]], i64 [[IDXPROM_I_I61]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK22:%.*]] = icmp ult i64 [[TMP3]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK22]], label [[SCALAR_PH21:%.*]], label [[VECTOR_MEMCHECK14:%.*]]
-; CHECK:       vector.memcheck14:
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK22]], label [[SCALAR_PH22:%.*]], label [[VECTOR_MEMCHECK15:%.*]]
+; CHECK:       vector.memcheck15:
 ; CHECK-NEXT:    [[SCEVGEP15:%.*]] = getelementptr i8, ptr [[TMP1]], i64 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[IDXPROM_I_I61]], 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i64 [[TMP12]], 4
@@ -78,22 +78,22 @@ define void @reduced(ptr %0, ptr %1, i64 %iv, ptr %2, i64 %iv76, i64 %iv93) {
 ; CHECK-NEXT:    [[BOUND017:%.*]] = icmp ult ptr [[TMP1]], [[SCEVGEP16]]
 ; CHECK-NEXT:    [[BOUND118:%.*]] = icmp ult ptr [[ARRAYIDX_I_I62]], [[SCEVGEP15]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT19:%.*]] = and i1 [[BOUND017]], [[BOUND118]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT19]], label [[SCALAR_PH21]], label [[VECTOR_PH23:%.*]]
-; CHECK:       vector.ph23:
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT19]], label [[SCALAR_PH22]], label [[VECTOR_PH24:%.*]]
+; CHECK:       vector.ph24:
 ; CHECK-NEXT:    [[N_MOD_VF24:%.*]] = urem i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[N_VEC25:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF24]]
-; CHECK-NEXT:    br label [[VECTOR_BODY26:%.*]]
-; CHECK:       vector.body26:
-; CHECK-NEXT:    [[INDEX29:%.*]] = phi i64 [ 0, [[VECTOR_PH23]] ], [ [[INDEX_NEXT29:%.*]], [[VECTOR_BODY26]] ]
+; CHECK-NEXT:    br label [[VECTOR_BODY27:%.*]]
+; CHECK:       vector.body27:
+; CHECK-NEXT:    [[INDEX29:%.*]] = phi i64 [ 0, [[VECTOR_PH24]] ], [ [[INDEX_NEXT29:%.*]], [[VECTOR_BODY27]] ]
 ; CHECK-NEXT:    store i32 0, ptr [[TMP1]], align 4, !alias.scope !10, !noalias !13
 ; CHECK-NEXT:    [[INDEX_NEXT29]] = add nuw i64 [[INDEX29]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT29]], [[N_VEC25]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK20:%.*]], label [[VECTOR_BODY26]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK:       middle.block20:
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK21:%.*]], label [[VECTOR_BODY27]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK:       middle.block21:
 ; CHECK-NEXT:    [[CMP_N27:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC25]]
-; CHECK-NEXT:    br i1 [[CMP_N27]], label [[LOOP_CLEANUP:%.*]], label [[SCALAR_PH21]]
-; CHECK:       scalar.ph21:
-; CHECK-NEXT:    [[BC_RESUME_VAL26:%.*]] = phi i64 [ [[N_VEC25]], [[MIDDLE_BLOCK20]] ], [ 0, [[VECTOR_MEMCHECK14]] ], [ 0, [[LOOP_3_LR_PH]] ]
+; CHECK-NEXT:    br i1 [[CMP_N27]], label [[LOOP_CLEANUP:%.*]], label [[SCALAR_PH22]]
+; CHECK:       scalar.ph22:
+; CHECK-NEXT:    [[BC_RESUME_VAL26:%.*]] = phi i64 [ [[N_VEC25]], [[MIDDLE_BLOCK21]] ], [ 0, [[VECTOR_MEMCHECK15]] ], [ 0, [[LOOP_3_LR_PH]] ]
 ; CHECK-NEXT:    br label [[LOOP_3:%.*]]
 ; CHECK:       loop.2:
 ; CHECK-NEXT:    [[IV846:%.*]] = phi i64 [ [[IV_NEXT85:%.*]], [[LOOP_2]] ], [ [[BC_RESUME_VAL13]], [[SCALAR_PH5]] ]
@@ -105,7 +105,7 @@ define void @reduced(ptr %0, ptr %1, i64 %iv, ptr %2, i64 %iv76, i64 %iv93) {
 ; CHECK-NEXT:    [[EXITCOND92_NOT:%.*]] = icmp eq i64 [[IV846]], [[IV]]
 ; CHECK-NEXT:    br i1 [[EXITCOND92_NOT]], label [[LOOP_3_LR_PH]], label [[LOOP_2]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       loop.3:
-; CHECK-NEXT:    [[IV932:%.*]] = phi i64 [ [[BC_RESUME_VAL26]], [[SCALAR_PH21]] ], [ [[IV_NEXT94:%.*]], [[LOOP_3]] ]
+; CHECK-NEXT:    [[IV932:%.*]] = phi i64 [ [[BC_RESUME_VAL26]], [[SCALAR_PH22]] ], [ [[IV_NEXT94:%.*]], [[LOOP_3]] ]
 ; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX_I_I62]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_I_I653:%.*]] = getelementptr i32, ptr [[TMP2:%.*]], i64 [[IV93:%.*]]
 ; CHECK-NEXT:    store i32 0, ptr [[TMP1]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/pr66616.ll b/llvm/test/Transforms/LoopVectorize/pr66616.ll
index 50e18070a5c3f..24b9441749ee4 100644
--- a/llvm/test/Transforms/LoopVectorize/pr66616.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr66616.ll
@@ -46,7 +46,7 @@ define void @pr66616(ptr %ptr) {
 ; CHECK-NEXT:    [[IND_END:%.*]] = add i32 [[ADD3_LCSSA]], [[DOTCAST]]
 ; CHECK-NEXT:    [[IND_END5:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[N_VEC]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY7:%.*]]
-; CHECK:       vector.body5:
+; CHECK:       vector.body4:
 ; CHECK-NEXT:    [[INDEX8:%.*]] = phi i64 [ 0, [[VECTOR_PH3]] ], [ [[INDEX_NEXT9:%.*]], [[VECTOR_BODY7]] ]
 ; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX8]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-align.ll b/llvm/test/Transforms/LoopVectorize/reduction-align.ll
index 69e3e07ed3139..3216c9233ea21 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-align.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-align.ll
@@ -27,7 +27,7 @@ define void @fn(ptr %hbuf, ptr %ref, i32 %height) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[REF]], align 1, !alias.scope !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[REF]], align 1, !alias.scope [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1]] = add <4 x i16> [[BROADCAST_SPLAT]], [[VEC_PHI]]
@@ -36,12 +36,12 @@ define void @fn(ptr %hbuf, ptr %ref, i32 %height) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP1]])
-; CHECK-NEXT:    store i16 [[TMP3]], ptr [[HBUF]], align 1
+; CHECK-NEXT:    store i16 [[TMP3]], ptr [[HBUF]], align 1, !alias.scope [[META6:![0-9]+]], !noalias [[META0]]
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[HEIGHT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi i16 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
@@ -51,7 +51,7 @@ define void @fn(ptr %hbuf, ptr %ref, i32 %height) {
 ; CHECK-NEXT:    store i16 [[ADD]], ptr [[HBUF]], align 1
 ; CHECK-NEXT:    [[INC]] = add i32 [[I]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[HEIGHT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    br label [[FOR_END]]
 ; CHECK:       for.end:
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll
index 240421341626a..ad2f9c608f204 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll
@@ -733,8 +733,8 @@ define i32 @cond-uncond(ptr noalias %src1, ptr noalias %src2, ptr noalias %cond,
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP29]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[RDX1:%.*]] = phi i32 [ [[ADD2:%.*]], [[IF_END:%.*]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
@@ -897,8 +897,8 @@ define float @cond_cond(ptr noalias %src1, ptr noalias %src2, ptr noalias %cond,
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP49]], [[MIDDLE_BLOCK]] ], [ 2.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[RDX1:%.*]] = phi float [ [[RES:%.*]], [[FOR_INC:%.*]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
@@ -1039,8 +1039,8 @@ define i32 @uncond_cond(ptr noalias %src1, ptr noalias %src2, ptr noalias %cond,
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP29]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[RES:%.*]], [[FOR_INC:%.*]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
@@ -1168,8 +1168,8 @@ define i32 @uncond_cond_uncond(ptr noalias %src1, ptr noalias %src2, ptr noalias
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP30]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[ADD3:%.*]], [[IF_END:%.*]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
index 94fce866785db..f136b0e2e0b31 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
@@ -185,10 +185,10 @@ define void @add_unique_ind32(ptr noalias nocapture %a, i64 %n) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[IND_END:%.*]] = shl i32 [[DOTCAST]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 2
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[IND_END:%.*]] = shl i32 [[DOTCAST]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = call  @llvm.stepvector.nxv4i32()
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl  [[TMP6]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP5]] to i32
@@ -262,11 +262,11 @@ define void @add_unique_indf32(ptr noalias nocapture %a, i64 %n) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
 ; CHECK-NEXT:    [[TMP4:%.*]] = fmul float [[DOTCAST]], 2.000000e+00
 ; CHECK-NEXT:    [[IND_END:%.*]] = fadd float [[TMP4]], 0.000000e+00
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = call  @llvm.stepvector.nxv4i32()
 ; CHECK-NEXT:    [[TMP8:%.*]] = uitofp  [[TMP7]] to 
 ; CHECK-NEXT:    [[TMP9:%.*]] = fmul  [[TMP8]], splat (float 2.000000e+00)
diff --git a/llvm/test/Transforms/LoopVectorize/select-reduction.ll b/llvm/test/Transforms/LoopVectorize/select-reduction.ll
index 5e28192997695..836115f381382 100644
--- a/llvm/test/Transforms/LoopVectorize/select-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-reduction.ll
@@ -17,8 +17,8 @@ define i32 @test(i64 %N, i32 %x) {
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[EXTRA_ITER]], 3
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = sub i64 [[EXTRA_ITER]], [[N_VEC]]
 ; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[EXTRA_ITER]], 1
+; CHECK-NEXT:    [[IND_END:%.*]] = sub i64 [[EXTRA_ITER]], [[N_VEC]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -39,8 +39,8 @@ define i32 @test(i64 %N, i32 %x) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP3]])
 ; CHECK-NEXT:    br i1 true, label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[EXTRA_ITER]], [[LOOP_PREHEADER]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[EXTRA_ITER]], [[LOOP_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[NEXT:%.*]] = phi i32 [ [[SEL:%.*]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll b/llvm/test/Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll
index 40b007eff8ff8..57bc7b8337249 100644
--- a/llvm/test/Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll
@@ -19,8 +19,8 @@ define void @pr75298_store_reduction_value_in_folded_loop(i64 %iv.start) optsize
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[IV_START]], [[N_VEC]]
 ; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[IV_START]], [[N_VEC]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll b/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll
index 1a8f29e672f1a..b427b43cdb133 100644
--- a/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll
@@ -47,10 +47,11 @@ define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
 ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb:
-; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:      No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb:
@@ -129,10 +130,11 @@ define i64 @multi_exiting_to_same_exit_live_in_exit_values() {
 ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb:
-; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:      No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb:
@@ -204,10 +206,11 @@ define i64 @multi_exiting_to_same_exit_live_in_exit_values_2() {
 ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb:
-; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:      No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
index 95855e84c46e7..be1eb78cab607 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
@@ -35,10 +35,11 @@ define void @iv_no_binary_op_in_descriptor(i1 %c, ptr %dst) {
 ; CHECK-NEXT:  Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:    EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb:
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next.p, %loop.latch ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next.p, %loop.latch ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:         IR   %iv.next = add i64 %iv, 1
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
index 484e1ea8de0d2..dd3b50b3e060c 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
@@ -14,7 +14,7 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
 ; CHECK-NEXT: ir-bb:
 ; CHECK-NEXT:   IR %n.mod.vf = urem i64 %0, 2
 ; CHECK-NEXT:   IR %n.vec = sub i64 %0, %n.mod.vf
-; CHECK-NEXT:   IR %ind.end = getelementptr i8, ptr %start, i64 %n.vec
+; CHECK-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir<%start> + ir<%n.vec> * ir<1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector loop: {
@@ -94,7 +94,7 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb:
-; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi ir<%ind.end>, ir<%start>
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[END]]>, ir<%start>
 ; CHECK-NEXT: Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
index f07d1af47af02..d70c874499cb7 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
@@ -19,6 +19,8 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[END1:%.+]]> = DERIVED-IV ir<%and> + vp<[[VTC]]> * ir<-1>
+; CHECK-NEXT:   vp<[[END2:%.+]]> = DERIVED-IV ir<%A> + vp<[[VTC]]> * ir<1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector loop: {
@@ -43,11 +45,13 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[END1]]>, ir<%and>
+; CHECK-NEXT:   EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END2]]>, ir<%A>
 ; CHECK-NEXT: Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb:
-; CHECK-NEXT:   IR   %iv = phi i64 [ %and, %entry ], [ %iv.next, %loop ]
-; CHECK-NEXT:   IR   %p.src = phi ptr [ %A, %entry ], [ %p.src.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ %and, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME1]]> from scalar.ph)
+; CHECK-NEXT:   IR   %p.src = phi ptr [ %A, %entry ], [ %p.src.next, %loop ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph)
 ; CHECK:        IR   %cmp = icmp eq i64 %iv.next, 0
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -69,8 +73,8 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK-NEXT: ir-bb:
 ; CHECK-NEXT:  IR   %n.mod.vf = urem i64 %and, 16
 ; CHECK-NEXT:  IR   %n.vec = sub i64 %and, %n.mod.vf
-; CHECK-NEXT:  IR   %ind.end = sub i64 %and, %n.vec
-; CHECK-NEXT:  IR   %ind.end1 = getelementptr i8, ptr %A, i64 %n.vec
+; CHECK-NEXT:  vp<[[END1:%.+]]> = DERIVED-IV ir<%and> + ir<[[VTC]]> * ir<-1>
+; CHECK-NEXT:  vp<[[END2:%.+]]> = DERIVED-IV ir<%A> + ir<[[VTC]]> * ir<1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector loop: {
@@ -103,8 +107,8 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb:
-; CHECK-NEXT:   EMIT vp<[[RESUME1:%.+]]> = resume-phi ir<%ind.end>, ir<%and>
-; CHECK-NEXT:   EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi ir<%ind.end1>, ir<%A>
+; CHECK-NEXT:   EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[END1]]>, ir<%and>
+; CHECK-NEXT:   EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END2]]>, ir<%A>
 ; CHECK-NEXT: Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index cc2bd4e127447..5c09ce22cc8fb 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -42,10 +42,11 @@ define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) nounw
 ; CHECK-NEXT:  Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph
+; CHECK-NEXT:    EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb:
-; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %for.body.preheader ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %for.body.preheader ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond = icmp eq i64 %iv.next, %n
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
@@ -112,10 +113,11 @@ define void @print_widen_gep_and_select(i64 %n, ptr noalias %y, ptr noalias %x,
 ; CHECK-NEXT:  Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph
+; CHECK-NEXT:    EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb:
-; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %for.body.preheader ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %for.body.preheader ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond = icmp eq i64 %iv.next, %n
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
@@ -180,11 +182,12 @@ define float @print_reduction(i64 %n, ptr noalias %y) {
 ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00>
 ; CHECK-NEXT:  Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb:
-; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond = icmp eq i64 %iv.next, %n
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -246,11 +249,12 @@ define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr no
 ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00>
 ; CHECK-NEXT:  Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb:
-; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK-NEXT:    IR   %red = phi float [ %red.next, %for.body ], [ 0.000000e+00, %entry ]
 ; CHECK:         IR   %exitcond = icmp eq i64 %iv.next, %n
 ; CHECK-NEXT: No successors
@@ -332,10 +336,11 @@ define void @print_replicate_predicated_phi(i64 %n, ptr %x) {
 ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb:
-; CHECK-NEXT:    IR   %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+; CHECK-NEXT:    IR   %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK-NEXT:    IR   %cmp = icmp ult i64 %i, 5
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -381,6 +386,7 @@ define void @print_interleave_groups(i32 %C, i32 %D) {
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[IV_END:%.+]]> = DERIVED-IV ir<0> + vp<[[VTC]]> * ir<4>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector loop: {
@@ -412,10 +418,11 @@ define void @print_interleave_groups(i32 %C, i32 %D) {
 ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[IV_END]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb:
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %cmp = icmp slt i64 %iv.next, 1024
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -494,12 +501,13 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00>
 ; CHECK-NEXT:  Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb:
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-; CHECK-NEXT:    IR   %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
+; CHECK-NEXT:    IR   %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ] (extra operand: vp<[[RED_RESUME]]> from scalar.ph)
 ; CHECK:         IR   %exitcond.not = icmp eq i64 %iv.next, %n
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -588,10 +596,11 @@ define void @debug_loc_vpinstruction(ptr nocapture %asd, ptr nocapture %bsd) !db
 ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb:
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %if.end ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %if.end ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %cmp1 = icmp slt i32 %lsd, 100
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -648,6 +657,7 @@ define void @print_expand_scev(i64 %y, ptr %ptr) {
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[IV_END:%.+]]> = DERIVED-IV ir<0> + vp<[[VTC]]> * vp<[[EXP_SCEV]]>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector loop: {
@@ -671,10 +681,11 @@ define void @print_expand_scev(i64 %y, ptr %ptr) {
 ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[IV_END]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb:
-; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %iv.next = add i64 %iv, %inc
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -738,10 +749,11 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) {
 ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb:
-; CHECK-NEXT:    IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %ec = icmp eq i32 %iv.next, 1000
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -805,6 +817,7 @@ define void @print_fast_math_flags(i64 %n, ptr noalias %y, ptr noalias %x, ptr %
 ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb:
@@ -873,10 +886,11 @@ define void @print_exact_flags(i64 %n, ptr noalias %x) {
 ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb:
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond = icmp eq i64 %iv.next, %n
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -961,10 +975,11 @@ define void @print_call_flags(ptr readonly %src, ptr noalias %dest, i64 %n) {
 ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb:
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.loop ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %ifcond = fcmp oeq float %ld.value, 5.0
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -1038,10 +1053,11 @@ define void @print_disjoint_flags(i64 %n, ptr noalias %x) {
 ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb:
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond = icmp eq i64 %iv.next, %n
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -1152,11 +1168,12 @@ define i16 @print_first_order_recurrence_and_result(ptr %ptr) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT:   EMIT vp<[[RESUME_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<22>
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb:
 ; CHECK-NEXT:    IR   %for.1 = phi i16 [ 22, %entry ], [ %for.1.next, %loop ] (extra operand: vp<[[RESUME_P]]> from scalar.ph)
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond.not = icmp eq i64 %iv.next, 1000
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll
index a939b1e923a91..b6391e0457697 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll
@@ -16,6 +16,7 @@ define void @sink_with_sideeffects(i1 %c, ptr %ptr) {
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir<0> + vp<[[VEC_TC]]> * ir<-1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector loop: {
@@ -53,11 +54,13 @@ define void @sink_with_sideeffects(i1 %c, ptr %ptr) {
 ; CHECK-NEXT:  Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:    EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
+; CHECK-NEXT:    EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb:
-; CHECK-NEXT:    IR   %tmp0 = phi i64 [ %tmp6, %for.inc ], [ 0, %entry ]
-; CHECK-NEXT:    IR   %tmp1 = phi i64 [ %tmp7, %for.inc ], [ 0, %entry ]
+; CHECK-NEXT:    IR   %tmp0 = phi i64 [ %tmp6, %for.inc ], [ 0, %entry ] (extra operand: vp<[[RESUME1]]> from scalar.ph)
+; CHECK-NEXT:    IR   %tmp1 = phi i64 [ %tmp7, %for.inc ], [ 0, %entry ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph)
 ; CHECK:         IR   %tmp5 = trunc i32 %tmp4 to i8
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
index 534345152cb56..aa05bb153966e 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
@@ -262,6 +262,7 @@ define void @uniform_gep(i64 %k, ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir<21> + vp<[[VEC_TC]]> * ir<1>
 ; CHECK-NEXT:   CLONE ir<%gep.A.uniform> = getelementptr inbounds ir<%A>, ir<0>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -1046,6 +1047,7 @@ define void @merge_with_dead_gep_between_regions(i32 %n, ptr noalias %src, ptr n
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir<%n> + vp<[[VEC_TC]]> * ir<-1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector loop: {
@@ -1086,10 +1088,11 @@ define void @merge_with_dead_gep_between_regions(i32 %n, ptr noalias %src, ptr n
 ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[END]]>, ir<%n>
 ; CHECK-NEXT: Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb:
-; CHECK-NEXT:   IR   %iv = phi i32 [ %n, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i32 [ %n, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %iv.next = add nsw i32 %iv, -1
 ; CHECK-NEXT:   IR   %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv
 ; CHECK-NEXT:   IR   %l = load i32, ptr %gep.src, align 16
@@ -1134,6 +1137,7 @@ define void @ptr_induction_remove_dead_recipe(ptr %start, ptr %end) {
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir<%start> + vp<[[VEC_TC]]> * ir<-1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector loop: {
@@ -1177,10 +1181,11 @@ define void @ptr_induction_remove_dead_recipe(ptr %start, ptr %end) {
 ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[END]]>, ir<%start>
 ; CHECK-NEXT: Successor(s): ir-bb
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb:
-; CHECK-NEXT:   IR   %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop.latch ]
+; CHECK-NEXT:   IR   %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop.latch ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %ptr.iv.next = getelementptr inbounds i8, ptr %ptr.iv, i64 -1
 ; CHECK-NEXT:   IR   %l = load i8, ptr %ptr.iv.next, align 1
 ; CHECK-NEXT:   IR   %c.1 = icmp eq i8 %l, 0
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
index ef651010d6a90..66a0771c8d373 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
@@ -15,7 +15,7 @@ define void @arm_mult_q15(ptr %pSrcA, ptr %pSrcB, ptr noalias %pDst, i32 %blockS
 ; CHECK-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
 ; CHECK:       while.body.preheader:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[WHILE_BODY_PREHEADER18:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[WHILE_BODY_PREHEADER15:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[BLOCKSIZE]], -8
 ; CHECK-NEXT:    [[IND_END:%.*]] = and i32 [[BLOCKSIZE]], 7
@@ -48,18 +48,18 @@ define void @arm_mult_q15(ptr %pSrcA, ptr %pSrcB, ptr noalias %pDst, i32 %blockS
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[BLOCKSIZE]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[WHILE_BODY_PREHEADER18]]
-; CHECK:       while.body.preheader18:
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[WHILE_BODY_PREHEADER15]]
+; CHECK:       while.body.preheader15:
 ; CHECK-NEXT:    [[BLKCNT_06_PH:%.*]] = phi i32 [ [[BLOCKSIZE]], [[WHILE_BODY_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[PSRCA_ADDR_05_PH:%.*]] = phi ptr [ [[PSRCA]], [[WHILE_BODY_PREHEADER]] ], [ [[IND_END7]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[PDST_ADDR_04_PH:%.*]] = phi ptr [ [[PDST]], [[WHILE_BODY_PREHEADER]] ], [ [[IND_END9]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[PSRCB_ADDR_03_PH:%.*]] = phi ptr [ [[PSRCB]], [[WHILE_BODY_PREHEADER]] ], [ [[IND_END11]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
 ; CHECK:       while.body:
-; CHECK-NEXT:    [[BLKCNT_06:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BLKCNT_06_PH]], [[WHILE_BODY_PREHEADER18]] ]
-; CHECK-NEXT:    [[PSRCA_ADDR_05:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[PSRCA_ADDR_05_PH]], [[WHILE_BODY_PREHEADER18]] ]
-; CHECK-NEXT:    [[PDST_ADDR_04:%.*]] = phi ptr [ [[INCDEC_PTR4:%.*]], [[WHILE_BODY]] ], [ [[PDST_ADDR_04_PH]], [[WHILE_BODY_PREHEADER18]] ]
-; CHECK-NEXT:    [[PSRCB_ADDR_03:%.*]] = phi ptr [ [[INCDEC_PTR1:%.*]], [[WHILE_BODY]] ], [ [[PSRCB_ADDR_03_PH]], [[WHILE_BODY_PREHEADER18]] ]
+; CHECK-NEXT:    [[BLKCNT_06:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BLKCNT_06_PH]], [[WHILE_BODY_PREHEADER15]] ]
+; CHECK-NEXT:    [[PSRCA_ADDR_05:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[PSRCA_ADDR_05_PH]], [[WHILE_BODY_PREHEADER15]] ]
+; CHECK-NEXT:    [[PDST_ADDR_04:%.*]] = phi ptr [ [[INCDEC_PTR4:%.*]], [[WHILE_BODY]] ], [ [[PDST_ADDR_04_PH]], [[WHILE_BODY_PREHEADER15]] ]
+; CHECK-NEXT:    [[PSRCB_ADDR_03:%.*]] = phi ptr [ [[INCDEC_PTR1:%.*]], [[WHILE_BODY]] ], [ [[PSRCB_ADDR_03_PH]], [[WHILE_BODY_PREHEADER15]] ]
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds nuw i8, ptr [[PSRCA_ADDR_05]], i32 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[PSRCA_ADDR_05]], align 2
 ; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP10]] to i32

From d2e71c92b86895781172c3320e6ce828b1df5127 Mon Sep 17 00:00:00 2001
From: Lukas Bergdoll 
Date: Sun, 29 Dec 2024 20:55:44 +0100
Subject: [PATCH 159/567] [libc] Improve qsort (#120450)

---
 libc/src/stdlib/heap_sort.h                   |  12 +-
 libc/src/stdlib/qsort.cpp                     |  10 +-
 libc/src/stdlib/qsort_data.h                  | 171 +++++++++------
 libc/src/stdlib/qsort_pivot.h                 |  85 ++++++++
 libc/src/stdlib/qsort_r.cpp                   |  11 +-
 libc/src/stdlib/qsort_util.h                  |  47 +++-
 libc/src/stdlib/quick_sort.h                  | 203 +++++++++++++-----
 libc/test/src/stdlib/CMakeLists.txt           |  18 +-
 libc/test/src/stdlib/SortingTest.h            | 199 +++++++++--------
 libc/test/src/stdlib/heap_sort_test.cpp       |  18 +-
 libc/test/src/stdlib/qsort_r_test.cpp         |   4 +-
 libc/test/src/stdlib/qsort_test.cpp           |  17 --
 libc/test/src/stdlib/quick_sort_test.cpp      |  19 +-
 .../libc/test/src/stdlib/BUILD.bazel          |  16 +-
 14 files changed, 539 insertions(+), 291 deletions(-)
 create mode 100644 libc/src/stdlib/qsort_pivot.h
 delete mode 100644 libc/test/src/stdlib/qsort_test.cpp

diff --git a/libc/src/stdlib/heap_sort.h b/libc/src/stdlib/heap_sort.h
index ccb9ec5f82149..b9699776df89c 100644
--- a/libc/src/stdlib/heap_sort.h
+++ b/libc/src/stdlib/heap_sort.h
@@ -18,11 +18,12 @@ namespace internal {
 // A simple in-place heapsort implementation.
 // Follow the implementation in https://en.wikipedia.org/wiki/Heapsort.
 
-LIBC_INLINE void heap_sort(const Array &array) {
-  size_t end = array.size();
+template 
+LIBC_INLINE void heap_sort(const A &array, const F &is_less) {
+  size_t end = array.len();
   size_t start = end / 2;
 
-  auto left_child = [](size_t i) -> size_t { return 2 * i + 1; };
+  const auto left_child = [](size_t i) -> size_t { return 2 * i + 1; };
 
   while (end > 1) {
     if (start > 0) {
@@ -40,12 +41,11 @@ LIBC_INLINE void heap_sort(const Array &array) {
     while (left_child(root) < end) {
       size_t child = left_child(root);
       // If there are two children, set child to the greater.
-      if (child + 1 < end &&
-          array.elem_compare(child, array.get(child + 1)) < 0)
+      if ((child + 1 < end) && is_less(array.get(child), array.get(child + 1)))
         ++child;
 
       // If the root is less than the greater child
-      if (array.elem_compare(root, array.get(child)) >= 0)
+      if (!is_less(array.get(root), array.get(child)))
         break;
 
       // Swap the root with the greater child and continue sifting down.
diff --git a/libc/src/stdlib/qsort.cpp b/libc/src/stdlib/qsort.cpp
index 65a63c239f5c0..0bf5fc7980527 100644
--- a/libc/src/stdlib/qsort.cpp
+++ b/libc/src/stdlib/qsort.cpp
@@ -18,14 +18,12 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(void, qsort,
                    (void *array, size_t array_size, size_t elem_size,
                     int (*compare)(const void *, const void *))) {
-  if (array == nullptr || array_size == 0 || elem_size == 0)
-    return;
-  internal::Comparator c(compare);
 
-  auto arr = internal::Array(reinterpret_cast(array), array_size,
-                             elem_size, c);
+  const auto is_less = [compare](const void *a, const void *b) -> bool {
+    return compare(a, b) < 0;
+  };
 
-  internal::sort(arr);
+  internal::unstable_sort(array, array_size, elem_size, is_less);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/qsort_data.h b/libc/src/stdlib/qsort_data.h
index c529d55ca46ff..aa6d9bbc123de 100644
--- a/libc/src/stdlib/qsort_data.h
+++ b/libc/src/stdlib/qsort_data.h
@@ -17,91 +17,122 @@
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
-using Compare = int(const void *, const void *);
-using CompareWithState = int(const void *, const void *, void *);
-
-enum class CompType { COMPARE, COMPARE_WITH_STATE };
-
-struct Comparator {
-  union {
-    Compare *comp_func;
-    CompareWithState *comp_func_r;
-  };
-  const CompType comp_type;
-
-  void *arg;
-
-  Comparator(Compare *func)
-      : comp_func(func), comp_type(CompType::COMPARE), arg(nullptr) {}
-
-  Comparator(CompareWithState *func, void *arg_val)
-      : comp_func_r(func), comp_type(CompType::COMPARE_WITH_STATE),
-        arg(arg_val) {}
-
-#if defined(__clang__)
-  // Recent upstream changes to -fsanitize=function find more instances of
-  // function type mismatches. One case is with the comparator passed to this
-  // class. Libraries will tend to pass comparators that take pointers to
-  // varying types while this comparator expects to accept const void pointers.
-  // Ideally those tools would pass a function that strictly accepts const
-  // void*s to avoid UB, or would use qsort_r to pass their own comparator.
-  [[clang::no_sanitize("function")]]
-#endif
-  int comp_vals(const void *a, const void *b) const {
-    if (comp_type == CompType::COMPARE) {
-      return comp_func(a, b);
-    } else {
-      return comp_func_r(a, b, arg);
+class ArrayGenericSize {
+  cpp::byte *array_base;
+  size_t array_len;
+  size_t elem_size;
+
+  LIBC_INLINE cpp::byte *get_internal(size_t i) const {
+    return array_base + (i * elem_size);
+  }
+
+public:
+  LIBC_INLINE ArrayGenericSize(void *a, size_t s, size_t e)
+      : array_base(reinterpret_cast(a)), array_len(s),
+        elem_size(e) {}
+
+  static constexpr bool has_fixed_size() { return false; }
+
+  LIBC_INLINE void *get(size_t i) const { return get_internal(i); }
+
+  LIBC_INLINE void swap(size_t i, size_t j) const {
+    // It's possible to use 8 byte blocks with `uint64_t`, but that
+    // generates more machine code as the remainder loop gets
+    // unrolled, plus 4 byte operations are more likely to be
+    // efficient on a wider variety of hardware. On x86 LLVM tends
+    // to unroll the block loop again into 2 16 byte swaps per
+    // iteration which is another reason that 4 byte blocks yields
+    // good performance even for big types.
+    using block_t = uint32_t;
+    constexpr size_t BLOCK_SIZE = sizeof(block_t);
+
+    alignas(block_t) cpp::byte tmp_block[BLOCK_SIZE];
+
+    cpp::byte *elem_i = get_internal(i);
+    cpp::byte *elem_j = get_internal(j);
+
+    const size_t elem_size_rem = elem_size % BLOCK_SIZE;
+    const cpp::byte *elem_i_block_end = elem_i + (elem_size - elem_size_rem);
+
+    while (elem_i != elem_i_block_end) {
+      __builtin_memcpy(tmp_block, elem_i, BLOCK_SIZE);
+      __builtin_memcpy(elem_i, elem_j, BLOCK_SIZE);
+      __builtin_memcpy(elem_j, tmp_block, BLOCK_SIZE);
+
+      elem_i += BLOCK_SIZE;
+      elem_j += BLOCK_SIZE;
+    }
+
+    for (size_t n = 0; n < elem_size_rem; ++n) {
+      cpp::byte tmp = elem_i[n];
+      elem_i[n] = elem_j[n];
+      elem_j[n] = tmp;
     }
   }
+
+  LIBC_INLINE size_t len() const { return array_len; }
+
+  // Make an Array starting at index |i| and length |s|.
+  LIBC_INLINE ArrayGenericSize make_array(size_t i, size_t s) const {
+    return ArrayGenericSize(get_internal(i), s, elem_size);
+  }
+
+  // Reset this Array to point at a different interval of the same
+  // items starting at index |i|.
+  LIBC_INLINE void reset_bounds(size_t i, size_t s) {
+    array_base = get_internal(i);
+    array_len = s;
+  }
 };
 
-class Array {
-  uint8_t *array;
-  size_t array_size;
-  size_t elem_size;
-  Comparator compare;
+// Having a specialized Array type for sorting that knows at
+// compile-time what the size of the element is, allows for much more
+// efficient swapping and for cheaper offset calculations.
+template  class ArrayFixedSize {
+  cpp::byte *array_base;
+  size_t array_len;
 
-public:
-  Array(uint8_t *a, size_t s, size_t e, Comparator c)
-      : array(a), array_size(s), elem_size(e), compare(c) {}
-
-  uint8_t *get(size_t i) const { return array + i * elem_size; }
-
-  void swap(size_t i, size_t j) const {
-    uint8_t *elem_i = get(i);
-    uint8_t *elem_j = get(j);
-    for (size_t b = 0; b < elem_size; ++b) {
-      uint8_t temp = elem_i[b];
-      elem_i[b] = elem_j[b];
-      elem_j[b] = temp;
-    }
+  LIBC_INLINE cpp::byte *get_internal(size_t i) const {
+    return array_base + (i * ELEM_SIZE);
   }
 
-  int elem_compare(size_t i, const uint8_t *other) const {
-    // An element must compare equal to itself so we don't need to consult the
-    // user provided comparator.
-    if (get(i) == other)
-      return 0;
-    return compare.comp_vals(get(i), other);
+public:
+  LIBC_INLINE ArrayFixedSize(void *a, size_t s)
+      : array_base(reinterpret_cast(a)), array_len(s) {}
+
+  // Beware this function is used a heuristic for cheap to swap types, so
+  // instantiating `ArrayFixedSize` with `ELEM_SIZE > 100` is probably a bad
+  // idea perf wise.
+  static constexpr bool has_fixed_size() { return true; }
+
+  LIBC_INLINE void *get(size_t i) const { return get_internal(i); }
+
+  LIBC_INLINE void swap(size_t i, size_t j) const {
+    alignas(32) cpp::byte tmp[ELEM_SIZE];
+
+    cpp::byte *elem_i = get_internal(i);
+    cpp::byte *elem_j = get_internal(j);
+
+    __builtin_memcpy(tmp, elem_i, ELEM_SIZE);
+    __builtin_memmove(elem_i, elem_j, ELEM_SIZE);
+    __builtin_memcpy(elem_j, tmp, ELEM_SIZE);
   }
 
-  size_t size() const { return array_size; }
+  LIBC_INLINE size_t len() const { return array_len; }
 
-  // Make an Array starting at index |i| and size |s|.
-  LIBC_INLINE Array make_array(size_t i, size_t s) const {
-    return Array(get(i), s, elem_size, compare);
+  // Make an Array starting at index |i| and length |s|.
+  LIBC_INLINE ArrayFixedSize make_array(size_t i, size_t s) const {
+    return ArrayFixedSize(get_internal(i), s);
   }
 
-  // Reset this Array to point at a different interval of the same items.
-  LIBC_INLINE void reset_bounds(uint8_t *a, size_t s) {
-    array = a;
-    array_size = s;
+  // Reset this Array to point at a different interval of the same
+  // items starting at index |i|.
+  LIBC_INLINE void reset_bounds(size_t i, size_t s) {
+    array_base = get_internal(i);
+    array_len = s;
   }
 };
 
-using SortingRoutine = void(const Array &);
-
 } // namespace internal
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/src/stdlib/qsort_pivot.h b/libc/src/stdlib/qsort_pivot.h
new file mode 100644
index 0000000000000..b7e1b4294f6d6
--- /dev/null
+++ b/libc/src/stdlib/qsort_pivot.h
@@ -0,0 +1,85 @@
+//===-- Implementation header for qsort utilities ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H
+#define LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H
+
+#include 
+
+namespace LIBC_NAMESPACE_DECL {
+namespace internal {
+
+// Recursively select a pseudomedian if above this threshold.
+constexpr size_t PSEUDO_MEDIAN_REC_THRESHOLD = 64;
+
+// Selects a pivot from `array`. Algorithm taken from glidesort by Orson Peters.
+//
+// This chooses a pivot by sampling an adaptive amount of points, approximating
+// the quality of a median of sqrt(n) elements.
+template 
+size_t choose_pivot(const A &array, const F &is_less) {
+  const size_t len = array.len();
+
+  if (len < 8) {
+    return 0;
+  }
+
+  const size_t len_div_8 = len / 8;
+
+  const size_t a = 0;             // [0, floor(n/8))
+  const size_t b = len_div_8 * 4; // [4*floor(n/8), 5*floor(n/8))
+  const size_t c = len_div_8 * 7; // [7*floor(n/8), 8*floor(n/8))
+
+  if (len < PSEUDO_MEDIAN_REC_THRESHOLD)
+    return median3(array, a, b, c, is_less);
+  else
+    return median3_rec(array, a, b, c, len_div_8, is_less);
+}
+
+// Calculates an approximate median of 3 elements from sections a, b, c, or
+// recursively from an approximation of each, if they're large enough. By
+// dividing the size of each section by 8 when recursing we have logarithmic
+// recursion depth and overall sample from f(n) = 3*f(n/8) -> f(n) =
+// O(n^(log(3)/log(8))) ~= O(n^0.528) elements.
+template 
+size_t median3_rec(const A &array, size_t a, size_t b, size_t c, size_t n,
+                   const F &is_less) {
+  if (n * 8 >= PSEUDO_MEDIAN_REC_THRESHOLD) {
+    const size_t n8 = n / 8;
+    a = median3_rec(array, a, a + (n8 * 4), a + (n8 * 7), n8, is_less);
+    b = median3_rec(array, b, b + (n8 * 4), b + (n8 * 7), n8, is_less);
+    c = median3_rec(array, c, c + (n8 * 4), c + (n8 * 7), n8, is_less);
+  }
+  return median3(array, a, b, c, is_less);
+}
+
+/// Calculates the median of 3 elements.
+template 
+size_t median3(const A &array, size_t a, size_t b, size_t c, const F &is_less) {
+  const void *a_ptr = array.get(a);
+  const void *b_ptr = array.get(b);
+  const void *c_ptr = array.get(c);
+
+  const bool x = is_less(a_ptr, b_ptr);
+  const bool y = is_less(a_ptr, c_ptr);
+  if (x == y) {
+    // If x=y=0 then b, c <= a. In this case we want to return max(b, c).
+    // If x=y=1 then a < b, c. In this case we want to return min(b, c).
+    // By toggling the outcome of b < c using XOR x we get this behavior.
+    const bool z = is_less(b_ptr, c_ptr);
+    return z ^ x ? c : b;
+  } else {
+    // Either c <= a < b or b <= a < c, thus a is our median.
+    return a;
+  }
+}
+
+} // namespace internal
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H
diff --git a/libc/src/stdlib/qsort_r.cpp b/libc/src/stdlib/qsort_r.cpp
index bf61a40e84734..4e60998b6a6df 100644
--- a/libc/src/stdlib/qsort_r.cpp
+++ b/libc/src/stdlib/qsort_r.cpp
@@ -19,13 +19,12 @@ LLVM_LIBC_FUNCTION(void, qsort_r,
                    (void *array, size_t array_size, size_t elem_size,
                     int (*compare)(const void *, const void *, void *),
                     void *arg)) {
-  if (array == nullptr || array_size == 0 || elem_size == 0)
-    return;
-  internal::Comparator c(compare, arg);
-  auto arr = internal::Array(reinterpret_cast(array), array_size,
-                             elem_size, c);
 
-  internal::sort(arr);
+  const auto is_less = [compare, arg](const void *a, const void *b) -> bool {
+    return compare(a, b, arg) < 0;
+  };
+
+  internal::unstable_sort(array, array_size, elem_size, is_less);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/qsort_util.h b/libc/src/stdlib/qsort_util.h
index d42adde06d976..7882b829d3274 100644
--- a/libc/src/stdlib/qsort_util.h
+++ b/libc/src/stdlib/qsort_util.h
@@ -27,11 +27,48 @@
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
-#if LIBC_QSORT_IMPL == LIBC_QSORT_QUICK_SORT
-constexpr auto sort = quick_sort;
-#elif LIBC_QSORT_IMPL == LIBC_QSORT_HEAP_SORT
-constexpr auto sort = heap_sort;
-#endif
+template 
+LIBC_INLINE void unstable_sort_impl(void *array, size_t array_len,
+                                    size_t elem_size, const F &is_less) {
+  if (array == nullptr || array_len == 0 || elem_size == 0)
+    return;
+
+  if constexpr (USE_QUICKSORT) {
+    switch (elem_size) {
+    case 4: {
+      auto arr_fixed_size = internal::ArrayFixedSize<4>(array, array_len);
+      quick_sort(arr_fixed_size, is_less);
+      return;
+    }
+    case 8: {
+      auto arr_fixed_size = internal::ArrayFixedSize<8>(array, array_len);
+      quick_sort(arr_fixed_size, is_less);
+      return;
+    }
+    case 16: {
+      auto arr_fixed_size = internal::ArrayFixedSize<16>(array, array_len);
+      quick_sort(arr_fixed_size, is_less);
+      return;
+    }
+    default:
+      auto arr_generic_size =
+          internal::ArrayGenericSize(array, array_len, elem_size);
+      quick_sort(arr_generic_size, is_less);
+      return;
+    }
+  } else {
+    auto arr_generic_size =
+        internal::ArrayGenericSize(array, array_len, elem_size);
+    heap_sort(arr_generic_size, is_less);
+  }
+}
+
+template 
+LIBC_INLINE void unstable_sort(void *array, size_t array_len, size_t elem_size,
+                               const F &is_less) {
+#define USE_QUICK_SORT ((LIBC_QSORT_IMPL) == (LIBC_QSORT_QUICK_SORT))
+  unstable_sort_impl(array, array_len, elem_size, is_less);
+}
 
 } // namespace internal
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/quick_sort.h b/libc/src/stdlib/quick_sort.h
index 82b90a7d511d9..9ab2830250018 100644
--- a/libc/src/stdlib/quick_sort.h
+++ b/libc/src/stdlib/quick_sort.h
@@ -9,84 +9,175 @@
 #ifndef LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H
 #define LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H
 
-#include "src/__support/macros/attributes.h"
+#include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/cstddef.h"
 #include "src/__support/macros/config.h"
-#include "src/stdlib/qsort_data.h"
+#include "src/stdlib/qsort_pivot.h"
 
 #include 
 
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
-// A simple quicksort implementation using the Hoare partition scheme.
-LIBC_INLINE size_t partition(const Array &array) {
-  const size_t array_size = array.size();
-  size_t pivot_index = array_size / 2;
-  uint8_t *pivot = array.get(pivot_index);
-  size_t i = 0;
-  size_t j = array_size - 1;
+// Branchless Lomuto partition based on the implementation by Lukas
+// Bergdoll and Orson Peters
+// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/lomcyc_partition/text.md.
+// Simplified to avoid having to stack allocate.
+template 
+LIBC_INLINE size_t partition_lomuto_branchless(const A &array,
+                                               const void *pivot,
+                                               const F &is_less) {
+  const size_t array_len = array.len();
+
+  size_t left = 0;
+  size_t right = 0;
+
+  while (right < array_len) {
+    const bool right_is_lt = is_less(array.get(right), pivot);
+    array.swap(left, right);
+    left += static_cast(right_is_lt);
+    right += 1;
+  }
+
+  return left;
+}
+
+// Optimized for large types that are expensive to move. Not optimized
+// for integers. It's possible to use a cyclic permutation here for
+// large types as done in ipnsort but the advantages of this are limited
+// as `is_less` is a small wrapper around a call to a function pointer
+// and won't incur much binary-size overhead. The other reason to use
+// cyclic permutation is to have more efficient swapping, but we don't
+// know the element size so this isn't applicable here either.
+template 
+LIBC_INLINE size_t partition_hoare_branchy(const A &array, const void *pivot,
+                                           const F &is_less) {
+  const size_t array_len = array.len();
+
+  size_t left = 0;
+  size_t right = array_len;
 
   while (true) {
-    int compare_i, compare_j;
-
-    while ((compare_i = array.elem_compare(i, pivot)) < 0)
-      ++i;
-    while ((compare_j = array.elem_compare(j, pivot)) > 0)
-      --j;
-
-    // At some point i will crossover j so we will definitely break out of
-    // this while loop.
-    if (i >= j)
-      return j + 1;
-
-    array.swap(i, j);
-
-    // The pivot itself might have got swapped so we will update the pivot.
-    if (i == pivot_index) {
-      pivot = array.get(j);
-      pivot_index = j;
-    } else if (j == pivot_index) {
-      pivot = array.get(i);
-      pivot_index = i;
+    while (left < right && is_less(array.get(left), pivot))
+      ++left;
+
+    while (true) {
+      --right;
+      if (left >= right || is_less(array.get(right), pivot)) {
+        break;
+      }
     }
 
-    if (compare_i == 0 && compare_j == 0) {
-      // If we do not move the pointers, we will end up with an
-      // infinite loop as i and j will be stuck without advancing.
-      ++i;
-      --j;
-    }
+    if (left >= right)
+      break;
+
+    array.swap(left, right);
+    ++left;
+  }
+
+  return left;
+}
+
+template 
+LIBC_INLINE size_t partition(const A &array, size_t pivot_index,
+                             const F &is_less) {
+  // Place the pivot at the beginning of the array.
+  if (pivot_index != 0) {
+    array.swap(0, pivot_index);
   }
+
+  const A array_without_pivot = array.make_array(1, array.len() - 1);
+  const void *pivot = array.get(0);
+
+  size_t num_lt;
+  if constexpr (A::has_fixed_size()) {
+    // Branchless Lomuto avoid branch misprediction penalties, but
+    // it also swaps more often which is only faster if the swap is a fast
+    // constant operation.
+    num_lt = partition_lomuto_branchless(array_without_pivot, pivot, is_less);
+  } else {
+    num_lt = partition_hoare_branchy(array_without_pivot, pivot, is_less);
+  }
+
+  // Place the pivot between the two partitions.
+  array.swap(0, num_lt);
+
+  return num_lt;
 }
 
-LIBC_INLINE void quick_sort(Array array) {
+template 
+LIBC_INLINE void quick_sort_impl(A &array, const void *ancestor_pivot,
+                                 size_t limit, const F &is_less) {
   while (true) {
-    const size_t array_size = array.size();
-    if (array_size <= 1)
+    const size_t array_len = array.len();
+    if (array_len <= 1)
       return;
-    size_t split_index = partition(array);
-    if (array_size == 2)
-      // The partition operation sorts the two element array.
+
+    // If too many bad pivot choices were made, simply fall back to
+    // heapsort in order to guarantee `O(N x log(N))` worst-case.
+    if (limit == 0) {
+      heap_sort(array, is_less);
       return;
+    }
 
-    // Make Arrays describing the two sublists that still need sorting.
-    Array left = array.make_array(0, split_index);
-    Array right = array.make_array(split_index, array.size() - split_index);
-
-    // Recurse to sort the smaller of the two, and then loop round within this
-    // function to sort the larger. This way, recursive call depth is bounded
-    // by log2 of the total array size, because every recursive call is sorting
-    // a list at most half the length of the one in its caller.
-    if (left.size() < right.size()) {
-      quick_sort(left);
-      array.reset_bounds(right.get(0), right.size());
-    } else {
-      quick_sort(right);
-      array.reset_bounds(left.get(0), left.size());
+    limit -= 1;
+
+    const size_t pivot_index = choose_pivot(array, is_less);
+
+    // If the chosen pivot is equal to the predecessor, then it's the smallest
+    // element in the slice. Partition the slice into elements equal to and
+    // elements greater than the pivot. This case is usually hit when the slice
+    // contains many duplicate elements.
+    if (ancestor_pivot) {
+      if (!is_less(ancestor_pivot, array.get(pivot_index))) {
+        const size_t num_lt =
+            partition(array, pivot_index,
+                      [is_less](const void *a, const void *b) -> bool {
+                        return !is_less(b, a);
+                      });
+
+        // Continue sorting elements greater than the pivot. We know that
+        // `num_lt` cont
+        array.reset_bounds(num_lt + 1, array.len() - (num_lt + 1));
+        ancestor_pivot = nullptr;
+        continue;
+      }
     }
+
+    size_t split_index = partition(array, pivot_index, is_less);
+
+    if (array_len == 2)
+      // The partition operation sorts the two element array.
+      return;
+
+    // Split the array into `left`, `pivot`, and `right`.
+    A left = array.make_array(0, split_index);
+    const void *pivot = array.get(split_index);
+    const size_t right_start = split_index + 1;
+    A right = array.make_array(right_start, array.len() - right_start);
+
+    // Recurse into the left side. We have a fixed recursion limit,
+    // testing shows no real benefit for recursing into the shorter
+    // side.
+    quick_sort_impl(left, ancestor_pivot, limit, is_less);
+
+    // Continue with the right side.
+    array = right;
+    ancestor_pivot = pivot;
   }
 }
 
+constexpr size_t ilog2(size_t n) { return cpp::bit_width(n) - 1; }
+
+template 
+LIBC_INLINE void quick_sort(A &array, const F &is_less) {
+  const void *ancestor_pivot = nullptr;
+  // Limit the number of imbalanced partitions to `2 * floor(log2(len))`.
+  // The binary OR by one is used to eliminate the zero-check in the logarithm.
+  const size_t limit = 2 * ilog2((array.len() | 1));
+  quick_sort_impl(array, ancestor_pivot, limit, is_less);
+}
+
 } // namespace internal
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt
index 4ca2043ab4c9b..8cc0428632ba3 100644
--- a/libc/test/src/stdlib/CMakeLists.txt
+++ b/libc/test/src/stdlib/CMakeLists.txt
@@ -300,18 +300,6 @@ add_libc_test(
     libc.src.stdlib.bsearch
 )
 
-add_libc_test(
-  quick_sort_test
-  SUITE
-    libc-stdlib-tests
-  SRCS
-    quick_sort_test.cpp
-  HDRS
-    SortingTest.h
-  DEPENDS
-    libc.src.stdlib.qsort_util
-)
-
 add_libc_test(
   heap_sort_test
   SUITE
@@ -321,15 +309,15 @@ add_libc_test(
   HDRS
     SortingTest.h
   DEPENDS
-    libc.src.stdlib.qsort_util
+    libc.src.stdlib.qsort
 )
 
 add_libc_test(
-  qsort_test
+  quick_sort_test
   SUITE
     libc-stdlib-tests
   SRCS
-    qsort_test.cpp
+    quick_sort_test.cpp
   HDRS
     SortingTest.h
   DEPENDS
diff --git a/libc/test/src/stdlib/SortingTest.h b/libc/test/src/stdlib/SortingTest.h
index d34584e5addf0..034c0e4f1fd01 100644
--- a/libc/test/src/stdlib/SortingTest.h
+++ b/libc/test/src/stdlib/SortingTest.h
@@ -7,19 +7,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/macros/config.h"
-#include "src/stdlib/qsort_data.h"
+#include "src/stdlib/qsort.h"
 #include "test/UnitTest/Test.h"
 
 class SortingTest : public LIBC_NAMESPACE::testing::Test {
 
-  using Array = LIBC_NAMESPACE::internal::Array;
-  using Comparator = LIBC_NAMESPACE::internal::Comparator;
-  using SortingRoutine = LIBC_NAMESPACE::internal::SortingRoutine;
+  using SortingRoutine = void (*)(void *array, size_t array_len,
+                                  size_t elem_size,
+                                  int (*compare)(const void *, const void *));
 
-public:
   static int int_compare(const void *l, const void *r) {
     int li = *reinterpret_cast(l);
     int ri = *reinterpret_cast(r);
+
     if (li == ri)
       return 0;
     else if (li > ri)
@@ -28,16 +28,19 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
       return -1;
   }
 
+  static void int_sort(SortingRoutine sort_func, int *array, size_t array_len) {
+    sort_func(reinterpret_cast(array), array_len, sizeof(int),
+              int_compare);
+  }
+
+public:
   void test_sorted_array(SortingRoutine sort_func) {
     int array[25] = {10,   23,   33,   35,   55,   70,    71,   100,  110,
                      123,  133,  135,  155,  170,  171,   1100, 1110, 1123,
                      1133, 1135, 1155, 1170, 1171, 11100, 12310};
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
-    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    sort_func(arr);
+    int_sort(sort_func, array, ARRAY_LEN);
 
     ASSERT_LE(array[0], 10);
     ASSERT_LE(array[1], 23);
@@ -69,14 +72,11 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
   void test_reversed_sorted_array(SortingRoutine sort_func) {
     int array[] = {25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13,
                    12, 11, 10, 9,  8,  7,  6,  5,  4,  3,  2,  1};
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
+    int_sort(sort_func, array, ARRAY_LEN);
 
-    sort_func(arr);
-
-    for (int i = 0; i < int(ARRAY_SIZE - 1); ++i)
+    for (int i = 0; i < int(ARRAY_LEN - 1); ++i)
       ASSERT_EQ(array[i], i + 1);
   }
 
@@ -84,14 +84,11 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
     int array[] = {100, 100, 100, 100, 100, 100, 100, 100, 100,
                    100, 100, 100, 100, 100, 100, 100, 100, 100,
                    100, 100, 100, 100, 100, 100, 100};
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
-    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    sort_func(arr);
+    int_sort(sort_func, array, ARRAY_LEN);
 
-    for (size_t i = 0; i < ARRAY_SIZE; ++i)
+    for (size_t i = 0; i < ARRAY_LEN; ++i)
       ASSERT_EQ(array[i], 100);
   }
 
@@ -99,12 +96,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
     int array[25] = {10,  23,  8,    35,   55,   45,  40,  100, 110,
                      123, 90,  80,   70,   60,   171, 11,  1,   -1,
                      -5,  -10, 1155, 1170, 1171, 12,  -100};
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
-    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    sort_func(arr);
+    int_sort(sort_func, array, ARRAY_LEN);
 
     ASSERT_EQ(array[0], -100);
     ASSERT_EQ(array[1], -10);
@@ -135,12 +129,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
 
   void test_unsorted_array_2(SortingRoutine sort_func) {
     int array[7] = {10, 40, 45, 55, 35, 23, 60};
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
-
-    sort_func(arr);
+    int_sort(sort_func, array, ARRAY_LEN);
 
     ASSERT_EQ(array[0], 10);
     ASSERT_EQ(array[1], 23);
@@ -153,12 +144,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
 
   void test_unsorted_array_duplicated_1(SortingRoutine sort_func) {
     int array[6] = {10, 10, 20, 20, 5, 5};
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
-    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    sort_func(arr);
+    int_sort(sort_func, array, ARRAY_LEN);
 
     ASSERT_EQ(array[0], 5);
     ASSERT_EQ(array[1], 5);
@@ -170,12 +158,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
 
   void test_unsorted_array_duplicated_2(SortingRoutine sort_func) {
     int array[10] = {20, 10, 10, 10, 10, 20, 21, 21, 21, 21};
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
-    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    sort_func(arr);
+    int_sort(sort_func, array, ARRAY_LEN);
 
     ASSERT_EQ(array[0], 10);
     ASSERT_EQ(array[1], 10);
@@ -191,12 +176,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
 
   void test_unsorted_array_duplicated_3(SortingRoutine sort_func) {
     int array[10] = {20, 30, 30, 30, 30, 20, 21, 21, 21, 21};
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
-
-    sort_func(arr);
+    int_sort(sort_func, array, ARRAY_LEN);
 
     ASSERT_EQ(array[0], 20);
     ASSERT_EQ(array[1], 20);
@@ -213,12 +195,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
   void test_unsorted_three_element_1(SortingRoutine sort_func) {
     int array[3] = {14999024, 0, 3};
 
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
-    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    sort_func(arr);
+    int_sort(sort_func, array, ARRAY_LEN);
 
     ASSERT_EQ(array[0], 0);
     ASSERT_EQ(array[1], 3);
@@ -228,12 +207,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
   void test_unsorted_three_element_2(SortingRoutine sort_func) {
     int array[3] = {3, 14999024, 0};
 
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
-    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    sort_func(arr);
+    int_sort(sort_func, array, ARRAY_LEN);
 
     ASSERT_EQ(array[0], 0);
     ASSERT_EQ(array[1], 3);
@@ -243,12 +219,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
   void test_unsorted_three_element_3(SortingRoutine sort_func) {
     int array[3] = {3, 0, 14999024};
 
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
-
-    sort_func(arr);
+    int_sort(sort_func, array, ARRAY_LEN);
 
     ASSERT_EQ(array[0], 0);
     ASSERT_EQ(array[1], 3);
@@ -258,12 +231,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
   void test_same_three_element(SortingRoutine sort_func) {
     int array[3] = {12345, 12345, 12345};
 
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
-    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    sort_func(arr);
+    int_sort(sort_func, array, ARRAY_LEN);
 
     ASSERT_EQ(array[0], 12345);
     ASSERT_EQ(array[1], 12345);
@@ -273,12 +243,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
   void test_unsorted_two_element_1(SortingRoutine sort_func) {
     int array[] = {14999024, 0};
 
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
-    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    sort_func(arr);
+    int_sort(sort_func, array, ARRAY_LEN);
 
     ASSERT_EQ(array[0], 0);
     ASSERT_EQ(array[1], 14999024);
@@ -287,12 +254,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
   void test_unsorted_two_element_2(SortingRoutine sort_func) {
     int array[] = {0, 14999024};
 
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
-
-    sort_func(arr);
+    int_sort(sort_func, array, ARRAY_LEN);
 
     ASSERT_EQ(array[0], 0);
     ASSERT_EQ(array[1], 14999024);
@@ -301,12 +265,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
   void test_same_two_element(SortingRoutine sort_func) {
     int array[] = {12345, 12345};
 
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
-    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    sort_func(arr);
+    int_sort(sort_func, array, ARRAY_LEN);
 
     ASSERT_EQ(array[0], 12345);
     ASSERT_EQ(array[1], 12345);
@@ -315,15 +276,76 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
   void test_single_element(SortingRoutine sort_func) {
     int array[] = {12345};
 
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
-    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    sort_func(arr);
+    int_sort(sort_func, array, ARRAY_LEN);
 
     ASSERT_EQ(array[0], 12345);
   }
+
+  void test_different_elem_size(SortingRoutine sort_func) {
+    // Random order of values [0,50) to avoid only testing pre-sorted handling.
+    // Long enough to reach interesting code.
+    constexpr uint8_t ARRAY_INITIAL_VALS[] = {
+        42, 13, 8,  4,  17, 28, 20, 32, 22, 29, 7,  2,  46, 37, 26, 49, 24,
+        38, 10, 18, 40, 36, 47, 15, 11, 48, 44, 33, 1,  5,  16, 35, 39, 41,
+        14, 23, 3,  9,  6,  27, 21, 25, 31, 45, 12, 43, 34, 30, 19, 0};
+
+    constexpr size_t ARRAY_LEN = sizeof(ARRAY_INITIAL_VALS);
+    constexpr size_t MAX_ELEM_SIZE = 150;
+    constexpr size_t BUF_SIZE = ARRAY_LEN * MAX_ELEM_SIZE;
+
+    static_assert(ARRAY_LEN < 256); // so we can encode the values.
+
+    // Minimum alignment to test implementation for bugs related to assuming
+    // incorrect association between alignment and element size.
+    alignas(1) uint8_t buf[BUF_SIZE];
+
+    const auto fill_buf = [&buf](size_t elem_size) {
+      for (size_t i = 0; i < BUF_SIZE; ++i) {
+        buf[i] = 0;
+      }
+
+      for (size_t elem_i = 0, buf_i = 0; elem_i < ARRAY_LEN; ++elem_i) {
+        const uint8_t elem_val = ARRAY_INITIAL_VALS[elem_i];
+        for (size_t elem_byte_i = 0; elem_byte_i < elem_size; ++elem_byte_i) {
+          buf[buf_i] = elem_val;
+          buf_i += 1;
+        }
+      }
+    };
+
+    for (size_t elem_size = 0; elem_size <= MAX_ELEM_SIZE; ++elem_size) {
+      // Fill all bytes with data to ensure mistakes in elem swap are noticed.
+      fill_buf(elem_size);
+
+      sort_func(reinterpret_cast(buf), ARRAY_LEN, elem_size,
+                [](const void *a, const void *b) -> int {
+                  const uint8_t a_val = *reinterpret_cast(a);
+                  const uint8_t b_val = *reinterpret_cast(b);
+
+                  if (a_val < b_val) {
+                    return -1;
+                  } else if (a_val > b_val) {
+                    return 1;
+                  } else {
+                    return 0;
+                  }
+                });
+
+      for (size_t elem_i = 0, buf_i = 0; elem_i < ARRAY_LEN; ++elem_i) {
+        const uint8_t expected_elem_val = static_cast(elem_i);
+
+        for (size_t elem_byte_i = 0; elem_byte_i < elem_size; ++elem_byte_i) {
+          const uint8_t buf_val = buf[buf_i];
+          // Check that every byte in the element has the expected value.
+          ASSERT_EQ(buf_val, expected_elem_val)
+              << "elem_size: " << elem_size << " buf_i: " << buf_i << '\n';
+          buf_i += 1;
+        }
+      }
+    }
+  }
 };
 
 #define LIST_SORTING_TESTS(Name, Func)                                         \
@@ -374,4 +396,7 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
   TEST_F(LlvmLibc##Name##Test, SingleElementArray) {                           \
     test_single_element(Func);                                                 \
   }                                                                            \
+  TEST_F(LlvmLibc##Name##Test, DifferentElemSizeArray) {                       \
+    test_different_elem_size(Func);                                            \
+  }                                                                            \
   static_assert(true)
diff --git a/libc/test/src/stdlib/heap_sort_test.cpp b/libc/test/src/stdlib/heap_sort_test.cpp
index d70e3dc2272be..18d4244506ec2 100644
--- a/libc/test/src/stdlib/heap_sort_test.cpp
+++ b/libc/test/src/stdlib/heap_sort_test.cpp
@@ -7,10 +7,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "SortingTest.h"
-#include "src/stdlib/heap_sort.h"
+#include "src/stdlib/qsort_util.h"
 
-void sort(const LIBC_NAMESPACE::internal::Array &array) {
-  LIBC_NAMESPACE::internal::heap_sort(array);
+void heap_sort(void *array, size_t array_size, size_t elem_size,
+               int (*compare)(const void *, const void *)) {
+
+  constexpr bool USE_QUICKSORT = false;
+
+  const auto is_less = [compare](const void *a,
+                                 const void *b) noexcept -> bool {
+    return compare(a, b) < 0;
+  };
+
+  LIBC_NAMESPACE::internal::unstable_sort_impl(
+      array, array_size, elem_size, is_less);
 }
 
-LIST_SORTING_TESTS(HeapSort, sort);
+LIST_SORTING_TESTS(HeapSort, heap_sort);
diff --git a/libc/test/src/stdlib/qsort_r_test.cpp b/libc/test/src/stdlib/qsort_r_test.cpp
index 6893fdc7b74c8..f18923618ed5e 100644
--- a/libc/test/src/stdlib/qsort_r_test.cpp
+++ b/libc/test/src/stdlib/qsort_r_test.cpp
@@ -62,9 +62,9 @@ TEST(LlvmLibcQsortRTest, SortedArray) {
   ASSERT_LE(array[23], 11100);
   ASSERT_LE(array[24], 12310);
 
-  // This is a sorted list, but there still have to have been at least N
+  // This is a sorted list, but there still have to have been at least N - 1
   // comparisons made.
-  ASSERT_GE(count, ARRAY_SIZE);
+  ASSERT_GE(count, ARRAY_SIZE - 1);
 }
 
 TEST(LlvmLibcQsortRTest, ReverseSortedArray) {
diff --git a/libc/test/src/stdlib/qsort_test.cpp b/libc/test/src/stdlib/qsort_test.cpp
deleted file mode 100644
index 1e921a86fd1fd..0000000000000
--- a/libc/test/src/stdlib/qsort_test.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-//===-- Unittests for qsort -----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "SortingTest.h"
-#include "src/stdlib/qsort.h"
-
-void sort(const LIBC_NAMESPACE::internal::Array &array) {
-  LIBC_NAMESPACE::qsort(reinterpret_cast(array.get(0)), array.size(),
-                        sizeof(int), SortingTest::int_compare);
-}
-
-LIST_SORTING_TESTS(Qsort, sort);
diff --git a/libc/test/src/stdlib/quick_sort_test.cpp b/libc/test/src/stdlib/quick_sort_test.cpp
index d6bf77ebfd40d..2832c855370bc 100644
--- a/libc/test/src/stdlib/quick_sort_test.cpp
+++ b/libc/test/src/stdlib/quick_sort_test.cpp
@@ -1,4 +1,4 @@
-//===-- Unittests for quick sort ------------------------------------------===//
+//===-- Unittests for qsort -----------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,10 +7,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "SortingTest.h"
-#include "src/stdlib/quick_sort.h"
+#include "src/stdlib/qsort_util.h"
 
-void sort(const LIBC_NAMESPACE::internal::Array &array) {
-  LIBC_NAMESPACE::internal::quick_sort(array);
+void quick_sort(void *array, size_t array_size, size_t elem_size,
+                int (*compare)(const void *, const void *)) {
+  constexpr bool USE_QUICKSORT = true;
+
+  const auto is_less = [compare](const void *a,
+                                 const void *b) noexcept -> bool {
+    return compare(a, b) < 0;
+  };
+
+  LIBC_NAMESPACE::internal::unstable_sort_impl(
+      array, array_size, elem_size, is_less);
 }
 
-LIST_SORTING_TESTS(QuickSort, sort);
+LIST_SORTING_TESTS(Qsort, quick_sort);
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
index e4b4b075705e8..c0f1546912662 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
@@ -120,31 +120,23 @@ libc_support_library(
     ],
 )
 
-libc_test(
-    name = "qsort_test",
-    srcs = ["qsort_test.cpp"],
-    libc_function_deps = ["//libc:qsort"],
-    deps = [
-        ":qsort_test_helper",
-        "//libc:types_size_t",
-    ],
-)
-
 libc_test(
     name = "quick_sort_test",
     srcs = ["quick_sort_test.cpp"],
+    libc_function_deps = ["//libc:qsort"],
     deps = [
         ":qsort_test_helper",
-        "//libc:qsort_util",
+        "//libc:types_size_t",
     ],
 )
 
 libc_test(
     name = "heap_sort_test",
     srcs = ["heap_sort_test.cpp"],
+    libc_function_deps = ["//libc:qsort"],
     deps = [
         ":qsort_test_helper",
-        "//libc:qsort_util",
+        "//libc:types_size_t",
     ],
 )
 

From 6a769638b840385fa691f514e3764cc899f64570 Mon Sep 17 00:00:00 2001
From: Craig Topper 
Date: Sun, 29 Dec 2024 11:58:38 -0800
Subject: [PATCH 160/567] [RISCV] Remove XTHeadba pattern that is covered by
 DAG combine.

Mul by 200 is converted to multiple RISCVISD::SHL_ADD in ExpandMul
---
 llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
index 37b29eda2dc10..f6c4386f40e5f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
@@ -553,10 +553,6 @@ def : Pat<(add (XLenVT GPR:$r), CSImm12MulBy4:$i),
           (TH_ADDSL GPR:$r, (XLenVT (ADDI (XLenVT X0), CSImm12MulBy4:$i)), 2)>;
 def : Pat<(add (XLenVT GPR:$r), CSImm12MulBy8:$i),
           (TH_ADDSL GPR:$r, (XLenVT (ADDI (XLenVT X0), CSImm12MulBy8:$i)), 3)>;
-
-def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 200)),
-          (SLLI (XLenVT (TH_ADDSL (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)),
-                                  (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)), 2)), 3)>;
 } // Predicates = [HasVendorXTHeadBa]
 
 let Predicates = [HasVendorXTHeadBb] in {

From c557ce9f27feccdbda3588555fcb3303d1f81935 Mon Sep 17 00:00:00 2001
From: Craig Topper 
Date: Sun, 29 Dec 2024 12:25:45 -0800
Subject: [PATCH 161/567] [RISCV] Use add_like_non_imm12 in XTheadba patterns
 to match Zba.

---
 llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td | 12 +--
 llvm/test/CodeGen/RISCV/rv64xtheadba.ll       | 74 +++++++++++++++++--
 2 files changed, 72 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
index f6c4386f40e5f..9df889a50c3d8 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
@@ -536,22 +536,22 @@ multiclass VPatTernaryVMAQA_VV_VX;
 def : Pat<(XLenVT (riscv_shl_add GPR:$rs1, uimm2:$uimm2, GPR:$rs2)),
           (TH_ADDSL GPR:$rs2, GPR:$rs1, uimm2:$uimm2)>;
 
 // Reuse complex patterns from StdExtZba
-def : Pat<(add_non_imm12 sh1add_op:$rs1, (XLenVT GPR:$rs2)),
+def : Pat<(add_like_non_imm12 sh1add_op:$rs1, (XLenVT GPR:$rs2)),
           (TH_ADDSL GPR:$rs2, sh1add_op:$rs1, 1)>;
-def : Pat<(add_non_imm12 sh2add_op:$rs1, (XLenVT GPR:$rs2)),
+def : Pat<(add_like_non_imm12 sh2add_op:$rs1, (XLenVT GPR:$rs2)),
           (TH_ADDSL GPR:$rs2, sh2add_op:$rs1, 2)>;
-def : Pat<(add_non_imm12 sh3add_op:$rs1, (XLenVT GPR:$rs2)),
+def : Pat<(add_like_non_imm12 sh3add_op:$rs1, (XLenVT GPR:$rs2)),
           (TH_ADDSL GPR:$rs2, sh3add_op:$rs1, 3)>;
 
-def : Pat<(add (XLenVT GPR:$r), CSImm12MulBy4:$i),
+def : Pat<(add_like (XLenVT GPR:$r), CSImm12MulBy4:$i),
           (TH_ADDSL GPR:$r, (XLenVT (ADDI (XLenVT X0), CSImm12MulBy4:$i)), 2)>;
-def : Pat<(add (XLenVT GPR:$r), CSImm12MulBy8:$i),
+def : Pat<(add_like (XLenVT GPR:$r), CSImm12MulBy8:$i),
           (TH_ADDSL GPR:$r, (XLenVT (ADDI (XLenVT X0), CSImm12MulBy8:$i)), 3)>;
 } // Predicates = [HasVendorXTHeadBa]
 
diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
index 1da76c1673d6a..2272c17bcef03 100644
--- a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
@@ -109,6 +109,25 @@ define i64 @addmul6(i64 %a, i64 %b) {
   ret i64 %d
 }
 
+define i64 @disjointormul6(i64 %a, i64 %b) {
+; RV64I-LABEL: disjointormul6:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a2, a0, 1
+; RV64I-NEXT:    slli a0, a0, 3
+; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: disjointormul6:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a1, a0, 1
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 6
+  %d = or disjoint i64 %c, %b
+  ret i64 %d
+}
+
 define i64 @addmul10(i64 %a, i64 %b) {
 ; RV64I-LABEL: addmul10:
 ; RV64I:       # %bb.0:
@@ -423,8 +442,8 @@ define i64 @add255mul180(i64 %a) {
 ; RV64XTHEADBA:       # %bb.0:
 ; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
 ; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 3
-; RV64XTHEADBA-NEXT:    li a1, 255
-; RV64XTHEADBA-NEXT:    th.addsl a0, a1, a0, 2
+; RV64XTHEADBA-NEXT:    slli a0, a0, 2
+; RV64XTHEADBA-NEXT:    addi a0, a0, 255
 ; RV64XTHEADBA-NEXT:    ret
   %c = mul i64 %a, 180
   %d = add i64 %c, 255
@@ -642,6 +661,39 @@ define i64 @mul288(i64 %a) {
   ret i64 %c
 }
 
+define i64 @sh1add_imm(i64 %0) {
+; CHECK-LABEL: sh1add_imm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    addi a0, a0, 5
+; CHECK-NEXT:    ret
+  %a = shl i64 %0, 1
+  %b = add i64 %a, 5
+  ret i64 %b
+}
+
+define i64 @sh2add_imm(i64 %0) {
+; CHECK-LABEL: sh2add_imm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    addi a0, a0, -6
+; CHECK-NEXT:    ret
+  %a = shl i64 %0, 2
+  %b = add i64 %a, -6
+  ret i64 %b
+}
+
+define i64 @sh3add_imm(i64 %0) {
+; CHECK-LABEL: sh3add_imm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    addi a0, a0, 7
+; CHECK-NEXT:    ret
+  %a = shl i64 %0, 3
+  %b = add i64 %a, 7
+  ret i64 %b
+}
+
 define i64 @mul258(i64 %a) {
 ; RV64I-LABEL: mul258:
 ; RV64I:       # %bb.0:
@@ -983,12 +1035,18 @@ define i64 @add4104(i64 %a) {
 }
 
 define i64 @add4104_2(i64 %a) {
-; CHECK-LABEL: add4104_2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, 1
-; CHECK-NEXT:    addiw a1, a1, 8
-; CHECK-NEXT:    or a0, a0, a1
-; CHECK-NEXT:    ret
+; RV64I-LABEL: add4104_2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 1
+; RV64I-NEXT:    addiw a1, a1, 8
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: add4104_2:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    li a1, 1026
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
+; RV64XTHEADBA-NEXT:    ret
   %c = or disjoint i64 %a, 4104
   ret i64 %c
 }

From 82ffdf317ee52e8679d58361dfe39930600839d1 Mon Sep 17 00:00:00 2001
From: Craig Topper 
Date: Sun, 29 Dec 2024 12:44:30 -0800
Subject: [PATCH 162/567] [RISCV] Swap rs1 and rs2 names in TH_ADDSL patterns.
 NFC

The names should match the operand order of the final instruction.
I assume these patterns were somewhat copied from Zba where rs1 is
shifted, but for th.addsl, rs2 is shifted.
---
 llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
index 9df889a50c3d8..942ced8c64815 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
@@ -538,16 +538,16 @@ multiclass VPatTernaryVMAQA_VV_VX;
-def : Pat<(XLenVT (riscv_shl_add GPR:$rs1, uimm2:$uimm2, GPR:$rs2)),
-          (TH_ADDSL GPR:$rs2, GPR:$rs1, uimm2:$uimm2)>;
+def : Pat<(XLenVT (riscv_shl_add GPR:$rs2, uimm2:$uimm2, GPR:$rs1)),
+          (TH_ADDSL GPR:$rs1, GPR:$rs2, uimm2:$uimm2)>;
 
 // Reuse complex patterns from StdExtZba
-def : Pat<(add_like_non_imm12 sh1add_op:$rs1, (XLenVT GPR:$rs2)),
-          (TH_ADDSL GPR:$rs2, sh1add_op:$rs1, 1)>;
-def : Pat<(add_like_non_imm12 sh2add_op:$rs1, (XLenVT GPR:$rs2)),
-          (TH_ADDSL GPR:$rs2, sh2add_op:$rs1, 2)>;
-def : Pat<(add_like_non_imm12 sh3add_op:$rs1, (XLenVT GPR:$rs2)),
-          (TH_ADDSL GPR:$rs2, sh3add_op:$rs1, 3)>;
+def : Pat<(add_like_non_imm12 sh1add_op:$rs2, (XLenVT GPR:$rs1)),
+          (TH_ADDSL GPR:$rs1, sh1add_op:$rs2, 1)>;
+def : Pat<(add_like_non_imm12 sh2add_op:$rs2, (XLenVT GPR:$rs1)),
+          (TH_ADDSL GPR:$rs1, sh2add_op:$rs2, 2)>;
+def : Pat<(add_like_non_imm12 sh3add_op:$rs2, (XLenVT GPR:$rs1)),
+          (TH_ADDSL GPR:$rs1, sh3add_op:$rs2, 3)>;
 
 def : Pat<(add_like (XLenVT GPR:$r), CSImm12MulBy4:$i),
           (TH_ADDSL GPR:$r, (XLenVT (ADDI (XLenVT X0), CSImm12MulBy4:$i)), 2)>;

From 0b96f1cf6877c21fee42445fa33f26a52bf3dfe8 Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan 
Date: Sun, 29 Dec 2024 16:03:53 -0500
Subject: [PATCH 163/567] Revert "[libc] Improve qsort" (#121303)

Reverts llvm/llvm-project#120450
---
 libc/src/stdlib/heap_sort.h                   |  12 +-
 libc/src/stdlib/qsort.cpp                     |  10 +-
 libc/src/stdlib/qsort_data.h                  | 171 ++++++---------
 libc/src/stdlib/qsort_pivot.h                 |  85 --------
 libc/src/stdlib/qsort_r.cpp                   |  11 +-
 libc/src/stdlib/qsort_util.h                  |  47 +---
 libc/src/stdlib/quick_sort.h                  | 203 +++++-------------
 libc/test/src/stdlib/CMakeLists.txt           |  18 +-
 libc/test/src/stdlib/SortingTest.h            | 199 ++++++++---------
 libc/test/src/stdlib/heap_sort_test.cpp       |  18 +-
 libc/test/src/stdlib/qsort_r_test.cpp         |   4 +-
 libc/test/src/stdlib/qsort_test.cpp           |  17 ++
 libc/test/src/stdlib/quick_sort_test.cpp      |  19 +-
 .../libc/test/src/stdlib/BUILD.bazel          |  16 +-
 14 files changed, 291 insertions(+), 539 deletions(-)
 delete mode 100644 libc/src/stdlib/qsort_pivot.h
 create mode 100644 libc/test/src/stdlib/qsort_test.cpp

diff --git a/libc/src/stdlib/heap_sort.h b/libc/src/stdlib/heap_sort.h
index b9699776df89c..ccb9ec5f82149 100644
--- a/libc/src/stdlib/heap_sort.h
+++ b/libc/src/stdlib/heap_sort.h
@@ -18,12 +18,11 @@ namespace internal {
 // A simple in-place heapsort implementation.
 // Follow the implementation in https://en.wikipedia.org/wiki/Heapsort.
 
-template 
-LIBC_INLINE void heap_sort(const A &array, const F &is_less) {
-  size_t end = array.len();
+LIBC_INLINE void heap_sort(const Array &array) {
+  size_t end = array.size();
   size_t start = end / 2;
 
-  const auto left_child = [](size_t i) -> size_t { return 2 * i + 1; };
+  auto left_child = [](size_t i) -> size_t { return 2 * i + 1; };
 
   while (end > 1) {
     if (start > 0) {
@@ -41,11 +40,12 @@ LIBC_INLINE void heap_sort(const A &array, const F &is_less) {
     while (left_child(root) < end) {
       size_t child = left_child(root);
       // If there are two children, set child to the greater.
-      if ((child + 1 < end) && is_less(array.get(child), array.get(child + 1)))
+      if (child + 1 < end &&
+          array.elem_compare(child, array.get(child + 1)) < 0)
         ++child;
 
       // If the root is less than the greater child
-      if (!is_less(array.get(root), array.get(child)))
+      if (array.elem_compare(root, array.get(child)) >= 0)
         break;
 
       // Swap the root with the greater child and continue sifting down.
diff --git a/libc/src/stdlib/qsort.cpp b/libc/src/stdlib/qsort.cpp
index 0bf5fc7980527..65a63c239f5c0 100644
--- a/libc/src/stdlib/qsort.cpp
+++ b/libc/src/stdlib/qsort.cpp
@@ -18,12 +18,14 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(void, qsort,
                    (void *array, size_t array_size, size_t elem_size,
                     int (*compare)(const void *, const void *))) {
+  if (array == nullptr || array_size == 0 || elem_size == 0)
+    return;
+  internal::Comparator c(compare);
 
-  const auto is_less = [compare](const void *a, const void *b) -> bool {
-    return compare(a, b) < 0;
-  };
+  auto arr = internal::Array(reinterpret_cast(array), array_size,
+                             elem_size, c);
 
-  internal::unstable_sort(array, array_size, elem_size, is_less);
+  internal::sort(arr);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/qsort_data.h b/libc/src/stdlib/qsort_data.h
index aa6d9bbc123de..c529d55ca46ff 100644
--- a/libc/src/stdlib/qsort_data.h
+++ b/libc/src/stdlib/qsort_data.h
@@ -17,122 +17,91 @@
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
-class ArrayGenericSize {
-  cpp::byte *array_base;
-  size_t array_len;
-  size_t elem_size;
-
-  LIBC_INLINE cpp::byte *get_internal(size_t i) const {
-    return array_base + (i * elem_size);
-  }
-
-public:
-  LIBC_INLINE ArrayGenericSize(void *a, size_t s, size_t e)
-      : array_base(reinterpret_cast(a)), array_len(s),
-        elem_size(e) {}
-
-  static constexpr bool has_fixed_size() { return false; }
-
-  LIBC_INLINE void *get(size_t i) const { return get_internal(i); }
-
-  LIBC_INLINE void swap(size_t i, size_t j) const {
-    // It's possible to use 8 byte blocks with `uint64_t`, but that
-    // generates more machine code as the remainder loop gets
-    // unrolled, plus 4 byte operations are more likely to be
-    // efficient on a wider variety of hardware. On x86 LLVM tends
-    // to unroll the block loop again into 2 16 byte swaps per
-    // iteration which is another reason that 4 byte blocks yields
-    // good performance even for big types.
-    using block_t = uint32_t;
-    constexpr size_t BLOCK_SIZE = sizeof(block_t);
-
-    alignas(block_t) cpp::byte tmp_block[BLOCK_SIZE];
-
-    cpp::byte *elem_i = get_internal(i);
-    cpp::byte *elem_j = get_internal(j);
-
-    const size_t elem_size_rem = elem_size % BLOCK_SIZE;
-    const cpp::byte *elem_i_block_end = elem_i + (elem_size - elem_size_rem);
-
-    while (elem_i != elem_i_block_end) {
-      __builtin_memcpy(tmp_block, elem_i, BLOCK_SIZE);
-      __builtin_memcpy(elem_i, elem_j, BLOCK_SIZE);
-      __builtin_memcpy(elem_j, tmp_block, BLOCK_SIZE);
-
-      elem_i += BLOCK_SIZE;
-      elem_j += BLOCK_SIZE;
-    }
-
-    for (size_t n = 0; n < elem_size_rem; ++n) {
-      cpp::byte tmp = elem_i[n];
-      elem_i[n] = elem_j[n];
-      elem_j[n] = tmp;
+using Compare = int(const void *, const void *);
+using CompareWithState = int(const void *, const void *, void *);
+
+enum class CompType { COMPARE, COMPARE_WITH_STATE };
+
+struct Comparator {
+  union {
+    Compare *comp_func;
+    CompareWithState *comp_func_r;
+  };
+  const CompType comp_type;
+
+  void *arg;
+
+  Comparator(Compare *func)
+      : comp_func(func), comp_type(CompType::COMPARE), arg(nullptr) {}
+
+  Comparator(CompareWithState *func, void *arg_val)
+      : comp_func_r(func), comp_type(CompType::COMPARE_WITH_STATE),
+        arg(arg_val) {}
+
+#if defined(__clang__)
+  // Recent upstream changes to -fsanitize=function find more instances of
+  // function type mismatches. One case is with the comparator passed to this
+  // class. Libraries will tend to pass comparators that take pointers to
+  // varying types while this comparator expects to accept const void pointers.
+  // Ideally those tools would pass a function that strictly accepts const
+  // void*s to avoid UB, or would use qsort_r to pass their own comparator.
+  [[clang::no_sanitize("function")]]
+#endif
+  int comp_vals(const void *a, const void *b) const {
+    if (comp_type == CompType::COMPARE) {
+      return comp_func(a, b);
+    } else {
+      return comp_func_r(a, b, arg);
     }
   }
-
-  LIBC_INLINE size_t len() const { return array_len; }
-
-  // Make an Array starting at index |i| and length |s|.
-  LIBC_INLINE ArrayGenericSize make_array(size_t i, size_t s) const {
-    return ArrayGenericSize(get_internal(i), s, elem_size);
-  }
-
-  // Reset this Array to point at a different interval of the same
-  // items starting at index |i|.
-  LIBC_INLINE void reset_bounds(size_t i, size_t s) {
-    array_base = get_internal(i);
-    array_len = s;
-  }
 };
 
-// Having a specialized Array type for sorting that knows at
-// compile-time what the size of the element is, allows for much more
-// efficient swapping and for cheaper offset calculations.
-template  class ArrayFixedSize {
-  cpp::byte *array_base;
-  size_t array_len;
-
-  LIBC_INLINE cpp::byte *get_internal(size_t i) const {
-    return array_base + (i * ELEM_SIZE);
-  }
+class Array {
+  uint8_t *array;
+  size_t array_size;
+  size_t elem_size;
+  Comparator compare;
 
 public:
-  LIBC_INLINE ArrayFixedSize(void *a, size_t s)
-      : array_base(reinterpret_cast(a)), array_len(s) {}
-
-  // Beware this function is used a heuristic for cheap to swap types, so
-  // instantiating `ArrayFixedSize` with `ELEM_SIZE > 100` is probably a bad
-  // idea perf wise.
-  static constexpr bool has_fixed_size() { return true; }
-
-  LIBC_INLINE void *get(size_t i) const { return get_internal(i); }
-
-  LIBC_INLINE void swap(size_t i, size_t j) const {
-    alignas(32) cpp::byte tmp[ELEM_SIZE];
-
-    cpp::byte *elem_i = get_internal(i);
-    cpp::byte *elem_j = get_internal(j);
+  Array(uint8_t *a, size_t s, size_t e, Comparator c)
+      : array(a), array_size(s), elem_size(e), compare(c) {}
+
+  uint8_t *get(size_t i) const { return array + i * elem_size; }
+
+  void swap(size_t i, size_t j) const {
+    uint8_t *elem_i = get(i);
+    uint8_t *elem_j = get(j);
+    for (size_t b = 0; b < elem_size; ++b) {
+      uint8_t temp = elem_i[b];
+      elem_i[b] = elem_j[b];
+      elem_j[b] = temp;
+    }
+  }
 
-    __builtin_memcpy(tmp, elem_i, ELEM_SIZE);
-    __builtin_memmove(elem_i, elem_j, ELEM_SIZE);
-    __builtin_memcpy(elem_j, tmp, ELEM_SIZE);
+  int elem_compare(size_t i, const uint8_t *other) const {
+    // An element must compare equal to itself so we don't need to consult the
+    // user provided comparator.
+    if (get(i) == other)
+      return 0;
+    return compare.comp_vals(get(i), other);
   }
 
-  LIBC_INLINE size_t len() const { return array_len; }
+  size_t size() const { return array_size; }
 
-  // Make an Array starting at index |i| and length |s|.
-  LIBC_INLINE ArrayFixedSize make_array(size_t i, size_t s) const {
-    return ArrayFixedSize(get_internal(i), s);
+  // Make an Array starting at index |i| and size |s|.
+  LIBC_INLINE Array make_array(size_t i, size_t s) const {
+    return Array(get(i), s, elem_size, compare);
   }
 
-  // Reset this Array to point at a different interval of the same
-  // items starting at index |i|.
-  LIBC_INLINE void reset_bounds(size_t i, size_t s) {
-    array_base = get_internal(i);
-    array_len = s;
+  // Reset this Array to point at a different interval of the same items.
+  LIBC_INLINE void reset_bounds(uint8_t *a, size_t s) {
+    array = a;
+    array_size = s;
   }
 };
 
+using SortingRoutine = void(const Array &);
+
 } // namespace internal
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/src/stdlib/qsort_pivot.h b/libc/src/stdlib/qsort_pivot.h
deleted file mode 100644
index b7e1b4294f6d6..0000000000000
--- a/libc/src/stdlib/qsort_pivot.h
+++ /dev/null
@@ -1,85 +0,0 @@
-//===-- Implementation header for qsort utilities ---------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H
-#define LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H
-
-#include 
-
-namespace LIBC_NAMESPACE_DECL {
-namespace internal {
-
-// Recursively select a pseudomedian if above this threshold.
-constexpr size_t PSEUDO_MEDIAN_REC_THRESHOLD = 64;
-
-// Selects a pivot from `array`. Algorithm taken from glidesort by Orson Peters.
-//
-// This chooses a pivot by sampling an adaptive amount of points, approximating
-// the quality of a median of sqrt(n) elements.
-template 
-size_t choose_pivot(const A &array, const F &is_less) {
-  const size_t len = array.len();
-
-  if (len < 8) {
-    return 0;
-  }
-
-  const size_t len_div_8 = len / 8;
-
-  const size_t a = 0;             // [0, floor(n/8))
-  const size_t b = len_div_8 * 4; // [4*floor(n/8), 5*floor(n/8))
-  const size_t c = len_div_8 * 7; // [7*floor(n/8), 8*floor(n/8))
-
-  if (len < PSEUDO_MEDIAN_REC_THRESHOLD)
-    return median3(array, a, b, c, is_less);
-  else
-    return median3_rec(array, a, b, c, len_div_8, is_less);
-}
-
-// Calculates an approximate median of 3 elements from sections a, b, c, or
-// recursively from an approximation of each, if they're large enough. By
-// dividing the size of each section by 8 when recursing we have logarithmic
-// recursion depth and overall sample from f(n) = 3*f(n/8) -> f(n) =
-// O(n^(log(3)/log(8))) ~= O(n^0.528) elements.
-template 
-size_t median3_rec(const A &array, size_t a, size_t b, size_t c, size_t n,
-                   const F &is_less) {
-  if (n * 8 >= PSEUDO_MEDIAN_REC_THRESHOLD) {
-    const size_t n8 = n / 8;
-    a = median3_rec(array, a, a + (n8 * 4), a + (n8 * 7), n8, is_less);
-    b = median3_rec(array, b, b + (n8 * 4), b + (n8 * 7), n8, is_less);
-    c = median3_rec(array, c, c + (n8 * 4), c + (n8 * 7), n8, is_less);
-  }
-  return median3(array, a, b, c, is_less);
-}
-
-/// Calculates the median of 3 elements.
-template 
-size_t median3(const A &array, size_t a, size_t b, size_t c, const F &is_less) {
-  const void *a_ptr = array.get(a);
-  const void *b_ptr = array.get(b);
-  const void *c_ptr = array.get(c);
-
-  const bool x = is_less(a_ptr, b_ptr);
-  const bool y = is_less(a_ptr, c_ptr);
-  if (x == y) {
-    // If x=y=0 then b, c <= a. In this case we want to return max(b, c).
-    // If x=y=1 then a < b, c. In this case we want to return min(b, c).
-    // By toggling the outcome of b < c using XOR x we get this behavior.
-    const bool z = is_less(b_ptr, c_ptr);
-    return z ^ x ? c : b;
-  } else {
-    // Either c <= a < b or b <= a < c, thus a is our median.
-    return a;
-  }
-}
-
-} // namespace internal
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif // LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H
diff --git a/libc/src/stdlib/qsort_r.cpp b/libc/src/stdlib/qsort_r.cpp
index 4e60998b6a6df..bf61a40e84734 100644
--- a/libc/src/stdlib/qsort_r.cpp
+++ b/libc/src/stdlib/qsort_r.cpp
@@ -19,12 +19,13 @@ LLVM_LIBC_FUNCTION(void, qsort_r,
                    (void *array, size_t array_size, size_t elem_size,
                     int (*compare)(const void *, const void *, void *),
                     void *arg)) {
+  if (array == nullptr || array_size == 0 || elem_size == 0)
+    return;
+  internal::Comparator c(compare, arg);
+  auto arr = internal::Array(reinterpret_cast(array), array_size,
+                             elem_size, c);
 
-  const auto is_less = [compare, arg](const void *a, const void *b) -> bool {
-    return compare(a, b, arg) < 0;
-  };
-
-  internal::unstable_sort(array, array_size, elem_size, is_less);
+  internal::sort(arr);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/qsort_util.h b/libc/src/stdlib/qsort_util.h
index 7882b829d3274..d42adde06d976 100644
--- a/libc/src/stdlib/qsort_util.h
+++ b/libc/src/stdlib/qsort_util.h
@@ -27,48 +27,11 @@
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
-template 
-LIBC_INLINE void unstable_sort_impl(void *array, size_t array_len,
-                                    size_t elem_size, const F &is_less) {
-  if (array == nullptr || array_len == 0 || elem_size == 0)
-    return;
-
-  if constexpr (USE_QUICKSORT) {
-    switch (elem_size) {
-    case 4: {
-      auto arr_fixed_size = internal::ArrayFixedSize<4>(array, array_len);
-      quick_sort(arr_fixed_size, is_less);
-      return;
-    }
-    case 8: {
-      auto arr_fixed_size = internal::ArrayFixedSize<8>(array, array_len);
-      quick_sort(arr_fixed_size, is_less);
-      return;
-    }
-    case 16: {
-      auto arr_fixed_size = internal::ArrayFixedSize<16>(array, array_len);
-      quick_sort(arr_fixed_size, is_less);
-      return;
-    }
-    default:
-      auto arr_generic_size =
-          internal::ArrayGenericSize(array, array_len, elem_size);
-      quick_sort(arr_generic_size, is_less);
-      return;
-    }
-  } else {
-    auto arr_generic_size =
-        internal::ArrayGenericSize(array, array_len, elem_size);
-    heap_sort(arr_generic_size, is_less);
-  }
-}
-
-template 
-LIBC_INLINE void unstable_sort(void *array, size_t array_len, size_t elem_size,
-                               const F &is_less) {
-#define USE_QUICK_SORT ((LIBC_QSORT_IMPL) == (LIBC_QSORT_QUICK_SORT))
-  unstable_sort_impl(array, array_len, elem_size, is_less);
-}
+#if LIBC_QSORT_IMPL == LIBC_QSORT_QUICK_SORT
+constexpr auto sort = quick_sort;
+#elif LIBC_QSORT_IMPL == LIBC_QSORT_HEAP_SORT
+constexpr auto sort = heap_sort;
+#endif
 
 } // namespace internal
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/quick_sort.h b/libc/src/stdlib/quick_sort.h
index 9ab2830250018..82b90a7d511d9 100644
--- a/libc/src/stdlib/quick_sort.h
+++ b/libc/src/stdlib/quick_sort.h
@@ -9,175 +9,84 @@
 #ifndef LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H
 #define LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H
 
-#include "src/__support/CPP/bit.h"
-#include "src/__support/CPP/cstddef.h"
+#include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
-#include "src/stdlib/qsort_pivot.h"
+#include "src/stdlib/qsort_data.h"
 
 #include 
 
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
-// Branchless Lomuto partition based on the implementation by Lukas
-// Bergdoll and Orson Peters
-// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/lomcyc_partition/text.md.
-// Simplified to avoid having to stack allocate.
-template 
-LIBC_INLINE size_t partition_lomuto_branchless(const A &array,
-                                               const void *pivot,
-                                               const F &is_less) {
-  const size_t array_len = array.len();
-
-  size_t left = 0;
-  size_t right = 0;
-
-  while (right < array_len) {
-    const bool right_is_lt = is_less(array.get(right), pivot);
-    array.swap(left, right);
-    left += static_cast(right_is_lt);
-    right += 1;
-  }
-
-  return left;
-}
-
-// Optimized for large types that are expensive to move. Not optimized
-// for integers. It's possible to use a cyclic permutation here for
-// large types as done in ipnsort but the advantages of this are limited
-// as `is_less` is a small wrapper around a call to a function pointer
-// and won't incur much binary-size overhead. The other reason to use
-// cyclic permutation is to have more efficient swapping, but we don't
-// know the element size so this isn't applicable here either.
-template 
-LIBC_INLINE size_t partition_hoare_branchy(const A &array, const void *pivot,
-                                           const F &is_less) {
-  const size_t array_len = array.len();
-
-  size_t left = 0;
-  size_t right = array_len;
+// A simple quicksort implementation using the Hoare partition scheme.
+LIBC_INLINE size_t partition(const Array &array) {
+  const size_t array_size = array.size();
+  size_t pivot_index = array_size / 2;
+  uint8_t *pivot = array.get(pivot_index);
+  size_t i = 0;
+  size_t j = array_size - 1;
 
   while (true) {
-    while (left < right && is_less(array.get(left), pivot))
-      ++left;
-
-    while (true) {
-      --right;
-      if (left >= right || is_less(array.get(right), pivot)) {
-        break;
-      }
+    int compare_i, compare_j;
+
+    while ((compare_i = array.elem_compare(i, pivot)) < 0)
+      ++i;
+    while ((compare_j = array.elem_compare(j, pivot)) > 0)
+      --j;
+
+    // At some point i will crossover j so we will definitely break out of
+    // this while loop.
+    if (i >= j)
+      return j + 1;
+
+    array.swap(i, j);
+
+    // The pivot itself might have got swapped so we will update the pivot.
+    if (i == pivot_index) {
+      pivot = array.get(j);
+      pivot_index = j;
+    } else if (j == pivot_index) {
+      pivot = array.get(i);
+      pivot_index = i;
     }
 
-    if (left >= right)
-      break;
-
-    array.swap(left, right);
-    ++left;
-  }
-
-  return left;
-}
-
-template 
-LIBC_INLINE size_t partition(const A &array, size_t pivot_index,
-                             const F &is_less) {
-  // Place the pivot at the beginning of the array.
-  if (pivot_index != 0) {
-    array.swap(0, pivot_index);
-  }
-
-  const A array_without_pivot = array.make_array(1, array.len() - 1);
-  const void *pivot = array.get(0);
-
-  size_t num_lt;
-  if constexpr (A::has_fixed_size()) {
-    // Branchless Lomuto avoid branch misprediction penalties, but
-    // it also swaps more often which is only faster if the swap is a fast
-    // constant operation.
-    num_lt = partition_lomuto_branchless(array_without_pivot, pivot, is_less);
-  } else {
-    num_lt = partition_hoare_branchy(array_without_pivot, pivot, is_less);
+    if (compare_i == 0 && compare_j == 0) {
+      // If we do not move the pointers, we will end up with an
+      // infinite loop as i and j will be stuck without advancing.
+      ++i;
+      --j;
+    }
   }
-
-  // Place the pivot between the two partitions.
-  array.swap(0, num_lt);
-
-  return num_lt;
 }
 
-template 
-LIBC_INLINE void quick_sort_impl(A &array, const void *ancestor_pivot,
-                                 size_t limit, const F &is_less) {
+LIBC_INLINE void quick_sort(Array array) {
   while (true) {
-    const size_t array_len = array.len();
-    if (array_len <= 1)
+    const size_t array_size = array.size();
+    if (array_size <= 1)
       return;
-
-    // If too many bad pivot choices were made, simply fall back to
-    // heapsort in order to guarantee `O(N x log(N))` worst-case.
-    if (limit == 0) {
-      heap_sort(array, is_less);
-      return;
-    }
-
-    limit -= 1;
-
-    const size_t pivot_index = choose_pivot(array, is_less);
-
-    // If the chosen pivot is equal to the predecessor, then it's the smallest
-    // element in the slice. Partition the slice into elements equal to and
-    // elements greater than the pivot. This case is usually hit when the slice
-    // contains many duplicate elements.
-    if (ancestor_pivot) {
-      if (!is_less(ancestor_pivot, array.get(pivot_index))) {
-        const size_t num_lt =
-            partition(array, pivot_index,
-                      [is_less](const void *a, const void *b) -> bool {
-                        return !is_less(b, a);
-                      });
-
-        // Continue sorting elements greater than the pivot. We know that
-        // `num_lt` cont
-        array.reset_bounds(num_lt + 1, array.len() - (num_lt + 1));
-        ancestor_pivot = nullptr;
-        continue;
-      }
-    }
-
-    size_t split_index = partition(array, pivot_index, is_less);
-
-    if (array_len == 2)
+    size_t split_index = partition(array);
+    if (array_size == 2)
       // The partition operation sorts the two element array.
       return;
 
-    // Split the array into `left`, `pivot`, and `right`.
-    A left = array.make_array(0, split_index);
-    const void *pivot = array.get(split_index);
-    const size_t right_start = split_index + 1;
-    A right = array.make_array(right_start, array.len() - right_start);
-
-    // Recurse into the left side. We have a fixed recursion limit,
-    // testing shows no real benefit for recursing into the shorter
-    // side.
-    quick_sort_impl(left, ancestor_pivot, limit, is_less);
-
-    // Continue with the right side.
-    array = right;
-    ancestor_pivot = pivot;
+    // Make Arrays describing the two sublists that still need sorting.
+    Array left = array.make_array(0, split_index);
+    Array right = array.make_array(split_index, array.size() - split_index);
+
+    // Recurse to sort the smaller of the two, and then loop round within this
+    // function to sort the larger. This way, recursive call depth is bounded
+    // by log2 of the total array size, because every recursive call is sorting
+    // a list at most half the length of the one in its caller.
+    if (left.size() < right.size()) {
+      quick_sort(left);
+      array.reset_bounds(right.get(0), right.size());
+    } else {
+      quick_sort(right);
+      array.reset_bounds(left.get(0), left.size());
+    }
   }
 }
 
-constexpr size_t ilog2(size_t n) { return cpp::bit_width(n) - 1; }
-
-template 
-LIBC_INLINE void quick_sort(A &array, const F &is_less) {
-  const void *ancestor_pivot = nullptr;
-  // Limit the number of imbalanced partitions to `2 * floor(log2(len))`.
-  // The binary OR by one is used to eliminate the zero-check in the logarithm.
-  const size_t limit = 2 * ilog2((array.len() | 1));
-  quick_sort_impl(array, ancestor_pivot, limit, is_less);
-}
-
 } // namespace internal
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt
index 8cc0428632ba3..4ca2043ab4c9b 100644
--- a/libc/test/src/stdlib/CMakeLists.txt
+++ b/libc/test/src/stdlib/CMakeLists.txt
@@ -300,6 +300,18 @@ add_libc_test(
     libc.src.stdlib.bsearch
 )
 
+add_libc_test(
+  quick_sort_test
+  SUITE
+    libc-stdlib-tests
+  SRCS
+    quick_sort_test.cpp
+  HDRS
+    SortingTest.h
+  DEPENDS
+    libc.src.stdlib.qsort_util
+)
+
 add_libc_test(
   heap_sort_test
   SUITE
@@ -309,15 +321,15 @@ add_libc_test(
   HDRS
     SortingTest.h
   DEPENDS
-    libc.src.stdlib.qsort
+    libc.src.stdlib.qsort_util
 )
 
 add_libc_test(
-  quick_sort_test
+  qsort_test
   SUITE
     libc-stdlib-tests
   SRCS
-    quick_sort_test.cpp
+    qsort_test.cpp
   HDRS
     SortingTest.h
   DEPENDS
diff --git a/libc/test/src/stdlib/SortingTest.h b/libc/test/src/stdlib/SortingTest.h
index 034c0e4f1fd01..d34584e5addf0 100644
--- a/libc/test/src/stdlib/SortingTest.h
+++ b/libc/test/src/stdlib/SortingTest.h
@@ -7,19 +7,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/macros/config.h"
-#include "src/stdlib/qsort.h"
+#include "src/stdlib/qsort_data.h"
 #include "test/UnitTest/Test.h"
 
 class SortingTest : public LIBC_NAMESPACE::testing::Test {
 
-  using SortingRoutine = void (*)(void *array, size_t array_len,
-                                  size_t elem_size,
-                                  int (*compare)(const void *, const void *));
+  using Array = LIBC_NAMESPACE::internal::Array;
+  using Comparator = LIBC_NAMESPACE::internal::Comparator;
+  using SortingRoutine = LIBC_NAMESPACE::internal::SortingRoutine;
 
+public:
   static int int_compare(const void *l, const void *r) {
     int li = *reinterpret_cast(l);
     int ri = *reinterpret_cast(r);
-
     if (li == ri)
       return 0;
     else if (li > ri)
@@ -28,19 +28,16 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
       return -1;
   }
 
-  static void int_sort(SortingRoutine sort_func, int *array, size_t array_len) {
-    sort_func(reinterpret_cast(array), array_len, sizeof(int),
-              int_compare);
-  }
-
-public:
   void test_sorted_array(SortingRoutine sort_func) {
     int array[25] = {10,   23,   33,   35,   55,   70,    71,   100,  110,
                      123,  133,  135,  155,  170,  171,   1100, 1110, 1123,
                      1133, 1135, 1155, 1170, 1171, 11100, 12310};
-    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
+    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
+
+    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
+                     sizeof(int), Comparator(int_compare));
 
-    int_sort(sort_func, array, ARRAY_LEN);
+    sort_func(arr);
 
     ASSERT_LE(array[0], 10);
     ASSERT_LE(array[1], 23);
@@ -72,11 +69,14 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
   void test_reversed_sorted_array(SortingRoutine sort_func) {
     int array[] = {25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13,
                    12, 11, 10, 9,  8,  7,  6,  5,  4,  3,  2,  1};
-    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
+    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
 
-    int_sort(sort_func, array, ARRAY_LEN);
+    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
+                     sizeof(int), Comparator(int_compare));
 
-    for (int i = 0; i < int(ARRAY_LEN - 1); ++i)
+    sort_func(arr);
+
+    for (int i = 0; i < int(ARRAY_SIZE - 1); ++i)
       ASSERT_EQ(array[i], i + 1);
   }
 
@@ -84,11 +84,14 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
     int array[] = {100, 100, 100, 100, 100, 100, 100, 100, 100,
                    100, 100, 100, 100, 100, 100, 100, 100, 100,
                    100, 100, 100, 100, 100, 100, 100};
-    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
+    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
+
+    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
+                     sizeof(int), Comparator(int_compare));
 
-    int_sort(sort_func, array, ARRAY_LEN);
+    sort_func(arr);
 
-    for (size_t i = 0; i < ARRAY_LEN; ++i)
+    for (size_t i = 0; i < ARRAY_SIZE; ++i)
       ASSERT_EQ(array[i], 100);
   }
 
@@ -96,9 +99,12 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
     int array[25] = {10,  23,  8,    35,   55,   45,  40,  100, 110,
                      123, 90,  80,   70,   60,   171, 11,  1,   -1,
                      -5,  -10, 1155, 1170, 1171, 12,  -100};
-    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
+    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
+
+    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
+                     sizeof(int), Comparator(int_compare));
 
-    int_sort(sort_func, array, ARRAY_LEN);
+    sort_func(arr);
 
     ASSERT_EQ(array[0], -100);
     ASSERT_EQ(array[1], -10);
@@ -129,9 +135,12 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
 
   void test_unsorted_array_2(SortingRoutine sort_func) {
     int array[7] = {10, 40, 45, 55, 35, 23, 60};
-    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
+    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
 
-    int_sort(sort_func, array, ARRAY_LEN);
+    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
+                     sizeof(int), Comparator(int_compare));
+
+    sort_func(arr);
 
     ASSERT_EQ(array[0], 10);
     ASSERT_EQ(array[1], 23);
@@ -144,9 +153,12 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
 
   void test_unsorted_array_duplicated_1(SortingRoutine sort_func) {
     int array[6] = {10, 10, 20, 20, 5, 5};
-    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
+    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
+
+    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
+                     sizeof(int), Comparator(int_compare));
 
-    int_sort(sort_func, array, ARRAY_LEN);
+    sort_func(arr);
 
     ASSERT_EQ(array[0], 5);
     ASSERT_EQ(array[1], 5);
@@ -158,9 +170,12 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
 
   void test_unsorted_array_duplicated_2(SortingRoutine sort_func) {
     int array[10] = {20, 10, 10, 10, 10, 20, 21, 21, 21, 21};
-    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
+    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
+
+    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
+                     sizeof(int), Comparator(int_compare));
 
-    int_sort(sort_func, array, ARRAY_LEN);
+    sort_func(arr);
 
     ASSERT_EQ(array[0], 10);
     ASSERT_EQ(array[1], 10);
@@ -176,9 +191,12 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
 
   void test_unsorted_array_duplicated_3(SortingRoutine sort_func) {
     int array[10] = {20, 30, 30, 30, 30, 20, 21, 21, 21, 21};
-    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
+    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
 
-    int_sort(sort_func, array, ARRAY_LEN);
+    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
+                     sizeof(int), Comparator(int_compare));
+
+    sort_func(arr);
 
     ASSERT_EQ(array[0], 20);
     ASSERT_EQ(array[1], 20);
@@ -195,9 +213,12 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
   void test_unsorted_three_element_1(SortingRoutine sort_func) {
     int array[3] = {14999024, 0, 3};
 
-    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
+    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
+
+    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
+                     sizeof(int), Comparator(int_compare));
 
-    int_sort(sort_func, array, ARRAY_LEN);
+    sort_func(arr);
 
     ASSERT_EQ(array[0], 0);
     ASSERT_EQ(array[1], 3);
@@ -207,9 +228,12 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
   void test_unsorted_three_element_2(SortingRoutine sort_func) {
     int array[3] = {3, 14999024, 0};
 
-    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
+    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
+
+    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
+                     sizeof(int), Comparator(int_compare));
 
-    int_sort(sort_func, array, ARRAY_LEN);
+    sort_func(arr);
 
     ASSERT_EQ(array[0], 0);
     ASSERT_EQ(array[1], 3);
@@ -219,9 +243,12 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
   void test_unsorted_three_element_3(SortingRoutine sort_func) {
     int array[3] = {3, 0, 14999024};
 
-    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
+    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
 
-    int_sort(sort_func, array, ARRAY_LEN);
+    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
+                     sizeof(int), Comparator(int_compare));
+
+    sort_func(arr);
 
     ASSERT_EQ(array[0], 0);
     ASSERT_EQ(array[1], 3);
@@ -231,9 +258,12 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
   void test_same_three_element(SortingRoutine sort_func) {
     int array[3] = {12345, 12345, 12345};
 
-    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
+    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
+
+    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
+                     sizeof(int), Comparator(int_compare));
 
-    int_sort(sort_func, array, ARRAY_LEN);
+    sort_func(arr);
 
     ASSERT_EQ(array[0], 12345);
     ASSERT_EQ(array[1], 12345);
@@ -243,9 +273,12 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
   void test_unsorted_two_element_1(SortingRoutine sort_func) {
     int array[] = {14999024, 0};
 
-    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
+    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
+
+    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
+                     sizeof(int), Comparator(int_compare));
 
-    int_sort(sort_func, array, ARRAY_LEN);
+    sort_func(arr);
 
     ASSERT_EQ(array[0], 0);
     ASSERT_EQ(array[1], 14999024);
@@ -254,9 +287,12 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
   void test_unsorted_two_element_2(SortingRoutine sort_func) {
     int array[] = {0, 14999024};
 
-    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
+    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
 
-    int_sort(sort_func, array, ARRAY_LEN);
+    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
+                     sizeof(int), Comparator(int_compare));
+
+    sort_func(arr);
 
     ASSERT_EQ(array[0], 0);
     ASSERT_EQ(array[1], 14999024);
@@ -265,9 +301,12 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
   void test_same_two_element(SortingRoutine sort_func) {
     int array[] = {12345, 12345};
 
-    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
+    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
+
+    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
+                     sizeof(int), Comparator(int_compare));
 
-    int_sort(sort_func, array, ARRAY_LEN);
+    sort_func(arr);
 
     ASSERT_EQ(array[0], 12345);
     ASSERT_EQ(array[1], 12345);
@@ -276,75 +315,14 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
   void test_single_element(SortingRoutine sort_func) {
     int array[] = {12345};
 
-    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
+    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
 
-    int_sort(sort_func, array, ARRAY_LEN);
+    auto arr = Array(reinterpret_cast(array), ARRAY_SIZE,
+                     sizeof(int), Comparator(int_compare));
 
-    ASSERT_EQ(array[0], 12345);
-  }
+    sort_func(arr);
 
-  void test_different_elem_size(SortingRoutine sort_func) {
-    // Random order of values [0,50) to avoid only testing pre-sorted handling.
-    // Long enough to reach interesting code.
-    constexpr uint8_t ARRAY_INITIAL_VALS[] = {
-        42, 13, 8,  4,  17, 28, 20, 32, 22, 29, 7,  2,  46, 37, 26, 49, 24,
-        38, 10, 18, 40, 36, 47, 15, 11, 48, 44, 33, 1,  5,  16, 35, 39, 41,
-        14, 23, 3,  9,  6,  27, 21, 25, 31, 45, 12, 43, 34, 30, 19, 0};
-
-    constexpr size_t ARRAY_LEN = sizeof(ARRAY_INITIAL_VALS);
-    constexpr size_t MAX_ELEM_SIZE = 150;
-    constexpr size_t BUF_SIZE = ARRAY_LEN * MAX_ELEM_SIZE;
-
-    static_assert(ARRAY_LEN < 256); // so we can encode the values.
-
-    // Minimum alignment to test implementation for bugs related to assuming
-    // incorrect association between alignment and element size.
-    alignas(1) uint8_t buf[BUF_SIZE];
-
-    const auto fill_buf = [&buf](size_t elem_size) {
-      for (size_t i = 0; i < BUF_SIZE; ++i) {
-        buf[i] = 0;
-      }
-
-      for (size_t elem_i = 0, buf_i = 0; elem_i < ARRAY_LEN; ++elem_i) {
-        const uint8_t elem_val = ARRAY_INITIAL_VALS[elem_i];
-        for (size_t elem_byte_i = 0; elem_byte_i < elem_size; ++elem_byte_i) {
-          buf[buf_i] = elem_val;
-          buf_i += 1;
-        }
-      }
-    };
-
-    for (size_t elem_size = 0; elem_size <= MAX_ELEM_SIZE; ++elem_size) {
-      // Fill all bytes with data to ensure mistakes in elem swap are noticed.
-      fill_buf(elem_size);
-
-      sort_func(reinterpret_cast(buf), ARRAY_LEN, elem_size,
-                [](const void *a, const void *b) -> int {
-                  const uint8_t a_val = *reinterpret_cast(a);
-                  const uint8_t b_val = *reinterpret_cast(b);
-
-                  if (a_val < b_val) {
-                    return -1;
-                  } else if (a_val > b_val) {
-                    return 1;
-                  } else {
-                    return 0;
-                  }
-                });
-
-      for (size_t elem_i = 0, buf_i = 0; elem_i < ARRAY_LEN; ++elem_i) {
-        const uint8_t expected_elem_val = static_cast(elem_i);
-
-        for (size_t elem_byte_i = 0; elem_byte_i < elem_size; ++elem_byte_i) {
-          const uint8_t buf_val = buf[buf_i];
-          // Check that every byte in the element has the expected value.
-          ASSERT_EQ(buf_val, expected_elem_val)
-              << "elem_size: " << elem_size << " buf_i: " << buf_i << '\n';
-          buf_i += 1;
-        }
-      }
-    }
+    ASSERT_EQ(array[0], 12345);
   }
 };
 
@@ -396,7 +374,4 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
   TEST_F(LlvmLibc##Name##Test, SingleElementArray) {                           \
     test_single_element(Func);                                                 \
   }                                                                            \
-  TEST_F(LlvmLibc##Name##Test, DifferentElemSizeArray) {                       \
-    test_different_elem_size(Func);                                            \
-  }                                                                            \
   static_assert(true)
diff --git a/libc/test/src/stdlib/heap_sort_test.cpp b/libc/test/src/stdlib/heap_sort_test.cpp
index 18d4244506ec2..d70e3dc2272be 100644
--- a/libc/test/src/stdlib/heap_sort_test.cpp
+++ b/libc/test/src/stdlib/heap_sort_test.cpp
@@ -7,20 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "SortingTest.h"
-#include "src/stdlib/qsort_util.h"
+#include "src/stdlib/heap_sort.h"
 
-void heap_sort(void *array, size_t array_size, size_t elem_size,
-               int (*compare)(const void *, const void *)) {
-
-  constexpr bool USE_QUICKSORT = false;
-
-  const auto is_less = [compare](const void *a,
-                                 const void *b) noexcept -> bool {
-    return compare(a, b) < 0;
-  };
-
-  LIBC_NAMESPACE::internal::unstable_sort_impl(
-      array, array_size, elem_size, is_less);
+void sort(const LIBC_NAMESPACE::internal::Array &array) {
+  LIBC_NAMESPACE::internal::heap_sort(array);
 }
 
-LIST_SORTING_TESTS(HeapSort, heap_sort);
+LIST_SORTING_TESTS(HeapSort, sort);
diff --git a/libc/test/src/stdlib/qsort_r_test.cpp b/libc/test/src/stdlib/qsort_r_test.cpp
index f18923618ed5e..6893fdc7b74c8 100644
--- a/libc/test/src/stdlib/qsort_r_test.cpp
+++ b/libc/test/src/stdlib/qsort_r_test.cpp
@@ -62,9 +62,9 @@ TEST(LlvmLibcQsortRTest, SortedArray) {
   ASSERT_LE(array[23], 11100);
   ASSERT_LE(array[24], 12310);
 
-  // This is a sorted list, but there still have to have been at least N - 1
+  // This is a sorted list, but there still have to have been at least N
   // comparisons made.
-  ASSERT_GE(count, ARRAY_SIZE - 1);
+  ASSERT_GE(count, ARRAY_SIZE);
 }
 
 TEST(LlvmLibcQsortRTest, ReverseSortedArray) {
diff --git a/libc/test/src/stdlib/qsort_test.cpp b/libc/test/src/stdlib/qsort_test.cpp
new file mode 100644
index 0000000000000..1e921a86fd1fd
--- /dev/null
+++ b/libc/test/src/stdlib/qsort_test.cpp
@@ -0,0 +1,17 @@
+//===-- Unittests for qsort -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SortingTest.h"
+#include "src/stdlib/qsort.h"
+
+void sort(const LIBC_NAMESPACE::internal::Array &array) {
+  LIBC_NAMESPACE::qsort(reinterpret_cast(array.get(0)), array.size(),
+                        sizeof(int), SortingTest::int_compare);
+}
+
+LIST_SORTING_TESTS(Qsort, sort);
diff --git a/libc/test/src/stdlib/quick_sort_test.cpp b/libc/test/src/stdlib/quick_sort_test.cpp
index 2832c855370bc..d6bf77ebfd40d 100644
--- a/libc/test/src/stdlib/quick_sort_test.cpp
+++ b/libc/test/src/stdlib/quick_sort_test.cpp
@@ -1,4 +1,4 @@
-//===-- Unittests for qsort -----------------------------------------------===//
+//===-- Unittests for quick sort ------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,19 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "SortingTest.h"
-#include "src/stdlib/qsort_util.h"
+#include "src/stdlib/quick_sort.h"
 
-void quick_sort(void *array, size_t array_size, size_t elem_size,
-                int (*compare)(const void *, const void *)) {
-  constexpr bool USE_QUICKSORT = true;
-
-  const auto is_less = [compare](const void *a,
-                                 const void *b) noexcept -> bool {
-    return compare(a, b) < 0;
-  };
-
-  LIBC_NAMESPACE::internal::unstable_sort_impl(
-      array, array_size, elem_size, is_less);
+void sort(const LIBC_NAMESPACE::internal::Array &array) {
+  LIBC_NAMESPACE::internal::quick_sort(array);
 }
 
-LIST_SORTING_TESTS(Qsort, quick_sort);
+LIST_SORTING_TESTS(QuickSort, sort);
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
index c0f1546912662..e4b4b075705e8 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
@@ -121,8 +121,8 @@ libc_support_library(
 )
 
 libc_test(
-    name = "quick_sort_test",
-    srcs = ["quick_sort_test.cpp"],
+    name = "qsort_test",
+    srcs = ["qsort_test.cpp"],
     libc_function_deps = ["//libc:qsort"],
     deps = [
         ":qsort_test_helper",
@@ -130,13 +130,21 @@ libc_test(
     ],
 )
 
+libc_test(
+    name = "quick_sort_test",
+    srcs = ["quick_sort_test.cpp"],
+    deps = [
+        ":qsort_test_helper",
+        "//libc:qsort_util",
+    ],
+)
+
 libc_test(
     name = "heap_sort_test",
     srcs = ["heap_sort_test.cpp"],
-    libc_function_deps = ["//libc:qsort"],
     deps = [
         ":qsort_test_helper",
-        "//libc:types_size_t",
+        "//libc:qsort_util",
     ],
 )
 

From 6230f1ba945a1bc795a34cd438c6df3b987f359f Mon Sep 17 00:00:00 2001
From: Amr Hesham 
Date: Mon, 30 Dec 2024 01:27:10 +0100
Subject: [PATCH 164/567] [Clang][ASTMatcher] Add `dependentNameType` AST
 matcher (#121263)

Fixes: https://github.com/llvm/llvm-project/issues/121240
---
 clang/docs/LibASTMatchersReference.html            |  9 +++++++++
 clang/docs/ReleaseNotes.rst                        |  2 ++
 clang/include/clang/ASTMatchers/ASTMatchers.h      | 10 ++++++++++
 clang/lib/ASTMatchers/ASTMatchersInternal.cpp      |  1 +
 clang/lib/ASTMatchers/Dynamic/Registry.cpp         |  1 +
 clang/unittests/AST/ASTImporterTest.cpp            |  3 ---
 .../unittests/ASTMatchers/ASTMatchersNodeTest.cpp  | 14 ++++++++++++++
 7 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/clang/docs/LibASTMatchersReference.html b/clang/docs/LibASTMatchersReference.html
index ddc99020604c9..8564f2650d205 100644
--- a/clang/docs/LibASTMatchersReference.html
+++ b/clang/docs/LibASTMatchersReference.html
@@ -2536,6 +2536,15 @@ 

Node Matchers

matches "decltype(i + j)"
+Matcher<Type>dependentNameTypeMatcher<DependentNameType>... +
Matches a dependent name type.
+
+Example matches T::type
+
+  template  struct declToImport {
+    typedef typename T::type dependent_name;
+  };
+
Matcher<Type>deducedTemplateSpecializationTypeMatcher<DeducedTemplateSpecializationType>...
Matches C++17 deduced template specialization types, e.g. deduced class
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 983c1da20ed4c..210ccc16eeb4f 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1110,6 +1110,8 @@ AST Matchers
 
 - Add ``dependentScopeDeclRefExpr`` matcher to match expressions that refer to dependent scope declarations.
 
+- Add ``dependentNameType`` matcher to match a dependent name type.
+
 clang-format
 ------------
 
diff --git a/clang/include/clang/ASTMatchers/ASTMatchers.h b/clang/include/clang/ASTMatchers/ASTMatchers.h
index 22e2546ab81e0..9a046714068a5 100644
--- a/clang/include/clang/ASTMatchers/ASTMatchers.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchers.h
@@ -7711,6 +7711,16 @@ AST_MATCHER_P(DecayedType, hasDecayedType, internal::Matcher,
   return InnerType.matches(Node.getDecayedType(), Finder, Builder);
 }
 
+/// Matches a dependent name type
+///
+/// Example matches  T::type
+/// \code
+///  template  struct declToImport {
+///    typedef typename T::type dependent_name;
+///  };
+/// \endcode
+extern const AstTypeMatcher dependentNameType;
+
 /// Matches declarations whose declaration context, interpreted as a
 /// Decl, matches \c InnerMatcher.
 ///
diff --git a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
index 8c744eebbdfb5..a47633bf4bae2 100644
--- a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
+++ b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
@@ -1108,6 +1108,7 @@ const AstTypeMatcher substTemplateTypeParmType;
 const AstTypeMatcher templateTypeParmType;
 const AstTypeMatcher injectedClassNameType;
 const AstTypeMatcher decayedType;
+const AstTypeMatcher dependentNameType;
 AST_TYPELOC_TRAVERSE_MATCHER_DEF(hasElementType,
                                  AST_POLYMORPHIC_SUPPORTED_TYPES(ArrayType,
                                                                  ComplexType));
diff --git a/clang/lib/ASTMatchers/Dynamic/Registry.cpp b/clang/lib/ASTMatchers/Dynamic/Registry.cpp
index 685d626d2978b..bfdee412c5328 100644
--- a/clang/lib/ASTMatchers/Dynamic/Registry.cpp
+++ b/clang/lib/ASTMatchers/Dynamic/Registry.cpp
@@ -222,6 +222,7 @@ RegistryMaps::RegistryMaps() {
   REGISTER_MATCHER(decompositionDecl);
   REGISTER_MATCHER(declCountIs);
   REGISTER_MATCHER(declRefExpr);
+  REGISTER_MATCHER(dependentNameType);
   REGISTER_MATCHER(dependentScopeDeclRefExpr);
   REGISTER_MATCHER(declStmt);
   REGISTER_MATCHER(declaratorDecl);
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index ec062a5cc953b..ee1d896f1ca6d 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -3196,9 +3196,6 @@ TEST_P(ImportExpr, DependentScopeDeclRefExpr) {
                  has(callExpr(has(dependentScopeDeclRefExpr())))))))));
 }
 
-const internal::VariadicDynCastAllOfMatcher
-    dependentNameType;
-
 TEST_P(ImportExpr, DependentNameType) {
   MatchVerifier Verifier;
   testImport("template  struct declToImport {"
diff --git a/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
index a3baad367a27b..b8521e2f95768 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
+++ b/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
@@ -1912,6 +1912,20 @@ TEST_P(ASTMatchersTest, DeducedTemplateSpecializationType) {
               deducedTemplateSpecializationType()));
 }
 
+TEST_P(ASTMatchersTest, DependentNameType) {
+  if (!GetParam().isCXX()) {
+    return;
+  }
+
+  EXPECT_TRUE(matches(
+      R"(
+        template  struct declToImport {
+          typedef typename T::type dependent_name;
+        };
+      )",
+      dependentNameType()));
+}
+
 TEST_P(ASTMatchersTest, RecordType) {
   EXPECT_TRUE(matches("struct S {}; struct S s;",
                       recordType(hasDeclaration(recordDecl(hasName("S"))))));

From 9efa7d7af3decfc5223963b19e1b85cacac48084 Mon Sep 17 00:00:00 2001
From: Fangrui Song 
Date: Sun, 29 Dec 2024 18:58:30 -0800
Subject: [PATCH 165/567] Remove -print-lsr-output in favor of
 --stop-after=loop-reduce

Pull Request: https://github.com/llvm/llvm-project/pull/121305
---
 llvm/include/llvm/Passes/CodeGenPassBuilder.h               | 3 ---
 llvm/include/llvm/Target/CGPassBuilderOption.h              | 1 -
 llvm/lib/CodeGen/TargetPassConfig.cpp                       | 6 ------
 llvm/test/CodeGen/PowerPC/lsr-postinc-pos.ll                | 2 +-
 .../test/Transforms/LoopStrengthReduce/AArch64/lsr-reuse.ll | 2 +-
 5 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index d2e9e8185a2b9..a84164bed46ce 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -668,9 +668,6 @@ void CodeGenPassBuilder::addIRPasses(
   if (getOptLevel() != CodeGenOptLevel::None && !Opt.DisableLSR) {
     addPass(createFunctionToLoopPassAdaptor(LoopStrengthReducePass(),
                                             /*UseMemorySSA=*/true));
-    // FIXME: use -stop-after so we could remove PrintLSR
-    if (Opt.PrintLSR)
-      addPass(PrintFunctionPass(dbgs(), "\n\n*** Code after LSR ***\n"));
   }
 
   if (getOptLevel() != CodeGenOptLevel::None) {
diff --git a/llvm/include/llvm/Target/CGPassBuilderOption.h b/llvm/include/llvm/Target/CGPassBuilderOption.h
index 29bdb9c1746d3..d3d19c8a7dc9f 100644
--- a/llvm/include/llvm/Target/CGPassBuilderOption.h
+++ b/llvm/include/llvm/Target/CGPassBuilderOption.h
@@ -39,7 +39,6 @@ struct CGPassBuilderOption {
 
   bool DisableLSR = false;
   bool DisableCGP = false;
-  bool PrintLSR = false;
   bool DisableMergeICmps = false;
   bool DisablePartialLibcallInlining = false;
   bool DisableConstantHoisting = false;
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index d407e9f0871d4..5c055896130a1 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -113,8 +113,6 @@ static cl::opt EnableImplicitNullChecks(
 static cl::opt DisableMergeICmps("disable-mergeicmps",
     cl::desc("Disable MergeICmps Pass"),
     cl::init(false), cl::Hidden);
-static cl::opt PrintLSR("print-lsr-output", cl::Hidden,
-    cl::desc("Print LLVM IR produced by the loop-reduce pass"));
 static cl::opt
     PrintISelInput("print-isel-input", cl::Hidden,
                    cl::desc("Print LLVM IR input to isel pass"));
@@ -503,7 +501,6 @@ CGPassBuilderOption llvm::getCGPassBuilderOption() {
   SET_BOOLEAN_OPTION(DisableCGP)
   SET_BOOLEAN_OPTION(DisablePartialLibcallInlining)
   SET_BOOLEAN_OPTION(DisableSelectOptimize)
-  SET_BOOLEAN_OPTION(PrintLSR)
   SET_BOOLEAN_OPTION(PrintISelInput)
   SET_BOOLEAN_OPTION(DebugifyAndStripAll)
   SET_BOOLEAN_OPTION(DebugifyCheckAndStripAll)
@@ -836,9 +833,6 @@ void TargetPassConfig::addIRPasses() {
       addPass(createLoopStrengthReducePass());
       if (EnableLoopTermFold)
         addPass(createLoopTermFoldPass());
-      if (PrintLSR)
-        addPass(createPrintFunctionPass(dbgs(),
-                                        "\n\n*** Code after LSR ***\n"));
     }
 
     // The MergeICmpsPass tries to create memcmp calls by grouping sequences of
diff --git a/llvm/test/CodeGen/PowerPC/lsr-postinc-pos.ll b/llvm/test/CodeGen/PowerPC/lsr-postinc-pos.ll
index 83763f5ef76aa..2c5ad949dcc5e 100644
--- a/llvm/test/CodeGen/PowerPC/lsr-postinc-pos.ll
+++ b/llvm/test/CodeGen/PowerPC/lsr-postinc-pos.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -print-lsr-output 2>&1 | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -stop-after=loop-reduce | FileCheck %s
 
 ; The icmp is a post-inc use, and the increment is in %bb11, but the
 ; scevgep needs to be inserted in %bb so that it is dominated by %t.
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-reuse.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-reuse.ll
index 64e8a6be998ea..7a3817d3a13c7 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-reuse.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-reuse.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-unknown-unknown -print-lsr-output < %s 2>&1 | FileCheck %s
+; RUN: llc -mtriple=aarch64 -stop-after=loop-reduce < %s | FileCheck %s
 
 declare void @foo(i64)
 

From 49331ab0b9a41f925153fe03cd7e0d4a33b7c3d4 Mon Sep 17 00:00:00 2001
From: Shilei Tian 
Date: Sun, 29 Dec 2024 22:28:14 -0500
Subject: [PATCH 166/567] [NFC][Clang] Auto generate check lines for
 `clang/test/CodeGenCXX/matrix-vector-bit-int.cpp`

---
 .../test/CodeGenCXX/matrix-vector-bit-int.cpp | 145 ++++++++++--------
 1 file changed, 79 insertions(+), 66 deletions(-)

diff --git a/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp b/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp
index 040615f417085..ffbce9ff8d6f4 100644
--- a/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp
+++ b/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -fenable-matrix %s -emit-llvm -triple x86_64-unknown-linux -disable-llvm-passes -o - -std=c++11 | FileCheck %s
 
 using i8x3 = _BitInt(8) __attribute__((ext_vector_type(3)));
@@ -7,92 +8,104 @@ using i32x3x3 = _BitInt(32) __attribute__((matrix_type(3, 3)));
 using i512x3 = _BitInt(512) __attribute__((ext_vector_type(3)));
 using i512x3x3 = _BitInt(512) __attribute__((matrix_type(3, 3)));
 
-// CHECK-LABEL: define dso_local i32 @_Z2v1Dv3_DB8_(i32 %a.coerce)
+// CHECK-LABEL: define dso_local i32 @_Z2v1Dv3_DB8_(
+// CHECK-SAME: i32 [[A_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <3 x i8>, align 4
+// CHECK-NEXT:    [[A:%.*]] = alloca <3 x i8>, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <3 x i8>, align 4
+// CHECK-NEXT:    store i32 [[A_COERCE]], ptr [[A]], align 4
+// CHECK-NEXT:    [[LOADVEC4:%.*]] = load <4 x i8>, ptr [[A]], align 4
+// CHECK-NEXT:    [[A1:%.*]] = shufflevector <4 x i8> [[LOADVEC4]], <4 x i8> poison, <3 x i32> 
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i8> [[A1]], <3 x i8> poison, <4 x i32> 
+// CHECK-NEXT:    store <4 x i8> [[EXTRACTVEC]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[LOADVEC42:%.*]] = load <4 x i8>, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[EXTRACTVEC3:%.*]] = shufflevector <4 x i8> [[LOADVEC42]], <4 x i8> poison, <3 x i32> 
+// CHECK-NEXT:    [[LOADVEC44:%.*]] = load <4 x i8>, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[EXTRACTVEC5:%.*]] = shufflevector <4 x i8> [[LOADVEC44]], <4 x i8> poison, <3 x i32> 
+// CHECK-NEXT:    [[ADD:%.*]] = add <3 x i8> [[EXTRACTVEC3]], [[EXTRACTVEC5]]
+// CHECK-NEXT:    store <3 x i8> [[ADD]], ptr [[RETVAL]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[RETVAL]], align 4
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
 i8x3 v1(i8x3 a) {
-  // CHECK-NEXT: entry:
-  // CHECK-NEXT:   %retval = alloca <3 x i8>, align 4
-  // CHECK-NEXT:   %a = alloca <3 x i8>, align 4
-  // CHECK-NEXT:   %a.addr = alloca <3 x i8>, align 4
-  // CHECK-NEXT:   store i32 %a.coerce, ptr %a, align 4
-  // CHECK-NEXT:   %loadVec4 = load <4 x i8>, ptr %a, align 4
-  // CHECK-NEXT:   %a1 = shufflevector <4 x i8> %loadVec4, <4 x i8> poison, <3 x i32> 
-  // CHECK-NEXT:   %extractVec = shufflevector <3 x i8> %a1, <3 x i8> poison, <4 x i32> 
-  // CHECK-NEXT:   store <4 x i8> %extractVec, ptr %a.addr, align 4
-  // CHECK-NEXT:   %loadVec42 = load <4 x i8>, ptr %a.addr, align 4
-  // CHECK-NEXT:   %extractVec3 = shufflevector <4 x i8> %loadVec42, <4 x i8> poison, <3 x i32> 
-  // CHECK-NEXT:   %loadVec44 = load <4 x i8>, ptr %a.addr, align 4
-  // CHECK-NEXT:   %extractVec5 = shufflevector <4 x i8> %loadVec44, <4 x i8> poison, <3 x i32> 
-  // CHECK-NEXT:   %add = add <3 x i8> %extractVec3, %extractVec5
-  // CHECK-NEXT:   store <3 x i8> %add, ptr %retval, align 4
-  // CHECK-NEXT:   %0 = load i32, ptr %retval, align 4
-  // CHECK-NEXT:   ret i32 %0
   return a + a;
 }
 
-// CHECK-LABEL: define dso_local noundef <3 x i32> @_Z2v2Dv3_DB32_(<3 x i32> noundef %a)
+// CHECK-LABEL: define dso_local noundef <3 x i32> @_Z2v2Dv3_DB32_(
+// CHECK-SAME: <3 x i32> noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <3 x i32>, align 16
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i32> [[A]], <3 x i32> poison, <4 x i32> 
+// CHECK-NEXT:    store <4 x i32> [[EXTRACTVEC]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[LOADVEC4:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[EXTRACTVEC1:%.*]] = shufflevector <4 x i32> [[LOADVEC4]], <4 x i32> poison, <3 x i32> 
+// CHECK-NEXT:    [[LOADVEC42:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[EXTRACTVEC3:%.*]] = shufflevector <4 x i32> [[LOADVEC42]], <4 x i32> poison, <3 x i32> 
+// CHECK-NEXT:    [[ADD:%.*]] = add <3 x i32> [[EXTRACTVEC1]], [[EXTRACTVEC3]]
+// CHECK-NEXT:    ret <3 x i32> [[ADD]]
+//
 i32x3 v2(i32x3 a) {
-  // CHECK-NEXT: entry:
-  // CHECK-NEXT:   %a.addr = alloca <3 x i32>, align 16
-  // CHECK-NEXT:   %extractVec = shufflevector <3 x i32> %a, <3 x i32> poison, <4 x i32> 
-  // CHECK-NEXT:   store <4 x i32> %extractVec, ptr %a.addr, align 16
-  // CHECK-NEXT:   %loadVec4 = load <4 x i32>, ptr %a.addr, align 16
-  // CHECK-NEXT:   %extractVec1 = shufflevector <4 x i32> %loadVec4, <4 x i32> poison, <3 x i32> 
-  // CHECK-NEXT:   %loadVec42 = load <4 x i32>, ptr %a.addr, align 16
-  // CHECK-NEXT:   %extractVec3 = shufflevector <4 x i32> %loadVec42, <4 x i32> poison, <3 x i32> 
-  // CHECK-NEXT:   %add = add <3 x i32> %extractVec1, %extractVec3
-  // CHECK-NEXT:   ret <3 x i32> %add
   return a + a;
 }
 
-// CHECK-LABEL: define dso_local noundef <3 x i512> @_Z2v3Dv3_DB512_(ptr noundef byval(<3 x i512>) align 256 %0)
+// CHECK-LABEL: define dso_local noundef <3 x i512> @_Z2v3Dv3_DB512_(
+// CHECK-SAME: ptr noundef byval(<3 x i512>) align 256 [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <3 x i512>, align 256
+// CHECK-NEXT:    [[LOADVEC4:%.*]] = load <4 x i512>, ptr [[TMP0]], align 256
+// CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x i512> [[LOADVEC4]], <4 x i512> poison, <3 x i32> 
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i512> [[A]], <3 x i512> poison, <4 x i32> 
+// CHECK-NEXT:    store <4 x i512> [[EXTRACTVEC]], ptr [[A_ADDR]], align 256
+// CHECK-NEXT:    [[LOADVEC41:%.*]] = load <4 x i512>, ptr [[A_ADDR]], align 256
+// CHECK-NEXT:    [[EXTRACTVEC2:%.*]] = shufflevector <4 x i512> [[LOADVEC41]], <4 x i512> poison, <3 x i32> 
+// CHECK-NEXT:    [[LOADVEC43:%.*]] = load <4 x i512>, ptr [[A_ADDR]], align 256
+// CHECK-NEXT:    [[EXTRACTVEC4:%.*]] = shufflevector <4 x i512> [[LOADVEC43]], <4 x i512> poison, <3 x i32> 
+// CHECK-NEXT:    [[ADD:%.*]] = add <3 x i512> [[EXTRACTVEC2]], [[EXTRACTVEC4]]
+// CHECK-NEXT:    ret <3 x i512> [[ADD]]
+//
 i512x3 v3(i512x3 a) {
-  // CHECK-NEXT: entry:
-  // CHECK-NEXT:   %a.addr = alloca <3 x i512>, align 256
-  // CHECK-NEXT:   %loadVec4 = load <4 x i512>, ptr %0, align 256
-  // CHECK-NEXT:   %a = shufflevector <4 x i512> %loadVec4, <4 x i512> poison, <3 x i32> 
-  // CHECK-NEXT:   %extractVec = shufflevector <3 x i512> %a, <3 x i512> poison, <4 x i32> 
-  // CHECK-NEXT:   store <4 x i512> %extractVec, ptr %a.addr, align 256
-  // CHECK-NEXT:   %loadVec41 = load <4 x i512>, ptr %a.addr, align 256
-  // CHECK-NEXT:   %extractVec2 = shufflevector <4 x i512> %loadVec41, <4 x i512> poison, <3 x i32> 
-  // CHECK-NEXT:   %loadVec43 = load <4 x i512>, ptr %a.addr, align 256
-  // CHECK-NEXT:   %extractVec4 = shufflevector <4 x i512> %loadVec43, <4 x i512> poison, <3 x i32> 
-  // CHECK-NEXT:   %add = add <3 x i512> %extractVec2, %extractVec4
-  // CHECK-NEXT:   ret <3 x i512> %add
   return a + a;
 }
 
-// CHECK-LABEL: define dso_local noundef <9 x i8> @_Z2m1u11matrix_typeILm3ELm3EDB8_E(<9 x i8> noundef %a)
+// CHECK-LABEL: define dso_local noundef <9 x i8> @_Z2m1u11matrix_typeILm3ELm3EDB8_E(
+// CHECK-SAME: <9 x i8> noundef [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [9 x i8], align 1
+// CHECK-NEXT:    store <9 x i8> [[A]], ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load <9 x i8>, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <9 x i8>, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = add <9 x i8> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    ret <9 x i8> [[TMP2]]
+//
 i8x3x3 m1(i8x3x3 a) {
-  // CHECK-NEXT: entry:
-  // CHECK-NEXT:   %a.addr = alloca [9 x i8], align 1
-  // CHECK-NEXT:   store <9 x i8> %a, ptr %a.addr, align 1
-  // CHECK-NEXT:   %0 = load <9 x i8>, ptr %a.addr, align 1
-  // CHECK-NEXT:   %1 = load <9 x i8>, ptr %a.addr, align 1
-  // CHECK-NEXT:   %2 = add <9 x i8> %0, %1
-  // CHECK-NEXT:   ret <9 x i8> %2
   return a + a;
 }
 
-// CHECK-LABEL: define dso_local noundef <9 x i32> @_Z2m2u11matrix_typeILm3ELm3EDB32_E(<9 x i32> noundef %a)
+// CHECK-LABEL: define dso_local noundef <9 x i32> @_Z2m2u11matrix_typeILm3ELm3EDB32_E(
+// CHECK-SAME: <9 x i32> noundef [[A:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [9 x i32], align 4
+// CHECK-NEXT:    store <9 x i32> [[A]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <9 x i32>, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <9 x i32>, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = add <9 x i32> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    ret <9 x i32> [[TMP2]]
+//
 i32x3x3 m2(i32x3x3 a) {
-  // CHECK-NEXT: entry:
-  // CHECK-NEXT:   %a.addr = alloca [9 x i32], align 4
-  // CHECK-NEXT:   store <9 x i32> %a, ptr %a.addr, align 4
-  // CHECK-NEXT:   %0 = load <9 x i32>, ptr %a.addr, align 4
-  // CHECK-NEXT:   %1 = load <9 x i32>, ptr %a.addr, align 4
-  // CHECK-NEXT:   %2 = add <9 x i32> %0, %1
-  // CHECK-NEXT:   ret <9 x i32> %2
   return a + a;
 }
 
-// CHECK-LABEL: define dso_local noundef <9 x i512> @_Z2m3u11matrix_typeILm3ELm3EDB512_E(<9 x i512> noundef %a)
+// CHECK-LABEL: define dso_local noundef <9 x i512> @_Z2m3u11matrix_typeILm3ELm3EDB512_E(
+// CHECK-SAME: <9 x i512> noundef [[A:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [9 x i512], align 8
+// CHECK-NEXT:    store <9 x i512> [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <9 x i512>, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <9 x i512>, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = add <9 x i512> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    ret <9 x i512> [[TMP2]]
+//
 i512x3x3 m3(i512x3x3 a) {
-  // CHECK-NEXT: entry:
-  // CHECK-NEXT:   %a.addr = alloca [9 x i512], align 8
-  // CHECK-NEXT:   store <9 x i512> %a, ptr %a.addr, align 8
-  // CHECK-NEXT:   %0 = load <9 x i512>, ptr %a.addr, align 8
-  // CHECK-NEXT:   %1 = load <9 x i512>, ptr %a.addr, align 8
-  // CHECK-NEXT:   %2 = add <9 x i512> %0, %1
-  // CHECK-NEXT:   ret <9 x i512> %2
   return a + a;
 }

From b53866fec80d8676705409140b8ed2147ec44fad Mon Sep 17 00:00:00 2001
From: ZhaoQi 
Date: Mon, 30 Dec 2024 16:01:46 +0800
Subject: [PATCH 167/567] [LoongArch] Modify expanding code sequence for
 PseudoLA_TLS_LE (#119696)

Before this commit, PseudoLA_TLS_LE for normal/medium code model expand
normally to:
```
  lu12i.w $rd, %le_hi20(sym)
  ori $rd, $rd, %le_lo12(sym)
```

This commit changes the result to:
```
  lu12i.w $rd, %le_hi20_r(sym)
  add.w/d $rd, $rd, $tp, %le_add_r(sym)
  addi.w/d $rd, $rd, %le_lo12_r(sym)
```

This aims to be optimized by linker relaxation in the future.

This commit makes no change to PseudoLA_TLS_LE in large code model.
---
 .../LoongArch/LoongArchExpandPseudoInsts.cpp  | 39 +++++++++++++------
 .../LoongArch/LoongArchISelLowering.cpp       | 12 +++++-
 .../Target/LoongArch/LoongArchInstrInfo.cpp   |  5 ++-
 .../Target/LoongArch/LoongArchMCInstLower.cpp |  9 +++++
 .../MCTargetDesc/LoongArchBaseInfo.h          |  3 ++
 .../LoongArch/machinelicm-address-pseudos.ll  | 13 ++++---
 .../CodeGen/LoongArch/mir-target-flags.ll     |  5 ++-
 llvm/test/CodeGen/LoongArch/tls-models.ll     | 36 ++++++++---------
 8 files changed, 82 insertions(+), 40 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
index 30742c79653b5..0218934ea3344 100644
--- a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
@@ -352,11 +352,13 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLE(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
     MachineBasicBlock::iterator &NextMBBI) {
   // Code Sequence:
+  // lu12i.w $rd, %le_hi20_r(sym)
+  // add.w/d $rd, $rd, $tp, %le_add_r(sym)
+  // addi.w/d $rd, $rd, %le_lo12_r(sym)
+  //
+  // Code Sequence while using the large code model:
   // lu12i.w $rd, %le_hi20(sym)
   // ori $rd, $rd, %le_lo12(sym)
-  //
-  // And additionally if generating code using the large code model:
-  //
   // lu32i.d $rd, %le64_lo20(sym)
   // lu52i.d $rd, $rd, %le64_hi12(sym)
   MachineFunction *MF = MBB.getParent();
@@ -366,20 +368,35 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLE(
   bool Large = MF->getTarget().getCodeModel() == CodeModel::Large;
   Register DestReg = MI.getOperand(0).getReg();
   Register Parts01 =
-      Large ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
-            : DestReg;
+      MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass);
   Register Part1 =
       MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass);
   MachineOperand &Symbol = MI.getOperand(1);
 
-  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU12I_W), Part1)
-      .addDisp(Symbol, 0, LoongArchII::MO_LE_HI);
+  if (!Large) {
+    BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU12I_W), Part1)
+        .addDisp(Symbol, 0, LoongArchII::MO_LE_HI_R);
 
-  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ORI), Parts01)
-      .addReg(Part1, RegState::Kill)
-      .addDisp(Symbol, 0, LoongArchII::MO_LE_LO);
+    const auto &STI = MF->getSubtarget();
+    unsigned AddOp = STI.is64Bit() ? LoongArch::PseudoAddTPRel_D
+                                   : LoongArch::PseudoAddTPRel_W;
+    BuildMI(MBB, MBBI, DL, TII->get(AddOp), Parts01)
+        .addReg(Part1, RegState::Kill)
+        .addReg(LoongArch::R2)
+        .addDisp(Symbol, 0, LoongArchII::MO_LE_ADD_R);
+
+    unsigned AddiOp = STI.is64Bit() ? LoongArch::ADDI_D : LoongArch::ADDI_W;
+    BuildMI(MBB, MBBI, DL, TII->get(AddiOp), DestReg)
+        .addReg(Parts01, RegState::Kill)
+        .addDisp(Symbol, 0, LoongArchII::MO_LE_LO_R);
+  } else {
+    BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU12I_W), Part1)
+        .addDisp(Symbol, 0, LoongArchII::MO_LE_HI);
+
+    BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ORI), Parts01)
+        .addReg(Part1, RegState::Kill)
+        .addDisp(Symbol, 0, LoongArchII::MO_LE_LO);
 
-  if (Large) {
     Register Parts012 =
         MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass);
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 7f67def73ca2b..96e6f71344a78 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -1866,9 +1866,17 @@ SDValue LoongArchTargetLowering::getStaticTLSAddr(GlobalAddressSDNode *N,
   // PseudoLA_*_LARGE nodes.
   SDValue Tmp = DAG.getConstant(0, DL, Ty);
   SDValue Addr = DAG.getTargetGlobalAddress(N->getGlobal(), DL, Ty, 0, 0);
-  SDValue Offset = Large
+
+  // Only IE needs an extra argument for large code model.
+  SDValue Offset = Opc == LoongArch::PseudoLA_TLS_IE_LARGE
                        ? SDValue(DAG.getMachineNode(Opc, DL, Ty, Tmp, Addr), 0)
                        : SDValue(DAG.getMachineNode(Opc, DL, Ty, Addr), 0);
+
+  // If it is LE for normal/medium code model, the add tp operation will occur
+  // during the pseudo-instruction expansion.
+  if (Opc == LoongArch::PseudoLA_TLS_LE && !Large)
+    return Offset;
+
   if (UseGOT) {
     // Mark the load instruction as invariant to enable hoisting in MachineLICM.
     MachineFunction &MF = DAG.getMachineFunction();
@@ -1989,7 +1997,7 @@ LoongArchTargetLowering::lowerGlobalTLSAddress(SDValue Op,
     //
     // This node doesn't need an extra argument for the large code model.
     return getStaticTLSAddr(N, DAG, LoongArch::PseudoLA_TLS_LE,
-                            /*UseGOT=*/false);
+                            /*UseGOT=*/false, Large);
   }
 
   return getTLSDescAddr(N, DAG,
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
index 363cacf726c9c..7d0e4f9d58a16 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
@@ -651,7 +651,10 @@ LoongArchInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
       {MO_DESC_LD, "loongarch-desc-ld"},
       {MO_DESC_CALL, "loongarch-desc-call"},
       {MO_LD_PC_HI, "loongarch-ld-pc-hi"},
-      {MO_GD_PC_HI, "loongarch-gd-pc-hi"}};
+      {MO_GD_PC_HI, "loongarch-gd-pc-hi"},
+      {MO_LE_HI_R, "loongarch-le-hi-r"},
+      {MO_LE_ADD_R, "loongarch-le-add-r"},
+      {MO_LE_LO_R, "loongarch-le-lo-r"}};
   return ArrayRef(TargetFlags);
 }
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
index 2bacc1273343e..d1de0609f24ce 100644
--- a/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
@@ -114,6 +114,15 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
   case LoongArchII::MO_DESC_CALL:
     Kind = LoongArchMCExpr::VK_LoongArch_TLS_DESC_CALL;
     break;
+  case LoongArchII::MO_LE_HI_R:
+    Kind = LoongArchMCExpr::VK_LoongArch_TLS_LE_HI20_R;
+    break;
+  case LoongArchII::MO_LE_ADD_R:
+    Kind = LoongArchMCExpr::VK_LoongArch_TLS_LE_ADD_R;
+    break;
+  case LoongArchII::MO_LE_LO_R:
+    Kind = LoongArchMCExpr::VK_LoongArch_TLS_LE_LO12_R;
+    break;
     // TODO: Handle more target-flags.
   }
 
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
index bd63c5edeabca..23699043b9926 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
@@ -54,6 +54,9 @@ enum {
   MO_DESC64_PC_LO,
   MO_DESC_LD,
   MO_DESC_CALL,
+  MO_LE_HI_R,
+  MO_LE_ADD_R,
+  MO_LE_LO_R,
   // TODO: Add more flags.
 };
 
diff --git a/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll b/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
index fc0c7ad1686ee..e0a93e3051bf8 100644
--- a/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
+++ b/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
@@ -315,9 +315,9 @@ define void @test_la_tls_le(i32 signext %n) {
 ; LA32-LABEL: test_la_tls_le:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    move $a1, $zero
-; LA32-NEXT:    lu12i.w $a2, %le_hi20(le)
-; LA32-NEXT:    ori $a2, $a2, %le_lo12(le)
-; LA32-NEXT:    add.w $a2, $a2, $tp
+; LA32-NEXT:    lu12i.w $a2, %le_hi20_r(le)
+; LA32-NEXT:    add.w $a2, $a2, $tp, %le_add_r(le)
+; LA32-NEXT:    addi.w $a2, $a2, %le_lo12_r(le)
 ; LA32-NEXT:    .p2align 4, , 16
 ; LA32-NEXT:  .LBB4_1: # %loop
 ; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -330,12 +330,13 @@ define void @test_la_tls_le(i32 signext %n) {
 ; LA64-LABEL: test_la_tls_le:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    move $a1, $zero
-; LA64-NEXT:    lu12i.w $a2, %le_hi20(le)
-; LA64-NEXT:    ori $a2, $a2, %le_lo12(le)
+; LA64-NEXT:    lu12i.w $a2, %le_hi20_r(le)
+; LA64-NEXT:    add.d $a2, $a2, $tp, %le_add_r(le)
+; LA64-NEXT:    addi.d $a2, $a2, %le_lo12_r(le)
 ; LA64-NEXT:    .p2align 4, , 16
 ; LA64-NEXT:  .LBB4_1: # %loop
 ; LA64-NEXT:    # =>This Inner Loop Header: Depth=1
-; LA64-NEXT:    ldx.w $zero, $a2, $tp
+; LA64-NEXT:    ld.w $zero, $a2, 0
 ; LA64-NEXT:    addi.w $a1, $a1, 1
 ; LA64-NEXT:    blt $a1, $a0, .LBB4_1
 ; LA64-NEXT:  # %bb.2: # %ret
diff --git a/llvm/test/CodeGen/LoongArch/mir-target-flags.ll b/llvm/test/CodeGen/LoongArch/mir-target-flags.ll
index f530e3ef237c1..3bc8a8d309586 100644
--- a/llvm/test/CodeGen/LoongArch/mir-target-flags.ll
+++ b/llvm/test/CodeGen/LoongArch/mir-target-flags.ll
@@ -28,8 +28,9 @@ define void @caller() nounwind {
 ; CHECK-NEXT: target-flags(loongarch-got-pc-lo) @t_ld
 ; CHECK:      target-flags(loongarch-ie-pc-hi) @t_ie
 ; CHECK-NEXT: target-flags(loongarch-ie-pc-lo) @t_ie
-; CHECK:      target-flags(loongarch-le-hi) @t_le
-; CHECK-NEXT: target-flags(loongarch-le-lo) @t_le
+; CHECK:      target-flags(loongarch-le-hi-r) @t_le
+; CHECK-NEXT: target-flags(loongarch-le-add-r) @t_le
+; CHECK-NEXT: target-flags(loongarch-le-lo-r) @t_le
 ; CHECK:      target-flags(loongarch-call-plt) @callee1
 ; CHECK:      target-flags(loongarch-call) @callee2
   %a = load volatile i32, ptr @g_e
diff --git a/llvm/test/CodeGen/LoongArch/tls-models.ll b/llvm/test/CodeGen/LoongArch/tls-models.ll
index dbd7bf6a81269..e3a8ace3bc7e3 100644
--- a/llvm/test/CodeGen/LoongArch/tls-models.ll
+++ b/llvm/test/CodeGen/LoongArch/tls-models.ll
@@ -330,16 +330,16 @@ entry:
 define ptr @f4() nounwind {
 ; LA32PIC-LABEL: f4:
 ; LA32PIC:       # %bb.0: # %entry
-; LA32PIC-NEXT:    lu12i.w $a0, %le_hi20(le)
-; LA32PIC-NEXT:    ori $a0, $a0, %le_lo12(le)
-; LA32PIC-NEXT:    add.w $a0, $a0, $tp
+; LA32PIC-NEXT:    lu12i.w $a0, %le_hi20_r(le)
+; LA32PIC-NEXT:    add.w $a0, $a0, $tp, %le_add_r(le)
+; LA32PIC-NEXT:    addi.w $a0, $a0, %le_lo12_r(le)
 ; LA32PIC-NEXT:    ret
 ;
 ; LA64PIC-LABEL: f4:
 ; LA64PIC:       # %bb.0: # %entry
-; LA64PIC-NEXT:    lu12i.w $a0, %le_hi20(le)
-; LA64PIC-NEXT:    ori $a0, $a0, %le_lo12(le)
-; LA64PIC-NEXT:    add.d $a0, $a0, $tp
+; LA64PIC-NEXT:    lu12i.w $a0, %le_hi20_r(le)
+; LA64PIC-NEXT:    add.d $a0, $a0, $tp, %le_add_r(le)
+; LA64PIC-NEXT:    addi.d $a0, $a0, %le_lo12_r(le)
 ; LA64PIC-NEXT:    ret
 ;
 ; LA64LARGEPIC-LABEL: f4:
@@ -353,16 +353,16 @@ define ptr @f4() nounwind {
 ;
 ; LA32NOPIC-LABEL: f4:
 ; LA32NOPIC:       # %bb.0: # %entry
-; LA32NOPIC-NEXT:    lu12i.w $a0, %le_hi20(le)
-; LA32NOPIC-NEXT:    ori $a0, $a0, %le_lo12(le)
-; LA32NOPIC-NEXT:    add.w $a0, $a0, $tp
+; LA32NOPIC-NEXT:    lu12i.w $a0, %le_hi20_r(le)
+; LA32NOPIC-NEXT:    add.w $a0, $a0, $tp, %le_add_r(le)
+; LA32NOPIC-NEXT:    addi.w $a0, $a0, %le_lo12_r(le)
 ; LA32NOPIC-NEXT:    ret
 ;
 ; LA64NOPIC-LABEL: f4:
 ; LA64NOPIC:       # %bb.0: # %entry
-; LA64NOPIC-NEXT:    lu12i.w $a0, %le_hi20(le)
-; LA64NOPIC-NEXT:    ori $a0, $a0, %le_lo12(le)
-; LA64NOPIC-NEXT:    add.d $a0, $a0, $tp
+; LA64NOPIC-NEXT:    lu12i.w $a0, %le_hi20_r(le)
+; LA64NOPIC-NEXT:    add.d $a0, $a0, $tp, %le_add_r(le)
+; LA64NOPIC-NEXT:    addi.d $a0, $a0, %le_lo12_r(le)
 ; LA64NOPIC-NEXT:    ret
 ;
 ; LA64LARGENOPIC-LABEL: f4:
@@ -376,16 +376,16 @@ define ptr @f4() nounwind {
 ;
 ; LA32DESC-LABEL: f4:
 ; LA32DESC:       # %bb.0: # %entry
-; LA32DESC-NEXT:    lu12i.w $a0, %le_hi20(le)
-; LA32DESC-NEXT:    ori $a0, $a0, %le_lo12(le)
-; LA32DESC-NEXT:    add.w $a0, $a0, $tp
+; LA32DESC-NEXT:    lu12i.w $a0, %le_hi20_r(le)
+; LA32DESC-NEXT:    add.w $a0, $a0, $tp, %le_add_r(le)
+; LA32DESC-NEXT:    addi.w $a0, $a0, %le_lo12_r(le)
 ; LA32DESC-NEXT:    ret
 ;
 ; LA64DESC-LABEL: f4:
 ; LA64DESC:       # %bb.0: # %entry
-; LA64DESC-NEXT:    lu12i.w $a0, %le_hi20(le)
-; LA64DESC-NEXT:    ori $a0, $a0, %le_lo12(le)
-; LA64DESC-NEXT:    add.d $a0, $a0, $tp
+; LA64DESC-NEXT:    lu12i.w $a0, %le_hi20_r(le)
+; LA64DESC-NEXT:    add.d $a0, $a0, $tp, %le_add_r(le)
+; LA64DESC-NEXT:    addi.d $a0, $a0, %le_lo12_r(le)
 ; LA64DESC-NEXT:    ret
 ;
 ; DESC64-LABEL: f4:

From 91c5de7fb8f95132043ed08056e58238383cfcb9 Mon Sep 17 00:00:00 2001
From: Haojian Wu 
Date: Mon, 30 Dec 2024 09:04:34 +0100
Subject: [PATCH 168/567] [bazel] Fix the broken bazel build for clang-tidy
 after e45e091b90896023584b303539bd8ae16d8932b3

---
 .../clang-tools-extra/clang-tidy/BUILD.bazel                    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-tidy/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-tidy/BUILD.bazel
index d8afbe37e8467..fa77152711334 100644
--- a/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-tidy/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-tidy/BUILD.bazel
@@ -265,6 +265,7 @@ clang_tidy_library(
 clang_tidy_library(
     name = "cppcoreguidelines",
     deps = [
+        ":bugprone",
         ":lib",
         ":misc",
         ":modernize",
@@ -278,7 +279,6 @@ clang_tidy_library(
 clang_tidy_library(
     name = "bugprone",
     deps = [
-        ":cppcoreguidelines",
         ":lib",
         ":utils",
         "//clang:analysis",

From 486ec4bd7466cda444a7da6386a1bbb2db89a33f Mon Sep 17 00:00:00 2001
From: Galen Elias 
Date: Mon, 30 Dec 2024 01:28:03 -0800
Subject: [PATCH 169/567] [clang-format] Add
 `AllowShortNamespacesOnASingleLine` option (#105597)

This fixes #101363 which is a resurrection of a previously opened but
never completed review: https://reviews.llvm.org/D11851

The feature is to allow code like the following not to be broken across
multiple lines:

```
namespace foo { class bar; }
namespace foo { namespace bar { class baz; } }
```

Code like this is commonly used for forward declarations, which are
ideally kept compact. This is also apparently the format that
include-what-you-use will insert for forward declarations.

Also, fix an off-by-one error in `CompactNamespaces` code. For nested
namespaces with 3 or more namespaces, it was incorrectly compacting
lines which were 1 or two spaces over the `ColumnLimit`, leading to
incorrect formatting results.
---
 clang/docs/ClangFormatStyleOptions.rst      |   5 +
 clang/docs/ReleaseNotes.rst                 |   1 +
 clang/include/clang/Format/Format.h         |   6 ++
 clang/lib/Format/Format.cpp                 |   3 +
 clang/lib/Format/UnwrappedLineFormatter.cpp |  93 +++++++++++++++-
 clang/unittests/Format/ConfigParseTest.cpp  |   1 +
 clang/unittests/Format/FormatTest.cpp       | 113 ++++++++++++++++++++
 7 files changed, 221 insertions(+), 1 deletion(-)

diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index 4be448171699c..c175436a2817a 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -2088,6 +2088,11 @@ the configuration (without a prefix: ``Auto``).
   If ``true``, ``while (true) continue;`` can be put on a single
   line.
 
+.. _AllowShortNamespacesOnASingleLine:
+
+**AllowShortNamespacesOnASingleLine** (``Boolean``) :versionbadge:`clang-format 20` :ref:`¶ `
+  If ``true``, ``namespace a { class b; }`` can be put on a single line.
+
 .. _AlwaysBreakAfterDefinitionReturnType:
 
 **AlwaysBreakAfterDefinitionReturnType** (``DefinitionReturnTypeBreakingStyle``) :versionbadge:`clang-format 3.7` :ref:`¶ `
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 210ccc16eeb4f..b7da12bcf6581 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1123,6 +1123,7 @@ clang-format
   ``Never``, and ``true`` to ``Always``.
 - Adds ``RemoveEmptyLinesInUnwrappedLines`` option.
 - Adds ``KeepFormFeed`` option and set it to ``true`` for ``GNU`` style.
+- Adds ``AllowShortNamespacesOnASingleLine`` option.
 
 libclang
 --------
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index 6383934afa2c4..eefaabf9392fd 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -988,6 +988,10 @@ struct FormatStyle {
   /// \version 3.7
   bool AllowShortLoopsOnASingleLine;
 
+  /// If ``true``, ``namespace a { class b; }`` can be put on a single line.
+  /// \version 20
+  bool AllowShortNamespacesOnASingleLine;
+
   /// Different ways to break after the function definition return type.
   /// This option is **deprecated** and is retained for backwards compatibility.
   enum DefinitionReturnTypeBreakingStyle : int8_t {
@@ -5168,6 +5172,8 @@ struct FormatStyle {
                R.AllowShortIfStatementsOnASingleLine &&
            AllowShortLambdasOnASingleLine == R.AllowShortLambdasOnASingleLine &&
            AllowShortLoopsOnASingleLine == R.AllowShortLoopsOnASingleLine &&
+           AllowShortNamespacesOnASingleLine ==
+               R.AllowShortNamespacesOnASingleLine &&
            AlwaysBreakBeforeMultilineStrings ==
                R.AlwaysBreakBeforeMultilineStrings &&
            AttributeMacros == R.AttributeMacros &&
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 95129a8fe9240..8f44e9f00212c 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -975,6 +975,8 @@ template <> struct MappingTraits {
                    Style.AllowShortLambdasOnASingleLine);
     IO.mapOptional("AllowShortLoopsOnASingleLine",
                    Style.AllowShortLoopsOnASingleLine);
+    IO.mapOptional("AllowShortNamespacesOnASingleLine",
+                   Style.AllowShortNamespacesOnASingleLine);
     IO.mapOptional("AlwaysBreakAfterDefinitionReturnType",
                    Style.AlwaysBreakAfterDefinitionReturnType);
     IO.mapOptional("AlwaysBreakBeforeMultilineStrings",
@@ -1480,6 +1482,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
   LLVMStyle.AllowShortIfStatementsOnASingleLine = FormatStyle::SIS_Never;
   LLVMStyle.AllowShortLambdasOnASingleLine = FormatStyle::SLS_All;
   LLVMStyle.AllowShortLoopsOnASingleLine = false;
+  LLVMStyle.AllowShortNamespacesOnASingleLine = false;
   LLVMStyle.AlwaysBreakAfterDefinitionReturnType = FormatStyle::DRTBS_None;
   LLVMStyle.AlwaysBreakBeforeMultilineStrings = false;
   LLVMStyle.AttributeMacros.push_back("__capability");
diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp
index 1804c1437fd41..803c600cec44d 100644
--- a/clang/lib/Format/UnwrappedLineFormatter.cpp
+++ b/clang/lib/Format/UnwrappedLineFormatter.cpp
@@ -361,9 +361,18 @@ class LineJoiner {
     const auto *FirstNonComment = TheLine->getFirstNonComment();
     if (!FirstNonComment)
       return 0;
+
     // FIXME: There are probably cases where we should use FirstNonComment
     // instead of TheLine->First.
 
+    if (Style.AllowShortNamespacesOnASingleLine &&
+        TheLine->First->is(tok::kw_namespace) &&
+        TheLine->Last->is(tok::l_brace)) {
+      const auto result = tryMergeNamespace(I, E, Limit);
+      if (result > 0)
+        return result;
+    }
+
     if (Style.CompactNamespaces) {
       if (const auto *NSToken = TheLine->First->getNamespaceToken()) {
         int J = 1;
@@ -373,7 +382,7 @@ class LineJoiner {
              ClosingLineIndex == I[J]->MatchingClosingBlockLineIndex &&
              I[J]->Last->TotalLength < Limit;
              ++J, --ClosingLineIndex) {
-          Limit -= I[J]->Last->TotalLength;
+          Limit -= I[J]->Last->TotalLength + 1;
 
           // Reduce indent level for bodies of namespaces which were compacted,
           // but only if their content was indented in the first place.
@@ -420,6 +429,7 @@ class LineJoiner {
         TheLine->First != LastNonComment) {
       return MergeShortFunctions ? tryMergeSimpleBlock(I, E, Limit) : 0;
     }
+
     // Try to merge a control statement block with left brace unwrapped.
     if (TheLine->Last->is(tok::l_brace) && FirstNonComment != TheLine->Last &&
         FirstNonComment->isOneOf(tok::kw_if, tok::kw_while, tok::kw_for,
@@ -616,6 +626,72 @@ class LineJoiner {
     return 1;
   }
 
+  unsigned tryMergeNamespace(SmallVectorImpl::const_iterator I,
+                             SmallVectorImpl::const_iterator E,
+                             unsigned Limit) {
+    if (Limit == 0)
+      return 0;
+
+    assert(I[1]);
+    const auto &L1 = *I[1];
+    if (L1.InPPDirective != (*I)->InPPDirective ||
+        (L1.InPPDirective && L1.First->HasUnescapedNewline)) {
+      return 0;
+    }
+
+    if (std::distance(I, E) <= 2)
+      return 0;
+
+    assert(I[2]);
+    const auto &L2 = *I[2];
+    if (L2.Type == LT_Invalid)
+      return 0;
+
+    Limit = limitConsideringMacros(I + 1, E, Limit);
+
+    if (!nextTwoLinesFitInto(I, Limit))
+      return 0;
+
+    // Check if it's a namespace inside a namespace, and call recursively if so.
+    // '3' is the sizes of the whitespace and closing brace for " _inner_ }".
+    if (L1.First->is(tok::kw_namespace)) {
+      if (L1.Last->is(tok::comment) || !Style.CompactNamespaces)
+        return 0;
+
+      assert(Limit >= L1.Last->TotalLength + 3);
+      const auto InnerLimit = Limit - L1.Last->TotalLength - 3;
+      const auto MergedLines = tryMergeNamespace(I + 1, E, InnerLimit);
+      if (MergedLines == 0)
+        return 0;
+      const auto N = MergedLines + 2;
+      // Check if there is even a line after the inner result.
+      if (std::distance(I, E) <= N)
+        return 0;
+      // Check that the line after the inner result starts with a closing brace
+      // which we are permitted to merge into one line.
+      if (I[N]->First->is(tok::r_brace) && !I[N]->First->MustBreakBefore &&
+          I[MergedLines + 1]->Last->isNot(tok::comment) &&
+          nextNLinesFitInto(I, I + N + 1, Limit)) {
+        return N;
+      }
+      return 0;
+    }
+
+    // There's no inner namespace, so we are considering to merge at most one
+    // line.
+
+    // The line which is in the namespace should end with semicolon.
+    if (L1.Last->isNot(tok::semi))
+      return 0;
+
+    // Last, check that the third line starts with a closing brace.
+    if (L2.First->isNot(tok::r_brace) || L2.First->MustBreakBefore)
+      return 0;
+
+    // If so, merge all three lines.
+    return 2;
+  }
+
   unsigned tryMergeSimpleControlStatement(
       SmallVectorImpl::const_iterator I,
       SmallVectorImpl::const_iterator E, unsigned Limit) {
@@ -916,6 +992,21 @@ class LineJoiner {
     return 1 + I[1]->Last->TotalLength + 1 + I[2]->Last->TotalLength <= Limit;
   }
 
+  bool nextNLinesFitInto(SmallVectorImpl::const_iterator I,
+                         SmallVectorImpl::const_iterator E,
+                         unsigned Limit) {
+    unsigned JoinedLength = 0;
+    for (const auto *J = I + 1; J != E; ++J) {
+      if ((*J)->First->MustBreakBefore)
+        return false;
+
+      JoinedLength += 1 + (*J)->Last->TotalLength;
+      if (JoinedLength > Limit)
+        return false;
+    }
+    return true;
+  }
+
   bool containsMustBreak(const AnnotatedLine *Line) {
     assert(Line->First);
     // Ignore the first token, because in this situation, it applies more to the
diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp
index 7fc7492271668..b249bf073aa45 100644
--- a/clang/unittests/Format/ConfigParseTest.cpp
+++ b/clang/unittests/Format/ConfigParseTest.cpp
@@ -159,6 +159,7 @@ TEST(ConfigParseTest, ParsesConfigurationBools) {
   CHECK_PARSE_BOOL(AllowShortCompoundRequirementOnASingleLine);
   CHECK_PARSE_BOOL(AllowShortEnumsOnASingleLine);
   CHECK_PARSE_BOOL(AllowShortLoopsOnASingleLine);
+  CHECK_PARSE_BOOL(AllowShortNamespacesOnASingleLine);
   CHECK_PARSE_BOOL(BinPackArguments);
   CHECK_PARSE_BOOL(BreakAdjacentStringLiterals);
   CHECK_PARSE_BOOL(BreakAfterJavaFieldAnnotations);
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 47465a18e9a41..22b6f7e1b62e2 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -4476,6 +4476,7 @@ TEST_F(FormatTest, FormatsCompactNamespaces) {
                "int k; } // namespace out",
                Style);
 
+  Style.ColumnLimit = 41;
   verifyFormat("namespace A { namespace B { namespace C {\n"
                "}}} // namespace A::B::C",
                "namespace A { namespace B {\n"
@@ -4504,6 +4505,12 @@ TEST_F(FormatTest, FormatsCompactNamespaces) {
                "} // namespace bbbbbb\n"
                "} // namespace aaaaaa",
                Style);
+
+  verifyFormat("namespace a { namespace b {\n"
+               "namespace c {\n"
+               "}}} // namespace a::b::c",
+               Style);
+
   Style.ColumnLimit = 80;
 
   // Extra semicolon after 'inner' closing brace prevents merging
@@ -28314,6 +28321,112 @@ TEST_F(FormatTest, KeepFormFeed) {
                Style);
 }
 
+TEST_F(FormatTest, ShortNamespacesOption) {
+  auto Style = getLLVMStyle();
+  Style.AllowShortNamespacesOnASingleLine = true;
+  Style.CompactNamespaces = true;
+  Style.FixNamespaceComments = false;
+
+  // Basic functionality.
+  verifyFormat("namespace foo { class bar; }", Style);
+  verifyFormat("namespace foo::bar { class baz; }", Style);
+  verifyFormat("namespace { class bar; }", Style);
+  verifyFormat("namespace foo {\n"
+               "class bar;\n"
+               "class baz;\n"
+               "}",
+               Style);
+
+  // Trailing comments prevent merging.
+  verifyFormat("namespace foo { namespace baz {\n"
+               "class qux;\n"
+               "} // comment\n"
+               "}",
+               Style);
+
+  // Make sure code doesn't walk too far on unbalanced code.
+  verifyFormat("namespace foo {", Style);
+  verifyFormat("namespace foo {\n"
+               "class baz;",
+               Style);
+  verifyFormat("namespace foo {\n"
+               "namespace bar { class baz; }",
+               Style);
+
+  // Nested namespaces.
+  verifyFormat("namespace foo { namespace bar { class baz; } }", Style);
+
+  // Without CompactNamespaces, we won't merge consecutive namespace
+  // declarations.
+  Style.CompactNamespaces = false;
+  verifyFormat("namespace foo {\n"
+               "namespace bar { class baz; }\n"
+               "}",
+               Style);
+
+  verifyFormat("namespace foo {\n"
+               "namespace bar { class baz; }\n"
+               "namespace qux { class quux; }\n"
+               "}",
+               Style);
+
+  Style.CompactNamespaces = true;
+
+  // Varying inner content.
+  verifyFormat("namespace foo {\n"
+               "int f() { return 5; }\n"
+               "}",
+               Style);
+  verifyFormat("namespace foo { template  struct bar; }", Style);
+  verifyFormat("namespace foo { constexpr int num = 42; }", Style);
+
+  // Validate nested namespace wrapping scenarios around the ColumnLimit.
+  Style.ColumnLimit = 64;
+
+  // Validate just under the ColumnLimit.
+  verifyFormat(
+      "namespace foo { namespace bar { namespace baz { class qux; } } }",
+      Style);
+
+  // Validate just over the ColumnLimit.
+  verifyFormat("namespace foo { namespace baar { namespace baaz {\n"
+               "class quux;\n"
+               "}}}",
+               Style);
+
+  verifyFormat(
+      "namespace foo { namespace bar { namespace baz { namespace qux {\n"
+      "class quux;\n"
+      "}}}}",
+      Style);
+
+  // Validate that the ColumnLimit logic accounts for trailing content as well.
+  verifyFormat("namespace foo { namespace bar { class qux; } } // extra",
+               Style);
+
+  verifyFormat("namespace foo { namespace bar { namespace baz {\n"
+               "class qux;\n"
+               "}}} // extra",
+               Style);
+
+  // FIXME: Ideally AllowShortNamespacesOnASingleLine would disable the trailing
+  // namespace comment from 'FixNamespaceComments', as it's not really necessary
+  // in this scenario, but the two options work at very different layers of the
+  // formatter, so I'm not sure how to make them interact.
+  //
+  // As it stands, the trailing comment will be added and likely make the line
+  // too long to fit within the ColumnLimit, reducing the how likely the line
+  // will still fit on a single line. The recommendation for now is to use the
+  // concatenated namespace syntax instead. e.g. 'namespace foo::bar'
+  Style.FixNamespaceComments = true;
+  verifyFormat(
+      "namespace foo { namespace bar { namespace baz {\n"
+      "class qux;\n"
+      "}}} // namespace foo::bar::baz",
+      "namespace foo { namespace bar { namespace baz { class qux; } } }",
+      Style);
+}
+
 } // namespace
 } // namespace test
 } // namespace format

From 998bdae7f5ce8c5aa31b376592c9693fc95f02e3 Mon Sep 17 00:00:00 2001
From: Jonas Hahnfeld 
Date: Mon, 30 Dec 2024 10:32:54 +0100
Subject: [PATCH 170/567] [MLGO] Only configure tests with LLVM_INCLUDE_TESTS
 (#121293)

This allows downstream customers to remove all test directories and save
quite some space when only building with LLVM_INCLUDE_TESTS=OFF.
---
 llvm/utils/mlgo-utils/CMakeLists.txt | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/llvm/utils/mlgo-utils/CMakeLists.txt b/llvm/utils/mlgo-utils/CMakeLists.txt
index 2f3920644b737..d9b2bdc9bf60a 100644
--- a/llvm/utils/mlgo-utils/CMakeLists.txt
+++ b/llvm/utils/mlgo-utils/CMakeLists.txt
@@ -1,9 +1,11 @@
-configure_lit_site_cfg(
-  "${CMAKE_CURRENT_SOURCE_DIR}/tests/lit.site.cfg.in"
-  "${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg"
-)
+if(LLVM_INCLUDE_TESTS)
+  configure_lit_site_cfg(
+    "${CMAKE_CURRENT_SOURCE_DIR}/tests/lit.site.cfg.in"
+    "${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg"
+  )
 
-add_lit_testsuite(check-mlgo-utils "Running mlgo-utils tests"
-  ${CMAKE_CURRENT_BINARY_DIR}
-  DEPENDS "FileCheck" "not" "count" "split-file" "yaml2obj" "llvm-objcopy"
-)
+  add_lit_testsuite(check-mlgo-utils "Running mlgo-utils tests"
+    ${CMAKE_CURRENT_BINARY_DIR}
+    DEPENDS "FileCheck" "not" "count" "split-file" "yaml2obj" "llvm-objcopy"
+  )
+endif()

From 39e93eee76ab86c9892540e2d4ad881c41ad54d4 Mon Sep 17 00:00:00 2001
From: Luke Hutton 
Date: Mon, 30 Dec 2024 11:05:37 +0000
Subject: [PATCH 171/567] [TOSA] Handle dialect check more efficiently
 (#120960)

---
 mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
index 6fd671051362c..8588c878bfe4f 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
@@ -542,9 +542,13 @@ bool TosaValidation::isValidElementType(Type type) {
 
 void TosaValidation::runOnOperation() {
   configLevelAndProfile();
+
+  TosaDialect *tosaDialect = getContext().getLoadedDialect();
+  if (!tosaDialect)
+    return;
+
   getOperation().walk([&](Operation *op) {
-    if (!op->getDialect() ||
-        op->getDialect()->getNamespace() != TosaDialect::getDialectNamespace())
+    if (op->getDialect() != tosaDialect)
       return;
 
     for (Value operand : op->getOperands()) {

From 60d20603e43a53b1d495d199ea020c3a56a6866f Mon Sep 17 00:00:00 2001
From: Maksim Levental 
Date: Mon, 30 Dec 2024 03:21:07 -0800
Subject: [PATCH 172/567] [mlir][xegpu] DCE decl in TD (#121249)

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 2aaa7fd4221ab..4841f94de75f4 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -164,11 +164,9 @@ def XeGPU_SGMapAttr : XeGPUAttr<"SGMap", "sg_map"> {
   }];
   let parameters = (ins
     ArrayRefParameter<"uint32_t">:$wi_layout,
-    ArrayRefParameter<"uint32_t">:$wi_data);
+    ArrayRefParameter<"uint32_t">:$wi_data
+  );
 
-  let builders = [
-    AttrBuilder<(ins)>
-  ];
 
   let hasCustomAssemblyFormat = 1;
   let genVerifyDecl = 1;

From 16d19aaedf347f452c22c7254934753b19803d5d Mon Sep 17 00:00:00 2001
From: Florian Hahn 
Date: Mon, 30 Dec 2024 12:08:12 +0000
Subject: [PATCH 173/567] [VPlan] Manage created blocks directly in VPlan.
 (NFC) (#120918)

This patch changes the way blocks are managed by VPlan. Previously all
blocks reachable from entry would be cleaned up when a VPlan is
destroyed. With this patch, each VPlan keeps track of blocks created for
it in a list and this list is then used to delete all blocks in the list
when the VPlan is destroyed. To do so, block creation is funneled
through helpers in directly in VPlan.

The main advantage of doing so is it simplifies CFG transformations, as
those do not have to take care of deleting any blocks, just adjusting
the CFG. This helps to simplify
https://github.com/llvm/llvm-project/pull/108378 and
https://github.com/llvm/llvm-project/pull/106748.

This also simplifies handling of 'immutable' blocks a VPlan holds
references to, which at the moment only include the scalar header block.

PR: https://github.com/llvm/llvm-project/pull/120918
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   8 +-
 llvm/lib/Transforms/Vectorize/VPlan.cpp       | 116 ++++++++++--------
 llvm/lib/Transforms/Vectorize/VPlan.h         |  88 +++++++------
 .../Transforms/Vectorize/VPlanHCFGBuilder.cpp |  10 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  28 +++--
 .../Transforms/Vectorize/VPDomTreeTest.cpp    |  46 +++----
 .../Transforms/Vectorize/VPlanTest.cpp        |  98 +++++++--------
 .../Vectorize/VPlanVerifierTest.cpp           |  30 ++---
 8 files changed, 228 insertions(+), 196 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1975df3cacbca..f38db39db9cff 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2454,7 +2454,7 @@ static void introduceCheckBlockInVPlan(VPlan &Plan, BasicBlock *CheckIRBB) {
     assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
     assert(PreVectorPH->getSuccessors()[0] == ScalarPH &&
            "Unexpected successor");
-    VPIRBasicBlock *CheckVPIRBB = VPIRBasicBlock::fromBasicBlock(CheckIRBB);
+    VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
     VPBlockUtils::insertOnEdge(PreVectorPH, VectorPH, CheckVPIRBB);
     PreVectorPH = CheckVPIRBB;
   }
@@ -8084,11 +8084,11 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
 
   // A new entry block has been created for the epilogue VPlan. Hook it in, as
   // otherwise we would try to modify the entry to the main vector loop.
-  VPIRBasicBlock *NewEntry = VPIRBasicBlock::fromBasicBlock(Insert);
+  VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Insert);
   VPBasicBlock *OldEntry = Plan.getEntry();
   VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
   Plan.setEntry(NewEntry);
-  delete OldEntry;
+  // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
 
   introduceCheckBlockInVPlan(Plan, Insert);
   return Insert;
@@ -9289,7 +9289,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
         VPBB->appendRecipe(Recipe);
     }
 
-    VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
+    VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB);
     VPBB = cast(VPBB->getSingleSuccessor());
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 9a082921d4f7f..82a42b29c6a7d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -205,11 +205,6 @@ VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() {
   return Parent->getEnclosingBlockWithPredecessors();
 }
 
-void VPBlockBase::deleteCFG(VPBlockBase *Entry) {
-  for (VPBlockBase *Block : to_vector(vp_depth_first_shallow(Entry)))
-    delete Block;
-}
-
 VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
   iterator It = begin();
   while (It != end() && It->isPhi())
@@ -474,6 +469,13 @@ void VPIRBasicBlock::execute(VPTransformState *State) {
   connectToPredecessors(State->CFG);
 }
 
+VPIRBasicBlock *VPIRBasicBlock::clone() {
+  auto *NewBlock = getPlan()->createEmptyVPIRBasicBlock(IRBB);
+  for (VPRecipeBase &R : Recipes)
+    NewBlock->appendRecipe(R.clone());
+  return NewBlock;
+}
+
 void VPBasicBlock::execute(VPTransformState *State) {
   bool Replica = bool(State->Lane);
   BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible.
@@ -513,14 +515,11 @@ void VPBasicBlock::execute(VPTransformState *State) {
   executeRecipes(State, NewBB);
 }
 
-void VPBasicBlock::dropAllReferences(VPValue *NewValue) {
-  for (VPRecipeBase &R : Recipes) {
-    for (auto *Def : R.definedValues())
-      Def->replaceAllUsesWith(NewValue);
-
-    for (unsigned I = 0, E = R.getNumOperands(); I != E; I++)
-      R.setOperand(I, NewValue);
-  }
+VPBasicBlock *VPBasicBlock::clone() {
+  auto *NewBlock = getPlan()->createVPBasicBlock(getName());
+  for (VPRecipeBase &R : *this)
+    NewBlock->appendRecipe(R.clone());
+  return NewBlock;
 }
 
 void VPBasicBlock::executeRecipes(VPTransformState *State, BasicBlock *BB) {
@@ -541,7 +540,7 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) {
 
   SmallVector Succs(successors());
   // Create new empty block after the block to split.
-  auto *SplitBlock = new VPBasicBlock(getName() + ".split");
+  auto *SplitBlock = getPlan()->createVPBasicBlock(getName() + ".split");
   VPBlockUtils::insertBlockAfter(SplitBlock, this);
 
   // Finally, move the recipes starting at SplitAt to new block.
@@ -701,20 +700,13 @@ static std::pair cloneFrom(VPBlockBase *Entry) {
 
 VPRegionBlock *VPRegionBlock::clone() {
   const auto &[NewEntry, NewExiting] = cloneFrom(getEntry());
-  auto *NewRegion =
-      new VPRegionBlock(NewEntry, NewExiting, getName(), isReplicator());
+  auto *NewRegion = getPlan()->createVPRegionBlock(NewEntry, NewExiting,
+                                                   getName(), isReplicator());
   for (VPBlockBase *Block : vp_depth_first_shallow(NewEntry))
     Block->setParent(NewRegion);
   return NewRegion;
 }
 
-void VPRegionBlock::dropAllReferences(VPValue *NewValue) {
-  for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
-    // Drop all references in VPBasicBlocks and replace all uses with
-    // DummyValue.
-    Block->dropAllReferences(NewValue);
-}
-
 void VPRegionBlock::execute(VPTransformState *State) {
   ReversePostOrderTraversal>
       RPOT(Entry);
@@ -822,17 +814,26 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
 #endif
 
 VPlan::VPlan(Loop *L) {
-  setEntry(VPIRBasicBlock::fromBasicBlock(L->getLoopPreheader()));
-  ScalarHeader = VPIRBasicBlock::fromBasicBlock(L->getHeader());
+  setEntry(createVPIRBasicBlock(L->getLoopPreheader()));
+  ScalarHeader = createVPIRBasicBlock(L->getHeader());
 }
 
 VPlan::~VPlan() {
-  if (Entry) {
-    VPValue DummyValue;
-    for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
-      Block->dropAllReferences(&DummyValue);
-
-    VPBlockBase::deleteCFG(Entry);
+  VPValue DummyValue;
+
+  for (auto *VPB : CreatedBlocks) {
+    if (auto *VPBB = dyn_cast(VPB)) {
+      // Replace all operands of recipes and all VPValues defined in VPBB with
+      // DummyValue so the block can be deleted.
+      for (VPRecipeBase &R : *VPBB) {
+        for (auto *Def : R.definedValues())
+          Def->replaceAllUsesWith(&DummyValue);
+
+        for (unsigned I = 0, E = R.getNumOperands(); I != E; I++)
+          R.setOperand(I, &DummyValue);
+      }
+    }
+    delete VPB;
   }
   for (VPValue *VPV : VPLiveInsToFree)
     delete VPV;
@@ -840,14 +841,6 @@ VPlan::~VPlan() {
     delete BackedgeTakenCount;
 }
 
-VPIRBasicBlock *VPIRBasicBlock::fromBasicBlock(BasicBlock *IRBB) {
-  auto *VPIRBB = new VPIRBasicBlock(IRBB);
-  for (Instruction &I :
-       make_range(IRBB->begin(), IRBB->getTerminator()->getIterator()))
-    VPIRBB->appendRecipe(new VPIRInstruction(I));
-  return VPIRBB;
-}
-
 VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
                                    PredicatedScalarEvolution &PSE,
                                    bool RequiresScalarEpilogueCheck,
@@ -861,7 +854,7 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
   // an epilogue vector loop, the original entry block here will be replaced by
   // a new VPIRBasicBlock wrapping the entry to the epilogue vector loop after
   // generating code for the main vector loop.
-  VPBasicBlock *VecPreheader = new VPBasicBlock("vector.ph");
+  VPBasicBlock *VecPreheader = Plan->createVPBasicBlock("vector.ph");
   VPBlockUtils::connectBlocks(Plan->getEntry(), VecPreheader);
 
   // Create SCEV and VPValue for the trip count.
@@ -878,17 +871,17 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
 
   // Create VPRegionBlock, with empty header and latch blocks, to be filled
   // during processing later.
-  VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
-  VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
+  VPBasicBlock *HeaderVPBB = Plan->createVPBasicBlock("vector.body");
+  VPBasicBlock *LatchVPBB = Plan->createVPBasicBlock("vector.latch");
   VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
-  auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop",
-                                      false /*isReplicator*/);
+  auto *TopRegion = Plan->createVPRegionBlock(
+      HeaderVPBB, LatchVPBB, "vector loop", false /*isReplicator*/);
 
   VPBlockUtils::insertBlockAfter(TopRegion, VecPreheader);
-  VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
+  VPBasicBlock *MiddleVPBB = Plan->createVPBasicBlock("middle.block");
   VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
 
-  VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
+  VPBasicBlock *ScalarPH = Plan->createVPBasicBlock("scalar.ph");
   VPBlockUtils::connectBlocks(ScalarPH, ScalarHeader);
   if (!RequiresScalarEpilogueCheck) {
     VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
@@ -904,7 +897,7 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
   //    we unconditionally branch to the scalar preheader.  Do nothing.
   // 3) Otherwise, construct a runtime check.
   BasicBlock *IRExitBlock = TheLoop->getUniqueLatchExitBlock();
-  auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
+  auto *VPExitBlock = Plan->createVPIRBasicBlock(IRExitBlock);
   // The connection order corresponds to the operands of the conditional branch.
   VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
   VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
@@ -960,15 +953,14 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
 /// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
 /// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
 static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
-  VPIRBasicBlock *IRVPBB = VPIRBasicBlock::fromBasicBlock(IRBB);
+  VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
   for (auto &R : make_early_inc_range(*VPBB)) {
     assert(!R.isPhi() && "Tried to move phi recipe to end of block");
     R.moveBefore(*IRVPBB, IRVPBB->end());
   }
 
   VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
-
-  delete VPBB;
+  // VPBB is now dead and will be cleaned up when the plan gets destroyed.
 }
 
 /// Generate the code inside the preheader and body of the vectorized loop.
@@ -1217,6 +1209,7 @@ static void remapOperands(VPBlockBase *Entry, VPBlockBase *NewEntry,
 }
 
 VPlan *VPlan::duplicate() {
+  unsigned NumBlocksBeforeCloning = CreatedBlocks.size();
   // Clone blocks.
   const auto &[NewEntry, __] = cloneFrom(Entry);
 
@@ -1257,9 +1250,32 @@ VPlan *VPlan::duplicate() {
   assert(Old2NewVPValues.contains(TripCount) &&
          "TripCount must have been added to Old2NewVPValues");
   NewPlan->TripCount = Old2NewVPValues[TripCount];
+
+  // Transfer all cloned blocks (the second half of all current blocks) from
+  // current to new VPlan.
+  unsigned NumBlocksAfterCloning = CreatedBlocks.size();
+  for (unsigned I :
+       seq(NumBlocksBeforeCloning, NumBlocksAfterCloning))
+    NewPlan->CreatedBlocks.push_back(this->CreatedBlocks[I]);
+  CreatedBlocks.truncate(NumBlocksBeforeCloning);
+
   return NewPlan;
 }
 
+VPIRBasicBlock *VPlan::createEmptyVPIRBasicBlock(BasicBlock *IRBB) {
+  auto *VPIRBB = new VPIRBasicBlock(IRBB);
+  CreatedBlocks.push_back(VPIRBB);
+  return VPIRBB;
+}
+
+VPIRBasicBlock *VPlan::createVPIRBasicBlock(BasicBlock *IRBB) {
+  auto *VPIRBB = createEmptyVPIRBasicBlock(IRBB);
+  for (Instruction &I :
+       make_range(IRBB->begin(), IRBB->getTerminator()->getIterator()))
+    VPIRBB->appendRecipe(new VPIRInstruction(I));
+  return VPIRBB;
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 
 Twine VPlanPrinter::getUID(const VPBlockBase *Block) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 404202b7f3130..199e0dd7a6bec 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -636,9 +636,6 @@ class VPBlockBase {
   /// Return the cost of the block.
   virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx) = 0;
 
-  /// Delete all blocks reachable from a given VPBlockBase, inclusive.
-  static void deleteCFG(VPBlockBase *Entry);
-
   /// Return true if it is legal to hoist instructions into this block.
   bool isLegalToHoistInto() {
     // There are currently no constraints that prevent an instruction to be
@@ -646,10 +643,6 @@ class VPBlockBase {
     return true;
   }
 
-  /// Replace all operands of VPUsers in the block with \p NewValue and also
-  /// replaces all uses of VPValues defined in the block with NewValue.
-  virtual void dropAllReferences(VPValue *NewValue) = 0;
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   void printAsOperand(raw_ostream &OS, bool PrintType = false) const {
     OS << getName();
@@ -3556,8 +3549,6 @@ class VPBasicBlock : public VPBlockBase {
     return make_range(begin(), getFirstNonPhi());
   }
 
-  void dropAllReferences(VPValue *NewValue) override;
-
   /// Split current block at \p SplitAt by inserting a new block between the
   /// current block and its successors and moving all recipes starting at
   /// SplitAt to the new block. Returns the new block.
@@ -3587,12 +3578,7 @@ class VPBasicBlock : public VPBlockBase {
 
   /// Clone the current block and it's recipes, without updating the operands of
   /// the cloned recipes.
-  VPBasicBlock *clone() override {
-    auto *NewBlock = new VPBasicBlock(getName());
-    for (VPRecipeBase &R : *this)
-      NewBlock->appendRecipe(R.clone());
-    return NewBlock;
-  }
+  VPBasicBlock *clone() override;
 
 protected:
   /// Execute the recipes in the IR basic block \p BB.
@@ -3628,20 +3614,11 @@ class VPIRBasicBlock : public VPBasicBlock {
     return V->getVPBlockID() == VPBlockBase::VPIRBasicBlockSC;
   }
 
-  /// Create a VPIRBasicBlock from \p IRBB containing VPIRInstructions for all
-  /// instructions in \p IRBB, except its terminator which is managed in VPlan.
-  static VPIRBasicBlock *fromBasicBlock(BasicBlock *IRBB);
-
   /// The method which generates the output IR instructions that correspond to
   /// this VPBasicBlock, thereby "executing" the VPlan.
   void execute(VPTransformState *State) override;
 
-  VPIRBasicBlock *clone() override {
-    auto *NewBlock = new VPIRBasicBlock(IRBB);
-    for (VPRecipeBase &R : Recipes)
-      NewBlock->appendRecipe(R.clone());
-    return NewBlock;
-  }
+  VPIRBasicBlock *clone() override;
 
   BasicBlock *getIRBasicBlock() const { return IRBB; }
 };
@@ -3680,13 +3657,7 @@ class VPRegionBlock : public VPBlockBase {
       : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exiting(nullptr),
         IsReplicator(IsReplicator) {}
 
-  ~VPRegionBlock() override {
-    if (Entry) {
-      VPValue DummyValue;
-      Entry->dropAllReferences(&DummyValue);
-      deleteCFG(Entry);
-    }
-  }
+  ~VPRegionBlock() override {}
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPBlockBase *V) {
@@ -3734,8 +3705,6 @@ class VPRegionBlock : public VPBlockBase {
   // Return the cost of this region.
   InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override;
 
-  void dropAllReferences(VPValue *NewValue) override;
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print this VPRegionBlock to \p O (recursively), prefixing all lines with
   /// \p Indent. \p SlotTracker is used to print unnamed VPValue's using
@@ -3812,6 +3781,10 @@ class VPlan {
   /// been modeled in VPlan directly.
   DenseMap SCEVToExpansion;
 
+  /// Blocks allocated and owned by the VPlan. They will be deleted once the
+  /// VPlan is destroyed.
+  SmallVector CreatedBlocks;
+
   /// Construct a VPlan with \p Entry to the plan and with \p ScalarHeader
   /// wrapping the original header of the scalar loop.
   VPlan(VPBasicBlock *Entry, VPIRBasicBlock *ScalarHeader)
@@ -3830,8 +3803,8 @@ class VPlan {
   /// Construct a VPlan with a new VPBasicBlock as entry, a VPIRBasicBlock
   /// wrapping \p ScalarHeaderBB and a trip count of \p TC.
   VPlan(BasicBlock *ScalarHeaderBB, VPValue *TC) {
-    setEntry(new VPBasicBlock("preheader"));
-    ScalarHeader = VPIRBasicBlock::fromBasicBlock(ScalarHeaderBB);
+    setEntry(createVPBasicBlock("preheader"));
+    ScalarHeader = createVPIRBasicBlock(ScalarHeaderBB);
     TripCount = TC;
   }
 
@@ -4029,6 +4002,49 @@ class VPlan {
   /// Clone the current VPlan, update all VPValues of the new VPlan and cloned
   /// recipes to refer to the clones, and return it.
   VPlan *duplicate();
+
+  /// Create a new VPBasicBlock with \p Name and containing \p Recipe if
+  /// present. The returned block is owned by the VPlan and deleted once the
+  /// VPlan is destroyed.
+  VPBasicBlock *createVPBasicBlock(const Twine &Name,
+                                   VPRecipeBase *Recipe = nullptr) {
+    auto *VPB = new VPBasicBlock(Name, Recipe);
+    CreatedBlocks.push_back(VPB);
+    return VPB;
+  }
+
+  /// Create a new VPRegionBlock with \p Entry, \p Exiting and \p Name. If \p
+  /// IsReplicator is true, the region is a replicate region. The returned block
+  /// is owned by the VPlan and deleted once the VPlan is destroyed.
+  VPRegionBlock *createVPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting,
+                                     const std::string &Name = "",
+                                     bool IsReplicator = false) {
+    auto *VPB = new VPRegionBlock(Entry, Exiting, Name, IsReplicator);
+    CreatedBlocks.push_back(VPB);
+    return VPB;
+  }
+
+  /// Create a new VPRegionBlock with \p Name and entry and exiting blocks set
+  /// to nullptr. If \p IsReplicator is true, the region is a replicate region.
+  /// The returned block is owned by the VPlan and deleted once the VPlan is
+  /// destroyed.
+  VPRegionBlock *createVPRegionBlock(const std::string &Name = "",
+                                     bool IsReplicator = false) {
+    auto *VPB = new VPRegionBlock(Name, IsReplicator);
+    CreatedBlocks.push_back(VPB);
+    return VPB;
+  }
+
+  /// Create a VPIRBasicBlock wrapping \p IRBB, but do not create
+  /// VPIRInstructions wrapping the instructions in t\p IRBB.  The returned
+  /// block is owned by the VPlan and deleted once the VPlan is destroyed.
+  VPIRBasicBlock *createEmptyVPIRBasicBlock(BasicBlock *IRBB);
+
+  /// Create a VPIRBasicBlock from \p IRBB containing VPIRInstructions for all
+  /// instructions in \p IRBB, except its terminator which is managed by the
+  /// successors of the block in VPlan. The returned block is owned by the VPlan
+  /// and deleted once the VPlan is destroyed.
+  VPIRBasicBlock *createVPIRBasicBlock(BasicBlock *IRBB);
 };
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
index 6e633739fcc3d..76ed578424dfe 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -182,7 +182,7 @@ VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {
   // Create new VPBB.
   StringRef Name = isHeaderBB(BB, TheLoop) ? "vector.body" : BB->getName();
   LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << Name << "\n");
-  VPBasicBlock *VPBB = new VPBasicBlock(Name);
+  VPBasicBlock *VPBB = Plan.createVPBasicBlock(Name);
   BB2VPBB[BB] = VPBB;
 
   // Get or create a region for the loop containing BB.
@@ -204,7 +204,7 @@ VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {
   if (LoopOfBB == TheLoop) {
     RegionOfVPBB = Plan.getVectorLoopRegion();
   } else {
-    RegionOfVPBB = new VPRegionBlock(Name.str(), false /*isReplicator*/);
+    RegionOfVPBB = Plan.createVPRegionBlock(Name.str(), false /*isReplicator*/);
     RegionOfVPBB->setParent(Loop2Region[LoopOfBB->getParentLoop()]);
   }
   RegionOfVPBB->setEntry(VPBB);
@@ -357,12 +357,10 @@ void PlainCFGBuilder::buildPlainCFG() {
   BB2VPBB[TheLoop->getHeader()] = VectorHeaderVPBB;
   VectorHeaderVPBB->clearSuccessors();
   VectorLatchVPBB->clearPredecessors();
-  if (TheLoop->getHeader() != TheLoop->getLoopLatch()) {
+  if (TheLoop->getHeader() != TheLoop->getLoopLatch())
     BB2VPBB[TheLoop->getLoopLatch()] = VectorLatchVPBB;
-  } else {
+  else
     TheRegion->setExiting(VectorHeaderVPBB);
-    delete VectorLatchVPBB;
-  }
 
   // 1. Scan the body of the loop in a topological order to visit each basic
   // block after having visited its predecessor basic blocks. Create a VPBB for
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 0b809c2b34df9..1f5acf996a772 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -217,7 +217,7 @@ static VPBasicBlock *getPredicatedThenBlock(VPRegionBlock *R) {
 // is connected to a successor replicate region with the same predicate by a
 // single, empty VPBasicBlock.
 static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
-  SetVector DeletedRegions;
+  SmallPtrSet TransformedRegions;
 
   // Collect replicate regions followed by an empty block, followed by another
   // replicate region with matching masks to process front. This is to avoid
@@ -248,7 +248,7 @@ static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
 
   // Move recipes from Region1 to its successor region, if both are triangles.
   for (VPRegionBlock *Region1 : WorkList) {
-    if (DeletedRegions.contains(Region1))
+    if (TransformedRegions.contains(Region1))
       continue;
     auto *MiddleBasicBlock = cast(Region1->getSingleSuccessor());
     auto *Region2 = cast(MiddleBasicBlock->getSingleSuccessor());
@@ -294,12 +294,10 @@ static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
       VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
     }
     VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
-    DeletedRegions.insert(Region1);
+    TransformedRegions.insert(Region1);
   }
 
-  for (VPRegionBlock *ToDelete : DeletedRegions)
-    delete ToDelete;
-  return !DeletedRegions.empty();
+  return !TransformedRegions.empty();
 }
 
 static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
@@ -310,7 +308,8 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
   assert(Instr->getParent() && "Predicated instruction not in any basic block");
   auto *BlockInMask = PredRecipe->getMask();
   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
-  auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
+  auto *Entry =
+      Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
 
   // Replace predicated replicate recipe with a replicate recipe without a
   // mask but in the replicate region.
@@ -318,7 +317,8 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
       PredRecipe->getUnderlyingInstr(),
       make_range(PredRecipe->op_begin(), std::prev(PredRecipe->op_end())),
       PredRecipe->isUniform());
-  auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
+  auto *Pred =
+      Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
 
   VPPredInstPHIRecipe *PHIRecipe = nullptr;
   if (PredRecipe->getNumUsers() != 0) {
@@ -328,8 +328,10 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
     PHIRecipe->setOperand(0, RecipeWithoutMask);
   }
   PredRecipe->eraseFromParent();
-  auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
-  VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true);
+  auto *Exiting =
+      Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
+  VPRegionBlock *Region =
+      Plan.createVPRegionBlock(Entry, Exiting, RegionName, true);
 
   // Note: first set Entry as region entry and then connect successors starting
   // from it in order, to propagate the "parent" of each VPBasicBlock.
@@ -396,7 +398,7 @@ static bool mergeBlocksIntoPredecessors(VPlan &Plan) {
       VPBlockUtils::disconnectBlocks(VPBB, Succ);
       VPBlockUtils::connectBlocks(PredVPBB, Succ);
     }
-    delete VPBB;
+    // VPBB is now dead and will be cleaned up when the plan gets destroyed.
   }
   return !WorkList.empty();
 }
@@ -1898,7 +1900,7 @@ void VPlanTransforms::handleUncountableEarlyExit(
   if (OrigLoop->getUniqueExitBlock()) {
     VPEarlyExitBlock = cast(MiddleVPBB->getSuccessors()[0]);
   } else {
-    VPEarlyExitBlock = VPIRBasicBlock::fromBasicBlock(
+    VPEarlyExitBlock = Plan.createVPIRBasicBlock(
         !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
   }
 
@@ -1908,7 +1910,7 @@ void VPlanTransforms::handleUncountableEarlyExit(
   IsEarlyExitTaken =
       Builder.createNaryOp(VPInstruction::AnyOf, {EarlyExitTakenCond});
 
-  VPBasicBlock *NewMiddle = new VPBasicBlock("middle.split");
+  VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
   VPBlockUtils::insertOnEdge(LoopRegion, MiddleVPBB, NewMiddle);
   VPBlockUtils::connectBlocks(NewMiddle, VPEarlyExitBlock);
   NewMiddle->swapSuccessors();
diff --git a/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp b/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp
index 6aa34a5fa431b..55b68f5866dee 100644
--- a/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp
@@ -29,11 +29,11 @@ TEST_F(VPDominatorTreeTest, DominanceNoRegionsTest) {
   //  }
   VPlan &Plan = getPlan();
   VPBasicBlock *VPBB0 = Plan.getEntry();
-  VPBasicBlock *VPBB1 = new VPBasicBlock("VPBB1");
-  VPBasicBlock *VPBB2 = new VPBasicBlock("VPBB2");
-  VPBasicBlock *VPBB3 = new VPBasicBlock("VPBB3");
-  VPBasicBlock *VPBB4 = new VPBasicBlock("VPBB4");
-  VPRegionBlock *R1 = new VPRegionBlock(VPBB1, VPBB4);
+  VPBasicBlock *VPBB1 = Plan.createVPBasicBlock("VPBB1");
+  VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("VPBB2");
+  VPBasicBlock *VPBB3 = Plan.createVPBasicBlock("VPBB3");
+  VPBasicBlock *VPBB4 = Plan.createVPBasicBlock("VPBB4");
+  VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB1, VPBB4);
   VPBB2->setParent(R1);
   VPBB3->setParent(R1);
 
@@ -96,11 +96,11 @@ TEST_F(VPDominatorTreeTest, DominanceRegionsTest) {
     //
     VPlan &Plan = getPlan();
     VPBasicBlock *VPBB0 = Plan.getEntry();
-    VPBasicBlock *R1BB1 = new VPBasicBlock();
-    VPBasicBlock *R1BB2 = new VPBasicBlock();
-    VPBasicBlock *R1BB3 = new VPBasicBlock();
-    VPBasicBlock *R1BB4 = new VPBasicBlock();
-    VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB4, "R1");
+    VPBasicBlock *R1BB1 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R1BB2 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R1BB3 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R1BB4 = Plan.createVPBasicBlock("");
+    VPRegionBlock *R1 = Plan.createVPRegionBlock(R1BB1, R1BB4, "R1");
     R1BB2->setParent(R1);
     R1BB3->setParent(R1);
     VPBlockUtils::connectBlocks(VPBB0, R1);
@@ -111,9 +111,9 @@ TEST_F(VPDominatorTreeTest, DominanceRegionsTest) {
     // Cycle.
     VPBlockUtils::connectBlocks(R1BB3, R1BB3);
 
-    VPBasicBlock *R2BB1 = new VPBasicBlock();
-    VPBasicBlock *R2BB2 = new VPBasicBlock();
-    VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R2BB2, "R2");
+    VPBasicBlock *R2BB1 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R2BB2 = Plan.createVPBasicBlock("");
+    VPRegionBlock *R2 = Plan.createVPRegionBlock(R2BB1, R2BB2, "R2");
     VPBlockUtils::connectBlocks(R2BB1, R2BB2);
     VPBlockUtils::connectBlocks(R1, R2);
 
@@ -170,15 +170,15 @@ TEST_F(VPDominatorTreeTest, DominanceRegionsTest) {
     //  VPBB2
     //
     VPlan &Plan = getPlan();
-    VPBasicBlock *R1BB1 = new VPBasicBlock("R1BB1");
-    VPBasicBlock *R1BB2 = new VPBasicBlock("R1BB2");
-    VPBasicBlock *R1BB3 = new VPBasicBlock("R1BB3");
-    VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB3, "R1");
-
-    VPBasicBlock *R2BB1 = new VPBasicBlock("R2BB1");
-    VPBasicBlock *R2BB2 = new VPBasicBlock("R2BB2");
-    VPBasicBlock *R2BB3 = new VPBasicBlock("R2BB3");
-    VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R2BB3, "R2");
+    VPBasicBlock *R1BB1 = Plan.createVPBasicBlock("R1BB1");
+    VPBasicBlock *R1BB2 = Plan.createVPBasicBlock("R1BB2");
+    VPBasicBlock *R1BB3 = Plan.createVPBasicBlock("R1BB3");
+    VPRegionBlock *R1 = Plan.createVPRegionBlock(R1BB1, R1BB3, "R1");
+
+    VPBasicBlock *R2BB1 = Plan.createVPBasicBlock("R2BB1");
+    VPBasicBlock *R2BB2 = Plan.createVPBasicBlock("R2BB2");
+    VPBasicBlock *R2BB3 = Plan.createVPBasicBlock("R2BB#");
+    VPRegionBlock *R2 = Plan.createVPRegionBlock(R2BB1, R2BB3, "R2");
     R2BB2->setParent(R2);
     VPBlockUtils::connectBlocks(R2BB1, R2BB2);
     VPBlockUtils::connectBlocks(R2BB2, R2BB1);
@@ -193,7 +193,7 @@ TEST_F(VPDominatorTreeTest, DominanceRegionsTest) {
 
     VPBasicBlock *VPBB1 = Plan.getEntry();
     VPBlockUtils::connectBlocks(VPBB1, R1);
-    VPBasicBlock *VPBB2 = new VPBasicBlock("VPBB2");
+    VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("VPBB2");
     VPBlockUtils::connectBlocks(R1, VPBB2);
 
     VPBlockUtils::connectBlocks(VPBB2, Plan.getScalarHeader());
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index 2ab55f64a2073..5bcc2b8eb2e22 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -245,9 +245,9 @@ TEST_F(VPBasicBlockTest, getPlan) {
   {
     VPlan &Plan = getPlan();
     VPBasicBlock *VPBB1 = Plan.getEntry();
-    VPBasicBlock *VPBB2 = new VPBasicBlock();
-    VPBasicBlock *VPBB3 = new VPBasicBlock();
-    VPBasicBlock *VPBB4 = new VPBasicBlock();
+    VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
+    VPBasicBlock *VPBB3 = Plan.createVPBasicBlock("");
+    VPBasicBlock *VPBB4 = Plan.createVPBasicBlock("");
 
     //     VPBB1
     //     /   \
@@ -270,9 +270,9 @@ TEST_F(VPBasicBlockTest, getPlan) {
     VPlan &Plan = getPlan();
     VPBasicBlock *VPBB1 = Plan.getEntry();
     // VPBasicBlock is the entry into the VPlan, followed by a region.
-    VPBasicBlock *R1BB1 = new VPBasicBlock();
-    VPBasicBlock *R1BB2 = new VPBasicBlock();
-    VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB2, "R1");
+    VPBasicBlock *R1BB1 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R1BB2 = Plan.createVPBasicBlock("");
+    VPRegionBlock *R1 = Plan.createVPRegionBlock(R1BB1, R1BB2, "R1");
     VPBlockUtils::connectBlocks(R1BB1, R1BB2);
 
     VPBlockUtils::connectBlocks(VPBB1, R1);
@@ -287,21 +287,21 @@ TEST_F(VPBasicBlockTest, getPlan) {
 
   {
     VPlan &Plan = getPlan();
-    VPBasicBlock *R1BB1 = new VPBasicBlock();
-    VPBasicBlock *R1BB2 = new VPBasicBlock();
-    VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB2, "R1");
+    VPBasicBlock *R1BB1 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R1BB2 = Plan.createVPBasicBlock("");
+    VPRegionBlock *R1 = Plan.createVPRegionBlock(R1BB1, R1BB2, "R1");
     VPBlockUtils::connectBlocks(R1BB1, R1BB2);
 
-    VPBasicBlock *R2BB1 = new VPBasicBlock();
-    VPBasicBlock *R2BB2 = new VPBasicBlock();
-    VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R2BB2, "R2");
+    VPBasicBlock *R2BB1 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R2BB2 = Plan.createVPBasicBlock("");
+    VPRegionBlock *R2 = Plan.createVPRegionBlock(R2BB1, R2BB2, "R2");
     VPBlockUtils::connectBlocks(R2BB1, R2BB2);
 
     VPBasicBlock *VPBB1 = Plan.getEntry();
     VPBlockUtils::connectBlocks(VPBB1, R1);
     VPBlockUtils::connectBlocks(VPBB1, R2);
 
-    VPBasicBlock *VPBB2 = new VPBasicBlock();
+    VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
     VPBlockUtils::connectBlocks(R1, VPBB2);
     VPBlockUtils::connectBlocks(R2, VPBB2);
 
@@ -329,9 +329,9 @@ TEST_F(VPBasicBlockTest, TraversingIteratorTest) {
     //
     VPlan &Plan = getPlan();
     VPBasicBlock *VPBB1 = Plan.getEntry();
-    VPBasicBlock *VPBB2 = new VPBasicBlock();
-    VPBasicBlock *VPBB3 = new VPBasicBlock();
-    VPBasicBlock *VPBB4 = new VPBasicBlock();
+    VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
+    VPBasicBlock *VPBB3 = Plan.createVPBasicBlock("");
+    VPBasicBlock *VPBB4 = Plan.createVPBasicBlock("");
 
     VPBlockUtils::connectBlocks(VPBB1, VPBB2);
     VPBlockUtils::connectBlocks(VPBB1, VPBB3);
@@ -368,11 +368,11 @@ TEST_F(VPBasicBlockTest, TraversingIteratorTest) {
     //
     VPlan &Plan = getPlan();
     VPBasicBlock *VPBB0 = Plan.getEntry();
-    VPBasicBlock *R1BB1 = new VPBasicBlock();
-    VPBasicBlock *R1BB2 = new VPBasicBlock();
-    VPBasicBlock *R1BB3 = new VPBasicBlock();
-    VPBasicBlock *R1BB4 = new VPBasicBlock();
-    VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB4, "R1");
+    VPBasicBlock *R1BB1 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R1BB2 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R1BB3 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R1BB4 = Plan.createVPBasicBlock("");
+    VPRegionBlock *R1 = Plan.createVPRegionBlock(R1BB1, R1BB4, "R1");
     R1BB2->setParent(R1);
     R1BB3->setParent(R1);
     VPBlockUtils::connectBlocks(VPBB0, R1);
@@ -383,9 +383,9 @@ TEST_F(VPBasicBlockTest, TraversingIteratorTest) {
     // Cycle.
     VPBlockUtils::connectBlocks(R1BB3, R1BB3);
 
-    VPBasicBlock *R2BB1 = new VPBasicBlock();
-    VPBasicBlock *R2BB2 = new VPBasicBlock();
-    VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R2BB2, "R2");
+    VPBasicBlock *R2BB1 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R2BB2 = Plan.createVPBasicBlock("");
+    VPRegionBlock *R2 = Plan.createVPRegionBlock(R2BB1, R2BB2, "R2");
     VPBlockUtils::connectBlocks(R2BB1, R2BB2);
     VPBlockUtils::connectBlocks(R1, R2);
 
@@ -467,15 +467,15 @@ TEST_F(VPBasicBlockTest, TraversingIteratorTest) {
     //  VPBB2
     //
     VPlan &Plan = getPlan();
-    VPBasicBlock *R1BB1 = new VPBasicBlock("R1BB1");
-    VPBasicBlock *R1BB2 = new VPBasicBlock("R1BB2");
-    VPBasicBlock *R1BB3 = new VPBasicBlock("R1BB3");
-    VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB3, "R1");
-
-    VPBasicBlock *R2BB1 = new VPBasicBlock("R2BB1");
-    VPBasicBlock *R2BB2 = new VPBasicBlock("R2BB2");
-    VPBasicBlock *R2BB3 = new VPBasicBlock("R2BB3");
-    VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R2BB3, "R2");
+    VPBasicBlock *R1BB1 = Plan.createVPBasicBlock("R1BB1");
+    VPBasicBlock *R1BB2 = Plan.createVPBasicBlock("R1BB2");
+    VPBasicBlock *R1BB3 = Plan.createVPBasicBlock("R1BB3");
+    VPRegionBlock *R1 = Plan.createVPRegionBlock(R1BB1, R1BB3, "R1");
+
+    VPBasicBlock *R2BB1 = Plan.createVPBasicBlock("R2BB1");
+    VPBasicBlock *R2BB2 = Plan.createVPBasicBlock("R2BB2");
+    VPBasicBlock *R2BB3 = Plan.createVPBasicBlock("R2BB3");
+    VPRegionBlock *R2 = Plan.createVPRegionBlock(R2BB1, R2BB3, "R2");
     R2BB2->setParent(R2);
     VPBlockUtils::connectBlocks(R2BB1, R2BB2);
     VPBlockUtils::connectBlocks(R2BB2, R2BB1);
@@ -490,7 +490,7 @@ TEST_F(VPBasicBlockTest, TraversingIteratorTest) {
 
     VPBasicBlock *VPBB1 = Plan.getEntry();
     VPBlockUtils::connectBlocks(VPBB1, R1);
-    VPBasicBlock *VPBB2 = new VPBasicBlock("VPBB2");
+    VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("VPBB2");
     VPBlockUtils::connectBlocks(R1, VPBB2);
 
     // Depth-first.
@@ -538,12 +538,12 @@ TEST_F(VPBasicBlockTest, TraversingIteratorTest) {
     //   }
     //
     VPlan &Plan = getPlan();
-    VPBasicBlock *R2BB1 = new VPBasicBlock("R2BB1");
-    VPBasicBlock *R2BB2 = new VPBasicBlock("R2BB2");
-    VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R2BB2, "R2");
+    VPBasicBlock *R2BB1 = Plan.createVPBasicBlock("R2BB1");
+    VPBasicBlock *R2BB2 = Plan.createVPBasicBlock("R2BB2");
+    VPRegionBlock *R2 = Plan.createVPRegionBlock(R2BB1, R2BB2, "R2");
     VPBlockUtils::connectBlocks(R2BB1, R2BB2);
 
-    VPRegionBlock *R1 = new VPRegionBlock(R2, R2, "R1");
+    VPRegionBlock *R1 = Plan.createVPRegionBlock(R2, R2, "R1");
     R2->setParent(R1);
 
     VPBasicBlock *VPBB1 = Plan.getEntry();
@@ -592,19 +592,19 @@ TEST_F(VPBasicBlockTest, TraversingIteratorTest) {
     //  VPBB2
     //
     VPlan &Plan = getPlan();
-    VPBasicBlock *R3BB1 = new VPBasicBlock("R3BB1");
-    VPRegionBlock *R3 = new VPRegionBlock(R3BB1, R3BB1, "R3");
+    VPBasicBlock *R3BB1 = Plan.createVPBasicBlock("R3BB1");
+    VPRegionBlock *R3 = Plan.createVPRegionBlock(R3BB1, R3BB1, "R3");
 
-    VPBasicBlock *R2BB1 = new VPBasicBlock("R2BB1");
-    VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R3, "R2");
+    VPBasicBlock *R2BB1 = Plan.createVPBasicBlock("R2BB1");
+    VPRegionBlock *R2 = Plan.createVPRegionBlock(R2BB1, R3, "R2");
     R3->setParent(R2);
     VPBlockUtils::connectBlocks(R2BB1, R3);
 
-    VPRegionBlock *R1 = new VPRegionBlock(R2, R2, "R1");
+    VPRegionBlock *R1 = Plan.createVPRegionBlock(R2, R2, "R1");
     R2->setParent(R1);
 
     VPBasicBlock *VPBB1 = Plan.getEntry();
-    VPBasicBlock *VPBB2 = new VPBasicBlock("VPBB2");
+    VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("VPBB2");
     VPBlockUtils::connectBlocks(VPBB1, R1);
     VPBlockUtils::connectBlocks(R1, VPBB2);
 
@@ -674,7 +674,7 @@ TEST_F(VPBasicBlockTest, print) {
   VPInstruction *I2 = new VPInstruction(Instruction::Sub, {I1});
   VPInstruction *I3 = new VPInstruction(Instruction::Br, {I1, I2});
 
-  VPBasicBlock *VPBB1 = new VPBasicBlock();
+  VPBasicBlock *VPBB1 = Plan.createVPBasicBlock("");
   VPBB1->appendRecipe(I1);
   VPBB1->appendRecipe(I2);
   VPBB1->appendRecipe(I3);
@@ -682,7 +682,7 @@ TEST_F(VPBasicBlockTest, print) {
 
   VPInstruction *I4 = new VPInstruction(Instruction::Mul, {I2, I1});
   VPInstruction *I5 = new VPInstruction(Instruction::Ret, {I4});
-  VPBasicBlock *VPBB2 = new VPBasicBlock();
+  VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
   VPBB2->appendRecipe(I4);
   VPBB2->appendRecipe(I5);
   VPBB2->setName("bb2");
@@ -783,7 +783,7 @@ TEST_F(VPBasicBlockTest, printPlanWithVFsAndUFs) {
   VPBB0->appendRecipe(TC);
 
   VPInstruction *I1 = new VPInstruction(Instruction::Add, {});
-  VPBasicBlock *VPBB1 = new VPBasicBlock();
+  VPBasicBlock *VPBB1 = Plan.createVPBasicBlock("");
   VPBB1->appendRecipe(I1);
   VPBB1->setName("bb1");
 
@@ -1238,7 +1238,7 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
 TEST_F(VPRecipeTest, dumpRecipeInPlan) {
   VPlan &Plan = getPlan();
   VPBasicBlock *VPBB0 = Plan.getEntry();
-  VPBasicBlock *VPBB1 = new VPBasicBlock();
+  VPBasicBlock *VPBB1 = Plan.createVPBasicBlock("");
   VPBlockUtils::connectBlocks(VPBB1, Plan.getScalarHeader());
   VPBlockUtils::connectBlocks(VPBB0, VPBB1);
 
@@ -1307,7 +1307,7 @@ TEST_F(VPRecipeTest, dumpRecipeInPlan) {
 TEST_F(VPRecipeTest, dumpRecipeUnnamedVPValuesInPlan) {
   VPlan &Plan = getPlan();
   VPBasicBlock *VPBB0 = Plan.getEntry();
-  VPBasicBlock *VPBB1 = new VPBasicBlock();
+  VPBasicBlock *VPBB1 = Plan.createVPBasicBlock("");
   VPBlockUtils::connectBlocks(VPBB1, Plan.getScalarHeader());
   VPBlockUtils::connectBlocks(VPBB0, VPBB1);
 
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
index 174249a7e85e3..f098ba0bce497 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
@@ -27,8 +27,8 @@ TEST_F(VPVerifierTest, VPInstructionUseBeforeDefSameBB) {
   VPBB1->appendRecipe(UseI);
   VPBB1->appendRecipe(DefI);
 
-  VPBasicBlock *VPBB2 = new VPBasicBlock();
-  VPRegionBlock *R1 = new VPRegionBlock(VPBB2, VPBB2, "R1");
+  VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
+  VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB2, VPBB2, "R1");
   VPBlockUtils::connectBlocks(VPBB1, R1);
   VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader());
 
@@ -51,14 +51,14 @@ TEST_F(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) {
       new VPInstruction(VPInstruction::BranchOnCond, {CanIV});
 
   VPBasicBlock *VPBB1 = Plan.getEntry();
-  VPBasicBlock *VPBB2 = new VPBasicBlock();
+  VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
 
   VPBB1->appendRecipe(UseI);
   VPBB2->appendRecipe(CanIV);
   VPBB2->appendRecipe(DefI);
   VPBB2->appendRecipe(BranchOnCond);
 
-  VPRegionBlock *R1 = new VPRegionBlock(VPBB2, VPBB2, "R1");
+  VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB2, VPBB2, "R1");
   VPBlockUtils::connectBlocks(VPBB1, R1);
   VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader());
 
@@ -85,9 +85,9 @@ TEST_F(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) {
 
   VPlan &Plan = getPlan();
   VPBasicBlock *VPBB1 = Plan.getEntry();
-  VPBasicBlock *VPBB2 = new VPBasicBlock();
-  VPBasicBlock *VPBB3 = new VPBasicBlock();
-  VPBasicBlock *VPBB4 = new VPBasicBlock();
+  VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
+  VPBasicBlock *VPBB3 = Plan.createVPBasicBlock("");
+  VPBasicBlock *VPBB4 = Plan.createVPBasicBlock("");
 
   VPBB1->appendRecipe(I1);
   VPBB2->appendRecipe(CanIV);
@@ -97,7 +97,7 @@ TEST_F(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) {
 
   VPBlockUtils::connectBlocks(VPBB2, VPBB3);
   VPBlockUtils::connectBlocks(VPBB3, VPBB4);
-  VPRegionBlock *R1 = new VPRegionBlock(VPBB2, VPBB4, "R1");
+  VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB2, VPBB4, "R1");
   VPBlockUtils::connectBlocks(VPBB1, R1);
   VPBB3->setParent(R1);
 
@@ -125,14 +125,14 @@ TEST_F(VPVerifierTest, DuplicateSuccessorsOutsideRegion) {
 
   VPlan &Plan = getPlan();
   VPBasicBlock *VPBB1 = Plan.getEntry();
-  VPBasicBlock *VPBB2 = new VPBasicBlock();
+  VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
 
   VPBB1->appendRecipe(I1);
   VPBB1->appendRecipe(BranchOnCond2);
   VPBB2->appendRecipe(CanIV);
   VPBB2->appendRecipe(BranchOnCond);
 
-  VPRegionBlock *R1 = new VPRegionBlock(VPBB2, VPBB2, "R1");
+  VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB2, VPBB2, "R1");
   VPBlockUtils::connectBlocks(VPBB1, R1);
   VPBlockUtils::connectBlocks(VPBB1, R1);
 
@@ -158,8 +158,8 @@ TEST_F(VPVerifierTest, DuplicateSuccessorsInsideRegion) {
 
   VPlan &Plan = getPlan();
   VPBasicBlock *VPBB1 = Plan.getEntry();
-  VPBasicBlock *VPBB2 = new VPBasicBlock();
-  VPBasicBlock *VPBB3 = new VPBasicBlock();
+  VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
+  VPBasicBlock *VPBB3 = Plan.createVPBasicBlock("");
 
   VPBB1->appendRecipe(I1);
   VPBB2->appendRecipe(CanIV);
@@ -168,7 +168,7 @@ TEST_F(VPVerifierTest, DuplicateSuccessorsInsideRegion) {
 
   VPBlockUtils::connectBlocks(VPBB2, VPBB3);
   VPBlockUtils::connectBlocks(VPBB2, VPBB3);
-  VPRegionBlock *R1 = new VPRegionBlock(VPBB2, VPBB3, "R1");
+  VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB2, VPBB3, "R1");
   VPBlockUtils::connectBlocks(VPBB1, R1);
   VPBB3->setParent(R1);
 
@@ -187,7 +187,7 @@ TEST_F(VPVerifierTest, DuplicateSuccessorsInsideRegion) {
 TEST_F(VPVerifierTest, BlockOutsideRegionWithParent) {
   VPlan &Plan = getPlan();
   VPBasicBlock *VPBB1 = Plan.getEntry();
-  VPBasicBlock *VPBB2 = new VPBasicBlock();
+  VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
 
   VPInstruction *DefI = new VPInstruction(Instruction::Add, {});
   VPInstruction *BranchOnCond =
@@ -196,7 +196,7 @@ TEST_F(VPVerifierTest, BlockOutsideRegionWithParent) {
   VPBB1->appendRecipe(DefI);
   VPBB2->appendRecipe(BranchOnCond);
 
-  VPRegionBlock *R1 = new VPRegionBlock(VPBB2, VPBB2, "R1");
+  VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB2, VPBB2, "R1");
   VPBlockUtils::connectBlocks(VPBB1, R1);
 
   VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader());

From 50a457d9e89142e89a9b66de5c23c515b50d1f96 Mon Sep 17 00:00:00 2001
From: Michael Maitland 
Date: Mon, 30 Dec 2024 09:00:27 -0500
Subject: [PATCH 174/567] [RISCV][VLOPT] Add getOperandInfo for saturating
 signed multiply (#120351)

These instructions are covered by the existing tests. We don't add them to
isSupported because of VXSAT. This decision was made in #120358.
---
 llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index 4e3212c70ee9b..85ea5a23e8f29 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -415,6 +415,11 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VASUBU_VX:
   case RISCV::VASUB_VV:
   case RISCV::VASUB_VX:
+  // Vector Single-Width Fractional Multiply with Rounding and Saturation
+  // EEW=SEW. EMUL=LMUL. The instruction produces 2*SEW product internally but
+  // saturates to fit into SEW bits.
+  case RISCV::VSMUL_VV:
+  case RISCV::VSMUL_VX:
   // Vector Single-Width Scaling Shift Instructions
   // EEW=SEW. EMUL=LMUL.
   case RISCV::VSSRL_VI:

From ff936ce62bda2f9148575caae527cc4c6ab282a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?=
 
Date: Mon, 30 Dec 2024 15:07:28 +0100
Subject: [PATCH 175/567] [Clang][Driver][AMDGPU] Add missing space in missing
 device-libs error message (#121335)

Before/After:
> cannot find ROCm device **libraryfor** ABI version 6
> cannot find ROCm device **library for** ABI version 6
---
 clang/include/clang/Basic/DiagnosticDriverKinds.td | 2 +-
 clang/test/Driver/hip-device-libs.hip              | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 5155b23d151c0..42c39ac6606c7 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -67,7 +67,7 @@ def err_drv_no_cuda_libdevice : Error<
   "libdevice">;
 
 def err_drv_no_rocm_device_lib : Error<
-  "cannot find ROCm device library%select{| for %1|for ABI version %1}0; provide its path via "
+  "cannot find ROCm device library%select{| for %1| for ABI version %1}0; provide its path via "
   "'--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build "
   "without ROCm device library">;
 def err_drv_no_hip_runtime : Error<
diff --git a/clang/test/Driver/hip-device-libs.hip b/clang/test/Driver/hip-device-libs.hip
index 6f1d31508e330..317fd79242697 100644
--- a/clang/test/Driver/hip-device-libs.hip
+++ b/clang/test/Driver/hip-device-libs.hip
@@ -253,5 +253,5 @@
 // NOABI4-NOT: error:
 // NOABI4-NOT: "-mlink-builtin-bitcode" "{{.*}}oclc_abi_version_400.bc"
 // NOABI4-NOT: "-mlink-builtin-bitcode" "{{.*}}oclc_abi_version_500.bc"
-// NOABI5: error: cannot find ROCm device libraryfor ABI version 5; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
-// NOABI6: error: cannot find ROCm device libraryfor ABI version 6; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
+// NOABI5: error: cannot find ROCm device library for ABI version 5; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
+// NOABI6: error: cannot find ROCm device library for ABI version 6; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library

From 42dfaa15a60cea6cd75a2efb419aa2c206d2a127 Mon Sep 17 00:00:00 2001
From: Congcong Cai 
Date: Mon, 30 Dec 2024 22:35:46 +0800
Subject: [PATCH 176/567] [clang-tidy] add depercation warning for
 non-whitelisted global options (#121057)

We plan to depercate `StrictMode` and `IgnoreMacros` global options
after 2 major versions and support local options only for them.
This patch introduces the depercation warning.
---
 .../clang-tidy/ClangTidyCheck.cpp             | 32 ++++++++++++-------
 clang-tools-extra/docs/ReleaseNotes.rst       |  3 +-
 .../checkers/modernize/use-std-format-fmt.cpp |  2 +-
 .../deprecation-global-option.cpp             |  3 ++
 4 files changed, 27 insertions(+), 13 deletions(-)
 create mode 100644 clang-tools-extra/test/clang-tidy/infrastructure/deprecation-global-option.cpp

diff --git a/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp
index 6028bb2258136..4aa9fe228ee79 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp
+++ b/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp
@@ -7,11 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "ClangTidyCheck.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Error.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/Support/YAMLParser.h"
 #include 
+#include 
 
 namespace clang::tidy {
 
@@ -62,16 +62,29 @@ ClangTidyCheck::OptionsView::get(StringRef LocalName) const {
   return std::nullopt;
 }
 
+static const llvm::StringSet<> DeprecatedGlobalOptions{
+    "StrictMode",
+    "IgnoreMacros",
+};
+
 static ClangTidyOptions::OptionMap::const_iterator
 findPriorityOption(const ClangTidyOptions::OptionMap &Options,
                    StringRef NamePrefix, StringRef LocalName,
-                   llvm::StringSet<> *Collector) {
+                   ClangTidyContext *Context) {
+  llvm::StringSet<> *Collector = Context->getOptionsCollector();
   if (Collector) {
     Collector->insert((NamePrefix + LocalName).str());
     Collector->insert(LocalName);
   }
   auto IterLocal = Options.find((NamePrefix + LocalName).str());
   auto IterGlobal = Options.find(LocalName);
+  // FIXME: temporary solution for deprecation warnings, should be removed
+  // after 22.x. Warn configuration deps on deprecation global options.
+  if (IterLocal == Options.end() && IterGlobal != Options.end() &&
+      DeprecatedGlobalOptions.contains(LocalName))
+    Context->configurationDiag(
+        "global option '%0' is deprecated, please use '%1%0' instead.")
+        << LocalName << NamePrefix;
   if (IterLocal == Options.end())
     return IterGlobal;
   if (IterGlobal == Options.end())
@@ -83,8 +96,7 @@ findPriorityOption(const ClangTidyOptions::OptionMap &Options,
 
 std::optional
 ClangTidyCheck::OptionsView::getLocalOrGlobal(StringRef LocalName) const {
-  auto Iter = findPriorityOption(CheckOptions, NamePrefix, LocalName,
-                                 Context->getOptionsCollector());
+  auto Iter = findPriorityOption(CheckOptions, NamePrefix, LocalName, Context);
   if (Iter != CheckOptions.end())
     return StringRef(Iter->getValue().Value);
   return std::nullopt;
@@ -117,8 +129,7 @@ ClangTidyCheck::OptionsView::get(StringRef LocalName) const {
 template <>
 std::optional
 ClangTidyCheck::OptionsView::getLocalOrGlobal(StringRef LocalName) const {
-  auto Iter = findPriorityOption(CheckOptions, NamePrefix, LocalName,
-                                 Context->getOptionsCollector());
+  auto Iter = findPriorityOption(CheckOptions, NamePrefix, LocalName, Context);
   if (Iter != CheckOptions.end()) {
     if (auto Result = getAsBool(Iter->getValue().Value, Iter->getKey()))
       return Result;
@@ -157,10 +168,9 @@ std::optional ClangTidyCheck::OptionsView::getEnumInt(
     bool IgnoreCase) const {
   if (!CheckGlobal && Context->getOptionsCollector())
     Context->getOptionsCollector()->insert((NamePrefix + LocalName).str());
-  auto Iter = CheckGlobal
-                  ? findPriorityOption(CheckOptions, NamePrefix, LocalName,
-                                       Context->getOptionsCollector())
-                  : CheckOptions.find((NamePrefix + LocalName).str());
+  auto Iter = CheckGlobal ? findPriorityOption(CheckOptions, NamePrefix,
+                                               LocalName, Context)
+                          : CheckOptions.find((NamePrefix + LocalName).str());
   if (Iter == CheckOptions.end())
     return std::nullopt;
 
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 3cab440155250..1fd9b6077be5f 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -119,7 +119,8 @@ Improvements to clang-tidy
 
 - Removed :program:`clang-tidy`'s global options for most of checks. All options
   are changed to local options except `IncludeStyle`, `StrictMode` and
-  `IgnoreMacros`.
+  `IgnoreMacros`. Global scoped `StrictMode` and `IgnoreMacros` are deprecated
+  and will be removed in further releases.
 
 .. csv-table::
   :header: "Check", "Options removed from global option"
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format-fmt.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format-fmt.cpp
index 1eaf18ac11996..71c8af190467c 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format-fmt.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format-fmt.cpp
@@ -1,6 +1,6 @@
 // RUN: %check_clang_tidy %s modernize-use-std-format %t -- \
 // RUN:   -config="{CheckOptions: { \
-// RUN:              StrictMode: true, \
+// RUN:              modernize-use-std-format.StrictMode: true, \
 // RUN:              modernize-use-std-format.StrFormatLikeFunctions: 'fmt::sprintf', \
 // RUN:              modernize-use-std-format.ReplacementFormatFunction: 'fmt::format', \
 // RUN:              modernize-use-std-format.FormatHeader: '' \
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/deprecation-global-option.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/deprecation-global-option.cpp
new file mode 100644
index 0000000000000..4c9854d221832
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/infrastructure/deprecation-global-option.cpp
@@ -0,0 +1,3 @@
+// RUN: clang-tidy %s --config="{CheckOptions:{StrictMode: true}}" -checks="-*,modernize-use-std-format" | FileCheck %s 
+
+// CHECK: warning: global option 'StrictMode' is deprecated, please use 'modernize-use-std-format.StrictMode' instead. [clang-tidy-config]

From 79af7bdd4e415aa8a94263a4507b51862fba882f Mon Sep 17 00:00:00 2001
From: Longsheng Mou 
Date: Mon, 30 Dec 2024 23:12:55 +0800
Subject: [PATCH 177/567] [mlir][tosa] Add `AllElementTypesMatch` trait for
 `tosa.transpose` (#120964)

This PR adds `AllElementTypesMatch` trait for `tosa.transpose` to ensure
output tensor of same type as the input tensor. Fixes #119364.
---
 mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td       | 3 ++-
 mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp | 4 ----
 mlir/test/Dialect/Tosa/constant-op-fold.mlir       | 9 ---------
 mlir/test/Dialect/Tosa/invalid.mlir                | 9 +++++++++
 4 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
index e3c725801d162..8ae5d3ab417b6 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
@@ -1698,7 +1698,8 @@ def Tosa_TileOp : Tosa_InferShapedTypeOp<"tile"> {
 // Operator: transpose
 //===----------------------------------------------------------------------===//
 def Tosa_TransposeOp : Tosa_InferShapedTypeOp<"transpose",
-                [DeclareOpInterfaceMethods]> {
+                [DeclareOpInterfaceMethods,
+                 AllElementTypesMatch<["input1", "output"]>]> {
   let summary = "Transpose operator";
 
   let description = [{
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
index 39d0ee122b163..f51c3dbce6eef 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
@@ -1002,10 +1002,6 @@ OpFoldResult TransposeOp::fold(FoldAdaptor adaptor) {
       return input.reshape(resultTy);
   }
 
-  // Transpose does not change the input type.
-  if (getInput1().getType() != getType())
-    return {};
-
   // Transpose is not the identity transpose.
   SmallVector perms;
   if (getConstantPerms(perms).failed())
diff --git a/mlir/test/Dialect/Tosa/constant-op-fold.mlir b/mlir/test/Dialect/Tosa/constant-op-fold.mlir
index 2902c4a62009e..8198903b78ac0 100644
--- a/mlir/test/Dialect/Tosa/constant-op-fold.mlir
+++ b/mlir/test/Dialect/Tosa/constant-op-fold.mlir
@@ -117,15 +117,6 @@ func.func @transpose_nofold_multi_users() -> (tensor<3x2xf32>, tensor<2x3xf32>)
   return %1, %input : tensor<3x2xf32>, tensor<2x3xf32>
 }
 
-// CHECK-LABEL: @transpose_nofold_quantized_types
-func.func @transpose_nofold_quantized_types() -> tensor<1x1x2x2x!quant.uniform:f32:3, {1.000000e-01,1.000000e-01}>> {
-  %perms = "tosa.const"() {value = dense<[1, 2, 3, 0]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %input = "tosa.const"() {value = dense<-127> : tensor<2x1x1x2xi8>} : () -> tensor<2x1x1x2xi8>
-  // CHECK: tosa.transpose
-  %0 = tosa.transpose %input, %perms : (tensor<2x1x1x2xi8>, tensor<4xi32>) -> tensor<1x1x2x2x!quant.uniform:f32:3, {1.000000e-01,1.000000e-01}>>
-  return %0: tensor<1x1x2x2x!quant.uniform:f32:3, {1.000000e-01,1.000000e-01}>>
-}
-
 // CHECK-LABEL: @transpose_nofold_dense_resource
 func.func @transpose_nofold_dense_resource() -> tensor<2x2xf32> {
   %0 = "tosa.const"() <{value = dense_resource : tensor<2x2xf32>}> : () -> tensor<2x2xf32>
diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir
index cca50b25d14d6..b796a6343e5ed 100644
--- a/mlir/test/Dialect/Tosa/invalid.mlir
+++ b/mlir/test/Dialect/Tosa/invalid.mlir
@@ -206,6 +206,15 @@ func.func @test_transpose_invalid_permutation_types_dynamic_dim_ok(%arg0: tensor
 
 // -----
 
+func.func @test_transpose_element_type_mismatch(%arg0: tensor<2x3xi32>) -> tensor<3x2xf32> {
+  %perms = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // expected-error@+1 {{'tosa.transpose' op failed to verify that all of {input1, output} have same element type}}
+  %1 = tosa.transpose %arg0, %perms : (tensor<2x3xi32>, tensor<2xi32>) -> tensor<3x2xf32>
+  return %1 : tensor<3x2xf32>
+}
+
+// -----
+
 func.func @test_fully_connected_non_const(%arg0: tensor<13x21x3xf32>, %arg1: tensor<2x3xf32>) -> tensor<273x2xf32> {
   %0 = "tosa.const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
   %1 = tosa.reshape %arg0 {new_shape = array} : (tensor<13x21x3xf32>) -> tensor<273x3xf32>

From b3a7ab6f1f6954bfb1da0683aa5d03a2837c7065 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim 
Date: Mon, 30 Dec 2024 16:08:35 +0000
Subject: [PATCH 178/567] [DAG] Don't allow implicit truncation in
 extract_element(bitcast(scalar_to_vector(X))) -> trunc(srl(X,C)) fold

Limits #117900 to only fold when scalar_to_vector doesn't perform implicit truncation, as the scaled shift calculation doesn't currently account for this - this can be addressed in a future update.

Fixes #121306
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  5 ++++-
 .../CodeGen/PowerPC/scalar_vector_test_5.ll   | 19 +++++++++++++++----
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 6cbfef2d238bb..6b2501591c81a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -23088,8 +23088,11 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     if (ExtractIndex == BCTruncElt && BCSrc.getValueType().isScalarInteger())
       return DAG.getAnyExtOrTrunc(BCSrc, DL, ScalarVT);
 
+    // TODO: Add support for SCALAR_TO_VECTOR implicit truncation.
     if (LegalTypes && BCSrc.getValueType().isInteger() &&
-        BCSrc.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+        BCSrc.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+        BCSrc.getScalarValueSizeInBits() ==
+            BCSrc.getOperand(0).getScalarValueSizeInBits()) {
       // ext_elt (bitcast (scalar_to_vec i64 X to v2i64) to v4i32), TruncElt -->
       // trunc i64 X to i32
       SDValue X = BCSrc.getOperand(0);
diff --git a/llvm/test/CodeGen/PowerPC/scalar_vector_test_5.ll b/llvm/test/CodeGen/PowerPC/scalar_vector_test_5.ll
index b6799c8a88e0c..f62f70ca7ac1c 100644
--- a/llvm/test/CodeGen/PowerPC/scalar_vector_test_5.ll
+++ b/llvm/test/CodeGen/PowerPC/scalar_vector_test_5.ll
@@ -11,24 +11,35 @@
 define i8 @scalar_to_vector_half(ptr nocapture readonly %ad) {
 ; P9LE-LABEL: scalar_to_vector_half:
 ; P9LE:       # %bb.0: # %entry
-; P9LE-NEXT:    lhz r3, 0(r3)
+; P9LE-NEXT:    lxsihzx v2, 0, r3
+; P9LE-NEXT:    li r3, 0
+; P9LE-NEXT:    vsplth v2, v2, 3
+; P9LE-NEXT:    vextubrx r3, r3, v2
 ; P9LE-NEXT:    blr
 ;
 ; P9BE-LABEL: scalar_to_vector_half:
 ; P9BE:       # %bb.0: # %entry
-; P9BE-NEXT:    lhz r3, 0(r3)
-; P9BE-NEXT:    srwi r3, r3, 24
+; P9BE-NEXT:    lxsihzx v2, 0, r3
+; P9BE-NEXT:    li r3, 0
+; P9BE-NEXT:    vsplth v2, v2, 3
+; P9BE-NEXT:    vextublx r3, r3, v2
 ; P9BE-NEXT:    blr
 ;
 ; P8LE-LABEL: scalar_to_vector_half:
 ; P8LE:       # %bb.0: # %entry
 ; P8LE-NEXT:    lhz r3, 0(r3)
+; P8LE-NEXT:    mtfprd f0, r3
+; P8LE-NEXT:    mffprd r3, f0
+; P8LE-NEXT:    clrldi r3, r3, 56
 ; P8LE-NEXT:    blr
 ;
 ; P8BE-LABEL: scalar_to_vector_half:
 ; P8BE:       # %bb.0: # %entry
 ; P8BE-NEXT:    lhz r3, 0(r3)
-; P8BE-NEXT:    srwi r3, r3, 24
+; P8BE-NEXT:    sldi r3, r3, 48
+; P8BE-NEXT:    mtfprd f0, r3
+; P8BE-NEXT:    mffprd r3, f0
+; P8BE-NEXT:    rldicl r3, r3, 8, 56
 ; P8BE-NEXT:    blr
 entry:
     %0 = load <2 x i8>, ptr %ad, align 1

From c7d237085bf9102ecf0c9105d8cc7fd94b752a3a Mon Sep 17 00:00:00 2001
From: "Oleksandr \"Alex\" Zinenko" 
Date: Mon, 30 Dec 2024 08:27:53 -0800
Subject: [PATCH 179/567] [mlir] add a simple pygments lexer (#120942)

This enables syntax highlighting of MLIR using the Pygments package in
Python, which is in turn usable from LaTeX via the minted package.
---
 mlir/utils/pygments/README.md     | 45 +++++++++++++++++++++++++++++++
 mlir/utils/pygments/mlir_lexer.py | 38 ++++++++++++++++++++++++++
 2 files changed, 83 insertions(+)
 create mode 100644 mlir/utils/pygments/README.md
 create mode 100644 mlir/utils/pygments/mlir_lexer.py

diff --git a/mlir/utils/pygments/README.md b/mlir/utils/pygments/README.md
new file mode 100644
index 0000000000000..838faceb01b0f
--- /dev/null
+++ b/mlir/utils/pygments/README.md
@@ -0,0 +1,45 @@
+## Pygments Lexer for MLIR
+
+This file contains a simple Pygments lexer configuration for MLIR, derived from
+the version used in the original CGO paper. Pygments allows for advanced
+configurable syntax highlighting of any code. This lexer is known to be
+incomplete and support mostly core IR with a subset of built-in types.
+Additions and customizations are welcome.
+
+### Standalone Usage
+
+Install Pygments, e.g., by running `pip install Pygments` or a Python package
+manager of your choosing. Use the standalone `pygmentize` command by
+instructing it to load the custom lexer:
+
+```
+pygmentize -l /path/to/mlir_lexer.py:MlirLexer -x myfile.mlir
+```
+
+This will produce highlighted output in the terminal. Other output formats are
+available, see Pygments [documentation](https://pygments.org/docs/) for more
+information.
+
+### LaTeX Usage
+
+First, make sure your distribution includes the `minted` package and list in
+the preamble.
+
+```latex
+\usepackage{minted}
+```
+
+Place the `mlir_lexer.py` in a place where the `latex` binary can find it,
+typically in the working directory next to the main `.tex` file. Note that you
+will have to invoke `latex` with the `-shell-escape` flag. See the `minted` 
+package [documentation](https://ctan.org/pkg/minted?lang=en) for more
+information.
+
+Leverage the custom lexer facility of `minted` to use this lexer in your
+document as:
+
+```latex
+\begin{minted}{mlir_lexer.py:MlirLexer -x}
+   ... your code here ...
+\end{minted}
+```
diff --git a/mlir/utils/pygments/mlir_lexer.py b/mlir/utils/pygments/mlir_lexer.py
new file mode 100644
index 0000000000000..179a058e9110c
--- /dev/null
+++ b/mlir/utils/pygments/mlir_lexer.py
@@ -0,0 +1,38 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from pygments.lexer import RegexLexer
+from pygments.token import *
+
+
+class MlirLexer(RegexLexer):
+    name = "MLIR"
+    aliases = ["mlir"]
+    filenames = ["*.mlir"]
+
+    tokens = {
+        "root": [
+            (r"%[a-zA-Z0-9_]+", Name.Variable),
+            (r"@[a-zA-Z_][a-zA-Z0-9_]+", Name.Function),
+            (r"\^[a-zA-Z0-9_]+", Name.Label),
+            (r"#[a-zA-Z0-9_]+", Name.Constant),
+            (r"![a-zA-Z0-9_]+", Keyword.Type),
+            (r"[a-zA-Z_][a-zA-Z0-9_]*\.", Name.Entity),
+            (r"memref[^.]", Keyword.Type),
+            (r"index", Keyword.Type),
+            (r"i[0-9]+", Keyword.Type),
+            (r"f[0-9]+", Keyword.Type),
+            (r"[0-9]+", Number.Integer),
+            (r"[0-9]*\.[0-9]*", Number.Float),
+            (r'"[^"]*"', String.Double),
+            (r"affine_map", Keyword.Reserved),
+            # TODO: this should be within affine maps only
+            (r"\+-\*\/", Operator),
+            (r"floordiv", Operator.Word),
+            (r"ceildiv", Operator.Word),
+            (r"mod", Operator.Word),
+            (r"()\[\]<>,{}", Punctuation),
+            (r"\/\/.*\n", Comment.Single),
+        ]
+    }

From a74f825a7acec4962bb4c172da7ed0028f7b4d44 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu 
Date: Mon, 30 Dec 2024 09:23:51 -0800
Subject: [PATCH 180/567] [MIPatternMatch] Add m_DeferredReg/Type (#121218)

This pattern does the same thing as m_SpecificReg/Type except the value
it matches against origniated from an earlier pattern in the same
mi_match expression.

This patch also changes how commutative patterns are handled: in order
to support m_DefferedReg/Type, we always have to run the LHS-pattern
before the RHS one.
---
 .../llvm/CodeGen/GlobalISel/MIPatternMatch.h  | 52 +++++++++++++++++--
 .../CodeGen/GlobalISel/PatternMatchTest.cpp   | 30 +++++++++++
 2 files changed, 78 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
index 47417f53b6e40..78a92c86b91e4 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
@@ -372,6 +372,36 @@ inline bind_ty m_Type(LLT &Ty) { return Ty; }
 inline bind_ty m_Pred(CmpInst::Predicate &P) { return P; }
 inline operand_type_match m_Pred() { return operand_type_match(); }
 
+template  struct deferred_helper {
+  static bool match(const MachineRegisterInfo &MRI, BindTy &VR, BindTy &V) {
+    return VR == V;
+  }
+};
+
+template <> struct deferred_helper {
+  static bool match(const MachineRegisterInfo &MRI, LLT VT, Register R) {
+    return VT == MRI.getType(R);
+  }
+};
+
+template  struct deferred_ty {
+  Class &VR;
+
+  deferred_ty(Class &V) : VR(V) {}
+
+  template  bool match(const MachineRegisterInfo &MRI, ITy &&V) {
+    return deferred_helper::match(MRI, VR, V);
+  }
+};
+
+/// Similar to m_SpecificReg/Type, but the specific value to match originated
+/// from an earlier sub-pattern in the same mi_match expression. For example,
+/// we cannot match `(add X, X)` with `m_GAdd(m_Reg(X), m_SpecificReg(X))`
+/// because `X` is not initialized at the time it's passed to `m_SpecificReg`.
+/// Instead, we can use `m_GAdd(m_Reg(x), m_DeferredReg(X))`.
+inline deferred_ty m_DeferredReg(Register &R) { return R; }
+inline deferred_ty m_DeferredType(LLT &Ty) { return Ty; }
+
 struct ImplicitDefMatch {
   bool match(const MachineRegisterInfo &MRI, Register Reg) {
     MachineInstr *TmpMI;
@@ -401,8 +431,13 @@ struct BinaryOp_match {
       if (TmpMI->getOpcode() == Opcode && TmpMI->getNumOperands() == 3) {
         return (L.match(MRI, TmpMI->getOperand(1).getReg()) &&
                 R.match(MRI, TmpMI->getOperand(2).getReg())) ||
-               (Commutable && (R.match(MRI, TmpMI->getOperand(1).getReg()) &&
-                               L.match(MRI, TmpMI->getOperand(2).getReg())));
+               // NOTE: When trying the alternative operand ordering
+               // with a commutative operation, it is imperative to always run
+               // the LHS sub-pattern  (i.e. `L`) before the RHS sub-pattern
+               // (i.e. `R`). Otherwsie, m_DeferredReg/Type will not work as
+               // expected.
+               (Commutable && (L.match(MRI, TmpMI->getOperand(2).getReg()) &&
+                               R.match(MRI, TmpMI->getOperand(1).getReg())));
       }
     }
     return false;
@@ -426,8 +461,13 @@ struct BinaryOpc_match {
           TmpMI->getNumOperands() == 3) {
         return (L.match(MRI, TmpMI->getOperand(1).getReg()) &&
                 R.match(MRI, TmpMI->getOperand(2).getReg())) ||
-               (Commutable && (R.match(MRI, TmpMI->getOperand(1).getReg()) &&
-                               L.match(MRI, TmpMI->getOperand(2).getReg())));
+               // NOTE: When trying the alternative operand ordering
+               // with a commutative operation, it is imperative to always run
+               // the LHS sub-pattern  (i.e. `L`) before the RHS sub-pattern
+               // (i.e. `R`). Otherwsie, m_DeferredReg/Type will not work as
+               // expected.
+               (Commutable && (L.match(MRI, TmpMI->getOperand(2).getReg()) &&
+                               R.match(MRI, TmpMI->getOperand(1).getReg())));
       }
     }
     return false;
@@ -674,6 +714,10 @@ struct CompareOp_match {
     Register RHS = TmpMI->getOperand(3).getReg();
     if (L.match(MRI, LHS) && R.match(MRI, RHS))
       return true;
+    // NOTE: When trying the alternative operand ordering
+    // with a commutative operation, it is imperative to always run
+    // the LHS sub-pattern  (i.e. `L`) before the RHS sub-pattern
+    // (i.e. `R`). Otherwsie, m_DeferredReg/Type will not work as expected.
     if (Commutable && L.match(MRI, RHS) && R.match(MRI, LHS) &&
         P.match(MRI, CmpInst::getSwappedPredicate(TmpPred)))
       return true;
diff --git a/llvm/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp b/llvm/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp
index fc76d4055722e..40cd055c1c3f8 100644
--- a/llvm/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp
@@ -920,6 +920,36 @@ TEST_F(AArch64GISelMITest, MatchSpecificReg) {
   EXPECT_TRUE(mi_match(Add.getReg(0), *MRI, m_GAdd(m_SpecificReg(Reg), m_Reg())));
 }
 
+TEST_F(AArch64GISelMITest, DeferredMatching) {
+  setUp();
+  if (!TM)
+    GTEST_SKIP();
+  auto s64 = LLT::scalar(64);
+  auto s32 = LLT::scalar(32);
+
+  auto Cst1 = B.buildConstant(s64, 42);
+  auto Cst2 = B.buildConstant(s64, 314);
+  auto Add = B.buildAdd(s64, Cst1, Cst2);
+  auto Sub = B.buildSub(s64, Add, Cst1);
+
+  auto TruncAdd = B.buildTrunc(s32, Add);
+  auto TruncSub = B.buildTrunc(s32, Sub);
+  auto NarrowAdd = B.buildAdd(s32, TruncAdd, TruncSub);
+
+  Register X;
+  EXPECT_TRUE(mi_match(Sub.getReg(0), *MRI,
+                       m_GSub(m_GAdd(m_Reg(X), m_Reg()), m_DeferredReg(X))));
+  LLT Ty;
+  EXPECT_TRUE(
+      mi_match(NarrowAdd.getReg(0), *MRI,
+               m_GAdd(m_GTrunc(m_Type(Ty)), m_GTrunc(m_DeferredType(Ty)))));
+
+  // Test commutative.
+  auto Add2 = B.buildAdd(s64, Sub, Cst1);
+  EXPECT_TRUE(mi_match(Add2.getReg(0), *MRI,
+                       m_GAdd(m_Reg(X), m_GSub(m_Reg(), m_DeferredReg(X)))));
+}
+
 } // namespace
 
 int main(int argc, char **argv) {

From c2be48a6ce87eeaa37e2a0ab531e6e5812dbfa55 Mon Sep 17 00:00:00 2001
From: Florian Hahn 
Date: Mon, 30 Dec 2024 17:36:48 +0000
Subject: [PATCH 181/567] [LV] Add additional tests with induction users.

Adds test coverage of post-inc IV users with different opcodes.
---
 .../LoopVectorize/iv_outside_user.ll          | 276 ++++++++++++++++++
 1 file changed, 276 insertions(+)

diff --git a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
index 10b6d1f7653da..fee10cf013bac 100644
--- a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
@@ -502,3 +502,279 @@ exit:
   %iv.2.lcssa = phi i32 [ %iv.2, %loop ]
   ret i32 %iv.2.lcssa
 }
+
+define i32 @postinc_sub(i32 %k)  {
+; CHECK-LABEL: define i32 @postinc_sub(
+; CHECK-SAME: i32 [[K:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[K]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[K]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[K]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i32 [[K]], [[N_VEC]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[K]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[K]], %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[INC]] = sub nsw i32 [[INC_PHI]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_BODY]], {{!llvm.loop ![0-9]+}}
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], %[[FOR_BODY]] ], [ [[TMP0]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[INC_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %inc.phi = phi i32 [ %k, %entry ], [ %inc, %for.body ]
+  %inc = sub nsw i32 %inc.phi, 1
+  %cmp = icmp eq i32 %inc, 0
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i32 %inc
+}
+
+define i32 @postinc_swapped_ops(i32 %k)  {
+; CHECK-LABEL: define i32 @postinc_swapped_ops(
+; CHECK-SAME: i32 [[K:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[K]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[K]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[K]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[K]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[INC]] = add nsw i32 1, [[INC_PHI]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC]], [[K]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_BODY]], {{!llvm.loop ![0-9]+}}
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], %[[FOR_BODY]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[INC_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %inc = add nsw i32 1, %inc.phi
+  %cmp = icmp eq i32 %inc, %k
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i32 %inc
+}
+
+define i32 @postinc_not_iv_backedge_value(i32 %k)  {
+; VEC-LABEL: define i32 @postinc_not_iv_backedge_value(
+; VEC-SAME: i32 [[K:%.*]]) {
+; VEC-NEXT:  [[ENTRY:.*]]:
+; VEC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[K]], 2
+; VEC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VEC:       [[VECTOR_PH]]:
+; VEC-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[K]], 2
+; VEC-NEXT:    [[N_VEC:%.*]] = sub i32 [[K]], [[N_MOD_VF]]
+; VEC-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VEC:       [[VECTOR_BODY]]:
+; VEC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[TMP0:%.*]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; VEC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; VEC-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; VEC-NEXT:    br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[MIDDLE_BLOCK]]:
+; VEC-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
+; VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[K]], [[N_VEC]]
+; VEC-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; VEC:       [[SCALAR_PH]]:
+; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VEC-NEXT:    br label %[[FOR_BODY:.*]]
+; VEC:       [[FOR_BODY]]:
+; VEC-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; VEC-NEXT:    [[INC]] = add nsw i32 [[INC_PHI]], 1
+; VEC-NEXT:    [[INC_2:%.*]] = add i32 [[INC_PHI]], 2
+; VEC-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC]], [[K]]
+; VEC-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_BODY]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[FOR_END]]:
+; VEC-NEXT:    [[INC_2_LCSSA:%.*]] = phi i32 [ [[INC_2]], %[[FOR_BODY]] ], [ [[TMP2]], %[[MIDDLE_BLOCK]] ]
+; VEC-NEXT:    ret i32 [[INC_2_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %inc = add nsw i32 %inc.phi, 1
+  %inc.2 = add i32 %inc.phi, 2
+  %cmp = icmp eq i32 %inc, %k
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i32 %inc.2
+}
+
+define float @fp_postinc_use_fadd(float %init, ptr noalias nocapture %A, i64 %N, float %fpinc) {
+; VEC-LABEL: define float @fp_postinc_use_fadd(
+; VEC-SAME: float [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i64 [[N:%.*]], float [[FPINC:%.*]]) {
+; VEC-NEXT:  [[ENTRY:.*]]:
+; VEC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VEC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VEC:       [[VECTOR_PH]]:
+; VEC-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VEC-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VEC-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC-NEXT:    [[TMP0:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]]
+; VEC-NEXT:    [[TMP1:%.*]] = fadd fast float [[INIT]], [[TMP0]]
+; VEC-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[INIT]], i64 0
+; VEC-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <2 x float> poison, float [[FPINC]], i64 0
+; VEC-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT1]], <2 x float> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    [[TMP2:%.*]] = fmul fast <2 x float> , [[DOTSPLAT2]]
+; VEC-NEXT:    [[INDUCTION:%.*]] = fadd fast <2 x float> [[DOTSPLAT]], [[TMP2]]
+; VEC-NEXT:    [[TMP3:%.*]] = fmul fast float [[FPINC]], 2.000000e+00
+; VEC-NEXT:    [[DOTSPLATINSERT3:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0
+; VEC-NEXT:    [[DOTSPLAT4:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT3]], <2 x float> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VEC:       [[VECTOR_BODY]]:
+; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[VEC_IND:%.*]] = phi <2 x float> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; VEC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]]
+; VEC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
+; VEC-NEXT:    store <2 x float> [[VEC_IND]], ptr [[TMP6]], align 4
+; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VEC-NEXT:    [[VEC_IND_NEXT]] = fadd fast <2 x float> [[VEC_IND]], [[DOTSPLAT4]]
+; VEC-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VEC-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[MIDDLE_BLOCK]]:
+; VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VEC-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VEC:       [[SCALAR_PH]]:
+; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VEC-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi float [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[INIT]], %[[ENTRY]] ]
+; VEC-NEXT:    br label %[[LOOP:.*]]
+; VEC:       [[LOOP]]:
+; VEC-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VEC-NEXT:    [[FP_IV:%.*]] = phi float [ [[BC_RESUME_VAL5]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; VEC-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; VEC-NEXT:    store float [[FP_IV]], ptr [[GEP_A]], align 4
+; VEC-NEXT:    [[ADD]] = fadd fast float [[FP_IV]], [[FPINC]]
+; VEC-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VEC-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VEC-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[EXIT]]:
+; VEC-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[LOOP]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ]
+; VEC-NEXT:    ret float [[ADD_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %fp.iv = phi float [ %init, %entry ], [ %add, %loop ]
+  %gep.A = getelementptr inbounds float, ptr %A, i64 %iv
+  store float %fp.iv, ptr %gep.A, align 4
+  %add = fadd fast float %fp.iv, %fpinc
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %add
+}
+
+define float @fp_postinc_use_fsub(float %init, ptr noalias nocapture %A, i64 %N, float %fpinc) {
+; VEC-LABEL: define float @fp_postinc_use_fsub(
+; VEC-SAME: float [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i64 [[N:%.*]], float [[FPINC:%.*]]) {
+; VEC-NEXT:  [[ENTRY:.*]]:
+; VEC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VEC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VEC:       [[VECTOR_PH]]:
+; VEC-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VEC-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VEC-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC-NEXT:    [[TMP0:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]]
+; VEC-NEXT:    [[TMP1:%.*]] = fsub fast float [[INIT]], [[TMP0]]
+; VEC-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[INIT]], i64 0
+; VEC-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <2 x float> poison, float [[FPINC]], i64 0
+; VEC-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT1]], <2 x float> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    [[TMP2:%.*]] = fmul fast <2 x float> , [[DOTSPLAT2]]
+; VEC-NEXT:    [[INDUCTION:%.*]] = fsub fast <2 x float> [[DOTSPLAT]], [[TMP2]]
+; VEC-NEXT:    [[TMP3:%.*]] = fmul fast float [[FPINC]], 2.000000e+00
+; VEC-NEXT:    [[DOTSPLATINSERT3:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0
+; VEC-NEXT:    [[DOTSPLAT4:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT3]], <2 x float> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VEC:       [[VECTOR_BODY]]:
+; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[VEC_IND:%.*]] = phi <2 x float> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; VEC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]]
+; VEC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
+; VEC-NEXT:    store <2 x float> [[VEC_IND]], ptr [[TMP6]], align 4
+; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VEC-NEXT:    [[VEC_IND_NEXT]] = fsub fast <2 x float> [[VEC_IND]], [[DOTSPLAT4]]
+; VEC-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VEC-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[MIDDLE_BLOCK]]:
+; VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VEC-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VEC:       [[SCALAR_PH]]:
+; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VEC-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi float [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[INIT]], %[[ENTRY]] ]
+; VEC-NEXT:    br label %[[LOOP:.*]]
+; VEC:       [[LOOP]]:
+; VEC-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VEC-NEXT:    [[FP_IV:%.*]] = phi float [ [[BC_RESUME_VAL5]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; VEC-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; VEC-NEXT:    store float [[FP_IV]], ptr [[GEP_A]], align 4
+; VEC-NEXT:    [[ADD]] = fsub fast float [[FP_IV]], [[FPINC]]
+; VEC-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VEC-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VEC-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[EXIT]]:
+; VEC-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[LOOP]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ]
+; VEC-NEXT:    ret float [[ADD_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %fp.iv = phi float [ %init, %entry ], [ %add, %loop ]
+  %gep.A = getelementptr inbounds float, ptr %A, i64 %iv
+  store float %fp.iv, ptr %gep.A, align 4
+  %add = fsub fast float %fp.iv, %fpinc
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %add
+}

From a77d119f76c45561f86528b27cf391c61d1ad69f Mon Sep 17 00:00:00 2001
From: Alex MacLean 
Date: Mon, 30 Dec 2024 09:51:31 -0800
Subject: [PATCH 182/567] [NVPTX] Remove redundant types from TableGen patterns
 (NFC) (#120986)

These types in the output dag of a Pat do not impact the generated
matcher code at all. Removing them makes for more concise and readable
code.
---
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td  | 610 +++++++++++------------
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 342 ++++++-------
 2 files changed, 476 insertions(+), 476 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 711cd67eceed9..c3e72d6ce3a3f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -733,12 +733,12 @@ def fpround_oneuse : PatFrag<(ops node:$a), (fpround node:$a), [{
 
 def : Pat<(v2bf16 (build_vector (bf16 (fpround_oneuse f32:$lo)),
                                 (bf16 (fpround_oneuse f32:$hi)))),
-          (CVT_bf16x2_f32 Float32Regs:$hi, Float32Regs:$lo, CvtRN)>,
+          (CVT_bf16x2_f32 $hi, $lo, CvtRN)>,
       Requires<[hasPTX<70>, hasSM<80>, hasBF16Math]>;
 
 def : Pat<(v2f16 (build_vector (f16 (fpround_oneuse f32:$lo)),
                                (f16 (fpround_oneuse f32:$hi)))),
-          (CVT_f16x2_f32 Float32Regs:$hi, Float32Regs:$lo, CvtRN)>,
+          (CVT_f16x2_f32 $hi, $lo, CvtRN)>,
       Requires<[hasPTX<70>, hasSM<80>, useFP16Math]>;
 
 //-----------------------------------
@@ -813,7 +813,7 @@ defm SELP_f64 : SELP_PATTERN<"f64", f64, Float64Regs, f64imm, fpimm>;
 
 foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
 def : Pat<(vt (select i1:$p, vt:$a, vt:$b)),
-          (SELP_b32rr Int32Regs:$a, Int32Regs:$b, Int1Regs:$p)>;
+          (SELP_b32rr $a, $b, $p)>;
 }
 
 //-----------------------------------
@@ -952,29 +952,29 @@ def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
 
 // Matchers for signed, unsigned mul.wide ISD nodes.
 def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)),
-          (MULWIDES32 i16:$a, i16:$b)>,
+          (MULWIDES32 $a, $b)>,
       Requires<[doMulWide]>;
 def : Pat<(i32 (mul_wide_signed i16:$a, imm:$b)),
-          (MULWIDES32Imm Int16Regs:$a, imm:$b)>,
+          (MULWIDES32Imm $a, imm:$b)>,
       Requires<[doMulWide]>;
 def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)),
-          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
+          (MULWIDEU32 $a, $b)>,
       Requires<[doMulWide]>;
 def : Pat<(i32 (mul_wide_unsigned i16:$a, imm:$b)),
-          (MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
+          (MULWIDEU32Imm $a, imm:$b)>,
       Requires<[doMulWide]>;
 
 def : Pat<(i64 (mul_wide_signed i32:$a, i32:$b)),
-          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
+          (MULWIDES64 $a, $b)>,
       Requires<[doMulWide]>;
 def : Pat<(i64 (mul_wide_signed i32:$a, imm:$b)),
-          (MULWIDES64Imm Int32Regs:$a, imm:$b)>,
+          (MULWIDES64Imm $a, imm:$b)>,
       Requires<[doMulWide]>;
 def : Pat<(i64 (mul_wide_unsigned i32:$a, i32:$b)),
-          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
+          (MULWIDEU64 $a, $b)>,
       Requires<[doMulWide]>;
 def : Pat<(i64 (mul_wide_unsigned i32:$a, imm:$b)),
-          (MULWIDEU64Imm Int32Regs:$a, imm:$b)>,
+          (MULWIDEU64Imm $a, imm:$b)>,
       Requires<[doMulWide]>;
 
 // Predicates used for converting some patterns to mul.wide.
@@ -1024,46 +1024,46 @@ def SHL2MUL16 : SDNodeXForm,
+          (MULWIDES64Imm $a, (SHL2MUL32 $b))>,
       Requires<[doMulWide]>;
 def : Pat<(shl (zext i32:$a), (i32 IntConst_0_30:$b)),
-          (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
+          (MULWIDEU64Imm $a, (SHL2MUL32 $b))>,
       Requires<[doMulWide]>;
 
 def : Pat<(shl (sext i16:$a), (i16 IntConst_0_14:$b)),
-          (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
+          (MULWIDES32Imm $a, (SHL2MUL16 $b))>,
       Requires<[doMulWide]>;
 def : Pat<(shl (zext i16:$a), (i16 IntConst_0_14:$b)),
-          (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
+          (MULWIDEU32Imm $a, (SHL2MUL16 $b))>,
       Requires<[doMulWide]>;
 
 // Convert "sign/zero-extend then multiply" to mul.wide.
 def : Pat<(mul (sext i32:$a), (sext i32:$b)),
-          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
+          (MULWIDES64 $a, $b)>,
       Requires<[doMulWide]>;
 def : Pat<(mul (sext i32:$a), (i64 SInt32Const:$b)),
-          (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>,
+          (MULWIDES64Imm64 $a, (i64 SInt32Const:$b))>,
       Requires<[doMulWide]>;
 
 def : Pat<(mul (zext i32:$a), (zext i32:$b)),
-          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
+          (MULWIDEU64 $a, $b)>,
       Requires<[doMulWide]>;
 def : Pat<(mul (zext i32:$a), (i64 UInt32Const:$b)),
-          (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>,
+          (MULWIDEU64Imm64 $a, (i64 UInt32Const:$b))>,
       Requires<[doMulWide]>;
 
 def : Pat<(mul (sext i16:$a), (sext i16:$b)),
-          (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
+          (MULWIDES32 $a, $b)>,
       Requires<[doMulWide]>;
 def : Pat<(mul (sext i16:$a), (i32 SInt16Const:$b)),
-          (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>,
+          (MULWIDES32Imm32 $a, (i32 SInt16Const:$b))>,
       Requires<[doMulWide]>;
 
 def : Pat<(mul (zext i16:$a), (zext i16:$b)),
-          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
+          (MULWIDEU32 $a, $b)>,
       Requires<[doMulWide]>;
 def : Pat<(mul (zext i16:$a), (i32 UInt16Const:$b)),
-          (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>,
+          (MULWIDEU32Imm32 $a, (i32 UInt16Const:$b))>,
       Requires<[doMulWide]>;
 
 //
@@ -1242,7 +1242,7 @@ def FDIV64ri :
 // fdiv will be converted to rcp
 // fneg (fdiv 1.0, X) => fneg (rcp.rn X)
 def : Pat<(fdiv DoubleConstNeg1:$a, f64:$b),
-          (FNEGf64 (FDIV641r (NegDoubleConst node:$a), Float64Regs:$b))>;
+          (FNEGf64 (FDIV641r (NegDoubleConst node:$a), $b))>;
 
 //
 // F32 Approximate reciprocal
@@ -1436,83 +1436,83 @@ def COSF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
 
 // frem - f32 FTZ
 def : Pat<(frem f32:$x, f32:$y),
-          (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
-            (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ),
-             Float32Regs:$y))>,
+          (FSUBf32rr_ftz $x, (FMULf32rr_ftz (CVT_f32_f32
+            (FDIV32rr_prec_ftz $x, $y), CvtRZI_FTZ),
+             $y))>,
           Requires<[doF32FTZ, allowUnsafeFPMath]>;
 def : Pat<(frem f32:$x, fpimm:$y),
-          (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
-            (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ),
+          (FSUBf32rr_ftz $x, (FMULf32ri_ftz (CVT_f32_f32
+            (FDIV32ri_prec_ftz $x, fpimm:$y), CvtRZI_FTZ),
              fpimm:$y))>,
           Requires<[doF32FTZ, allowUnsafeFPMath]>;
 
-def : Pat<(frem f32:$x, Float32Regs:$y),
-          (SELP_f32rr Float32Regs:$x,
-            (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
-              (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ),
-              Float32Regs:$y)),
-            (TESTINF_f32r Float32Regs:$y))>,
+def : Pat<(frem f32:$x, f32:$y),
+          (SELP_f32rr $x,
+            (FSUBf32rr_ftz $x, (FMULf32rr_ftz (CVT_f32_f32
+              (FDIV32rr_prec_ftz $x, $y), CvtRZI_FTZ),
+              $y)),
+            (TESTINF_f32r $y))>,
           Requires<[doF32FTZ, noUnsafeFPMath]>;
 def : Pat<(frem f32:$x, fpimm:$y),
-          (SELP_f32rr Float32Regs:$x,
-            (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
-              (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ),
+          (SELP_f32rr $x,
+            (FSUBf32rr_ftz $x, (FMULf32ri_ftz (CVT_f32_f32
+              (FDIV32ri_prec_ftz $x, fpimm:$y), CvtRZI_FTZ),
               fpimm:$y)),
             (TESTINF_f32i fpimm:$y))>,
           Requires<[doF32FTZ, noUnsafeFPMath]>;
 
 // frem - f32
 def : Pat<(frem f32:$x, f32:$y),
-          (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
-            (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI),
-             Float32Regs:$y))>,
+          (FSUBf32rr $x, (FMULf32rr (CVT_f32_f32
+            (FDIV32rr_prec $x, $y), CvtRZI),
+             $y))>,
           Requires<[allowUnsafeFPMath]>;
 def : Pat<(frem f32:$x, fpimm:$y),
-          (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
-            (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI),
+          (FSUBf32rr $x, (FMULf32ri (CVT_f32_f32
+            (FDIV32ri_prec $x, fpimm:$y), CvtRZI),
              fpimm:$y))>,
           Requires<[allowUnsafeFPMath]>;
 
 def : Pat<(frem f32:$x, f32:$y),
-          (SELP_f32rr Float32Regs:$x,
-            (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
-              (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI),
-              Float32Regs:$y)),
+          (SELP_f32rr $x,
+            (FSUBf32rr $x, (FMULf32rr (CVT_f32_f32
+              (FDIV32rr_prec $x, $y), CvtRZI),
+              $y)),
             (TESTINF_f32r Float32Regs:$y))>,
           Requires<[noUnsafeFPMath]>;
 def : Pat<(frem f32:$x, fpimm:$y),
-          (SELP_f32rr Float32Regs:$x,
-            (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
-              (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI),
+          (SELP_f32rr $x,
+            (FSUBf32rr $x, (FMULf32ri (CVT_f32_f32
+              (FDIV32ri_prec $x, fpimm:$y), CvtRZI),
               fpimm:$y)),
             (TESTINF_f32i fpimm:$y))>,
           Requires<[noUnsafeFPMath]>;
 
 // frem - f64
 def : Pat<(frem f64:$x, f64:$y),
-          (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
-            (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI),
-             Float64Regs:$y))>,
+          (FSUBf64rr $x, (FMULf64rr (CVT_f64_f64
+            (FDIV64rr $x, $y), CvtRZI),
+             $y))>,
           Requires<[allowUnsafeFPMath]>;
 def : Pat<(frem f64:$x, fpimm:$y),
-          (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
-            (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI),
+          (FSUBf64rr $x, (FMULf64ri (CVT_f64_f64
+            (FDIV64ri $x, fpimm:$y), CvtRZI),
              fpimm:$y))>,
           Requires<[allowUnsafeFPMath]>;
 
 def : Pat<(frem f64:$x, f64:$y),
-          (SELP_f64rr Float64Regs:$x,
-            (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
-              (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI),
-               Float64Regs:$y)),
+          (SELP_f64rr $x,
+            (FSUBf64rr $x, (FMULf64rr (CVT_f64_f64
+              (FDIV64rr $x, $y), CvtRZI),
+               $y)),
             (TESTINF_f64r Float64Regs:$y))>,
           Requires<[noUnsafeFPMath]>;
 def : Pat<(frem f64:$x, fpimm:$y),
-          (SELP_f64rr Float64Regs:$x,
-            (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
-              (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI),
+          (SELP_f64rr $x,
+            (FSUBf64rr $x, (FMULf64ri (CVT_f64_f64
+              (FDIV64ri $x, fpimm:$y), CvtRZI),
               fpimm:$y)),
-            (TESTINF_f64r Float64Regs:$y))>,
+            (TESTINF_f64r $y))>,
           Requires<[noUnsafeFPMath]>;
 
 //-----------------------------------
@@ -1561,32 +1561,32 @@ defm AND : BITWISE<"and", and>;
 defm XOR : BITWISE<"xor", xor>;
 
 // PTX does not support mul on predicates, convert to and instructions
-def : Pat<(mul i1:$a, i1:$b), (ANDb1rr Int1Regs:$a, Int1Regs:$b)>;
-def : Pat<(mul i1:$a, imm:$b), (ANDb1ri Int1Regs:$a, imm:$b)>;
+def : Pat<(mul i1:$a, i1:$b), (ANDb1rr $a, $b)>;
+def : Pat<(mul i1:$a, imm:$b), (ANDb1ri $a, imm:$b)>;
 
 // These transformations were once reliably performed by instcombine, but thanks
 // to poison semantics they are no longer safe for LLVM IR, perform them here
 // instead.
-def : Pat<(select i1:$a, i1:$b, 0), (ANDb1rr Int1Regs:$a, Int1Regs:$b)>;
-def : Pat<(select i1:$a, 1, i1:$b), (ORb1rr Int1Regs:$a, Int1Regs:$b)>;
+def : Pat<(select i1:$a, i1:$b, 0), (ANDb1rr $a, $b)>;
+def : Pat<(select i1:$a, 1, i1:$b), (ORb1rr $a, $b)>;
 
 // Lower logical v2i16/v4i8 ops as bitwise ops on b32.
 foreach vt = [v2i16, v4i8] in {
   def: Pat<(or vt:$a, vt:$b),
-           (ORb32rr Int32Regs:$a, Int32Regs:$b)>;
+           (ORb32rr $a, $b)>;
   def: Pat<(xor vt:$a, vt:$b),
-           (XORb32rr Int32Regs:$a, Int32Regs:$b)>;
+           (XORb32rr $a, $b)>;
   def: Pat<(and vt:$a, vt:$b),
-           (ANDb32rr Int32Regs:$a, Int32Regs:$b)>;
+           (ANDb32rr $a, $b)>;
 
   // The constants get legalized into a bitcast from i32, so that's what we need
   // to match here.
   def: Pat<(or vt:$a, (vt (bitconvert (i32 imm:$b)))),
-           (ORb32ri Int32Regs:$a, imm:$b)>;
+           (ORb32ri $a, imm:$b)>;
   def: Pat<(xor vt:$a, (vt (bitconvert (i32 imm:$b)))),
-           (XORb32ri Int32Regs:$a, imm:$b)>;
+           (XORb32ri $a, imm:$b)>;
   def: Pat<(and vt:$a, (vt (bitconvert (i32 imm:$b)))),
-           (ANDb32ri Int32Regs:$a, imm:$b)>;
+           (ANDb32ri $a, imm:$b)>;
 }
 
 def NOT1  : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
@@ -1770,34 +1770,34 @@ let hasSideEffects = false in {
 
 // byte extraction + signed/unsigned extension to i32.
 def : Pat<(i32 (sext_inreg (bfe i32:$s, i32:$o, 8), i8)),
-          (BFE_S32rri Int32Regs:$s, Int32Regs:$o, 8)>;
+          (BFE_S32rri $s, $o, 8)>;
 def : Pat<(i32 (sext_inreg (bfe i32:$s, imm:$o, 8), i8)),
-          (BFE_S32rii Int32Regs:$s, imm:$o, 8)>;
+          (BFE_S32rii $s, imm:$o, 8)>;
 def : Pat<(i32 (and (bfe i32:$s, i32:$o, 8), 255)),
-          (BFE_U32rri Int32Regs:$s, Int32Regs:$o, 8)>;
+          (BFE_U32rri $s, $o, 8)>;
 def : Pat<(i32 (and (bfe i32:$s, imm:$o, 8), 255)),
-          (BFE_U32rii Int32Regs:$s, imm:$o, 8)>;
+          (BFE_U32rii $s, imm:$o, 8)>;
 
 // byte extraction + signed extension to i16
 def : Pat<(i16 (sext_inreg (trunc (bfe i32:$s, imm:$o, 8)), i8)),
-          (CVT_s8_s32 (BFE_S32rii i32:$s, imm:$o, 8), CvtNONE)>;
+          (CVT_s8_s32 (BFE_S32rii $s, imm:$o, 8), CvtNONE)>;
 
 
 // Byte extraction via shift/trunc/sext
 def : Pat<(i16 (sext_inreg (trunc i32:$s), i8)),
-          (CVT_s8_s32 Int32Regs:$s, CvtNONE)>;
+          (CVT_s8_s32 $s, CvtNONE)>;
 def : Pat<(i16 (sext_inreg (trunc (srl i32:$s,  (i32 imm:$o))), i8)),
-          (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, imm:$o, 8), CvtNONE)>;
+          (CVT_s8_s32 (BFE_S32rii $s, imm:$o, 8), CvtNONE)>;
 def : Pat<(sext_inreg (srl i32:$s,  (i32 imm:$o)), i8),
-          (BFE_S32rii Int32Regs:$s, imm:$o, 8)>;
+          (BFE_S32rii $s, imm:$o, 8)>;
 def : Pat<(i16 (sra (i16 (trunc i32:$s)), (i32 8))),
-          (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, 8, 8), CvtNONE)>;
+          (CVT_s8_s32 (BFE_S32rii $s, 8, 8), CvtNONE)>;
 def : Pat<(sext_inreg (srl i64:$s,  (i32 imm:$o)), i8),
-          (BFE_S64rii Int64Regs:$s, imm:$o, 8)>;
+          (BFE_S64rii $s, imm:$o, 8)>;
 def : Pat<(i16 (sext_inreg (trunc i64:$s), i8)),
-          (CVT_s8_s64 Int64Regs:$s, CvtNONE)>;
+          (CVT_s8_s64 $s, CvtNONE)>;
 def : Pat<(i16 (sext_inreg (trunc (srl i64:$s,  (i32 imm:$o))), i8)),
-          (CVT_s8_s64 (BFE_S64rii Int64Regs:$s, imm:$o, 8), CvtNONE)>;
+          (CVT_s8_s64 (BFE_S64rii $s, imm:$o, 8), CvtNONE)>;
 
 //-----------------------------------
 // Comparison instructions (setp, set)
@@ -2032,47 +2032,47 @@ multiclass ISET_FORMAT {
   // i16 -> pred
   def : Pat<(i1 (OpNode i16:$a, i16:$b)),
-            (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
+            (setp_16rr $a, $b, Mode)>;
   def : Pat<(i1 (OpNode i16:$a, imm:$b)),
-            (setp_16ri Int16Regs:$a, imm:$b, Mode)>;
+            (setp_16ri $a, imm:$b, Mode)>;
   def : Pat<(i1 (OpNode imm:$a, i16:$b)),
-            (setp_16ir imm:$a, Int16Regs:$b, Mode)>;
+            (setp_16ir imm:$a, $b, Mode)>;
   // i32 -> pred
   def : Pat<(i1 (OpNode i32:$a, i32:$b)),
-            (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
+            (setp_32rr $a, $b, Mode)>;
   def : Pat<(i1 (OpNode i32:$a, imm:$b)),
-            (setp_32ri Int32Regs:$a, imm:$b, Mode)>;
+            (setp_32ri $a, imm:$b, Mode)>;
   def : Pat<(i1 (OpNode imm:$a, i32:$b)),
-            (setp_32ir imm:$a, Int32Regs:$b, Mode)>;
+            (setp_32ir imm:$a, $b, Mode)>;
   // i64 -> pred
   def : Pat<(i1 (OpNode i64:$a, i64:$b)),
-            (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
+            (setp_64rr $a, $b, Mode)>;
   def : Pat<(i1 (OpNode i64:$a, imm:$b)),
-            (setp_64ri Int64Regs:$a, imm:$b, Mode)>;
+            (setp_64ri $a, imm:$b, Mode)>;
   def : Pat<(i1 (OpNode imm:$a, i64:$b)),
-            (setp_64ir imm:$a, Int64Regs:$b, Mode)>;
+            (setp_64ir imm:$a, $b, Mode)>;
 
   // i16 -> i32
   def : Pat<(i32 (OpNode i16:$a, i16:$b)),
-            (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
+            (set_16rr $a, $b, Mode)>;
   def : Pat<(i32 (OpNode i16:$a, imm:$b)),
-            (set_16ri Int16Regs:$a, imm:$b, Mode)>;
+            (set_16ri $a, imm:$b, Mode)>;
   def : Pat<(i32 (OpNode imm:$a, i16:$b)),
-            (set_16ir imm:$a, Int16Regs:$b, Mode)>;
+            (set_16ir imm:$a, $b, Mode)>;
   // i32 -> i32
   def : Pat<(i32 (OpNode i32:$a, i32:$b)),
-            (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
+            (set_32rr $a, $b, Mode)>;
   def : Pat<(i32 (OpNode i32:$a, imm:$b)),
-            (set_32ri Int32Regs:$a, imm:$b, Mode)>;
+            (set_32ri $a, imm:$b, Mode)>;
   def : Pat<(i32 (OpNode imm:$a, i32:$b)),
-            (set_32ir imm:$a, Int32Regs:$b, Mode)>;
+            (set_32ir imm:$a, $b, Mode)>;
   // i64 -> i32
   def : Pat<(i32 (OpNode i64:$a, Int64Regs:$b)),
-            (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
+            (set_64rr $a, $b, Mode)>;
   def : Pat<(i32 (OpNode i64:$a, imm:$b)),
-            (set_64ri Int64Regs:$a, imm:$b, Mode)>;
+            (set_64ri $a, imm:$b, Mode)>;
   def : Pat<(i32 (OpNode imm:$a, i64:$b)),
-            (set_64ir imm:$a, Int64Regs:$b, Mode)>;
+            (set_64ir imm:$a, $b, Mode)>;
 }
 
 multiclass ISET_FORMAT_SIGNED
@@ -2179,94 +2179,94 @@ def: Pat<(setne (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
 
 // i1 compare -> i32
 def : Pat<(i32 (setne i1:$a, i1:$b)),
-          (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+          (SELP_u32ii -1, 0, (XORb1rr $a, $b))>;
 def : Pat<(i32 (setne i1:$a, i1:$b)),
-          (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+          (SELP_u32ii 0, -1, (XORb1rr $a, $b))>;
 
 
 
 multiclass FSET_FORMAT {
   // f16 -> pred
   def : Pat<(i1 (OpNode f16:$a, f16:$b)),
-            (SETP_f16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
+            (SETP_f16rr $a, $b, ModeFTZ)>,
         Requires<[useFP16Math,doF32FTZ]>;
   def : Pat<(i1 (OpNode f16:$a, f16:$b)),
-            (SETP_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
+            (SETP_f16rr $a, $b, Mode)>,
         Requires<[useFP16Math]>;
 
   // bf16 -> pred
   def : Pat<(i1 (OpNode bf16:$a, bf16:$b)),
-            (SETP_bf16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
+            (SETP_bf16rr $a, $b, ModeFTZ)>,
         Requires<[hasBF16Math,doF32FTZ]>;
   def : Pat<(i1 (OpNode bf16:$a, bf16:$b)),
-            (SETP_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
+            (SETP_bf16rr $a, $b, Mode)>,
         Requires<[hasBF16Math]>;
 
   // f32 -> pred
   def : Pat<(i1 (OpNode f32:$a, f32:$b)),
-            (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
+            (SETP_f32rr $a, $b, ModeFTZ)>,
         Requires<[doF32FTZ]>;
   def : Pat<(i1 (OpNode f32:$a, f32:$b)),
-            (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
+            (SETP_f32rr $a, $b, Mode)>;
   def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
-            (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
+            (SETP_f32ri $a, fpimm:$b, ModeFTZ)>,
         Requires<[doF32FTZ]>;
   def : Pat<(i1 (OpNode f32:$a, fpimm:$b)),
-            (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
+            (SETP_f32ri $a, fpimm:$b, Mode)>;
   def : Pat<(i1 (OpNode fpimm:$a, f32:$b)),
-            (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
+            (SETP_f32ir fpimm:$a, $b, ModeFTZ)>,
         Requires<[doF32FTZ]>;
   def : Pat<(i1 (OpNode fpimm:$a, f32:$b)),
-            (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
+            (SETP_f32ir fpimm:$a, $b, Mode)>;
 
   // f64 -> pred
   def : Pat<(i1 (OpNode f64:$a, f64:$b)),
-            (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
+            (SETP_f64rr $a, $b, Mode)>;
   def : Pat<(i1 (OpNode f64:$a, fpimm:$b)),
-            (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
+            (SETP_f64ri $a, fpimm:$b, Mode)>;
   def : Pat<(i1 (OpNode fpimm:$a, f64:$b)),
-            (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
+            (SETP_f64ir fpimm:$a, $b, Mode)>;
 
   // f16 -> i32
   def : Pat<(i32 (OpNode f16:$a, f16:$b)),
-            (SET_f16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
+            (SET_f16rr $a, $b, ModeFTZ)>,
         Requires<[useFP16Math, doF32FTZ]>;
   def : Pat<(i32 (OpNode f16:$a, f16:$b)),
-            (SET_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
+            (SET_f16rr $a, $b, Mode)>,
         Requires<[useFP16Math]>;
 
   // bf16 -> i32
   def : Pat<(i32 (OpNode bf16:$a, bf16:$b)),
-            (SET_bf16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
+            (SET_bf16rr $a, $b, ModeFTZ)>,
         Requires<[hasBF16Math, doF32FTZ]>;
   def : Pat<(i32 (OpNode bf16:$a, bf16:$b)),
-            (SET_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
+            (SET_bf16rr $a, $b, Mode)>,
         Requires<[hasBF16Math]>;
 
   // f32 -> i32
   def : Pat<(i32 (OpNode f32:$a, f32:$b)),
-            (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
+            (SET_f32rr $a, $b, ModeFTZ)>,
         Requires<[doF32FTZ]>;
   def : Pat<(i32 (OpNode f32:$a, f32:$b)),
-            (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
+            (SET_f32rr $a, $b, Mode)>;
   def : Pat<(i32 (OpNode f32:$a, fpimm:$b)),
-            (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
+            (SET_f32ri $a, fpimm:$b, ModeFTZ)>,
         Requires<[doF32FTZ]>;
   def : Pat<(i32 (OpNode f32:$a, fpimm:$b)),
-            (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
+            (SET_f32ri $a, fpimm:$b, Mode)>;
   def : Pat<(i32 (OpNode fpimm:$a, f32:$b)),
-            (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
+            (SET_f32ir fpimm:$a, $b, ModeFTZ)>,
         Requires<[doF32FTZ]>;
   def : Pat<(i32 (OpNode fpimm:$a, f32:$b)),
-            (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
+            (SET_f32ir fpimm:$a, $b, Mode)>;
 
   // f64 -> i32
   def : Pat<(i32 (OpNode f64:$a, f64:$b)),
-            (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
+            (SET_f64rr $a, $b, Mode)>;
   def : Pat<(i32 (OpNode f64:$a, fpimm:$b)),
-            (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
+            (SET_f64ri $a, fpimm:$b, Mode)>;
   def : Pat<(i32 (OpNode fpimm:$a, f64:$b)),
-            (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
+            (SET_f64ir fpimm:$a, $b, Mode)>;
 }
 
 defm FSetOGT : FSET_FORMAT;
@@ -2722,11 +2722,11 @@ def ProxyRegF32   : ProxyRegInst<"f32",  f32, Float32Regs>;
 def ProxyRegF64   : ProxyRegInst<"f64",  f64, Float64Regs>;
 
 foreach vt = [f16, bf16] in {
-  def: Pat<(vt (ProxyReg  vt:$src)), (ProxyRegI16 Int16Regs:$src)>;
+  def: Pat<(vt (ProxyReg  vt:$src)), (ProxyRegI16 $src)>;
 }
 
 foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
-  def: Pat<(vt (ProxyReg  vt:$src)), (ProxyRegI32 Int32Regs:$src)>;
+  def: Pat<(vt (ProxyReg  vt:$src)), (ProxyRegI32 $src)>;
 }
 
 //
@@ -3029,9 +3029,9 @@ def BITCONVERT_64_F2I : F_BITCONVERT<"64", f64, i64>;
 
 foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
 def: Pat<(vt (bitconvert (f32 Float32Regs:$a))),
-         (BITCONVERT_32_F2I Float32Regs:$a)>;
+         (BITCONVERT_32_F2I $a)>;
 def: Pat<(f32 (bitconvert vt:$a)),
-         (BITCONVERT_32_I2F Int32Regs:$a)>;
+         (BITCONVERT_32_I2F $a)>;
 }
 foreach vt = [f16, bf16] in {
   def: Pat<(vt (bitconvert i16:$a)),
@@ -3056,280 +3056,280 @@ foreach ta = [v2f16, v2bf16, v2i16, v4i8, i32] in {
 
 // sint -> f16
 def : Pat<(f16 (sint_to_fp i1:$a)),
-          (CVT_f16_s32 (SELP_s32ii -1, 0, Int1Regs:$a), CvtRN)>;
+          (CVT_f16_s32 (SELP_s32ii -1, 0, $a), CvtRN)>;
 def : Pat<(f16 (sint_to_fp Int16Regs:$a)),
-          (CVT_f16_s16 i16:$a, CvtRN)>;
+          (CVT_f16_s16 $a, CvtRN)>;
 def : Pat<(f16 (sint_to_fp i32:$a)),
-          (CVT_f16_s32 i32:$a, CvtRN)>;
+          (CVT_f16_s32 $a, CvtRN)>;
 def : Pat<(f16 (sint_to_fp i64:$a)),
-          (CVT_f16_s64 i64:$a, CvtRN)>;
+          (CVT_f16_s64 $a, CvtRN)>;
 
 // uint -> f16
 def : Pat<(f16 (uint_to_fp i1:$a)),
-          (CVT_f16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+          (CVT_f16_u32 (SELP_u32ii 1, 0, $a), CvtRN)>;
 def : Pat<(f16 (uint_to_fp Int16Regs:$a)),
-          (CVT_f16_u16 i16:$a, CvtRN)>;
+          (CVT_f16_u16 $a, CvtRN)>;
 def : Pat<(f16 (uint_to_fp i32:$a)),
-          (CVT_f16_u32 i32:$a, CvtRN)>;
+          (CVT_f16_u32 $a, CvtRN)>;
 def : Pat<(f16 (uint_to_fp i64:$a)),
-          (CVT_f16_u64 i64:$a, CvtRN)>;
+          (CVT_f16_u64 $a, CvtRN)>;
 
 // sint -> bf16
 def : Pat<(bf16 (sint_to_fp i1:$a)),
-          (CVT_bf16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
+          (CVT_bf16_s32 (SELP_u32ii 1, 0, $a), CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
 def : Pat<(bf16 (sint_to_fp i16:$a)),
-          (CVT_bf16_s16 i16:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
+          (CVT_bf16_s16 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
 def : Pat<(bf16 (sint_to_fp i32:$a)),
-          (CVT_bf16_s32 i32:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
+          (CVT_bf16_s32 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
 def : Pat<(bf16 (sint_to_fp i64:$a)),
-          (CVT_bf16_s64 i64:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
+          (CVT_bf16_s64 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
 
 // uint -> bf16
 def : Pat<(bf16 (uint_to_fp i1:$a)),
-          (CVT_bf16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
+          (CVT_bf16_u32 (SELP_u32ii 1, 0, $a), CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
 def : Pat<(bf16 (uint_to_fp i16:$a)),
-          (CVT_bf16_u16 i16:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
+          (CVT_bf16_u16 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
 def : Pat<(bf16 (uint_to_fp i32:$a)),
-          (CVT_bf16_u32 i32:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
+          (CVT_bf16_u32 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
 def : Pat<(bf16 (uint_to_fp i64:$a)),
-          (CVT_bf16_u64 i64:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
+          (CVT_bf16_u64 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
 
 // sint -> f32
 def : Pat<(f32 (sint_to_fp i1:$a)),
-          (CVT_f32_s32 (SELP_s32ii -1, 0, Int1Regs:$a), CvtRN)>;
+          (CVT_f32_s32 (SELP_s32ii -1, 0, $a), CvtRN)>;
 def : Pat<(f32 (sint_to_fp i16:$a)),
-          (CVT_f32_s16 i16:$a, CvtRN)>;
+          (CVT_f32_s16 $a, CvtRN)>;
 def : Pat<(f32 (sint_to_fp i32:$a)),
-          (CVT_f32_s32 i32:$a, CvtRN)>;
+          (CVT_f32_s32 $a, CvtRN)>;
 def : Pat<(f32 (sint_to_fp i64:$a)),
-          (CVT_f32_s64 i64:$a, CvtRN)>;
+          (CVT_f32_s64 $a, CvtRN)>;
 
 // uint -> f32
 def : Pat<(f32 (uint_to_fp i1:$a)),
-          (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+          (CVT_f32_u32 (SELP_u32ii 1, 0, $a), CvtRN)>;
 def : Pat<(f32 (uint_to_fp i16:$a)),
-          (CVT_f32_u16 Int16Regs:$a, CvtRN)>;
+          (CVT_f32_u16 $a, CvtRN)>;
 def : Pat<(f32 (uint_to_fp i32:$a)),
-          (CVT_f32_u32 i32:$a, CvtRN)>;
+          (CVT_f32_u32 $a, CvtRN)>;
 def : Pat<(f32 (uint_to_fp i64:$a)),
-          (CVT_f32_u64 i64:$a, CvtRN)>;
+          (CVT_f32_u64 $a, CvtRN)>;
 
 // sint -> f64
 def : Pat<(f64 (sint_to_fp i1:$a)),
-          (CVT_f64_s32 (SELP_s32ii -1, 0, Int1Regs:$a), CvtRN)>;
+          (CVT_f64_s32 (SELP_s32ii -1, 0, $a), CvtRN)>;
 def : Pat<(f64 (sint_to_fp i16:$a)),
-          (CVT_f64_s16 Int16Regs:$a, CvtRN)>;
+          (CVT_f64_s16 $a, CvtRN)>;
 def : Pat<(f64 (sint_to_fp i32:$a)),
-          (CVT_f64_s32 i32:$a, CvtRN)>;
+          (CVT_f64_s32 $a, CvtRN)>;
 def : Pat<(f64 (sint_to_fp i64:$a)),
-          (CVT_f64_s64 i64:$a, CvtRN)>;
+          (CVT_f64_s64 $a, CvtRN)>;
 
 // uint -> f64
 def : Pat<(f64 (uint_to_fp i1:$a)),
-          (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+          (CVT_f64_u32 (SELP_u32ii 1, 0, $a), CvtRN)>;
 def : Pat<(f64 (uint_to_fp i16:$a)),
-          (CVT_f64_u16 Int16Regs:$a, CvtRN)>;
+          (CVT_f64_u16 $a, CvtRN)>;
 def : Pat<(f64 (uint_to_fp i32:$a)),
-          (CVT_f64_u32 i32:$a, CvtRN)>;
+          (CVT_f64_u32 $a, CvtRN)>;
 def : Pat<(f64 (uint_to_fp i64:$a)),
-          (CVT_f64_u64 i64:$a, CvtRN)>;
+          (CVT_f64_u64 $a, CvtRN)>;
 
 
 // f16 -> sint
 def : Pat<(i1 (fp_to_sint f16:$a)),
-          (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
+          (SETP_b16ri $a, 0, CmpEQ)>;
 def : Pat<(i16 (fp_to_sint f16:$a)),
-          (CVT_s16_f16 Int16Regs:$a, CvtRZI)>;
+          (CVT_s16_f16 $a, CvtRZI)>;
 def : Pat<(i32 (fp_to_sint f16:$a)),
-          (CVT_s32_f16 Int16Regs:$a, CvtRZI)>;
+          (CVT_s32_f16 $a, CvtRZI)>;
 def : Pat<(i64 (fp_to_sint f16:$a)),
-          (CVT_s64_f16 Int16Regs:$a, CvtRZI)>;
+          (CVT_s64_f16 $a, CvtRZI)>;
 
 // f16 -> uint
 def : Pat<(i1 (fp_to_uint f16:$a)),
-          (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
+          (SETP_b16ri $a, 0, CmpEQ)>;
 def : Pat<(i16 (fp_to_uint f16:$a)),
-          (CVT_u16_f16 Int16Regs:$a, CvtRZI)>;
+          (CVT_u16_f16 $a, CvtRZI)>;
 def : Pat<(i32 (fp_to_uint f16:$a)),
-          (CVT_u32_f16 Int16Regs:$a, CvtRZI)>;
+          (CVT_u32_f16 $a, CvtRZI)>;
 def : Pat<(i64 (fp_to_uint f16:$a)),
-          (CVT_u64_f16 Int16Regs:$a, CvtRZI)>;
+          (CVT_u64_f16 $a, CvtRZI)>;
 
 // bf16 -> sint
 def : Pat<(i1 (fp_to_sint bf16:$a)),
-          (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
+          (SETP_b16ri $a, 0, CmpEQ)>;
 def : Pat<(i16 (fp_to_sint bf16:$a)),
-          (CVT_s16_bf16 Int16Regs:$a, CvtRZI)>;
+          (CVT_s16_bf16 $a, CvtRZI)>;
 def : Pat<(i32 (fp_to_sint bf16:$a)),
-          (CVT_s32_bf16 Int16Regs:$a, CvtRZI)>;
+          (CVT_s32_bf16 $a, CvtRZI)>;
 def : Pat<(i64 (fp_to_sint bf16:$a)),
-          (CVT_s64_bf16 Int16Regs:$a, CvtRZI)>;
+          (CVT_s64_bf16 $a, CvtRZI)>;
 
 // bf16 -> uint
 def : Pat<(i1 (fp_to_uint bf16:$a)),
-          (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
+          (SETP_b16ri $a, 0, CmpEQ)>;
 def : Pat<(i16 (fp_to_uint bf16:$a)),
-          (CVT_u16_bf16 Int16Regs:$a, CvtRZI)>;
+          (CVT_u16_bf16 $a, CvtRZI)>;
 def : Pat<(i32 (fp_to_uint bf16:$a)),
-          (CVT_u32_bf16 Int16Regs:$a, CvtRZI)>;
+          (CVT_u32_bf16 $a, CvtRZI)>;
 def : Pat<(i64 (fp_to_uint bf16:$a)),
-          (CVT_u64_bf16 Int16Regs:$a, CvtRZI)>;
+          (CVT_u64_bf16 $a, CvtRZI)>;
 // f32 -> sint
 def : Pat<(i1 (fp_to_sint f32:$a)),
-          (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
+          (SETP_b32ri (BITCONVERT_32_F2I $a), 0, CmpEQ)>;
 def : Pat<(i16 (fp_to_sint f32:$a)),
-          (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+          (CVT_s16_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
 def : Pat<(i16 (fp_to_sint f32:$a)),
-          (CVT_s16_f32 Float32Regs:$a, CvtRZI)>;
+          (CVT_s16_f32 $a, CvtRZI)>;
 def : Pat<(i32 (fp_to_sint f32:$a)),
-          (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+          (CVT_s32_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
 def : Pat<(i32 (fp_to_sint f32:$a)),
-          (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
+          (CVT_s32_f32 $a, CvtRZI)>;
 def : Pat<(i64 (fp_to_sint f32:$a)),
-          (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+          (CVT_s64_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
 def : Pat<(i64 (fp_to_sint f32:$a)),
-          (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
+          (CVT_s64_f32 $a, CvtRZI)>;
 
 // f32 -> uint
 def : Pat<(i1 (fp_to_uint f32:$a)),
-          (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
+          (SETP_b32ri (BITCONVERT_32_F2I $a), 0, CmpEQ)>;
 def : Pat<(i16 (fp_to_uint f32:$a)),
-          (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+          (CVT_u16_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
 def : Pat<(i16 (fp_to_uint f32:$a)),
-          (CVT_u16_f32 Float32Regs:$a, CvtRZI)>;
+          (CVT_u16_f32 $a, CvtRZI)>;
 def : Pat<(i32 (fp_to_uint f32:$a)),
-          (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+          (CVT_u32_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
 def : Pat<(i32 (fp_to_uint f32:$a)),
-          (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
+          (CVT_u32_f32 $a, CvtRZI)>;
 def : Pat<(i64 (fp_to_uint f32:$a)),
-          (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+          (CVT_u64_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
 def : Pat<(i64 (fp_to_uint f32:$a)),
-          (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
+          (CVT_u64_f32 $a, CvtRZI)>;
 
 // f64 -> sint
 def : Pat<(i1 (fp_to_sint f64:$a)),
-          (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
+          (SETP_b64ri (BITCONVERT_64_F2I $a), 0, CmpEQ)>;
 def : Pat<(i16 (fp_to_sint f64:$a)),
-          (CVT_s16_f64 Float64Regs:$a, CvtRZI)>;
+          (CVT_s16_f64 $a, CvtRZI)>;
 def : Pat<(i32 (fp_to_sint f64:$a)),
-          (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
+          (CVT_s32_f64 $a, CvtRZI)>;
 def : Pat<(i64 (fp_to_sint f64:$a)),
-          (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
+          (CVT_s64_f64 $a, CvtRZI)>;
 
 // f64 -> uint
 def : Pat<(i1 (fp_to_uint f64:$a)),
-          (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
+          (SETP_b64ri (BITCONVERT_64_F2I $a), 0, CmpEQ)>;
 def : Pat<(i16 (fp_to_uint f64:$a)),
-          (CVT_u16_f64 Float64Regs:$a, CvtRZI)>;
+          (CVT_u16_f64 $a, CvtRZI)>;
 def : Pat<(i32 (fp_to_uint f64:$a)),
-          (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
+          (CVT_u32_f64 $a, CvtRZI)>;
 def : Pat<(i64 (fp_to_uint f64:$a)),
-          (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
+          (CVT_u64_f64 $a, CvtRZI)>;
 
 // sext i1
 def : Pat<(i16 (sext i1:$a)),
-          (SELP_s16ii -1, 0, Int1Regs:$a)>;
+          (SELP_s16ii -1, 0, $a)>;
 def : Pat<(i32 (sext i1:$a)),
-          (SELP_s32ii -1, 0, Int1Regs:$a)>;
+          (SELP_s32ii -1, 0, $a)>;
 def : Pat<(i64 (sext i1:$a)),
-          (SELP_s64ii -1, 0, Int1Regs:$a)>;
+          (SELP_s64ii -1, 0, $a)>;
 
 // zext i1
 def : Pat<(i16 (zext i1:$a)),
-          (SELP_u16ii 1, 0, Int1Regs:$a)>;
+          (SELP_u16ii 1, 0, $a)>;
 def : Pat<(i32 (zext i1:$a)),
-          (SELP_u32ii 1, 0, Int1Regs:$a)>;
+          (SELP_u32ii 1, 0, $a)>;
 def : Pat<(i64 (zext i1:$a)),
-          (SELP_u64ii 1, 0, Int1Regs:$a)>;
+          (SELP_u64ii 1, 0, $a)>;
 
 // anyext i1
 def : Pat<(i16 (anyext i1:$a)),
-          (SELP_u16ii -1, 0, Int1Regs:$a)>;
+          (SELP_u16ii -1, 0, $a)>;
 def : Pat<(i32 (anyext i1:$a)),
-          (SELP_u32ii -1, 0, Int1Regs:$a)>;
+          (SELP_u32ii -1, 0, $a)>;
 def : Pat<(i64 (anyext i1:$a)),
-          (SELP_u64ii -1, 0, Int1Regs:$a)>;
+          (SELP_u64ii -1, 0, $a)>;
 
 // sext i16
 def : Pat<(i32 (sext i16:$a)),
-          (CVT_s32_s16 Int16Regs:$a, CvtNONE)>;
+          (CVT_s32_s16 $a, CvtNONE)>;
 def : Pat<(i64 (sext i16:$a)),
-          (CVT_s64_s16 Int16Regs:$a, CvtNONE)>;
+          (CVT_s64_s16 $a, CvtNONE)>;
 
 // zext i16
 def : Pat<(i32 (zext i16:$a)),
-          (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
+          (CVT_u32_u16 $a, CvtNONE)>;
 def : Pat<(i64 (zext i16:$a)),
-          (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
+          (CVT_u64_u16 $a, CvtNONE)>;
 
 // anyext i16
 def : Pat<(i32 (anyext i16:$a)),
-          (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
+          (CVT_u32_u16 $a, CvtNONE)>;
 def : Pat<(i64 (anyext i16:$a)),
-          (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
+          (CVT_u64_u16 $a, CvtNONE)>;
 
 // sext i32
 def : Pat<(i64 (sext i32:$a)),
-          (CVT_s64_s32 Int32Regs:$a, CvtNONE)>;
+          (CVT_s64_s32 $a, CvtNONE)>;
 
 // zext i32
 def : Pat<(i64 (zext i32:$a)),
-          (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
+          (CVT_u64_u32 $a, CvtNONE)>;
 
 // anyext i32
 def : Pat<(i64 (anyext i32:$a)),
-          (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
+          (CVT_u64_u32 $a, CvtNONE)>;
 
 
 // truncate i64
 def : Pat<(i32 (trunc i64:$a)),
-          (CVT_u32_u64 Int64Regs:$a, CvtNONE)>;
+          (CVT_u32_u64 $a, CvtNONE)>;
 def : Pat<(i16 (trunc i64:$a)),
-          (CVT_u16_u64 Int64Regs:$a, CvtNONE)>;
+          (CVT_u16_u64 $a, CvtNONE)>;
 def : Pat<(i1 (trunc i64:$a)),
-          (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>;
+          (SETP_b64ri (ANDb64ri $a, 1), 1, CmpEQ)>;
 
 // truncate i32
 def : Pat<(i16 (trunc i32:$a)),
-          (CVT_u16_u32 Int32Regs:$a, CvtNONE)>;
+          (CVT_u16_u32 $a, CvtNONE)>;
 def : Pat<(i1 (trunc i32:$a)),
-          (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>;
+          (SETP_b32ri (ANDb32ri $a, 1), 1, CmpEQ)>;
 
 // truncate i16
 def : Pat<(i1 (trunc i16:$a)),
-          (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>;
+          (SETP_b16ri (ANDb16ri $a, 1), 1, CmpEQ)>;
 
 // sext_inreg
-def : Pat<(sext_inreg i16:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>;
-def : Pat<(sext_inreg i32:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>;
-def : Pat<(sext_inreg i32:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>;
-def : Pat<(sext_inreg i64:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>;
-def : Pat<(sext_inreg i64:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>;
-def : Pat<(sext_inreg i64:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>;
+def : Pat<(sext_inreg i16:$a, i8), (CVT_INREG_s16_s8 $a)>;
+def : Pat<(sext_inreg i32:$a, i8), (CVT_INREG_s32_s8 $a)>;
+def : Pat<(sext_inreg i32:$a, i16), (CVT_INREG_s32_s16 $a)>;
+def : Pat<(sext_inreg i64:$a, i8), (CVT_INREG_s64_s8 $a)>;
+def : Pat<(sext_inreg i64:$a, i16), (CVT_INREG_s64_s16 $a)>;
+def : Pat<(sext_inreg i64:$a, i32), (CVT_INREG_s64_s32 $a)>;
 
 
 // Select instructions with 32-bit predicates
 def : Pat<(select i32:$pred, i16:$a, i16:$b),
-          (SELP_b16rr Int16Regs:$a, Int16Regs:$b,
-          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+          (SELP_b16rr $a, $b,
+          (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>;
 def : Pat<(select i32:$pred, i32:$a, i32:$b),
-          (SELP_b32rr Int32Regs:$a, Int32Regs:$b,
-          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+          (SELP_b32rr $a, $b,
+          (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>;
 def : Pat<(select i32:$pred, i64:$a, i64:$b),
-          (SELP_b64rr Int64Regs:$a, Int64Regs:$b,
-          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+          (SELP_b64rr $a, $b,
+          (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>;
 def : Pat<(select i32:$pred, f16:$a, f16:$b),
-          (SELP_f16rr Int16Regs:$a, Int16Regs:$b,
-          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+          (SELP_f16rr $a, $b,
+          (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>;
 def : Pat<(select i32:$pred, bf16:$a, bf16:$b),
-          (SELP_bf16rr Int16Regs:$a, Int16Regs:$b,
-          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+          (SELP_bf16rr $a, $b,
+          (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>;
 def : Pat<(select i32:$pred, f32:$a, f32:$b),
-          (SELP_f32rr Float32Regs:$a, Float32Regs:$b,
-          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+          (SELP_f32rr $a, $b,
+          (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>;
 def : Pat<(select i32:$pred, f64:$a, f64:$b),
-          (SELP_f64rr Float64Regs:$a, Float64Regs:$b,
-          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+          (SELP_f64rr $a, $b,
+          (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>;
 
 
 let hasSideEffects = false in {
@@ -3391,32 +3391,32 @@ let hasSideEffects = false in {
 // Using partial vectorized move produces better SASS code for extraction of
 // upper/lower parts of an integer.
 def : Pat<(i16 (trunc (srl i32:$s, (i32 16)))),
-          (I32toI16H Int32Regs:$s)>;
+          (I32toI16H $s)>;
 def : Pat<(i16 (trunc (sra i32:$s, (i32 16)))),
-          (I32toI16H Int32Regs:$s)>;
+          (I32toI16H $s)>;
 def : Pat<(i32 (trunc (srl i64:$s, (i32 32)))),
-          (I64toI32H Int64Regs:$s)>;
+          (I64toI32H $s)>;
 def : Pat<(i32 (trunc (sra i64:$s, (i32 32)))),
-          (I64toI32H Int64Regs:$s)>;
+          (I64toI32H $s)>;
 
 def: Pat<(i32 (sext (extractelt v2i16:$src, 0))),
-         (CVT_INREG_s32_s16 Int32Regs:$src)>;
+         (CVT_INREG_s32_s16 $src)>;
 
 foreach vt = [v2f16, v2bf16, v2i16] in {
 def : Pat<(extractelt vt:$src, 0),
-          (I32toI16L Int32Regs:$src)>;
+          (I32toI16L $src)>;
 def : Pat<(extractelt vt:$src, 1),
-          (I32toI16H Int32Regs:$src)>;
+          (I32toI16H $src)>;
 }
 def : Pat<(v2f16 (build_vector f16:$a, f16:$b)),
-          (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
+          (V2I16toI32 $a, $b)>;
 def : Pat<(v2bf16 (build_vector bf16:$a, bf16:$b)),
-          (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
+          (V2I16toI32 $a, $b)>;
 def : Pat<(v2i16 (build_vector i16:$a, i16:$b)),
-          (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
+          (V2I16toI32 $a, $b)>;
 
 def: Pat<(v2i16 (scalar_to_vector i16:$a)),
-         (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
+         (CVT_u32_u16 $a, CvtNONE)>;
 
 //
 // Funnel-Shift
@@ -3455,13 +3455,13 @@ let hasSideEffects = false in {
 }
 
 def : Pat<(i32 (int_nvvm_fshl_clamp i32:$hi, i32:$lo, i32:$amt)),
-          (SHF_L_CLAMP_r Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt)>;
+          (SHF_L_CLAMP_r $lo, $hi, $amt)>;
 def : Pat<(i32 (int_nvvm_fshl_clamp i32:$hi, i32:$lo, (i32 imm:$amt))),
-          (SHF_L_CLAMP_i Int32Regs:$lo, Int32Regs:$hi, imm:$amt)>;
+          (SHF_L_CLAMP_i $lo, $hi, imm:$amt)>;
 def : Pat<(i32 (int_nvvm_fshr_clamp i32:$hi, i32:$lo, i32:$amt)),
-          (SHF_R_CLAMP_r Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt)>;
+          (SHF_R_CLAMP_r $lo, $hi, $amt)>;
 def : Pat<(i32 (int_nvvm_fshr_clamp i32:$hi, i32:$lo, (i32 imm:$amt))),
-          (SHF_R_CLAMP_i Int32Regs:$lo, Int32Regs:$hi, imm:$amt)>;
+          (SHF_R_CLAMP_i $lo, $hi, imm:$amt)>;
 
 // Count leading zeros
 let hasSideEffects = false in {
@@ -3472,14 +3472,14 @@ let hasSideEffects = false in {
 }
 
 // 32-bit has a direct PTX instruction
-def : Pat<(i32 (ctlz i32:$a)), (CLZr32 i32:$a)>;
+def : Pat<(i32 (ctlz i32:$a)), (CLZr32 $a)>;
 
 // The return type of the ctlz ISD node is the same as its input, but the PTX
 // ctz instruction always returns a 32-bit value.  For ctlz.i64, convert the
 // ptx value to 64 bits to match the ISD node's semantics, unless we know we're
 // truncating back down to 32 bits.
-def : Pat<(i64 (ctlz i64:$a)), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
-def : Pat<(i32 (trunc (i64 (ctlz i64:$a)))), (CLZr64 Int64Regs:$a)>;
+def : Pat<(i64 (ctlz i64:$a)), (CVT_u64_u32 (CLZr64 $a), CvtNONE)>;
+def : Pat<(i32 (trunc (i64 (ctlz i64:$a)))), (CLZr64 $a)>;
 
 // For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
 // result back to 16-bits if necessary.  We also need to subtract 16 because
@@ -3497,9 +3497,9 @@ def : Pat<(i32 (trunc (i64 (ctlz i64:$a)))), (CLZr64 Int64Regs:$a)>;
 // "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
 def : Pat<(i16 (ctlz i16:$a)),
           (SUBi16ri (CVT_u16_u32
-           (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>;
+           (CLZr32 (CVT_u32_u16 $a, CvtNONE)), CvtNONE), 16)>;
 def : Pat<(i32 (zext (i16 (ctlz i16:$a)))),
-          (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>;
+          (SUBi32ri (CLZr32 (CVT_u32_u16 $a, CvtNONE)), 16)>;
 
 // Population count
 let hasSideEffects = false in {
@@ -3510,67 +3510,67 @@ let hasSideEffects = false in {
 }
 
 // 32-bit has a direct PTX instruction
-def : Pat<(i32 (ctpop i32:$a)), (POPCr32 Int32Regs:$a)>;
+def : Pat<(i32 (ctpop i32:$a)), (POPCr32 $a)>;
 
 // For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit
 // to match the LLVM semantics.  Just as with ctlz.i64, we provide a second
 // pattern that avoids the type conversion if we're truncating the result to
 // i32 anyway.
-def : Pat<(ctpop i64:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
-def : Pat<(i32 (trunc (i64 (ctpop i64:$a)))), (POPCr64 Int64Regs:$a)>;
+def : Pat<(ctpop i64:$a), (CVT_u64_u32 (POPCr64 $a), CvtNONE)>;
+def : Pat<(i32 (trunc (i64 (ctpop i64:$a)))), (POPCr64 $a)>;
 
 // For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits.
 // If we know that we're storing into an i32, we can avoid the final trunc.
 def : Pat<(ctpop i16:$a),
-          (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
+          (CVT_u16_u32 (POPCr32 (CVT_u32_u16 $a, CvtNONE)), CvtNONE)>;
 def : Pat<(i32 (zext (i16 (ctpop i16:$a)))),
-          (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>;
+          (POPCr32 (CVT_u32_u16 $a, CvtNONE))>;
 
 // fpround f32 -> f16
 def : Pat<(f16 (fpround f32:$a)),
-          (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
+          (CVT_f16_f32 $a, CvtRN)>;
 
 // fpround f32 -> bf16
 def : Pat<(bf16 (fpround f32:$a)),
-          (CVT_bf16_f32 Float32Regs:$a, CvtRN)>, Requires<[hasPTX<70>, hasSM<80>]>;
+          (CVT_bf16_f32 $a, CvtRN)>, Requires<[hasPTX<70>, hasSM<80>]>;
 
 // fpround f64 -> f16
 def : Pat<(f16 (fpround f64:$a)),
-          (CVT_f16_f64 Float64Regs:$a, CvtRN)>;
+          (CVT_f16_f64 $a, CvtRN)>;
 
 // fpround f64 -> bf16
 def : Pat<(bf16 (fpround f64:$a)),
-          (CVT_bf16_f64 Float64Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
+          (CVT_bf16_f64 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
 // fpround f64 -> f32
 def : Pat<(f32 (fpround f64:$a)),
-          (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
+          (CVT_f32_f64 $a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
 def : Pat<(f32 (fpround f64:$a)),
-          (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
+          (CVT_f32_f64 $a, CvtRN)>;
 
 // fpextend f16 -> f32
 def : Pat<(f32 (fpextend f16:$a)),
-          (CVT_f32_f16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+          (CVT_f32_f16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
 def : Pat<(f32 (fpextend f16:$a)),
-          (CVT_f32_f16 Int16Regs:$a, CvtNONE)>;
+          (CVT_f32_f16 $a, CvtNONE)>;
 // fpextend bf16 -> f32
 def : Pat<(f32 (fpextend bf16:$a)),
-          (CVT_f32_bf16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+          (CVT_f32_bf16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
 def : Pat<(f32 (fpextend bf16:$a)),
-          (CVT_f32_bf16 Int16Regs:$a, CvtNONE)>, Requires<[hasPTX<71>, hasSM<80>]>;
+          (CVT_f32_bf16 $a, CvtNONE)>, Requires<[hasPTX<71>, hasSM<80>]>;
 
 // fpextend f16 -> f64
 def : Pat<(f64 (fpextend f16:$a)),
-          (CVT_f64_f16 Int16Regs:$a, CvtNONE)>;
+          (CVT_f64_f16 $a, CvtNONE)>;
 
 // fpextend bf16 -> f64
 def : Pat<(f64 (fpextend bf16:$a)),
-          (CVT_f64_bf16 Int16Regs:$a, CvtNONE)>, Requires<[hasPTX<78>, hasSM<90>]>;
+          (CVT_f64_bf16 $a, CvtNONE)>, Requires<[hasPTX<78>, hasSM<90>]>;
 
 // fpextend f32 -> f64
 def : Pat<(f64 (fpextend f32:$a)),
-          (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+          (CVT_f64_f32 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
 def : Pat<(f64 (fpextend f32:$a)),
-          (CVT_f64_f32 Float32Regs:$a, CvtNONE)>;
+          (CVT_f64_f32 $a, CvtNONE)>;
 
 def retglue : SDNode<"NVPTXISD::RET_GLUE", SDTNone,
                      [SDNPHasChain, SDNPOptInGlue]>;
@@ -3579,15 +3579,15 @@ def retglue : SDNode<"NVPTXISD::RET_GLUE", SDTNone,
 
 multiclass CVT_ROUND {
   def : Pat<(OpNode f16:$a),
-            (CVT_f16_f16 Int16Regs:$a, Mode)>;
+            (CVT_f16_f16 $a, Mode)>;
   def : Pat<(OpNode bf16:$a),
-            (CVT_bf16_bf16 Int16Regs:$a, Mode)>;
+            (CVT_bf16_bf16 $a, Mode)>;
   def : Pat<(OpNode f32:$a),
-            (CVT_f32_f32 Float32Regs:$a, ModeFTZ)>, Requires<[doF32FTZ]>;
+            (CVT_f32_f32 $a, ModeFTZ)>, Requires<[doF32FTZ]>;
   def : Pat<(OpNode f32:$a),
-            (CVT_f32_f32 Float32Regs:$a, Mode)>, Requires<[doNoF32FTZ]>;
+            (CVT_f32_f32 $a, Mode)>, Requires<[doNoF32FTZ]>;
   def : Pat<(OpNode f64:$a),
-            (CVT_f64_f64 Float64Regs:$a, Mode)>;
+            (CVT_f64_f64 $a, Mode)>;
 }
 
 defm : CVT_ROUND;
@@ -3624,7 +3624,7 @@ let isTerminator=1 in {
 }
 
 def : Pat<(brcond i32:$a, bb:$target),
-          (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>;
+          (CBranch (SETP_u32ri $a, 0, CmpNE), bb:$target)>;
 
 // SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
 // conditional branch if the target block is the next block so that the code
@@ -3632,7 +3632,7 @@ def : Pat<(brcond i32:$a, bb:$target),
 // condition, 1', which will be translated to (setne condition, -1).  Since ptx
 // supports '@!pred bra target', we should use it.
 def : Pat<(brcond (i1 (setne i1:$a, -1)), bb:$target),
-          (CBranchOther i1:$a, bb:$target)>;
+          (CBranchOther $a, bb:$target)>;
 
 // Call
 def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
@@ -3830,17 +3830,17 @@ include "NVPTXIntrinsics.td"
 
 def : Pat <
   (i32 (bswap i32:$a)),
-  (INT_NVVM_PRMT Int32Regs:$a, (i32 0), (i32 0x0123))>;
+  (INT_NVVM_PRMT $a, (i32 0), (i32 0x0123))>;
 
 def : Pat <
   (v2i16 (bswap v2i16:$a)),
-  (INT_NVVM_PRMT Int32Regs:$a, (i32 0), (i32 0x2301))>;
+  (INT_NVVM_PRMT $a, (i32 0), (i32 0x2301))>;
 
 def : Pat <
   (i64 (bswap i64:$a)),
   (V2I32toI64
-    (INT_NVVM_PRMT (I64toI32H Int64Regs:$a), (i32 0), (i32 0x0123)),
-    (INT_NVVM_PRMT (I64toI32L Int64Regs:$a), (i32 0), (i32 0x0123)))>;
+    (INT_NVVM_PRMT (I64toI32H $a), (i32 0), (i32 0x0123)),
+    (INT_NVVM_PRMT (I64toI32L $a), (i32 0), (i32 0x0123)))>;
 
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -3910,18 +3910,18 @@ def FMARELU_BF16X2 : NVPTXInst_rrr,
+  (FMARELU_F16_FTZ $a, $b, $c)>,
   Requires<[doF32FTZ]>;
 def : Pat<(v2f16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan v2f16:$a, v2f16:$b, v2f16:$c), fpimm_positive_zero_v2f16)),
-  (FMARELU_F16X2_FTZ Int32Regs:$a, Int32Regs:$b, Int32Regs:$c)>,
+  (FMARELU_F16X2_FTZ $a, $b, $c)>,
   Requires<[doF32FTZ]>;
 
 // NO FTZ
 def : Pat<(f16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan f16:$a, f16:$b, f16:$c), fpimm_any_zero)),
-  (FMARELU_F16 Int16Regs:$a, Int16Regs:$b, Int16Regs:$c)>;
+  (FMARELU_F16 $a, $b, $c)>;
 def : Pat<(bf16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan bf16:$a, bf16:$b, bf16:$c), fpimm_any_zero)),
-  (FMARELU_BF16 Int16Regs:$a, Int16Regs:$b, Int16Regs:$c)>;
+  (FMARELU_BF16 $a, $b, $c)>;
 def : Pat<(v2f16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan v2f16:$a, v2f16:$b, v2f16:$c), fpimm_positive_zero_v2f16)),
-  (FMARELU_F16X2 Int32Regs:$a, Int32Regs:$b, Int32Regs:$c)>;
+  (FMARELU_F16X2 $a, $b, $c)>;
 def : Pat<(v2bf16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan v2bf16:$a, v2bf16:$b, v2bf16:$c), fpimm_positive_zero_v2bf16)),
-  (FMARELU_BF16X2 Int32Regs:$a, Int32Regs:$b, Int32Regs:$c)>;
+  (FMARELU_BF16X2 $a, $b, $c)>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 0773c1bbc5781..8ede1ec4f20dc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -824,29 +824,29 @@ def MBARRIER_PENDING_COUNT :
 
 def : Pat<(int_nvvm_fmin_f immFloat1,
             (int_nvvm_fmax_f immFloat0, f32:$a)),
-          (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+          (CVT_f32_f32 $a, CvtSAT)>;
 def : Pat<(int_nvvm_fmin_f immFloat1,
             (int_nvvm_fmax_f f32:$a, immFloat0)),
-          (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+          (CVT_f32_f32 $a, CvtSAT)>;
 def : Pat<(int_nvvm_fmin_f
             (int_nvvm_fmax_f immFloat0, f32:$a), immFloat1),
-          (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+          (CVT_f32_f32 $a, CvtSAT)>;
 def : Pat<(int_nvvm_fmin_f
             (int_nvvm_fmax_f f32:$a, immFloat0), immFloat1),
-          (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+          (CVT_f32_f32 $a, CvtSAT)>;
 
 def : Pat<(int_nvvm_fmin_d immDouble1,
             (int_nvvm_fmax_d immDouble0, f64:$a)),
-          (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+          (CVT_f64_f64 $a, CvtSAT)>;
 def : Pat<(int_nvvm_fmin_d immDouble1,
             (int_nvvm_fmax_d f64:$a, immDouble0)),
-          (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+          (CVT_f64_f64 $a, CvtSAT)>;
 def : Pat<(int_nvvm_fmin_d
             (int_nvvm_fmax_d immDouble0, f64:$a), immDouble1),
-          (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+          (CVT_f64_f64 $a, CvtSAT)>;
 def : Pat<(int_nvvm_fmin_d
             (int_nvvm_fmax_d f64:$a, immDouble0), immDouble1),
-          (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+          (CVT_f64_f64 $a, CvtSAT)>;
 
 
 // We need a full string for OpcStr here because we need to deal with case like
@@ -1125,16 +1125,16 @@ def INT_NVVM_DIV_RP_D : F_MATH_2<"div.rp.f64 \t$dst, $src0, $src1;",
   Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rp_d>;
 
 def : Pat<(int_nvvm_div_full f32:$a, f32:$b),
-          (FDIV32rr Float32Regs:$a, Float32Regs:$b)>;
+          (FDIV32rr $a, $b)>;
 
 def : Pat<(int_nvvm_div_full f32:$a, fpimm:$b),
-          (FDIV32ri Float32Regs:$a, f32imm:$b)>;
+          (FDIV32ri $a, f32imm:$b)>;
 
 def : Pat<(int_nvvm_div_full_ftz f32:$a, f32:$b),
-          (FDIV32rr_ftz Float32Regs:$a, Float32Regs:$b)>;
+          (FDIV32rr_ftz $a, $b)>;
 
 def : Pat<(int_nvvm_div_full_ftz f32:$a, fpimm:$b),
-          (FDIV32ri_ftz Float32Regs:$a, f32imm:$b)>;
+          (FDIV32ri_ftz $a, f32imm:$b)>;
 
 //
 // Sad
@@ -1158,18 +1158,18 @@ def INT_NVVM_SAD_ULL : F_MATH_3<"sad.u64 \t$dst, $src0, $src1, $src2;",
 //
 
 def : Pat<(int_nvvm_floor_ftz_f f32:$a),
-          (CVT_f32_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+          (CVT_f32_f32 $a, CvtRMI_FTZ)>;
 def : Pat<(int_nvvm_floor_f f32:$a),
-          (CVT_f32_f32 Float32Regs:$a, CvtRMI)>;
+          (CVT_f32_f32 $a, CvtRMI)>;
 def : Pat<(int_nvvm_floor_d f64:$a),
-          (CVT_f64_f64 Float64Regs:$a, CvtRMI)>;
+          (CVT_f64_f64 $a, CvtRMI)>;
 
 def : Pat<(int_nvvm_ceil_ftz_f f32:$a),
-          (CVT_f32_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+          (CVT_f32_f32 $a, CvtRPI_FTZ)>;
 def : Pat<(int_nvvm_ceil_f f32:$a),
-          (CVT_f32_f32 Float32Regs:$a, CvtRPI)>;
+          (CVT_f32_f32 $a, CvtRPI)>;
 def : Pat<(int_nvvm_ceil_d f64:$a),
-          (CVT_f64_f64 Float64Regs:$a, CvtRPI)>;
+          (CVT_f64_f64 $a, CvtRPI)>;
 
 //
 // Abs
@@ -1217,33 +1217,33 @@ def INT_NVVM_NEG_BF16X2 : F_MATH_1<"neg.bf16x2 \t$dst, $src0;", Int32Regs,
 //
 
 def : Pat<(int_nvvm_round_ftz_f f32:$a),
-          (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+          (CVT_f32_f32 $a, CvtRNI_FTZ)>;
 def : Pat<(int_nvvm_round_f f32:$a),
-          (CVT_f32_f32 Float32Regs:$a, CvtRNI)>;
+          (CVT_f32_f32 $a, CvtRNI)>;
 def : Pat<(int_nvvm_round_d f64:$a),
-          (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
+          (CVT_f64_f64 $a, CvtRNI)>;
 
 //
 // Trunc
 //
 
 def : Pat<(int_nvvm_trunc_ftz_f f32:$a),
-          (CVT_f32_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+          (CVT_f32_f32 $a, CvtRZI_FTZ)>;
 def : Pat<(int_nvvm_trunc_f f32:$a),
-          (CVT_f32_f32 Float32Regs:$a, CvtRZI)>;
+          (CVT_f32_f32 $a, CvtRZI)>;
 def : Pat<(int_nvvm_trunc_d f64:$a),
-          (CVT_f64_f64 Float64Regs:$a, CvtRZI)>;
+          (CVT_f64_f64 $a, CvtRZI)>;
 
 //
 // Saturate
 //
 
 def : Pat<(int_nvvm_saturate_ftz_f f32:$a),
-          (CVT_f32_f32 Float32Regs:$a, CvtSAT_FTZ)>;
+          (CVT_f32_f32 $a, CvtSAT_FTZ)>;
 def : Pat<(int_nvvm_saturate_f f32:$a),
-          (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+          (CVT_f32_f32 $a, CvtSAT)>;
 def : Pat<(int_nvvm_saturate_d f64:$a),
-          (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+          (CVT_f64_f64 $a, CvtSAT)>;
 
 //
 // Exp2  Log2
@@ -1430,13 +1430,13 @@ def INT_NVVM_SQRT_RP_D : F_MATH_1<"sqrt.rp.f64 \t$dst, $src0;", Float64Regs,
 
 // nvvm_sqrt intrinsic
 def : Pat<(int_nvvm_sqrt_f f32:$a),
-          (INT_NVVM_SQRT_RN_FTZ_F Float32Regs:$a)>, Requires<[doF32FTZ, do_SQRTF32_RN]>;
+          (INT_NVVM_SQRT_RN_FTZ_F $a)>, Requires<[doF32FTZ, do_SQRTF32_RN]>;
 def : Pat<(int_nvvm_sqrt_f f32:$a),
-          (INT_NVVM_SQRT_RN_F Float32Regs:$a)>, Requires<[do_SQRTF32_RN]>;
+          (INT_NVVM_SQRT_RN_F $a)>, Requires<[do_SQRTF32_RN]>;
 def : Pat<(int_nvvm_sqrt_f f32:$a),
-          (INT_NVVM_SQRT_APPROX_FTZ_F Float32Regs:$a)>, Requires<[doF32FTZ]>;
+          (INT_NVVM_SQRT_APPROX_FTZ_F $a)>, Requires<[doF32FTZ]>;
 def : Pat<(int_nvvm_sqrt_f f32:$a),
-          (INT_NVVM_SQRT_APPROX_F Float32Regs:$a)>;
+          (INT_NVVM_SQRT_APPROX_F $a)>;
 
 //
 // Rsqrt
@@ -1456,24 +1456,24 @@ def INT_NVVM_RSQRT_APPROX_D : F_MATH_1<"rsqrt.approx.f64 \t$dst, $src0;",
 
 // 1.0f / sqrt_approx -> rsqrt_approx
 def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_approx_f f32:$a)),
-         (INT_NVVM_RSQRT_APPROX_F Float32Regs:$a)>,
+         (INT_NVVM_RSQRT_APPROX_F $a)>,
          Requires<[doRsqrtOpt]>;
 def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_approx_ftz_f f32:$a)),
-         (INT_NVVM_RSQRT_APPROX_FTZ_F Float32Regs:$a)>,
+         (INT_NVVM_RSQRT_APPROX_FTZ_F $a)>,
          Requires<[doRsqrtOpt]>;
 // same for int_nvvm_sqrt_f when non-precision sqrt is requested
 def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_f f32:$a)),
-         (INT_NVVM_RSQRT_APPROX_F Float32Regs:$a)>,
+         (INT_NVVM_RSQRT_APPROX_F $a)>,
          Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doNoF32FTZ]>;
 def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_f f32:$a)),
-         (INT_NVVM_RSQRT_APPROX_FTZ_F Float32Regs:$a)>,
+         (INT_NVVM_RSQRT_APPROX_FTZ_F $a)>,
          Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doF32FTZ]>;
 
 def: Pat<(fdiv FloatConst1, (fsqrt f32:$a)),
-         (INT_NVVM_RSQRT_APPROX_F Float32Regs:$a)>,
+         (INT_NVVM_RSQRT_APPROX_F $a)>,
          Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doNoF32FTZ]>;
 def: Pat<(fdiv FloatConst1, (fsqrt f32:$a)),
-         (INT_NVVM_RSQRT_APPROX_FTZ_F Float32Regs:$a)>,
+         (INT_NVVM_RSQRT_APPROX_FTZ_F $a)>,
          Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doF32FTZ]>;
 //
 // Add
@@ -1529,136 +1529,136 @@ foreach t = [I32RT, I64RT] in {
 //
 
 def : Pat<(int_nvvm_d2f_rn_ftz f64:$a),
-          (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>;
+          (CVT_f32_f64 $a, CvtRN_FTZ)>;
 def : Pat<(int_nvvm_d2f_rn f64:$a),
-          (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
+          (CVT_f32_f64 $a, CvtRN)>;
 def : Pat<(int_nvvm_d2f_rz_ftz f64:$a),
-          (CVT_f32_f64 Float64Regs:$a, CvtRZ_FTZ)>;
+          (CVT_f32_f64 $a, CvtRZ_FTZ)>;
 def : Pat<(int_nvvm_d2f_rz f64:$a),
-          (CVT_f32_f64 Float64Regs:$a, CvtRZ)>;
+          (CVT_f32_f64 $a, CvtRZ)>;
 def : Pat<(int_nvvm_d2f_rm_ftz f64:$a),
-          (CVT_f32_f64 Float64Regs:$a, CvtRM_FTZ)>;
+          (CVT_f32_f64 $a, CvtRM_FTZ)>;
 def : Pat<(int_nvvm_d2f_rm f64:$a),
-          (CVT_f32_f64 Float64Regs:$a, CvtRM)>;
+          (CVT_f32_f64 $a, CvtRM)>;
 def : Pat<(int_nvvm_d2f_rp_ftz f64:$a),
-          (CVT_f32_f64 Float64Regs:$a, CvtRP_FTZ)>;
+          (CVT_f32_f64 $a, CvtRP_FTZ)>;
 def : Pat<(int_nvvm_d2f_rp f64:$a),
-          (CVT_f32_f64 Float64Regs:$a, CvtRP)>;
+          (CVT_f32_f64 $a, CvtRP)>;
 
 def : Pat<(int_nvvm_d2i_rn f64:$a),
-          (CVT_s32_f64 Float64Regs:$a, CvtRNI)>;
+          (CVT_s32_f64 $a, CvtRNI)>;
 def : Pat<(int_nvvm_d2i_rz f64:$a),
-          (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
+          (CVT_s32_f64 $a, CvtRZI)>;
 def : Pat<(int_nvvm_d2i_rm f64:$a),
-          (CVT_s32_f64 Float64Regs:$a, CvtRMI)>;
+          (CVT_s32_f64 $a, CvtRMI)>;
 def : Pat<(int_nvvm_d2i_rp f64:$a),
-          (CVT_s32_f64 Float64Regs:$a, CvtRPI)>;
+          (CVT_s32_f64 $a, CvtRPI)>;
 
 def : Pat<(int_nvvm_d2ui_rn f64:$a),
-          (CVT_u32_f64 Float64Regs:$a, CvtRNI)>;
+          (CVT_u32_f64 $a, CvtRNI)>;
 def : Pat<(int_nvvm_d2ui_rz f64:$a),
-          (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
+          (CVT_u32_f64 $a, CvtRZI)>;
 def : Pat<(int_nvvm_d2ui_rm f64:$a),
-          (CVT_u32_f64 Float64Regs:$a, CvtRMI)>;
+          (CVT_u32_f64 $a, CvtRMI)>;
 def : Pat<(int_nvvm_d2ui_rp f64:$a),
-          (CVT_u32_f64 Float64Regs:$a, CvtRPI)>;
+          (CVT_u32_f64 $a, CvtRPI)>;
 
 def : Pat<(int_nvvm_i2d_rn i32:$a),
-          (CVT_f64_s32 Int32Regs:$a, CvtRN)>;
+          (CVT_f64_s32 $a, CvtRN)>;
 def : Pat<(int_nvvm_i2d_rz i32:$a),
-          (CVT_f64_s32 Int32Regs:$a, CvtRZ)>;
+          (CVT_f64_s32 $a, CvtRZ)>;
 def : Pat<(int_nvvm_i2d_rm i32:$a),
-          (CVT_f64_s32 Int32Regs:$a, CvtRM)>;
+          (CVT_f64_s32 $a, CvtRM)>;
 def : Pat<(int_nvvm_i2d_rp i32:$a),
-          (CVT_f64_s32 Int32Regs:$a, CvtRP)>;
+          (CVT_f64_s32 $a, CvtRP)>;
 
 def : Pat<(int_nvvm_ui2d_rn i32:$a),
-          (CVT_f64_u32 Int32Regs:$a, CvtRN)>;
+          (CVT_f64_u32 $a, CvtRN)>;
 def : Pat<(int_nvvm_ui2d_rz i32:$a),
-          (CVT_f64_u32 Int32Regs:$a, CvtRZ)>;
+          (CVT_f64_u32 $a, CvtRZ)>;
 def : Pat<(int_nvvm_ui2d_rm i32:$a),
-          (CVT_f64_u32 Int32Regs:$a, CvtRM)>;
+          (CVT_f64_u32 $a, CvtRM)>;
 def : Pat<(int_nvvm_ui2d_rp i32:$a),
-          (CVT_f64_u32 Int32Regs:$a, CvtRP)>;
+          (CVT_f64_u32 $a, CvtRP)>;
 
 def : Pat<(int_nvvm_f2i_rn_ftz f32:$a),
-          (CVT_s32_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+          (CVT_s32_f32 $a, CvtRNI_FTZ)>;
 def : Pat<(int_nvvm_f2i_rn f32:$a),
-          (CVT_s32_f32 Float32Regs:$a, CvtRNI)>;
+          (CVT_s32_f32 $a, CvtRNI)>;
 def : Pat<(int_nvvm_f2i_rz_ftz f32:$a),
-          (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+          (CVT_s32_f32 $a, CvtRZI_FTZ)>;
 def : Pat<(int_nvvm_f2i_rz f32:$a),
-          (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
+          (CVT_s32_f32 $a, CvtRZI)>;
 def : Pat<(int_nvvm_f2i_rm_ftz f32:$a),
-          (CVT_s32_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+          (CVT_s32_f32 $a, CvtRMI_FTZ)>;
 def : Pat<(int_nvvm_f2i_rm f32:$a),
-          (CVT_s32_f32 Float32Regs:$a, CvtRMI)>;
+          (CVT_s32_f32 $a, CvtRMI)>;
 def : Pat<(int_nvvm_f2i_rp_ftz f32:$a),
-          (CVT_s32_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+          (CVT_s32_f32 $a, CvtRPI_FTZ)>;
 def : Pat<(int_nvvm_f2i_rp f32:$a),
-          (CVT_s32_f32 Float32Regs:$a, CvtRPI)>;
+          (CVT_s32_f32 $a, CvtRPI)>;
 
 def : Pat<(int_nvvm_f2ui_rn_ftz f32:$a),
-          (CVT_u32_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+          (CVT_u32_f32 $a, CvtRNI_FTZ)>;
 def : Pat<(int_nvvm_f2ui_rn f32:$a),
-          (CVT_u32_f32 Float32Regs:$a, CvtRNI)>;
+          (CVT_u32_f32 $a, CvtRNI)>;
 def : Pat<(int_nvvm_f2ui_rz_ftz f32:$a),
-          (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+          (CVT_u32_f32 $a, CvtRZI_FTZ)>;
 def : Pat<(int_nvvm_f2ui_rz f32:$a),
-          (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
+          (CVT_u32_f32 $a, CvtRZI)>;
 def : Pat<(int_nvvm_f2ui_rm_ftz f32:$a),
-          (CVT_u32_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+          (CVT_u32_f32 $a, CvtRMI_FTZ)>;
 def : Pat<(int_nvvm_f2ui_rm f32:$a),
-          (CVT_u32_f32 Float32Regs:$a, CvtRMI)>;
+          (CVT_u32_f32 $a, CvtRMI)>;
 def : Pat<(int_nvvm_f2ui_rp_ftz f32:$a),
-          (CVT_u32_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+          (CVT_u32_f32 $a, CvtRPI_FTZ)>;
 def : Pat<(int_nvvm_f2ui_rp f32:$a),
-          (CVT_u32_f32 Float32Regs:$a, CvtRPI)>;
+          (CVT_u32_f32 $a, CvtRPI)>;
 
 def : Pat<(int_nvvm_i2f_rn i32:$a),
-          (CVT_f32_s32 Int32Regs:$a, CvtRN)>;
+          (CVT_f32_s32 $a, CvtRN)>;
 def : Pat<(int_nvvm_i2f_rz i32:$a),
-          (CVT_f32_s32 Int32Regs:$a, CvtRZ)>;
+          (CVT_f32_s32 $a, CvtRZ)>;
 def : Pat<(int_nvvm_i2f_rm i32:$a),
-          (CVT_f32_s32 Int32Regs:$a, CvtRM)>;
+          (CVT_f32_s32 $a, CvtRM)>;
 def : Pat<(int_nvvm_i2f_rp i32:$a),
-          (CVT_f32_s32 Int32Regs:$a, CvtRP)>;
+          (CVT_f32_s32 $a, CvtRP)>;
 
 def : Pat<(int_nvvm_ui2f_rn i32:$a),
-          (CVT_f32_u32 Int32Regs:$a, CvtRN)>;
+          (CVT_f32_u32 $a, CvtRN)>;
 def : Pat<(int_nvvm_ui2f_rz i32:$a),
-          (CVT_f32_u32 Int32Regs:$a, CvtRZ)>;
+          (CVT_f32_u32 $a, CvtRZ)>;
 def : Pat<(int_nvvm_ui2f_rm i32:$a),
-          (CVT_f32_u32 Int32Regs:$a, CvtRM)>;
+          (CVT_f32_u32 $a, CvtRM)>;
 def : Pat<(int_nvvm_ui2f_rp i32:$a),
-          (CVT_f32_u32 Int32Regs:$a, CvtRP)>;
+          (CVT_f32_u32 $a, CvtRP)>;
 
 def : Pat<(int_nvvm_ff2bf16x2_rn f32:$a, f32:$b),
-          (CVT_bf16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN)>;
+          (CVT_bf16x2_f32 $a, $b, CvtRN)>;
 def : Pat<(int_nvvm_ff2bf16x2_rn_relu f32:$a, f32:$b),
-          (CVT_bf16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN_RELU)>;
+          (CVT_bf16x2_f32 $a, $b, CvtRN_RELU)>;
 def : Pat<(int_nvvm_ff2bf16x2_rz f32:$a, f32:$b),
-          (CVT_bf16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ)>;
+          (CVT_bf16x2_f32 $a, $b, CvtRZ)>;
 def : Pat<(int_nvvm_ff2bf16x2_rz_relu f32:$a, f32:$b),
-          (CVT_bf16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ_RELU)>;
+          (CVT_bf16x2_f32 $a, $b, CvtRZ_RELU)>;
 
 def : Pat<(int_nvvm_ff2f16x2_rn f32:$a, f32:$b),
-          (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN)>;
+          (CVT_f16x2_f32 $a, $b, CvtRN)>;
 def : Pat<(int_nvvm_ff2f16x2_rn_relu f32:$a, f32:$b),
-          (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN_RELU)>;
+          (CVT_f16x2_f32 $a, $b, CvtRN_RELU)>;
 def : Pat<(int_nvvm_ff2f16x2_rz f32:$a, f32:$b),
-          (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ)>;
+          (CVT_f16x2_f32 $a, $b, CvtRZ)>;
 def : Pat<(int_nvvm_ff2f16x2_rz_relu f32:$a, f32:$b),
-          (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ_RELU)>;
+          (CVT_f16x2_f32 $a, $b, CvtRZ_RELU)>;
 
 def : Pat<(int_nvvm_f2bf16_rn f32:$a),
-          (CVT_bf16_f32 Float32Regs:$a, CvtRN)>;
+          (CVT_bf16_f32 $a, CvtRN)>;
 def : Pat<(int_nvvm_f2bf16_rn_relu f32:$a),
-          (CVT_bf16_f32 Float32Regs:$a, CvtRN_RELU)>;
+          (CVT_bf16_f32 $a, CvtRN_RELU)>;
 def : Pat<(int_nvvm_f2bf16_rz f32:$a),
-          (CVT_bf16_f32 Float32Regs:$a, CvtRZ)>;
+          (CVT_bf16_f32 $a, CvtRZ)>;
 def : Pat<(int_nvvm_f2bf16_rz_relu f32:$a),
-          (CVT_bf16_f32 Float32Regs:$a, CvtRZ_RELU)>;
+          (CVT_bf16_f32 $a, CvtRZ_RELU)>;
 
 def CVT_tf32_f32 :
    NVPTXInst<(outs Int32Regs:$dest), (ins Float32Regs:$a),
@@ -1682,125 +1682,125 @@ def INT_NVVM_D2I_HI : F_MATH_1<
   Int32Regs, Float64Regs, int_nvvm_d2i_hi>;
 
 def : Pat<(int_nvvm_f2ll_rn_ftz f32:$a),
-          (CVT_s64_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+          (CVT_s64_f32 $a, CvtRNI_FTZ)>;
 def : Pat<(int_nvvm_f2ll_rn f32:$a),
-          (CVT_s64_f32 Float32Regs:$a, CvtRNI)>;
+          (CVT_s64_f32 $a, CvtRNI)>;
 def : Pat<(int_nvvm_f2ll_rz_ftz f32:$a),
-          (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+          (CVT_s64_f32 $a, CvtRZI_FTZ)>;
 def : Pat<(int_nvvm_f2ll_rz f32:$a),
-          (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
+          (CVT_s64_f32 $a, CvtRZI)>;
 def : Pat<(int_nvvm_f2ll_rm_ftz f32:$a),
-          (CVT_s64_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+          (CVT_s64_f32 $a, CvtRMI_FTZ)>;
 def : Pat<(int_nvvm_f2ll_rm f32:$a),
-          (CVT_s64_f32 Float32Regs:$a, CvtRMI)>;
+          (CVT_s64_f32 $a, CvtRMI)>;
 def : Pat<(int_nvvm_f2ll_rp_ftz f32:$a),
-          (CVT_s64_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+          (CVT_s64_f32 $a, CvtRPI_FTZ)>;
 def : Pat<(int_nvvm_f2ll_rp f32:$a),
-          (CVT_s64_f32 Float32Regs:$a, CvtRPI)>;
+          (CVT_s64_f32 $a, CvtRPI)>;
 
 def : Pat<(int_nvvm_f2ull_rn_ftz f32:$a),
-          (CVT_u64_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+          (CVT_u64_f32 $a, CvtRNI_FTZ)>;
 def : Pat<(int_nvvm_f2ull_rn f32:$a),
-          (CVT_u64_f32 Float32Regs:$a, CvtRNI)>;
+          (CVT_u64_f32 $a, CvtRNI)>;
 def : Pat<(int_nvvm_f2ull_rz_ftz f32:$a),
-          (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+          (CVT_u64_f32 $a, CvtRZI_FTZ)>;
 def : Pat<(int_nvvm_f2ull_rz f32:$a),
-          (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
+          (CVT_u64_f32 $a, CvtRZI)>;
 def : Pat<(int_nvvm_f2ull_rm_ftz f32:$a),
-          (CVT_u64_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+          (CVT_u64_f32 $a, CvtRMI_FTZ)>;
 def : Pat<(int_nvvm_f2ull_rm f32:$a),
-          (CVT_u64_f32 Float32Regs:$a, CvtRMI)>;
+          (CVT_u64_f32 $a, CvtRMI)>;
 def : Pat<(int_nvvm_f2ull_rp_ftz f32:$a),
-          (CVT_u64_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+          (CVT_u64_f32 $a, CvtRPI_FTZ)>;
 def : Pat<(int_nvvm_f2ull_rp f32:$a),
-          (CVT_u64_f32 Float32Regs:$a, CvtRPI)>;
+          (CVT_u64_f32 $a, CvtRPI)>;
 
 def : Pat<(int_nvvm_d2ll_rn f64:$a),
-          (CVT_s64_f64 Float64Regs:$a, CvtRNI)>;
+          (CVT_s64_f64 $a, CvtRNI)>;
 def : Pat<(int_nvvm_d2ll_rz f64:$a),
-          (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
+          (CVT_s64_f64 $a, CvtRZI)>;
 def : Pat<(int_nvvm_d2ll_rm f64:$a),
-          (CVT_s64_f64 Float64Regs:$a, CvtRMI)>;
+          (CVT_s64_f64 $a, CvtRMI)>;
 def : Pat<(int_nvvm_d2ll_rp f64:$a),
-          (CVT_s64_f64 Float64Regs:$a, CvtRPI)>;
+          (CVT_s64_f64 $a, CvtRPI)>;
 
 def : Pat<(int_nvvm_d2ull_rn f64:$a),
-          (CVT_u64_f64 Float64Regs:$a, CvtRNI)>;
+          (CVT_u64_f64 $a, CvtRNI)>;
 def : Pat<(int_nvvm_d2ull_rz f64:$a),
-          (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
+          (CVT_u64_f64 $a, CvtRZI)>;
 def : Pat<(int_nvvm_d2ull_rm f64:$a),
-          (CVT_u64_f64 Float64Regs:$a, CvtRMI)>;
+          (CVT_u64_f64 $a, CvtRMI)>;
 def : Pat<(int_nvvm_d2ull_rp f64:$a),
-          (CVT_u64_f64 Float64Regs:$a, CvtRPI)>;
+          (CVT_u64_f64 $a, CvtRPI)>;
 
 def : Pat<(int_nvvm_ll2f_rn i64:$a),
-          (CVT_f32_s64 Int64Regs:$a, CvtRN)>;
+          (CVT_f32_s64 $a, CvtRN)>;
 def : Pat<(int_nvvm_ll2f_rz i64:$a),
-          (CVT_f32_s64 Int64Regs:$a, CvtRZ)>;
+          (CVT_f32_s64 $a, CvtRZ)>;
 def : Pat<(int_nvvm_ll2f_rm i64:$a),
-          (CVT_f32_s64 Int64Regs:$a, CvtRM)>;
+          (CVT_f32_s64 $a, CvtRM)>;
 def : Pat<(int_nvvm_ll2f_rp i64:$a),
-          (CVT_f32_s64 Int64Regs:$a, CvtRP)>;
+          (CVT_f32_s64 $a, CvtRP)>;
 
 def : Pat<(int_nvvm_ull2f_rn i64:$a),
-          (CVT_f32_u64 Int64Regs:$a, CvtRN)>;
+          (CVT_f32_u64 $a, CvtRN)>;
 def : Pat<(int_nvvm_ull2f_rz i64:$a),
-          (CVT_f32_u64 Int64Regs:$a, CvtRZ)>;
+          (CVT_f32_u64 $a, CvtRZ)>;
 def : Pat<(int_nvvm_ull2f_rm i64:$a),
-          (CVT_f32_u64 Int64Regs:$a, CvtRM)>;
+          (CVT_f32_u64 $a, CvtRM)>;
 def : Pat<(int_nvvm_ull2f_rp i64:$a),
-          (CVT_f32_u64 Int64Regs:$a, CvtRP)>;
+          (CVT_f32_u64 $a, CvtRP)>;
 
 def : Pat<(int_nvvm_ll2d_rn i64:$a),
-          (CVT_f64_s64 Int64Regs:$a, CvtRN)>;
+          (CVT_f64_s64 $a, CvtRN)>;
 def : Pat<(int_nvvm_ll2d_rz i64:$a),
-          (CVT_f64_s64 Int64Regs:$a, CvtRZ)>;
+          (CVT_f64_s64 $a, CvtRZ)>;
 def : Pat<(int_nvvm_ll2d_rm i64:$a),
-          (CVT_f64_s64 Int64Regs:$a, CvtRM)>;
+          (CVT_f64_s64 $a, CvtRM)>;
 def : Pat<(int_nvvm_ll2d_rp i64:$a),
-          (CVT_f64_s64 Int64Regs:$a, CvtRP)>;
+          (CVT_f64_s64 $a, CvtRP)>;
 
 def : Pat<(int_nvvm_ull2d_rn i64:$a),
-          (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
+          (CVT_f64_u64 $a, CvtRN)>;
 def : Pat<(int_nvvm_ull2d_rz i64:$a),
-          (CVT_f64_u64 Int64Regs:$a, CvtRZ)>;
+          (CVT_f64_u64 $a, CvtRZ)>;
 def : Pat<(int_nvvm_ull2d_rm i64:$a),
-          (CVT_f64_u64 Int64Regs:$a, CvtRM)>;
+          (CVT_f64_u64 $a, CvtRM)>;
 def : Pat<(int_nvvm_ull2d_rp i64:$a),
-          (CVT_f64_u64 Int64Regs:$a, CvtRP)>;
+          (CVT_f64_u64 $a, CvtRP)>;
 
 
 def : Pat<(int_nvvm_f2h_rn_ftz f32:$a),
-          (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ)>;
+          (CVT_f16_f32 $a, CvtRN_FTZ)>;
 def : Pat<(int_nvvm_f2h_rn f32:$a),
-          (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
+          (CVT_f16_f32 $a, CvtRN)>;
 
 def : Pat<(int_nvvm_ff_to_e4m3x2_rn f32:$a, f32:$b),
-          (CVT_e4m3x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN)>;
+          (CVT_e4m3x2_f32 $a, $b, CvtRN)>;
 def : Pat<(int_nvvm_ff_to_e4m3x2_rn_relu f32:$a, f32:$b),
-          (CVT_e4m3x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN_RELU)>;
+          (CVT_e4m3x2_f32 $a, $b, CvtRN_RELU)>;
 def : Pat<(int_nvvm_ff_to_e5m2x2_rn f32:$a, f32:$b),
-          (CVT_e5m2x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN)>;
+          (CVT_e5m2x2_f32 $a, $b, CvtRN)>;
 def : Pat<(int_nvvm_ff_to_e5m2x2_rn_relu f32:$a, f32:$b),
-          (CVT_e5m2x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN_RELU)>;
+          (CVT_e5m2x2_f32 $a, $b, CvtRN_RELU)>;
 
 def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn Int32Regs:$a),
-          (CVT_e4m3x2_f16x2 Int32Regs:$a, CvtRN)>;
+          (CVT_e4m3x2_f16x2 $a, CvtRN)>;
 def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn_relu Int32Regs:$a),
-          (CVT_e4m3x2_f16x2 Int32Regs:$a, CvtRN_RELU)>;
+          (CVT_e4m3x2_f16x2 $a, CvtRN_RELU)>;
 def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn Int32Regs:$a),
-          (CVT_e5m2x2_f16x2 Int32Regs:$a, CvtRN)>;
+          (CVT_e5m2x2_f16x2 $a, CvtRN)>;
 def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn_relu Int32Regs:$a),
-          (CVT_e5m2x2_f16x2 Int32Regs:$a, CvtRN_RELU)>;
+          (CVT_e5m2x2_f16x2 $a, CvtRN_RELU)>;
 
 def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn Int16Regs:$a),
-          (CVT_f16x2_e4m3x2 Int16Regs:$a, CvtRN)>;
+          (CVT_f16x2_e4m3x2 $a, CvtRN)>;
 def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn_relu Int16Regs:$a),
-          (CVT_f16x2_e4m3x2 Int16Regs:$a, CvtRN_RELU)>;
+          (CVT_f16x2_e4m3x2 $a, CvtRN_RELU)>;
 def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn Int16Regs:$a),
-          (CVT_f16x2_e5m2x2 Int16Regs:$a, CvtRN)>;
+          (CVT_f16x2_e5m2x2 $a, CvtRN)>;
 def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn_relu Int16Regs:$a),
-          (CVT_f16x2_e5m2x2 Int16Regs:$a, CvtRN_RELU)>;
+          (CVT_f16x2_e5m2x2 $a, CvtRN_RELU)>;
 
 //
 // FNS
@@ -1823,9 +1823,9 @@ def INT_FNS_rii : INT_FNS_MBO<(ins Int32Regs:$mask,    i32imm:$base,    i32imm:$
 def INT_FNS_irr : INT_FNS_MBO<(ins    i32imm:$mask, Int32Regs:$base, Int32Regs:$offset),
                      (int_nvvm_fns       imm:$mask, i32:$base, i32:$offset)>;
 def INT_FNS_iri : INT_FNS_MBO<(ins    i32imm:$mask, Int32Regs:$base,    i32imm:$offset),
-                     (int_nvvm_fns       imm:$mask, Int32Regs:$base,       imm:$offset)>;
+                     (int_nvvm_fns       imm:$mask, i32:$base,       imm:$offset)>;
 def INT_FNS_iir : INT_FNS_MBO<(ins    i32imm:$mask,    i32imm:$base, Int32Regs:$offset),
-                     (int_nvvm_fns       imm:$mask,       imm:$base, Int32Regs:$offset)>;
+                     (int_nvvm_fns       imm:$mask,       imm:$base, i32:$offset)>;
 def INT_FNS_iii : INT_FNS_MBO<(ins    i32imm:$mask,    i32imm:$base,    i32imm:$offset),
                      (int_nvvm_fns       imm:$mask,       imm:$base,       imm:$offset)>;
 
@@ -2796,10 +2796,10 @@ defm cvta_to_const  : G_TO_NG<"const">;
 defm cvta_param : NG_TO_G<"param">;
 
 def : Pat<(int_nvvm_ptr_param_to_gen i32:$src),
-          (cvta_param Int32Regs:$src)>;
+          (cvta_param $src)>;
 
 def : Pat<(int_nvvm_ptr_param_to_gen i64:$src),
-          (cvta_param_64 Int64Regs:$src)>;
+          (cvta_param_64 $src)>;
 
 // nvvm.ptr.gen.to.param
 def : Pat<(int_nvvm_ptr_gen_to_param i32:$src),
@@ -2933,8 +2933,8 @@ def : Pat<(int_nvvm_read_ptx_sreg_envreg31), (MOV_SPECIAL ENVREG31)>;
 
 
 def : Pat<(int_nvvm_swap_lo_hi_b64 i64:$src),
-          (V2I32toI64 (I64toI32H Int64Regs:$src),
-                      (I64toI32L Int64Regs:$src))> ;
+          (V2I32toI64 (I64toI32H $src),
+                      (I64toI32L $src))> ;
 
 //-----------------------------------
 // Texture Intrinsics
@@ -5040,21 +5040,21 @@ def TXQ_NUM_MIPMAP_LEVELS_I
 }
 
 def : Pat<(int_nvvm_txq_channel_order i64:$a),
-          (TXQ_CHANNEL_ORDER_R i64:$a)>;
+          (TXQ_CHANNEL_ORDER_R $a)>;
 def : Pat<(int_nvvm_txq_channel_data_type i64:$a),
-          (TXQ_CHANNEL_DATA_TYPE_R i64:$a)>;
+          (TXQ_CHANNEL_DATA_TYPE_R $a)>;
 def : Pat<(int_nvvm_txq_width i64:$a),
-          (TXQ_WIDTH_R i64:$a)>;
+          (TXQ_WIDTH_R $a)>;
 def : Pat<(int_nvvm_txq_height i64:$a),
-          (TXQ_HEIGHT_R i64:$a)>;
+          (TXQ_HEIGHT_R $a)>;
 def : Pat<(int_nvvm_txq_depth i64:$a),
-          (TXQ_DEPTH_R i64:$a)>;
+          (TXQ_DEPTH_R $a)>;
 def : Pat<(int_nvvm_txq_array_size i64:$a),
-          (TXQ_ARRAY_SIZE_R i64:$a)>;
+          (TXQ_ARRAY_SIZE_R $a)>;
 def : Pat<(int_nvvm_txq_num_samples i64:$a),
-          (TXQ_NUM_SAMPLES_R i64:$a)>;
+          (TXQ_NUM_SAMPLES_R $a)>;
 def : Pat<(int_nvvm_txq_num_mipmap_levels i64:$a),
-          (TXQ_NUM_MIPMAP_LEVELS_R i64:$a)>;
+          (TXQ_NUM_MIPMAP_LEVELS_R $a)>;
 
 
 //-----------------------------------
@@ -5113,17 +5113,17 @@ def SUQ_ARRAY_SIZE_I
 }
 
 def : Pat<(int_nvvm_suq_channel_order i64:$a),
-          (SUQ_CHANNEL_ORDER_R Int64Regs:$a)>;
+          (SUQ_CHANNEL_ORDER_R $a)>;
 def : Pat<(int_nvvm_suq_channel_data_type i64:$a),
-          (SUQ_CHANNEL_DATA_TYPE_R Int64Regs:$a)>;
+          (SUQ_CHANNEL_DATA_TYPE_R $a)>;
 def : Pat<(int_nvvm_suq_width i64:$a),
-          (SUQ_WIDTH_R Int64Regs:$a)>;
+          (SUQ_WIDTH_R $a)>;
 def : Pat<(int_nvvm_suq_height i64:$a),
-          (SUQ_HEIGHT_R Int64Regs:$a)>;
+          (SUQ_HEIGHT_R $a)>;
 def : Pat<(int_nvvm_suq_depth i64:$a),
-          (SUQ_DEPTH_R Int64Regs:$a)>;
+          (SUQ_DEPTH_R $a)>;
 def : Pat<(int_nvvm_suq_array_size i64:$a),
-          (SUQ_ARRAY_SIZE_R Int64Regs:$a)>;
+          (SUQ_ARRAY_SIZE_R $a)>;
 
 
 //===- Handle Query -------------------------------------------------------===//

From af83093933ca73bc82c33130f8bda9f1ae54aae2 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim 
Date: Mon, 23 Dec 2024 17:32:04 +0000
Subject: [PATCH 183/567] [VectorCombine] eraseInstruction - ensure we
 reattempt to fold other users of an erased instruction's operands

As we're reducing the use count of the operands its more likely that they will now fold, as they were previously being prevented by a m_OneUse check, or the cost of retaining the extra instruction had been too high.

This is necessary for some upcoming patches, although the only change so far is instruction ordering as it allows some SSE folds of 256/512-bit with 128-bit subvectors to occur earlier in foldShuffleToIdentity as the subvector concats are free.

Pulled out of #120984
---
 .../Transforms/Vectorize/VectorCombine.cpp    | 11 +++-
 .../VectorCombine/X86/concat-boolmasks.ll     | 64 ++++++++++++++-----
 2 files changed, 58 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 2460ccc61d84d..4a0f014be0e75 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -139,10 +139,17 @@ class VectorCombine {
 
   void eraseInstruction(Instruction &I) {
     LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n');
-    for (Value *Op : I.operands())
-      Worklist.pushValue(Op);
+    SmallVector Ops(I.operands());
     Worklist.remove(&I);
     I.eraseFromParent();
+
+    // Push remaining users and then the operand itself - allows further folds
+    // that were hindered by OneUse limits.
+    for (Value *Op : Ops)
+      if (auto *OpI = dyn_cast(Op)) {
+        Worklist.pushUsersToWorkList(*OpI);
+        Worklist.pushValue(OpI);
+      }
   }
 };
 } // namespace
diff --git a/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll b/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll
index 057d9af314ba3..c3639baf8b650 100644
--- a/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll
@@ -80,13 +80,29 @@ define i64 @movmsk_i64_v8i32_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 }
 
 define i64 @movmsk_i64_v64i8_v16i8(<16 x i8> %v0, <16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
-; CHECK-LABEL: @movmsk_i64_v64i8_v16i8(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> 
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> 
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> [[TMP1]], <64 x i32> 
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64
-; CHECK-NEXT:    ret i64 [[OR]]
+; SSE-LABEL: @movmsk_i64_v64i8_v16i8(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> 
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> 
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> [[TMP2]], <64 x i32> 
+; SSE-NEXT:    [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer
+; SSE-NEXT:    [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64
+; SSE-NEXT:    ret i64 [[OR]]
+;
+; AVX2-LABEL: @movmsk_i64_v64i8_v16i8(
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> 
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> 
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> [[TMP1]], <64 x i32> 
+; AVX2-NEXT:    [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer
+; AVX2-NEXT:    [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64
+; AVX2-NEXT:    ret i64 [[OR]]
+;
+; AVX512-LABEL: @movmsk_i64_v64i8_v16i8(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> 
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> 
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> [[TMP1]], <64 x i32> 
+; AVX512-NEXT:    [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer
+; AVX512-NEXT:    [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64
+; AVX512-NEXT:    ret i64 [[OR]]
 ;
   %c0 = icmp slt <16 x i8> %v0, zeroinitializer
   %c1 = icmp slt <16 x i8> %v1, zeroinitializer
@@ -110,14 +126,32 @@ define i64 @movmsk_i64_v64i8_v16i8(<16 x i8> %v0, <16 x i8> %v1, <16 x i8> %v2,
 }
 
 define i64 @movmsk_i64_v32i32_v4i32(<4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
-; CHECK-LABEL: @movmsk_i64_v32i32_v4i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> 
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> 
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP1]], <16 x i32> 
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
-; CHECK-NEXT:    [[OR:%.*]] = zext i16 [[TMP5]] to i64
-; CHECK-NEXT:    ret i64 [[OR]]
+; SSE-LABEL: @movmsk_i64_v32i32_v4i32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> 
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> 
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <16 x i32> 
+; SSE-NEXT:    [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer
+; SSE-NEXT:    [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
+; SSE-NEXT:    [[OR:%.*]] = zext i16 [[TMP5]] to i64
+; SSE-NEXT:    ret i64 [[OR]]
+;
+; AVX2-LABEL: @movmsk_i64_v32i32_v4i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> 
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> 
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP1]], <16 x i32> 
+; AVX2-NEXT:    [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer
+; AVX2-NEXT:    [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
+; AVX2-NEXT:    [[OR:%.*]] = zext i16 [[TMP5]] to i64
+; AVX2-NEXT:    ret i64 [[OR]]
+;
+; AVX512-LABEL: @movmsk_i64_v32i32_v4i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> 
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> 
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP1]], <16 x i32> 
+; AVX512-NEXT:    [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer
+; AVX512-NEXT:    [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
+; AVX512-NEXT:    [[OR:%.*]] = zext i16 [[TMP5]] to i64
+; AVX512-NEXT:    ret i64 [[OR]]
 ;
   %c0 = icmp slt <4 x i32> %v0, zeroinitializer
   %c1 = icmp slt <4 x i32> %v1, zeroinitializer

From 65a2eb0b1589590ae78cc1e5f05cd004b3b3bec5 Mon Sep 17 00:00:00 2001
From: Stefan Schulze Frielinghaus 
Date: Mon, 30 Dec 2024 19:24:55 +0100
Subject: [PATCH 184/567] [sanitizer] Fix type in some Min() calls (#119248)

This is a follow-up to 6dec33834d1fd89f16e271dde9607c1de9554144 and
#116957 and #119114.
---
 .../sanitizer_common_interceptors.inc              | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
index 47436a6cd20f0..24a8a2d4dc55b 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
@@ -520,14 +520,14 @@ INTERCEPTOR(int, strncmp, const char *s1, const char *s2, usize size) {
   void *ctx;
   COMMON_INTERCEPTOR_ENTER(ctx, strncmp, s1, s2, size);
   unsigned char c1 = 0, c2 = 0;
-  uptr i;
+  usize i;
   for (i = 0; i < size; i++) {
     c1 = (unsigned char)s1[i];
     c2 = (unsigned char)s2[i];
     if (c1 != c2 || c1 == '\0') break;
   }
-  uptr i1 = i;
-  uptr i2 = i;
+  usize i1 = i;
+  usize i2 = i;
   if (common_flags()->strict_string_checks) {
     for (; i1 < size && s1[i1]; i1++) {}
     for (; i2 < size && s2[i2]; i2++) {}
@@ -583,14 +583,14 @@ INTERCEPTOR(int, strncasecmp, const char *s1, const char *s2, SIZE_T size) {
   void *ctx;
   COMMON_INTERCEPTOR_ENTER(ctx, strncasecmp, s1, s2, size);
   unsigned char c1 = 0, c2 = 0;
-  uptr i;
+  usize i;
   for (i = 0; i < size; i++) {
     c1 = (unsigned char)s1[i];
     c2 = (unsigned char)s2[i];
     if (CharCaseCmp(c1, c2) != 0 || c1 == '\0') break;
   }
-  uptr i1 = i;
-  uptr i2 = i;
+  usize i1 = i;
+  usize i2 = i;
   if (common_flags()->strict_string_checks) {
     for (; i1 < size && s1[i1]; i1++) {}
     for (; i2 < size && s2[i2]; i2++) {}
@@ -851,7 +851,7 @@ int MemcmpInterceptorCommon(void *ctx,
       unsigned char c1 = 0, c2 = 0;
       const unsigned char *s1 = (const unsigned char*)a1;
       const unsigned char *s2 = (const unsigned char*)a2;
-      uptr i;
+      usize i;
       for (i = 0; i < size; i++) {
         c1 = s1[i];
         c2 = s2[i];

From accd4a4ad5ec7a8682dc701fd7072610d40cc436 Mon Sep 17 00:00:00 2001
From: Jacob Lalonde 
Date: Mon, 30 Dec 2024 10:48:16 -0800
Subject: [PATCH 185/567] [LLDB][Minidump] Make workaround for the Dynamic
 loader issue (#120166)

In #119598 my recent TLS feature seems to break crashpad symbols. I have
a few ideas on how this is happening, but for now as a mitigation I'm
checking if the Minidump was LLDB generated, and if so leveraging the
dynamic loader.
---
 .../ObjectFile/Minidump/MinidumpFileBuilder.cpp  | 15 ++++++++++++---
 .../ObjectFile/Minidump/MinidumpFileBuilder.h    |  1 +
 .../Plugins/Process/minidump/MinidumpParser.cpp  |  6 ++++++
 .../Plugins/Process/minidump/MinidumpParser.h    |  1 +
 .../Plugins/Process/minidump/ProcessMinidump.cpp | 16 ++++++++++++++++
 .../Plugins/Process/minidump/ProcessMinidump.h   |  7 +++++--
 .../llvm/BinaryFormat/MinidumpConstants.def      |  4 ++++
 7 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
index bcac5edbc1a79..c5013ea5e3be4 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
+++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
@@ -58,8 +58,8 @@ Status MinidumpFileBuilder::AddHeaderAndCalculateDirectories() {
   // First set the offset on the file, and on the bytes saved
   m_saved_data_size = HEADER_SIZE;
   // We know we will have at least Misc, SystemInfo, Modules, and ThreadList
-  // (corresponding memory list for stacks) And an additional memory list for
-  // non-stacks.
+  // (corresponding memory list for stacks), an additional memory list for
+  // non-stacks, and a stream to mark this minidump was generated by LLDB.
   lldb_private::Target &target = m_process_sp->GetTarget();
   m_expected_directories = 6;
   // Check if OS is linux and reserve directory space for all linux specific
@@ -90,7 +90,10 @@ Status MinidumpFileBuilder::AddHeaderAndCalculateDirectories() {
         "sections. Written / Expected (%" PRIx64 " / %" PRIx64 ")",
         new_offset, m_saved_data_size);
 
-  return error;
+  if (error.Fail())
+    return error;
+
+  return AddLLDBGeneratedStream();
 }
 
 Status MinidumpFileBuilder::AddDirectory(StreamType type,
@@ -126,6 +129,12 @@ Status MinidumpFileBuilder::AddDirectory(StreamType type,
   return error;
 }
 
+Status MinidumpFileBuilder::AddLLDBGeneratedStream() {
+  Status error;
+  StreamType type = StreamType::LLDBGenerated;
+  return AddDirectory(type, 0);
+}
+
 Status MinidumpFileBuilder::AddSystemInfo() {
   Status error;
   const llvm::Triple &target_triple =
diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h
index 58b284608bd53..48293ee1bf5e5 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h
+++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h
@@ -120,6 +120,7 @@ class MinidumpFileBuilder {
   void DeleteFile() noexcept;
 
 private:
+  lldb_private::Status AddLLDBGeneratedStream();
   // Add data to the end of the buffer, if the buffer exceeds the flush level,
   // trigger a flush.
   lldb_private::Status AddData(const void *data, uint64_t size);
diff --git a/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp b/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp
index afc095ddbb2f9..94c0a5f11e435 100644
--- a/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp
+++ b/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp
@@ -49,6 +49,11 @@ llvm::ArrayRef MinidumpParser::GetStream(StreamType stream_type) {
   return m_file->getRawStream(stream_type).value_or(llvm::ArrayRef());
 }
 
+std::optional>
+MinidumpParser::GetRawStream(StreamType stream_type) {
+  return m_file->getRawStream(stream_type);
+}
+
 UUID MinidumpParser::GetModuleUUID(const minidump::Module *module) {
   auto cv_record =
       GetData().slice(module->CvRecord.RVA, module->CvRecord.DataSize);
@@ -651,6 +656,7 @@ MinidumpParser::GetStreamTypeAsString(StreamType stream_type) {
     ENUM_TO_CSTR(FacebookAbortReason);
     ENUM_TO_CSTR(FacebookThreadName);
     ENUM_TO_CSTR(FacebookLogcat);
+    ENUM_TO_CSTR(LLDBGenerated);
   }
   return "unknown stream type";
 }
diff --git a/lldb/source/Plugins/Process/minidump/MinidumpParser.h b/lldb/source/Plugins/Process/minidump/MinidumpParser.h
index f0b6e6027c52f..2c5e6f19ff9a1 100644
--- a/lldb/source/Plugins/Process/minidump/MinidumpParser.h
+++ b/lldb/source/Plugins/Process/minidump/MinidumpParser.h
@@ -59,6 +59,7 @@ class MinidumpParser {
   llvm::ArrayRef GetData();
 
   llvm::ArrayRef GetStream(StreamType stream_type);
+  std::optional> GetRawStream(StreamType stream_type);
 
   UUID GetModuleUUID(const minidump::Module *module);
 
diff --git a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
index 5b0df72130c16..05b3bb9f54f9c 100644
--- a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
+++ b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
@@ -354,6 +354,22 @@ DataExtractor ProcessMinidump::GetAuxvData() {
                        GetAddressByteSize(), GetAddressByteSize());
 }
 
+bool ProcessMinidump::IsLLDBMinidump() {
+  std::optional> lldb_generated_section =
+      m_minidump_parser->GetRawStream(StreamType::LLDBGenerated);
+  return lldb_generated_section.has_value();
+}
+
+DynamicLoader *ProcessMinidump::GetDynamicLoader() {
+  // This is a workaround for the dynamic loader not playing nice in issue
+  // #119598. The specific reason we use the dynamic loader is to get the TLS
+  // info sections, which we can assume are not being written to the minidump
+  // unless it's an LLDB generate minidump.
+  if (IsLLDBMinidump())
+    return PostMortemProcess::GetDynamicLoader();
+  return nullptr;
+}
+
 void ProcessMinidump::BuildMemoryRegions() {
   if (m_memory_regions)
     return;
diff --git a/lldb/source/Plugins/Process/minidump/ProcessMinidump.h b/lldb/source/Plugins/Process/minidump/ProcessMinidump.h
index 3d235670a33ab..ad8d0ed7a4832 100644
--- a/lldb/source/Plugins/Process/minidump/ProcessMinidump.h
+++ b/lldb/source/Plugins/Process/minidump/ProcessMinidump.h
@@ -53,6 +53,8 @@ class ProcessMinidump : public PostMortemProcess {
 
   Status DoLoadCore() override;
 
+  DynamicLoader *GetDynamicLoader() override;
+
   // Returns AUXV structure found in the core file
   lldb_private::DataExtractor GetAuxvData() override;
 
@@ -74,8 +76,8 @@ class ProcessMinidump : public PostMortemProcess {
 
   ArchSpec GetArchitecture();
 
-  Status GetMemoryRegions(
-      lldb_private::MemoryRegionInfos ®ion_list) override;
+  Status
+  GetMemoryRegions(lldb_private::MemoryRegionInfos ®ion_list) override;
 
   bool GetProcessInfo(ProcessInstanceInfo &info) override;
 
@@ -113,6 +115,7 @@ class ProcessMinidump : public PostMortemProcess {
   std::optional m_memory_regions;
 
   void BuildMemoryRegions();
+  bool IsLLDBMinidump();
 };
 
 } // namespace minidump
diff --git a/llvm/include/llvm/BinaryFormat/MinidumpConstants.def b/llvm/include/llvm/BinaryFormat/MinidumpConstants.def
index 5226da3e84126..722a70ff67a9d 100644
--- a/llvm/include/llvm/BinaryFormat/MinidumpConstants.def
+++ b/llvm/include/llvm/BinaryFormat/MinidumpConstants.def
@@ -85,6 +85,10 @@ HANDLE_MDMP_STREAM_TYPE(0xFACECCCC, FacebookAppStateLog)
 HANDLE_MDMP_STREAM_TYPE(0xFACEDEAD, FacebookAbortReason)
 HANDLE_MDMP_STREAM_TYPE(0xFACEE000, FacebookThreadName)
 
+// LLDB specific stream types
+// Ascii for 'LLDB'
+HANDLE_MDMP_STREAM_TYPE(0x4C4C4442, LLDBGenerated)
+
 HANDLE_MDMP_ARCH(0x0000, X86)       // PROCESSOR_ARCHITECTURE_INTEL
 HANDLE_MDMP_ARCH(0x0001, MIPS)      // PROCESSOR_ARCHITECTURE_MIPS
 HANDLE_MDMP_ARCH(0x0002, Alpha)     // PROCESSOR_ARCHITECTURE_ALPHA

From d9111f19d2ea53d8ce105b3d09425394ccf37969 Mon Sep 17 00:00:00 2001
From: Amir Bishara <139038766+amirBish@users.noreply.github.com>
Date: Mon, 30 Dec 2024 21:18:38 +0200
Subject: [PATCH 186/567] [mlir][bufferization]-Refactor
 findValueInReverseUseDefChain to accept opOperand (#121304)

Edit the `findValueInReverseUseDefChain` method to accept `OpOperand`
instead of the `Value` type, This change will make sure that the
populated `visitedOpOperands` argument is fully accurate and contains
the opOperand we have started the reverse chain from.
---
 .../IR/BufferizableOpInterface.h              |  6 +--
 .../Transforms/OneShotAnalysis.h              |  6 +--
 .../IR/BufferizableOpInterface.cpp            | 27 ++++++-----
 .../Transforms/EmptyTensorElimination.cpp     |  8 ++--
 .../Transforms/OneShotAnalysis.cpp            | 46 +++++++++++--------
 .../Transforms/ConvertToDestinationStyle.cpp  |  2 +-
 .../Transforms/EliminateEmptyTensors.cpp      |  2 +-
 ...ot-bufferize-empty-tensor-elimination.mlir | 11 +++++
 8 files changed, 64 insertions(+), 44 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
index 983f7a29cb220..d1a102e2a6e4e 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
@@ -456,7 +456,7 @@ class AnalysisState {
   /// read by themselves (e.g., ExtractSliceOp).
   bool isValueRead(Value value) const;
 
-  /// Starting from `value`, follow the use-def chain in reverse, always
+  /// Starting from `opOperand`, follow the use-def chain in reverse, always
   /// selecting the aliasing OpOperands. Find and return Values for which
   /// `condition` evaluates to true. OpOperands of such matching Values are not
   /// traversed any further, the visited aliasing opOperands will be preserved
@@ -484,7 +484,7 @@ class AnalysisState {
   /// Additional stopping conditions for the traversal can be specified in
   /// `config`.
   SetVector findValueInReverseUseDefChain(
-      Value value, llvm::function_ref condition,
+      OpOperand *opOperand, llvm::function_ref condition,
       TraversalConfig config = TraversalConfig(),
       llvm::DenseSet *visitedOpOperands = nullptr) const;
 
@@ -520,7 +520,7 @@ class AnalysisState {
   ///
   /// Note: OpResults of unknown ops are handled conservatively and assumed to
   /// be definitions.
-  SetVector findDefinitions(Value value) const;
+  SetVector findDefinitions(OpOperand *opOperand) const;
 
   /// Return `true` if the given OpResult has been decided to bufferize inplace.
   virtual bool isInPlace(OpOperand &opOperand) const;
diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h
index d50a3042aeeac..bd23a19f74728 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h
@@ -127,9 +127,9 @@ class OneShotAnalysisState : public AnalysisState {
   /// Return true if the buffer of the given tensor value is writable.
   bool isWritable(Value value) const;
 
-  /// Find the definitions of the given tensor value or retrieve them from the
-  /// cache.
-  const SetVector &findDefinitionsCached(Value value);
+  /// Find the definitions of the given operand's value or
+  /// retrieve them from the cache.
+  const SetVector &findDefinitionsCached(OpOperand *opOperand);
 
   /// Reset cached data structures.
   void resetCache() override;
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
index 349841f06959c..1eb27e44810b0 100644
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
@@ -480,18 +480,21 @@ bool AnalysisState::isValueRead(Value value) const {
   return false;
 }
 
-// Starting from `value`, follow the use-def chain in reverse, always selecting
-// the aliasing OpOperands. Find and return Values for which `condition`
-// evaluates to true. OpOperands of such matching Values are not traversed any
-// further, the visited aliasing opOperands will be preserved through
-// `visitedOpOperands`.
+// Starting from `opOperand`, follow the use-def chain in reverse, always
+// selecting the aliasing OpOperands. Find and return Values for which
+// `condition` evaluates to true. Uses of such matching Values are not
+// traversed any further, the visited aliasing opOperands will be preserved
+// through `visitedOpOperands`.
 llvm::SetVector AnalysisState::findValueInReverseUseDefChain(
-    Value value, llvm::function_ref condition,
+    OpOperand *opOperand, llvm::function_ref condition,
     TraversalConfig config,
     llvm::DenseSet *visitedOpOperands) const {
   llvm::DenseSet visited;
   llvm::SetVector result, workingSet;
-  workingSet.insert(value);
+  workingSet.insert(opOperand->get());
+
+  if (visitedOpOperands)
+    visitedOpOperands->insert(opOperand);
 
   while (!workingSet.empty()) {
     Value value = workingSet.pop_back_val();
@@ -563,12 +566,14 @@ llvm::SetVector AnalysisState::findValueInReverseUseDefChain(
   return result;
 }
 
-// Find the values that define the contents of the given value.
-llvm::SetVector AnalysisState::findDefinitions(Value value) const {
+// Find the values that define the contents of the given operand's value.
+llvm::SetVector
+AnalysisState::findDefinitions(OpOperand *opOperand) const {
   TraversalConfig config;
   config.alwaysIncludeLeaves = false;
   return findValueInReverseUseDefChain(
-      value, [&](Value v) { return this->bufferizesToMemoryWrite(v); }, config);
+      opOperand, [&](Value v) { return this->bufferizesToMemoryWrite(v); },
+      config);
 }
 
 AnalysisState::AnalysisState(const BufferizationOptions &options)
@@ -892,7 +897,7 @@ bool bufferization::detail::defaultResultBufferizesToMemoryWrite(
   config.alwaysIncludeLeaves = false;
   for (AliasingOpOperand alias : opOperands) {
     if (!state
-             .findValueInReverseUseDefChain(alias.opOperand->get(),
+             .findValueInReverseUseDefChain(alias.opOperand,
                                             isMemoryWriteInsideOp, config)
              .empty())
       return true;
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/EmptyTensorElimination.cpp b/mlir/lib/Dialect/Bufferization/Transforms/EmptyTensorElimination.cpp
index 98c3d8d0adc6d..2c4e362101f8f 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/EmptyTensorElimination.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/EmptyTensorElimination.cpp
@@ -143,7 +143,7 @@ LogicalResult mlir::bufferization::eliminateEmptyTensors(
     // %3 = tensor.insert_slice %2 into ...
     config.followSameTypeOrCastsOnly = true;
     SetVector emptyTensors = state.findValueInReverseUseDefChain(
-        source.get(), /*condition=*/
+        &source, /*condition=*/
         [&](Value val) { return val.getDefiningOp(); }, config,
         &visitedOpOperands);
 
@@ -155,10 +155,8 @@ LogicalResult mlir::bufferization::eliminateEmptyTensors(
           visitedOpOperands, [&emptyTensorOp](OpOperand *opOperand) {
             return llvm::count(emptyTensorOp->getUses(), *opOperand);
           });
-      // This could be achieved when a use of `emptyTensorOp` is being
-      // consumed by `SubsetInsertionOpInterface`'s source directly.
-      if (iter == visitedOpOperands.end())
-        continue;
+
+      assert(iter != visitedOpOperands.end() && "could not find use");
       OpOperand *useToBeReplaced = *iter;
       Operation *user = useToBeReplaced->getOwner();
       auto replacement = subsetsExtractionFn(rewriter, op, emptyTensorOp, user);
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
index d1e6acef324fb..fc1b221b4f036 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
@@ -196,7 +196,12 @@ void OneShotAnalysisState::gatherUndefinedTensorUses(Operation *op) {
 
       // If there is no preceding definition, the tensor contents are
       // undefined.
-      if (findDefinitionsCached(opResult).empty())
+      if (opResult.getUses().empty())
+        continue;
+      // It does not really matter which use to take to search about
+      // the value's definitions.
+      OpOperand *opOperand = &(*opResult.getUses().begin());
+      if (findDefinitionsCached(opOperand).empty())
         for (OpOperand &use : opResult.getUses())
           undefinedTensorUses.insert(&use);
     }
@@ -464,7 +469,8 @@ static void annotateConflict(OpOperand *uRead, OpOperand *uConflictingWrite,
 /// indexing. I.e., the tensor types do not change along the use-def chain,
 /// apart from static <-> dynamic dim casts.
 static bool hasEquivalentValueInReverseUseDefChain(AnalysisState &state,
-                                                   Value start, Value other) {
+                                                   OpOperand *start,
+                                                   Value other) {
   TraversalConfig config;
   config.followEquivalentOnly = true;
   config.alwaysIncludeLeaves = false;
@@ -475,9 +481,10 @@ static bool hasEquivalentValueInReverseUseDefChain(AnalysisState &state,
               .empty();
 }
 
-/// Return "true" if `value` is originating from a subset that is equivalent to
-/// the subset that `subsetOp` inserts into.
-static bool matchesInsertDestination(const AnalysisState &state, Value value,
+/// Return "true" if the given operand's value is originating from a subset
+/// that is equivalent to the subset that `subsetOp` inserts into.
+static bool matchesInsertDestination(const AnalysisState &state,
+                                     OpOperand *opOperand,
                                      SubsetInsertionOpInterface subsetOp) {
   auto matchingSubset = [&](Value val) {
     if (auto opResult = dyn_cast(val))
@@ -490,7 +497,7 @@ static bool matchesInsertDestination(const AnalysisState &state, Value value,
   // There may be multiple leaves at which the reverse SSA use-def chain lookup
   // terminates. All of them must be equivalent subsets.
   SetVector backwardSlice =
-      state.findValueInReverseUseDefChain(value, matchingSubset);
+      state.findValueInReverseUseDefChain(opOperand, matchingSubset);
   return static_cast(llvm::all_of(backwardSlice, matchingSubset));
 }
 
@@ -516,7 +523,7 @@ static bool areNonConflictingSubsets(OpOperand *uRead,
     //     {inplace= [true] }
 
     if (uRead == &subsetOp.getDestinationOperand() &&
-        matchesInsertDestination(state, uConflictingWrite->get(), subsetOp))
+        matchesInsertDestination(state, uConflictingWrite, subsetOp))
       // Case 1: The main insight is that InsertSliceOp reads only part of
       // the destination tensor. The overwritten area is not read. If
       // uConflictingWrite writes into exactly the memory location that is
@@ -533,7 +540,7 @@ static bool areNonConflictingSubsets(OpOperand *uRead,
 
     if (uRead == &subsetOp.getSourceOperand() &&
         uConflictingWrite == &subsetOp.getDestinationOperand() &&
-        matchesInsertDestination(state, uRead->get(), subsetOp))
+        matchesInsertDestination(state, uRead, subsetOp))
       // Case 2: The read of the source tensor and the write to the dest
       // tensor via an InsertSliceOp is not a conflict if the read is
       // reading exactly that part of an equivalent tensor that the
@@ -567,8 +574,7 @@ static bool areNonConflictingSubsets(OpOperand *uRead,
     if (uConflictingWrite == &subsetOp.getDestinationOperand() &&
         state.areEquivalentBufferizedValues(
             uRead->get(), subsetOp.getSourceOperand().get()) &&
-        matchesInsertDestination(state, subsetOp.getSourceOperand().get(),
-                                 subsetOp))
+        matchesInsertDestination(state, &subsetOp.getSourceOperand(), subsetOp))
       return true;
 
   return false;
@@ -600,9 +606,9 @@ hasReadAfterWriteInterference(const DenseSet &usesRead,
       // even though that op just bufferizes to an allocation but does define
       // the contents of the buffer.
       SetVector definitionsOrLeaves =
-          state.findValueInReverseUseDefChain(
-              uConflictingWrite->get(),
-              [&](Value v) { return state.bufferizesToMemoryWrite(v); });
+          state.findValueInReverseUseDefChain(uConflictingWrite, [&](Value v) {
+            return state.bufferizesToMemoryWrite(v);
+          });
       assert(!definitionsOrLeaves.empty() &&
              "expected at least one definition or leaf");
 
@@ -641,8 +647,7 @@ hasReadAfterWriteInterference(const DenseSet &usesRead,
     // In the above example, if uRead is the OpOperand of reading_op, the
     // definition is %0. Note that operations that create an alias but do not
     // bufferize to a memory write (such as ExtractSliceOp) are skipped.
-    const SetVector &definitions =
-        state.findDefinitionsCached(uRead->get());
+    const SetVector &definitions = state.findDefinitionsCached(uRead);
     if (definitions.empty()) {
       // Fast path: No conflict if there are no definitions.
       LLVM_DEBUG(llvm::dbgs()
@@ -714,9 +719,9 @@ hasReadAfterWriteInterference(const DenseSet &usesRead,
             if (bufferizableOp.bufferizesToElementwiseAccess(
                     state, {uRead, uConflictingWrite})) {
               if (hasEquivalentValueInReverseUseDefChain(
-                      state, uRead->get(), uConflictingWrite->get()) ||
+                      state, uRead, uConflictingWrite->get()) ||
                   hasEquivalentValueInReverseUseDefChain(
-                      state, uConflictingWrite->get(), uRead->get())) {
+                      state, uConflictingWrite, uRead->get())) {
                 LLVM_DEBUG(
                     llvm::dbgs()
                     << "  no conflict: op bufferizes to element-wise access\n");
@@ -965,11 +970,12 @@ wouldCreateWriteToNonWritableBuffer(OpOperand &operand,
 // Bufferization analyses.
 //===----------------------------------------------------------------------===//
 
-// Find the values that define the contents of the given value.
+// Find the values that define the contents of the given operand's value.
 const llvm::SetVector &
-OneShotAnalysisState::findDefinitionsCached(Value value) {
+OneShotAnalysisState::findDefinitionsCached(OpOperand *opOperand) {
+  Value value = opOperand->get();
   if (!cachedDefinitions.count(value))
-    cachedDefinitions[value] = findDefinitions(value);
+    cachedDefinitions[value] = findDefinitions(opOperand);
   return cachedDefinitions[value];
 }
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp b/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp
index 6801b68a85381..6c1087730ebba 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp
@@ -553,7 +553,7 @@ Value linalg::bufferizeToAllocation(
     Value alloc = createAllocationForTensor(
         rewriter, op->getLoc(), operand->get(), options, memorySpace);
     allocs.push_back(alloc);
-    if (!state.findDefinitions(operand->get()).empty()) {
+    if (!state.findDefinitions(operand).empty()) {
       // Initialize buffer with a copy of the operand data. Not needed if the
       // tensor is uninitialized.
       createMemcpy(rewriter, op->getLoc(), operand->get(), alloc, options);
diff --git a/mlir/lib/Dialect/Linalg/Transforms/EliminateEmptyTensors.cpp b/mlir/lib/Dialect/Linalg/Transforms/EliminateEmptyTensors.cpp
index 4776883ed95c5..b710bde87f9f3 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/EliminateEmptyTensors.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/EliminateEmptyTensors.cpp
@@ -59,7 +59,7 @@ LogicalResult linalg::linalgOpAnchoredEmptyTensorEliminationStep(
       config.followEquivalentOnly = true;
       config.alwaysIncludeLeaves = false;
       SetVector emptyTensors = state.findValueInReverseUseDefChain(
-          in->get(), /*condition=*/
+          in, /*condition=*/
           [&](Value val) {
             return val.getDefiningOp() &&
                    val.getType() == in->get().getType();
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
index 26434774730e1..820fb3dfa5e5e 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
@@ -465,3 +465,14 @@ func.func @mutli_use_of_the_same_tensor_empty_creates_non_existent_read(%arg1: t
       : tensor<5x6x64xf32> into tensor<5x6x128xf32>
   return %inserted_slice_1, %res_2 : tensor<5x6x128xf32>, tensor<5x6x64xf32>
 }
+
+// -----
+
+// CHECK-LABEL:   func.func @direct_use_of_tensor_empty
+func.func @direct_use_of_tensor_empty(%arg0: tensor<5x6x128xf32>) -> tensor<5x6x128xf32> {
+  // CHECK-NOT: memref.alloc
+  %empty_1 = tensor.empty() : tensor<5x6x64xf32>
+  %inserted_slice_1 = tensor.insert_slice %empty_1 into %arg0[0, 0, 0][5, 6, 64][1, 1, 1]
+      : tensor<5x6x64xf32> into tensor<5x6x128xf32>
+  return %inserted_slice_1 : tensor<5x6x128xf32>
+}

From fb365ac86c8032e65586b0eb055a7f1646304f89 Mon Sep 17 00:00:00 2001
From: Maksim Levental 
Date: Mon, 30 Dec 2024 11:51:11 -0800
Subject: [PATCH 187/567] [mlir][linalg] DCE unimplemented extra decl (#121272)

---
 mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
index 37eec6e07963b..fff4048ee125e 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
@@ -472,9 +472,6 @@ def TransposeOp : LinalgStructuredBase_Op<"transpose", [
       getRegionBuilder() {
       return regionBuilder;
     }
-
-    static void createRegion(::mlir::OpBuilder &opBuilder,
-                             ::mlir::OperationState & odsState);
   }];
 
   let hasFolder = 1;

From 9a88edeb49b3edefbb9933fa5b71d7d421fc99bd Mon Sep 17 00:00:00 2001
From: Petr Hosek 
Date: Mon, 30 Dec 2024 11:51:44 -0800
Subject: [PATCH 188/567] [TYSan][CMake] CMake build fixes (#121224)

TYSan CMake build follows patterns used by other sanitizers, but there's
also a number of issues, like referring to undefined variables, which
breaks the build in some cases (such as cross-compiling). This change
addresses the issues.
---
 compiler-rt/lib/tysan/CMakeLists.txt | 29 +++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/compiler-rt/lib/tysan/CMakeLists.txt b/compiler-rt/lib/tysan/CMakeLists.txt
index 859b67928f004..7d13ae3963919 100644
--- a/compiler-rt/lib/tysan/CMakeLists.txt
+++ b/compiler-rt/lib/tysan/CMakeLists.txt
@@ -3,11 +3,25 @@ include_directories(..)
 # Runtime library sources and build flags.
 set(TYSAN_SOURCES
   tysan.cpp
-  tysan_interceptors.cpp)
+  tysan_interceptors.cpp
+  )
+
+SET(TYSAN_HEADERS
+  tysan.h
+  tysan_flags.inc
+  tysan_platform.h
+  )
+
 set(TYSAN_COMMON_CFLAGS ${SANITIZER_COMMON_CFLAGS})
 append_rtti_flag(OFF TYSAN_COMMON_CFLAGS)
 # Prevent clang from generating libc calls.
 append_list_if(COMPILER_RT_HAS_FFREESTANDING_FLAG -ffreestanding TYSAN_COMMON_CFLAGS)
+set(TYSAN_DYNAMIC_CFLAGS ${TYSAN_COMMON_CFLAGS})
+
+set(TYSAN_COMMON_DEFINITIONS "")
+set(TYSAN_DYNAMIC_DEFINITIONS ${TYSAN_COMMON_DEFINITIONS} TYSAN_DYNAMIC=1)
+
+# Compile TYSan sources into an object library.
 
 add_compiler_rt_object_libraries(RTTysan_dynamic
   OS ${SANITIZER_COMMON_SUPPORTED_OS}
@@ -47,17 +61,18 @@ if(APPLE)
     DEFS ${TYSAN_COMMON_DEFINITIONS}
     PARENT_TARGET tysan)
 else()
+  set(TYSAN_CFLAGS ${TYSAN_COMMON_CFLAGS})
+  append_list_if(COMPILER_RT_HAS_FPIE_FLAG -fPIE TYSAN_CFLAGS)
+
   foreach(arch ${TYSAN_SUPPORTED_ARCH})
-    set(TYSAN_CFLAGS ${TYSAN_COMMON_CFLAGS})
-    append_list_if(COMPILER_RT_HAS_FPIE_FLAG -fPIE TYSAN_CFLAGS)
     add_compiler_rt_runtime(clang_rt.tysan
       STATIC
       ARCHS ${arch}
       SOURCES ${TYSAN_SOURCES}
-              $
-              $
-              $
-              $
+      OBJECT_LIBS RTInterception
+                  RTSanitizerCommon
+                  RTSanitizerCommonLibc
+                  RTSanitizerCommonSymbolizer
       CFLAGS ${TYSAN_CFLAGS}
       PARENT_TARGET tysan)
   endforeach()

From 5373ed0e79c3eaf8a11a0fd747aa80556f203211 Mon Sep 17 00:00:00 2001
From: Muhammad Omair Javaid 
Date: Tue, 31 Dec 2024 00:43:16 +0500
Subject: [PATCH 189/567] [fuzzer][test] Disable noasan-strcmp test for AArch64

This patch disables the `noasan-strcmp.test` for AArch64 Linux as it
consistently fails on the buildbot machine while passing on other
AArch64 Linux systems.

We have seen similar issues on noasan-strncmp.test in past which had
random failures on certain machines/environments.

Following buildbot is failing in both check stage1 and stage2:
https://lab.llvm.org/buildbot/#/builders/121/builds/711
---
 compiler-rt/test/fuzzer/noasan-strcmp.test | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/test/fuzzer/noasan-strcmp.test b/compiler-rt/test/fuzzer/noasan-strcmp.test
index 0d82d6b2846f8..f73af35f5d898 100644
--- a/compiler-rt/test/fuzzer/noasan-strcmp.test
+++ b/compiler-rt/test/fuzzer/noasan-strcmp.test
@@ -1,4 +1,4 @@
-UNSUPPORTED: darwin, target={{.*(freebsd|windows).*}}
+UNSUPPORTED: darwin, target={{.*(freebsd|windows).*}}, target=aarch64{{.*}}
 
 RUN: %cpp_compiler -fno-sanitize=address %S/StrcmpTest.cpp -o %t-NoAsanStrcmpTest
 RUN: not %run %t-NoAsanStrcmpTest -seed=1 -runs=2000000   2>&1 | FileCheck %s

From b20b6e9ea9e9e1f66feb2663b999f66a8af3a5dc Mon Sep 17 00:00:00 2001
From: Florian Hahn 
Date: Mon, 30 Dec 2024 19:57:26 +0000
Subject: [PATCH 190/567] [LV] Check IR generated for both interleaving and
 vectorizing in test.

Currently the tests would in some cases would only check the vectorized
IR, but not the interleaved IR, if they are different.
---
 .../LoopVectorize/iv_outside_user.ll          | 203 +++++++++++++++++-
 1 file changed, 202 insertions(+), 1 deletion(-)

diff --git a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
index fee10cf013bac..e9f67036faf2b 100644
--- a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --replace-value-regex "!llvm.loop ![0-9]+" --version 5
 ; RUN: opt -S -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 < %s | FileCheck --check-prefixes=CHECK,VEC %s
-; RUN: opt -S -passes=loop-vectorize -force-vector-interleave=2 -force-vector-width=1 < %s | FileCheck --check-prefixes=CHECK %s
+; RUN: opt -S -passes=loop-vectorize -force-vector-interleave=2 -force-vector-width=1 < %s | FileCheck --check-prefixes=CHECK,INTERLEAVE %s
 
 define i32 @postinc(i32 %k)  {
 ; CHECK-LABEL: define i32 @postinc(
@@ -430,6 +430,39 @@ define i64 @iv_scalar_steps_and_outside_users(ptr %ptr) {
 ; VEC-NEXT:    [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LOOP]] ], [ 1001, %[[MIDDLE_BLOCK]] ]
 ; VEC-NEXT:    ret i64 [[IV_LCSSA]]
 ;
+; INTERLEAVE-LABEL: define i64 @iv_scalar_steps_and_outside_users(
+; INTERLEAVE-SAME: ptr [[PTR:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[TMP0]]
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[TMP1]]
+; INTERLEAVE-NEXT:    store i64 [[TMP0]], ptr [[TMP2]], align 4
+; INTERLEAVE-NEXT:    store i64 [[TMP1]], ptr [[TMP3]], align 4
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1002
+; INTERLEAVE-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1002, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
+; INTERLEAVE:       [[LOOP]]:
+; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
+; INTERLEAVE-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[IV]]
+; INTERLEAVE-NEXT:    store i64 [[IV]], ptr [[GEP_PTR]], align 4
+; INTERLEAVE-NEXT:    [[EXITCOND:%.*]] = icmp ugt i64 [[IV]], 1000
+; INTERLEAVE-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[EXIT]]:
+; INTERLEAVE-NEXT:    [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LOOP]] ], [ 1001, %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    ret i64 [[IV_LCSSA]]
+;
 entry:
   br label %loop
 
@@ -485,6 +518,42 @@ define i32 @iv_2_dead_in_loop_only_used_outside(ptr %ptr) {
 ; VEC-NEXT:    [[IV_2_LCSSA:%.*]] = phi i32 [ [[IV_2]], %[[LOOP]] ], [ 2002, %[[MIDDLE_BLOCK]] ]
 ; VEC-NEXT:    ret i32 [[IV_2_LCSSA]]
 ;
+; INTERLEAVE-LABEL: define i32 @iv_2_dead_in_loop_only_used_outside(
+; INTERLEAVE-SAME: ptr [[PTR:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[TMP0]]
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[TMP1]]
+; INTERLEAVE-NEXT:    store i64 [[TMP0]], ptr [[TMP2]], align 4
+; INTERLEAVE-NEXT:    store i64 [[TMP1]], ptr [[TMP3]], align 4
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1002
+; INTERLEAVE-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1002, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 2004, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
+; INTERLEAVE:       [[LOOP]]:
+; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
+; INTERLEAVE-NEXT:    [[IV_2_NEXT]] = add nuw i32 [[IV_2]], 2
+; INTERLEAVE-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[IV]]
+; INTERLEAVE-NEXT:    store i64 [[IV]], ptr [[GEP_PTR]], align 4
+; INTERLEAVE-NEXT:    [[EXITCOND:%.*]] = icmp ugt i64 [[IV]], 1000
+; INTERLEAVE-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[EXIT]]:
+; INTERLEAVE-NEXT:    [[IV_2_LCSSA:%.*]] = phi i32 [ [[IV_2]], %[[LOOP]] ], [ 2002, %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    ret i32 [[IV_2_LCSSA]]
+;
 entry:
   br label %loop
 
@@ -625,6 +694,38 @@ define i32 @postinc_not_iv_backedge_value(i32 %k)  {
 ; VEC-NEXT:    [[INC_2_LCSSA:%.*]] = phi i32 [ [[INC_2]], %[[FOR_BODY]] ], [ [[TMP2]], %[[MIDDLE_BLOCK]] ]
 ; VEC-NEXT:    ret i32 [[INC_2_LCSSA]]
 ;
+; INTERLEAVE-LABEL: define i32 @postinc_not_iv_backedge_value(
+; INTERLEAVE-SAME: i32 [[K:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[K]], 2
+; INTERLEAVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[K]], 2
+; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = sub i32 [[K]], [[N_MOD_VF]]
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 1
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 2
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[K]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    br label %[[FOR_BODY:.*]]
+; INTERLEAVE:       [[FOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[INC]] = add nsw i32 [[INC_PHI]], 1
+; INTERLEAVE-NEXT:    [[INC_2:%.*]] = add i32 [[INC_PHI]], 2
+; INTERLEAVE-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC]], [[K]]
+; INTERLEAVE-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[FOR_END]]:
+; INTERLEAVE-NEXT:    [[INC_2_LCSSA:%.*]] = phi i32 [ [[INC_2]], %[[FOR_BODY]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    ret i32 [[INC_2_LCSSA]]
+;
 entry:
   br label %for.body
 
@@ -692,6 +793,56 @@ define float @fp_postinc_use_fadd(float %init, ptr noalias nocapture %A, i64 %N,
 ; VEC-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[LOOP]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ]
 ; VEC-NEXT:    ret float [[ADD_LCSSA]]
 ;
+; INTERLEAVE-LABEL: define float @fp_postinc_use_fadd(
+; INTERLEAVE-SAME: float [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i64 [[N:%.*]], float [[FPINC:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; INTERLEAVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; INTERLEAVE-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]]
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = fadd fast float [[INIT]], [[TMP0]]
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 1
+; INTERLEAVE-NEXT:    [[DOTCAST1:%.*]] = sitofp i64 [[INDEX]] to float
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = fmul fast float [[FPINC]], [[DOTCAST1]]
+; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = fadd fast float [[INIT]], [[TMP4]]
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = fmul fast float 0.000000e+00, [[FPINC]]
+; INTERLEAVE-NEXT:    [[TMP6:%.*]] = fadd fast float [[OFFSET_IDX]], [[TMP5]]
+; INTERLEAVE-NEXT:    [[TMP7:%.*]] = fmul fast float 1.000000e+00, [[FPINC]]
+; INTERLEAVE-NEXT:    [[TMP8:%.*]] = fadd fast float [[OFFSET_IDX]], [[TMP7]]
+; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]]
+; INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP3]]
+; INTERLEAVE-NEXT:    store float [[TMP6]], ptr [[TMP9]], align 4
+; INTERLEAVE-NEXT:    store float [[TMP8]], ptr [[TMP10]], align 4
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi float [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[INIT]], %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
+; INTERLEAVE:       [[LOOP]]:
+; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[FP_IV:%.*]] = phi float [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; INTERLEAVE-NEXT:    store float [[FP_IV]], ptr [[GEP_A]], align 4
+; INTERLEAVE-NEXT:    [[ADD]] = fadd fast float [[FP_IV]], [[FPINC]]
+; INTERLEAVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; INTERLEAVE-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; INTERLEAVE-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[EXIT]]:
+; INTERLEAVE-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[LOOP]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    ret float [[ADD_LCSSA]]
+;
 entry:
   br label %loop
 
@@ -762,6 +913,56 @@ define float @fp_postinc_use_fsub(float %init, ptr noalias nocapture %A, i64 %N,
 ; VEC-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[LOOP]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ]
 ; VEC-NEXT:    ret float [[ADD_LCSSA]]
 ;
+; INTERLEAVE-LABEL: define float @fp_postinc_use_fsub(
+; INTERLEAVE-SAME: float [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i64 [[N:%.*]], float [[FPINC:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; INTERLEAVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; INTERLEAVE-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]]
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = fsub fast float [[INIT]], [[TMP0]]
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 1
+; INTERLEAVE-NEXT:    [[DOTCAST1:%.*]] = sitofp i64 [[INDEX]] to float
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = fmul fast float [[FPINC]], [[DOTCAST1]]
+; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = fsub fast float [[INIT]], [[TMP4]]
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = fmul fast float 0.000000e+00, [[FPINC]]
+; INTERLEAVE-NEXT:    [[TMP6:%.*]] = fsub fast float [[OFFSET_IDX]], [[TMP5]]
+; INTERLEAVE-NEXT:    [[TMP7:%.*]] = fmul fast float 1.000000e+00, [[FPINC]]
+; INTERLEAVE-NEXT:    [[TMP8:%.*]] = fsub fast float [[OFFSET_IDX]], [[TMP7]]
+; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]]
+; INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP3]]
+; INTERLEAVE-NEXT:    store float [[TMP6]], ptr [[TMP9]], align 4
+; INTERLEAVE-NEXT:    store float [[TMP8]], ptr [[TMP10]], align 4
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi float [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[INIT]], %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
+; INTERLEAVE:       [[LOOP]]:
+; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[FP_IV:%.*]] = phi float [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; INTERLEAVE-NEXT:    store float [[FP_IV]], ptr [[GEP_A]], align 4
+; INTERLEAVE-NEXT:    [[ADD]] = fsub fast float [[FP_IV]], [[FPINC]]
+; INTERLEAVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; INTERLEAVE-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; INTERLEAVE-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[EXIT]]:
+; INTERLEAVE-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[LOOP]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    ret float [[ADD_LCSSA]]
+;
 entry:
   br label %loop
 

From 08f77241c0d90737f1818b948978876a0822be32 Mon Sep 17 00:00:00 2001
From: Congcong Cai 
Date: Tue, 31 Dec 2024 05:11:13 +0800
Subject: [PATCH 191/567] [clang-tidy][doc] mention smart ptr in
 bugprone-unhandled-self-assignment.WarnOnlyIfThisHasSuspiciousField option
 (#121316)

---
 .../checks/bugprone/unhandled-self-assignment.rst           | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unhandled-self-assignment.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unhandled-self-assignment.rst
index dee139861c8cf..d3cdd5a12fdca 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unhandled-self-assignment.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unhandled-self-assignment.rst
@@ -120,5 +120,7 @@ temporary object into ``this`` (needs a move assignment operator):
 
 .. option:: WarnOnlyIfThisHasSuspiciousField
 
-  When `true`, the check will warn only if the container class of the copy assignment operator
-  has any suspicious fields (pointer or C array). This option is set to `true` by default.
+  When `true`, the check will warn only if the container class of the copy
+  assignment operator has any suspicious fields (pointer, C array and C++ smart
+  pointer).
+  This option is set to `true` by default.

From d5a96eb125eaf661f7f9aad4cd184973fe750528 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim 
Date: Mon, 30 Dec 2024 21:20:56 +0000
Subject: [PATCH 192/567] Revert af83093933ca73bc82c33130f8bda9f1ae54aae2
 "[VectorCombine] eraseInstruction - ensure we reattempt to fold other users
 of an erased instruction's operands"

Reports of hung builds, but I don't have time to investigate at the moment.
---
 .../Transforms/Vectorize/VectorCombine.cpp    | 11 +---
 .../VectorCombine/X86/concat-boolmasks.ll     | 64 +++++--------------
 2 files changed, 17 insertions(+), 58 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 4a0f014be0e75..2460ccc61d84d 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -139,17 +139,10 @@ class VectorCombine {
 
   void eraseInstruction(Instruction &I) {
     LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n');
-    SmallVector Ops(I.operands());
+    for (Value *Op : I.operands())
+      Worklist.pushValue(Op);
     Worklist.remove(&I);
     I.eraseFromParent();
-
-    // Push remaining users and then the operand itself - allows further folds
-    // that were hindered by OneUse limits.
-    for (Value *Op : Ops)
-      if (auto *OpI = dyn_cast(Op)) {
-        Worklist.pushUsersToWorkList(*OpI);
-        Worklist.pushValue(OpI);
-      }
   }
 };
 } // namespace
diff --git a/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll b/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll
index c3639baf8b650..057d9af314ba3 100644
--- a/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll
@@ -80,29 +80,13 @@ define i64 @movmsk_i64_v8i32_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 }
 
 define i64 @movmsk_i64_v64i8_v16i8(<16 x i8> %v0, <16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
-; SSE-LABEL: @movmsk_i64_v64i8_v16i8(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> 
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> 
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> [[TMP2]], <64 x i32> 
-; SSE-NEXT:    [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer
-; SSE-NEXT:    [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64
-; SSE-NEXT:    ret i64 [[OR]]
-;
-; AVX2-LABEL: @movmsk_i64_v64i8_v16i8(
-; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> 
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> 
-; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> [[TMP1]], <64 x i32> 
-; AVX2-NEXT:    [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer
-; AVX2-NEXT:    [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64
-; AVX2-NEXT:    ret i64 [[OR]]
-;
-; AVX512-LABEL: @movmsk_i64_v64i8_v16i8(
-; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> 
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> 
-; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> [[TMP1]], <64 x i32> 
-; AVX512-NEXT:    [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer
-; AVX512-NEXT:    [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64
-; AVX512-NEXT:    ret i64 [[OR]]
+; CHECK-LABEL: @movmsk_i64_v64i8_v16i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> 
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> 
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> [[TMP1]], <64 x i32> 
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64
+; CHECK-NEXT:    ret i64 [[OR]]
 ;
   %c0 = icmp slt <16 x i8> %v0, zeroinitializer
   %c1 = icmp slt <16 x i8> %v1, zeroinitializer
@@ -126,32 +110,14 @@ define i64 @movmsk_i64_v64i8_v16i8(<16 x i8> %v0, <16 x i8> %v1, <16 x i8> %v2,
 }
 
 define i64 @movmsk_i64_v32i32_v4i32(<4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
-; SSE-LABEL: @movmsk_i64_v32i32_v4i32(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> 
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> 
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <16 x i32> 
-; SSE-NEXT:    [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer
-; SSE-NEXT:    [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
-; SSE-NEXT:    [[OR:%.*]] = zext i16 [[TMP5]] to i64
-; SSE-NEXT:    ret i64 [[OR]]
-;
-; AVX2-LABEL: @movmsk_i64_v32i32_v4i32(
-; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> 
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> 
-; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP1]], <16 x i32> 
-; AVX2-NEXT:    [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer
-; AVX2-NEXT:    [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
-; AVX2-NEXT:    [[OR:%.*]] = zext i16 [[TMP5]] to i64
-; AVX2-NEXT:    ret i64 [[OR]]
-;
-; AVX512-LABEL: @movmsk_i64_v32i32_v4i32(
-; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> 
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> 
-; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP1]], <16 x i32> 
-; AVX512-NEXT:    [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer
-; AVX512-NEXT:    [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
-; AVX512-NEXT:    [[OR:%.*]] = zext i16 [[TMP5]] to i64
-; AVX512-NEXT:    ret i64 [[OR]]
+; CHECK-LABEL: @movmsk_i64_v32i32_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> 
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> 
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP1]], <16 x i32> 
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
+; CHECK-NEXT:    [[OR:%.*]] = zext i16 [[TMP5]] to i64
+; CHECK-NEXT:    ret i64 [[OR]]
 ;
   %c0 = icmp slt <4 x i32> %v0, zeroinitializer
   %c1 = icmp slt <4 x i32> %v1, zeroinitializer

From 332d2647ff128af166f5b0f235c723888c3cd793 Mon Sep 17 00:00:00 2001
From: Muhammad Omair Javaid 
Date: Tue, 31 Dec 2024 01:19:55 +0500
Subject: [PATCH 193/567] Revert "[LV]: Teach LV to recursively (de)interleave.
  (#89018)"

This reverts commit ccfe0de0e1e37ed369c9bf89dd0188ba0afb2e9a.

This breaks LLVM build on AArch64 SVE Linux buildbots
https://lab.llvm.org/buildbot/#/builders/143/builds/4462
https://lab.llvm.org/buildbot/#/builders/17/builds/4902
https://lab.llvm.org/buildbot/#/builders/4/builds/4399
https://lab.llvm.org/buildbot/#/builders/41/builds/4299
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   14 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   79 +-
 .../AArch64/sve-interleaved-accesses.ll       |  260 +---
 .../sve-interleaved-masked-accesses.ll        |  252 ----
 .../RISCV/interleaved-accesses.ll             | 1318 ++++++++---------
 .../AArch64/sve-interleave-vectorization.ll   |  135 --
 6 files changed, 671 insertions(+), 1387 deletions(-)
 delete mode 100644 llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f38db39db9cff..a80f4b67f96e2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3489,10 +3489,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
   if (hasIrregularType(ScalarTy, DL))
     return false;
 
-  // For scalable vectors, the only interleave factor currently supported
-  // must be power of 2 since we require the (de)interleave2 intrinsics
-  // instead of shufflevectors.
-  if (VF.isScalable() && !isPowerOf2_32(InterleaveFactor))
+  // We currently only know how to emit interleave/deinterleave with
+  // Factor=2 for scalable vectors. This is purely an implementation
+  // limit.
+  if (VF.isScalable() && InterleaveFactor != 2)
     return false;
 
   // If the group involves a non-integral pointer, we may not be able to
@@ -9193,9 +9193,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
                      CM.getWideningDecision(IG->getInsertPos(), VF) ==
                          LoopVectorizationCostModel::CM_Interleave);
       // For scalable vectors, the only interleave factor currently supported
-      // must be power of 2 since we require the (de)interleave2 intrinsics
-      // instead of shufflevectors.
-      assert((!Result || !VF.isScalable() || isPowerOf2_32(IG->getFactor())) &&
+      // is 2 since we require the (de)interleave2 intrinsics instead of
+      // shufflevectors.
+      assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
              "Unsupported interleave factor for scalable vectors");
       return Result;
     };
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 7038e52a643c4..edba3de0719eb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2789,21 +2789,10 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef Vals,
   // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
   // must use intrinsics to interleave.
   if (VecTy->isScalableTy()) {
-    assert(isPowerOf2_32(Factor) && "Unsupported interleave factor for "
-                                    "scalable vectors, must be power of 2");
-    SmallVector InterleavingValues(Vals);
-    // When interleaving, the number of values will be shrunk until we have the
-    // single final interleaved value.
-    auto *InterleaveTy = cast(InterleavingValues[0]->getType());
-    for (unsigned Midpoint = Factor / 2; Midpoint > 0; Midpoint /= 2) {
-      InterleaveTy = VectorType::getDoubleElementsVectorType(InterleaveTy);
-      for (unsigned I = 0; I < Midpoint; ++I)
-        InterleavingValues[I] = Builder.CreateIntrinsic(
-            InterleaveTy, Intrinsic::vector_interleave2,
-            {InterleavingValues[I], InterleavingValues[Midpoint + I]},
-            /*FMFSource=*/nullptr, Name);
-    }
-    return InterleavingValues[0];
+    VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
+    return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
+                                   Vals,
+                                   /*FMFSource=*/nullptr, Name);
   }
 
   // Fixed length. Start by concatenating all vectors into a wide vector.
@@ -2889,11 +2878,15 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
                           &InterleaveFactor](Value *MaskForGaps) -> Value * {
     if (State.VF.isScalable()) {
       assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
-      assert(isPowerOf2_32(InterleaveFactor) &&
+      assert(InterleaveFactor == 2 &&
              "Unsupported deinterleave factor for scalable vectors");
       auto *ResBlockInMask = State.get(BlockInMask);
-      SmallVector Ops(InterleaveFactor, ResBlockInMask);
-      return interleaveVectors(State.Builder, Ops, "interleaved.mask");
+      SmallVector Ops = {ResBlockInMask, ResBlockInMask};
+      auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(),
+                                     State.VF.getKnownMinValue() * 2, true);
+      return State.Builder.CreateIntrinsic(
+          MaskTy, Intrinsic::vector_interleave2, Ops,
+          /*FMFSource=*/nullptr, "interleaved.mask");
     }
 
     if (!BlockInMask)
@@ -2933,48 +2926,22 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
     ArrayRef VPDefs = definedValues();
     const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
     if (VecTy->isScalableTy()) {
-      assert(isPowerOf2_32(InterleaveFactor) &&
+      assert(InterleaveFactor == 2 &&
              "Unsupported deinterleave factor for scalable vectors");
 
-      // Scalable vectors cannot use arbitrary shufflevectors (only splats),
-      // so must use intrinsics to deinterleave.
-      SmallVector DeinterleavedValues(InterleaveFactor);
-      DeinterleavedValues[0] = NewLoad;
-      // For the case of InterleaveFactor > 2, we will have to do recursive
-      // deinterleaving, because the current available deinterleave intrinsic
-      // supports only Factor of 2, otherwise it will bailout after first
-      // iteration.
-      // When deinterleaving, the number of values will double until we
-      // have "InterleaveFactor".
-      for (unsigned NumVectors = 1; NumVectors < InterleaveFactor;
-           NumVectors *= 2) {
-        // Deinterleave the elements within the vector
-        SmallVector TempDeinterleavedValues(NumVectors);
-        for (unsigned I = 0; I < NumVectors; ++I) {
-          auto *DiTy = DeinterleavedValues[I]->getType();
-          TempDeinterleavedValues[I] = State.Builder.CreateIntrinsic(
-              Intrinsic::vector_deinterleave2, DiTy, DeinterleavedValues[I],
-              /*FMFSource=*/nullptr, "strided.vec");
-        }
-        // Extract the deinterleaved values:
-        for (unsigned I = 0; I < 2; ++I)
-          for (unsigned J = 0; J < NumVectors; ++J)
-            DeinterleavedValues[NumVectors * I + J] =
-                State.Builder.CreateExtractValue(TempDeinterleavedValues[J], I);
-      }
-
-#ifndef NDEBUG
-      for (Value *Val : DeinterleavedValues)
-        assert(Val && "NULL Deinterleaved Value");
-#endif
-      for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
+        // Scalable vectors cannot use arbitrary shufflevectors (only splats),
+        // so must use intrinsics to deinterleave.
+      Value *DI = State.Builder.CreateIntrinsic(
+          Intrinsic::vector_deinterleave2, VecTy, NewLoad,
+          /*FMFSource=*/nullptr, "strided.vec");
+      unsigned J = 0;
+      for (unsigned I = 0; I < InterleaveFactor; ++I) {
         Instruction *Member = Group->getMember(I);
-        Value *StridedVec = DeinterleavedValues[I];
-        if (!Member) {
-          // This value is not needed as it's not used
-          static_cast(StridedVec)->eraseFromParent();
+
+        if (!Member)
           continue;
-        }
+
+        Value *StridedVec = State.Builder.CreateExtractValue(DI, I);
         // If this member has different type, cast the result type.
         if (Member->getType() != ScalarTy) {
           VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 05c0bc0761ea4..bf95622733461 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -396,8 +396,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load , ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 1
 ; CHECK-NEXT:    [[REVERSE:%.*]] = call  @llvm.vector.reverse.nxv4i32( [[TMP10]])
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 1
 ; CHECK-NEXT:    [[REVERSE1:%.*]] = call  @llvm.vector.reverse.nxv4i32( [[TMP11]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw  [[REVERSE]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = sub nsw  [[REVERSE1]], [[VEC_IND]]
@@ -1548,263 +1548,5 @@ end:
   ret void
 }
 
-; Check vectorization on an interleaved load/store groups of factor 4
-
-; for (int i = 0; i < 1024; ++i) {
-;   dst[i].x = a[i].x + b[i].x;
-;   dst[i].y = a[i].y - b[i].y;
-;   dst[i].z = a[i].z << b[i].z;
-;   dst[i].t = a[i].t >> b[i].t;
-; }
-%struct.xyzt = type { i32, i32, i32, i32 }
-
-define void @interleave_deinterleave(ptr writeonly noalias %dst, ptr readonly %a, ptr readonly %b) {
-; CHECK-LABEL: @interleave_deinterleave(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], 1024
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 2
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load , ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv16i32( [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv8i32( [[TMP7]])
-; CHECK-NEXT:    [[STRIDED_VEC7:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv8i32( [[TMP8]])
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { ,  } [[STRIDED_VEC6]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { ,  } [[STRIDED_VEC7]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { ,  } [[STRIDED_VEC6]], 1
-; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { ,  } [[STRIDED_VEC7]], 1
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_VEC8:%.*]] = load , ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC9:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv16i32( [[WIDE_VEC8]])
-; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { ,  } [[STRIDED_VEC9]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { ,  } [[STRIDED_VEC9]], 1
-; CHECK-NEXT:    [[STRIDED_VEC10:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv8i32( [[TMP14]])
-; CHECK-NEXT:    [[STRIDED_VEC11:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv8i32( [[TMP15]])
-; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { ,  } [[STRIDED_VEC10]], 0
-; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { ,  } [[STRIDED_VEC11]], 0
-; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { ,  } [[STRIDED_VEC10]], 1
-; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { ,  } [[STRIDED_VEC11]], 1
-; CHECK-NEXT:    [[TMP20:%.*]] = add nsw  [[TMP16]], [[TMP9]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP22:%.*]] = sub nsw  [[TMP10]], [[TMP17]]
-; CHECK-NEXT:    [[TMP23:%.*]] = shl  [[TMP11]], [[TMP18]]
-; CHECK-NEXT:    [[TMP24:%.*]] = ashr  [[TMP12]], [[TMP19]]
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call  @llvm.vector.interleave2.nxv8i32( [[TMP20]],  [[TMP23]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC12:%.*]] = call  @llvm.vector.interleave2.nxv8i32( [[TMP22]],  [[TMP24]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC13:%.*]] = call  @llvm.vector.interleave2.nxv16i32( [[INTERLEAVED_VEC]],  [[INTERLEAVED_VEC12]])
-; CHECK-NEXT:    store  [[INTERLEAVED_VEC13]], ptr [[TMP21]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP27]], [[TMP26]]
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4
-; CHECK-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4
-; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[Y]], align 4
-; CHECK-NEXT:    [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4
-; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[Y11]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP28]], [[TMP29]]
-; CHECK-NEXT:    [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4
-; CHECK-NEXT:    store i32 [[SUB]], ptr [[Y14]], align 4
-; CHECK-NEXT:    [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8
-; CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[Z]], align 4
-; CHECK-NEXT:    [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8
-; CHECK-NEXT:    [[TMP31:%.*]] = load i32, ptr [[Z19]], align 4
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[TMP30]], [[TMP31]]
-; CHECK-NEXT:    [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8
-; CHECK-NEXT:    store i32 [[SHL]], ptr [[Z22]], align 4
-; CHECK-NEXT:    [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12
-; CHECK-NEXT:    [[TMP32:%.*]] = load i32, ptr [[T]], align 4
-; CHECK-NEXT:    [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12
-; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[T27]], align 4
-; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[TMP32]], [[TMP33]]
-; CHECK-NEXT:    [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12
-; CHECK-NEXT:    store i32 [[SHR]], ptr [[T30]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds %struct.xyzt, ptr %a, i64 %indvars.iv
-  %0 = load i32, ptr %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %indvars.iv
-  %1 = load i32, ptr %arrayidx2, align 4
-  %add = add nsw i32 %1, %0
-  %arrayidx5 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %indvars.iv
-  store i32 %add, ptr %arrayidx5, align 4
-  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i64 4
-  %2 = load i32, ptr %y, align 4
-  %y11 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 4
-  %3 = load i32, ptr %y11, align 4
-  %sub = sub nsw i32 %2, %3
-  %y14 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 4
-  store i32 %sub, ptr %y14, align 4
-  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i64 8
-  %4 = load i32, ptr %z, align 4
-  %z19 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 8
-  %5 = load i32, ptr %z19, align 4
-  %shl = shl i32 %4, %5
-  %z22 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 8
-  store i32 %shl, ptr %z22, align 4
-  %t = getelementptr inbounds nuw i8, ptr %arrayidx, i64 12
-  %6 = load i32, ptr %t, align 4
-  %t27 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 12
-  %7 = load i32, ptr %t27, align 4
-  %shr = ashr i32 %6, %7
-  %t30 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 12
-  store i32 %shr, ptr %t30, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond.not, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-; Check vectorization on a reverse interleaved load/store groups of factor 4
-
-; for (int i = 1023; i >= 0; i--) {
-;   int a = A[i].x + i;
-;   int b = A[i].y - i;
-;   int c = A[i].z * i;
-;   int d = A[i].t << i;
-;   B[i].x = a;
-;   B[i].y = b;
-;   B[i].z = c;
-;   B[i].t = d;
-; }
-
-define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A, ptr noalias nocapture %B) #1{
-; CHECK-LABEL: @interleave_deinterleave_reverse(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = call  @llvm.stepvector.nxv4i32()
-; CHECK-NEXT:    [[INDUCTION:%.*]] = sub  splat (i32 1023), [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw i32 0, [[TMP3]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement  poison, i32 [[TMP4]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector  [[DOTSPLATINSERT]],  poison,  zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi  [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 4
-; CHECK-NEXT:    [[TMP8:%.*]] = sub nsw i32 4, [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 [[TMP9]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load , ptr [[TMP10]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv16i32( [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv8i32( [[TMP11]])
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv8i32( [[TMP12]])
-; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { ,  } [[STRIDED_VEC1]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { ,  } [[STRIDED_VEC2]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { ,  } [[STRIDED_VEC1]], 1
-; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { ,  } [[STRIDED_VEC2]], 1
-; CHECK-NEXT:    [[REVERSE:%.*]] = call  @llvm.vector.reverse.nxv4i32( [[TMP13]])
-; CHECK-NEXT:    [[REVERSE3:%.*]] = call  @llvm.vector.reverse.nxv4i32( [[TMP14]])
-; CHECK-NEXT:    [[REVERSE4:%.*]] = call  @llvm.vector.reverse.nxv4i32( [[TMP15]])
-; CHECK-NEXT:    [[REVERSE5:%.*]] = call  @llvm.vector.reverse.nxv4i32( [[TMP16]])
-; CHECK-NEXT:    [[TMP17:%.*]] = add nsw  [[REVERSE]], [[VEC_IND]]
-; CHECK-NEXT:    [[TMP18:%.*]] = sub nsw  [[REVERSE3]], [[VEC_IND]]
-; CHECK-NEXT:    [[TMP19:%.*]] = mul nsw  [[REVERSE4]], [[VEC_IND]]
-; CHECK-NEXT:    [[TMP20:%.*]] = shl nuw nsw  [[REVERSE5]], [[VEC_IND]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 4
-; CHECK-NEXT:    [[TMP24:%.*]] = sub nsw i32 4, [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = sext i32 [[TMP24]] to i64
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP25]]
-; CHECK-NEXT:    [[REVERSE6:%.*]] = call  @llvm.vector.reverse.nxv4i32( [[TMP17]])
-; CHECK-NEXT:    [[REVERSE7:%.*]] = call  @llvm.vector.reverse.nxv4i32( [[TMP18]])
-; CHECK-NEXT:    [[REVERSE8:%.*]] = call  @llvm.vector.reverse.nxv4i32( [[TMP19]])
-; CHECK-NEXT:    [[REVERSE9:%.*]] = call  @llvm.vector.reverse.nxv4i32( [[TMP20]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call  @llvm.vector.interleave2.nxv8i32( [[REVERSE6]],  [[REVERSE8]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC10:%.*]] = call  @llvm.vector.interleave2.nxv8i32( [[REVERSE7]],  [[REVERSE9]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC11:%.*]] = call  @llvm.vector.interleave2.nxv16i32( [[INTERLEAVED_VEC]],  [[INTERLEAVED_VEC10]])
-; CHECK-NEXT:    store  [[INTERLEAVED_VEC11]], ptr [[TMP26]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add  [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP44:![0-9]+]]
-;
-entry:
-  br label %for.body
-for.cond.cleanup:                                 ; preds = %for.body
-  ret void
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
-  %x = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 0
-  %load1 = load i32, ptr %x, align 4
-  %trunc = trunc i64 %indvars.iv to i32
-  %add = add nsw i32 %load1, %trunc
-  %y = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 1
-  %load2 = load i32, ptr %y, align 4
-  %sub = sub nsw i32 %load2, %trunc
-  %z = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 2
-  %load3 = load i32, ptr %z, align 4
-  %mul = mul nsw i32 %load3, %trunc
-  %t = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 3
-  %load4 = load i32, ptr %t, align 4
-  %shl = shl nuw nsw i32 %load4, %trunc
-  %x5 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 0
-  store i32 %add, ptr %x5, align 4
-  %y8 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 1
-  store i32 %sub, ptr %y8, align 4
-  %z5 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 2
-  store i32 %mul, ptr %z5, align 4
-  %t8 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 3
-  store i32 %shl, ptr %t8, align 4
-  %indvars.iv.next = add nsw i64 %indvars.iv, -1
-  %cmp = icmp sgt i64 %indvars.iv, 0
-  br i1 %cmp, label %for.body, label %for.cond.cleanup
-
-}
 attributes #1 = { "target-features"="+sve" vscale_range(1, 16) }
 attributes #0 = { "unsafe-fp-math"="true" "target-features"="+sve" vscale_range(1, 16) }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index d4392bebdf37b..1a281fe7c6f7f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -529,255 +529,3 @@ for.inc:
 for.end:
   ret void
 }
-
-; Expected to contain interleave2/deinterleave2 instructions
-;
-; void masked_strided_factor4(const unsigned char* restrict p,
-;                            unsigned char* restrict q,
-;                            unsigned char guard) {
-; for(ix=0; ix < 1024; ++ix) {
-;     if (ix > guard) {
-;         char left1 = p[4*ix];
-;         char right1 = p[4*ix + 1];
-;         char left2 = p[4*ix + 2];
-;         char right2 = p[4*ix + 3];
-;         char max1 = max(left1, right1);
-;         char max2 = max(left2, right2);
-;         q[4*ix] = max1;
-;         q[4*ix + 1] = 0 - max1;
-;         q[4*ix + 2] = max2;
-;         q[4*ix + 3] = 0 - max2;
-;     }
-; }
-;}
-define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 {
-; SCALAR_TAIL_FOLDING-LABEL: define dso_local void @masked_strided_factor4
-; SCALAR_TAIL_FOLDING-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SCALAR_TAIL_FOLDING-NEXT:  entry:
-; SCALAR_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024
-; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; SCALAR_TAIL_FOLDING:       vector.ph:
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call  @llvm.stepvector.nxv16i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement  poison, i32 [[TMP5]], i64 0
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector  [[DOTSPLATINSERT]],  poison,  zeroinitializer
-; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement  poison, i32 [[CONV]], i64 0
-; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector  [[BROADCAST_SPLATINSERT]],  poison,  zeroinitializer
-; SCALAR_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
-; SCALAR_TAIL_FOLDING:       vector.body:
-; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi  [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = icmp ugt  [[VEC_IND]], [[BROADCAST_SPLAT]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = shl i32 [[INDEX]], 2
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call  @llvm.vector.interleave2.nxv32i1( [[TMP7]],  [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call  @llvm.vector.interleave2.nxv32i1( [[TMP7]],  [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK2:%.*]] = call  @llvm.vector.interleave2.nxv64i1( [[INTERLEAVED_MASK]],  [[INTERLEAVED_MASK1]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call  @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1,  [[INTERLEAVED_MASK2]],  poison)
-; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv64i8( [[WIDE_MASKED_VEC]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 0
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC3:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv32i8( [[TMP11]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC4:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv32i8( [[TMP12]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = extractvalue { ,  } [[STRIDED_VEC3]], 0
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = extractvalue { ,  } [[STRIDED_VEC4]], 0
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = extractvalue { ,  } [[STRIDED_VEC3]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = extractvalue { ,  } [[STRIDED_VEC4]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = call  @llvm.smax.nxv16i8( [[TMP13]],  [[TMP14]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = sub  zeroinitializer, [[TMP17]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call  @llvm.smax.nxv16i8( [[TMP15]],  [[TMP16]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = sub  zeroinitializer, [[TMP19]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = sext i32 [[TMP8]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call  @llvm.vector.interleave2.nxv32i8( [[TMP17]],  [[TMP19]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC5:%.*]] = call  @llvm.vector.interleave2.nxv32i8( [[TMP18]],  [[TMP20]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC6:%.*]] = call  @llvm.vector.interleave2.nxv64i8( [[INTERLEAVED_VEC]],  [[INTERLEAVED_VEC5]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK7:%.*]] = call  @llvm.vector.interleave2.nxv32i1( [[TMP7]],  [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK8:%.*]] = call  @llvm.vector.interleave2.nxv32i1( [[TMP7]],  [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK9:%.*]] = call  @llvm.vector.interleave2.nxv64i1( [[INTERLEAVED_MASK7]],  [[INTERLEAVED_MASK8]])
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC6]], ptr [[TMP22]], i32 1,  [[INTERLEAVED_MASK9]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add  [[VEC_IND]], [[DOTSPLAT]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; SCALAR_TAIL_FOLDING:       middle.block:
-; SCALAR_TAIL_FOLDING-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
-; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; SCALAR_TAIL_FOLDING:       scalar.ph:
-; SCALAR_TAIL_FOLDING-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    br label [[FOR_BODY:%.*]]
-; SCALAR_TAIL_FOLDING:       for.body:
-; SCALAR_TAIL_FOLDING-NEXT:    [[IX_024:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    [[CMP1:%.*]] = icmp samesign ugt i32 [[IX_024]], [[CONV]]
-; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; SCALAR_TAIL_FOLDING:       if.then:
-; SCALAR_TAIL_FOLDING-NEXT:    [[IDX0:%.*]] = shl nuw nsw i32 [[IX_024]], 2
-; SCALAR_TAIL_FOLDING-NEXT:    [[IDX1:%.*]] = or disjoint i32 [[IDX0]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[IDX2:%.*]] = or disjoint i32 [[IDX0]], 2
-; SCALAR_TAIL_FOLDING-NEXT:    [[IDX3:%.*]] = or disjoint i32 [[IDX0]], 3
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP24:%.*]] = zext nneg i32 [[IDX0]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY1IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP24]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP25:%.*]] = load i8, ptr [[ARRAY1IDX0]], align 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP26:%.*]] = zext nneg i32 [[IDX1]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY1IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP26]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP27:%.*]] = load i8, ptr [[ARRAY1IDX1]], align 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP28:%.*]] = zext nneg i32 [[IDX2]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY1IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP28]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP29:%.*]] = load i8, ptr [[ARRAY1IDX2]], align 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP30:%.*]] = zext nneg i32 [[IDX3]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY1IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP30]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP31:%.*]] = load i8, ptr [[ARRAY1IDX3]], align 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[SPEC_SELECT_I1:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP25]], i8 [[TMP27]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[SUB1:%.*]] = sub i8 0, [[SPEC_SELECT_I1]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[SPEC_SELECT_I2:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP29]], i8 [[TMP31]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[SUB2:%.*]] = sub i8 0, [[SPEC_SELECT_I2]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP32:%.*]] = zext nneg i32 [[IDX0]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY3IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP32]]
-; SCALAR_TAIL_FOLDING-NEXT:    store i8 [[SPEC_SELECT_I1]], ptr [[ARRAY3IDX0]], align 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP33:%.*]] = zext nneg i32 [[IDX1]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY3IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP33]]
-; SCALAR_TAIL_FOLDING-NEXT:    store i8 [[SUB1]], ptr [[ARRAY3IDX1]], align 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP34:%.*]] = zext nneg i32 [[IDX2]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY3IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP34]]
-; SCALAR_TAIL_FOLDING-NEXT:    store i8 [[SPEC_SELECT_I2]], ptr [[ARRAY3IDX2]], align 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP35:%.*]] = zext nneg i32 [[IDX3]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY3IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP35]]
-; SCALAR_TAIL_FOLDING-NEXT:    store i8 [[SUB2]], ptr [[ARRAY3IDX3]], align 1
-; SCALAR_TAIL_FOLDING-NEXT:    br label [[FOR_INC]]
-; SCALAR_TAIL_FOLDING:       for.inc:
-; SCALAR_TAIL_FOLDING-NEXT:    [[INC]] = add nuw nsw i32 [[IX_024]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1024
-; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; SCALAR_TAIL_FOLDING:       for.end:
-; SCALAR_TAIL_FOLDING-NEXT:    ret void
-;
-; PREDICATED_TAIL_FOLDING-LABEL: define dso_local void @masked_strided_factor4
-; PREDICATED_TAIL_FOLDING-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; PREDICATED_TAIL_FOLDING-NEXT:  entry:
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; PREDICATED_TAIL_FOLDING:       vector.ph:
-; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call  @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call  @llvm.stepvector.nxv16i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement  poison, i32 [[TMP1]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector  [[DOTSPLATINSERT]],  poison,  zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement  poison, i32 [[CONV]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector  [[BROADCAST_SPLATINSERT]],  poison,  zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
-; PREDICATED_TAIL_FOLDING:       vector.body:
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi  [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi  [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = icmp ugt  [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = select  [[ACTIVE_LANE_MASK]],  [[TMP6]],  zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = shl i32 [[INDEX]], 2
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call  @llvm.vector.interleave2.nxv32i1( [[TMP7]],  [[TMP7]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call  @llvm.vector.interleave2.nxv32i1( [[TMP7]],  [[TMP7]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK2:%.*]] = call  @llvm.vector.interleave2.nxv64i1( [[INTERLEAVED_MASK]],  [[INTERLEAVED_MASK1]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call  @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1,  [[INTERLEAVED_MASK2]],  poison)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv64i8( [[WIDE_MASKED_VEC]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 1
-; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC3:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv32i8( [[TMP11]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC4:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv32i8( [[TMP12]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = extractvalue { ,  } [[STRIDED_VEC3]], 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = extractvalue { ,  } [[STRIDED_VEC4]], 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = extractvalue { ,  } [[STRIDED_VEC3]], 1
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = extractvalue { ,  } [[STRIDED_VEC4]], 1
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = call  @llvm.smax.nxv16i8( [[TMP13]],  [[TMP14]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = sub  zeroinitializer, [[TMP17]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call  @llvm.smax.nxv16i8( [[TMP15]],  [[TMP16]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = sub  zeroinitializer, [[TMP19]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = sext i32 [[TMP8]] to i64
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call  @llvm.vector.interleave2.nxv32i8( [[TMP17]],  [[TMP19]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC5:%.*]] = call  @llvm.vector.interleave2.nxv32i8( [[TMP18]],  [[TMP20]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC6:%.*]] = call  @llvm.vector.interleave2.nxv64i8( [[INTERLEAVED_VEC]],  [[INTERLEAVED_VEC5]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK7:%.*]] = call  @llvm.vector.interleave2.nxv32i1( [[TMP7]],  [[TMP7]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK8:%.*]] = call  @llvm.vector.interleave2.nxv32i1( [[TMP7]],  [[TMP7]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK9:%.*]] = call  @llvm.vector.interleave2.nxv64i1( [[INTERLEAVED_MASK7]],  [[INTERLEAVED_MASK8]])
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC6]], ptr [[TMP22]], i32 1,  [[INTERLEAVED_MASK9]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call  @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add  [[VEC_IND]], [[DOTSPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP23:%.*]] = extractelement  [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP23]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP8:![0-9]+]]
-; PREDICATED_TAIL_FOLDING:       middle.block:
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; PREDICATED_TAIL_FOLDING:       scalar.ph:
-; PREDICATED_TAIL_FOLDING-NEXT:    br label [[FOR_BODY:%.*]]
-; PREDICATED_TAIL_FOLDING:       for.body:
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 poison, label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
-; PREDICATED_TAIL_FOLDING:       if.then:
-; PREDICATED_TAIL_FOLDING-NEXT:    br label [[FOR_INC]]
-; PREDICATED_TAIL_FOLDING:       for.inc:
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; PREDICATED_TAIL_FOLDING:       for.end:
-; PREDICATED_TAIL_FOLDING-NEXT:    ret void
-;
-entry:
-  %conv = zext i8 %guard to i32
-  br label %for.body
-
-for.body:
-  %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
-  %cmp1 = icmp ugt i32 %ix.024, %conv
-  br i1 %cmp1, label %if.then, label %for.inc
-
-if.then:
-  %idx0 = shl nuw nsw i32 %ix.024, 2
-  %idx1 = add i32 %idx0, 1
-  %idx2 = add i32 %idx0, 2
-  %idx3 = add i32 %idx0, 3
-
-  %array1idx0 = getelementptr inbounds i8, ptr %p, i32 %idx0
-  %0 = load i8, ptr %array1idx0, align 1
-  %array1idx1 = getelementptr inbounds i8, ptr %p, i32 %idx1
-  %1 = load i8, ptr %array1idx1, align 1
-  %array1idx2 = getelementptr inbounds i8, ptr %p, i32 %idx2
-  %2 = load i8, ptr %array1idx2, align 1
-  %array1idx3 = getelementptr inbounds i8, ptr %p, i32 %idx3
-  %3 = load i8, ptr %array1idx3, align 1
-
-  %cmp.i1 = icmp slt i8 %0, %1
-  %spec.select.i1 = select i1 %cmp.i1, i8 %1, i8 %0
-  %sub1 = sub i8 0, %spec.select.i1
-  %cmp.i2 = icmp slt i8 %2, %3
-  %spec.select.i2 = select i1 %cmp.i2, i8 %3, i8 %2
-  %sub2 = sub i8 0, %spec.select.i2
-
-  %array3idx0 = getelementptr inbounds i8, ptr %q, i32 %idx0
-  store i8 %spec.select.i1, ptr %array3idx0, align 1
-  %array3idx1 = getelementptr inbounds i8, ptr %q, i32 %idx1
-  store i8 %sub1, ptr %array3idx1, align 1
-  %array3idx2 = getelementptr inbounds i8, ptr %q, i32 %idx2
-  store i8 %spec.select.i2, ptr %array3idx2, align 1
-  %array3idx3 = getelementptr inbounds i8, ptr %q, i32 %idx3
-  store i8 %sub2, ptr %array3idx3, align 1
-
-  br label %for.inc
-
-for.inc:
-  %inc = add nuw nsw i32 %ix.024, 1
-  %exitcond = icmp eq i32 %inc, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
index b1ff589fe51bf..bda4839dead51 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
@@ -9,7 +9,7 @@ define void @load_store_factor2_i32(ptr %p) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
@@ -17,88 +17,88 @@ define void @load_store_factor2_i32(ptr %p) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
-; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 4
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = add  [[TMP9]], splat (i32 1)
-; CHECK-NEXT:    [[TMP12:%.*]] = add  [[TMP10]], splat (i32 2)
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call  @llvm.vector.interleave2.nxv8i32( [[TMP11]],  [[TMP12]])
-; CHECK-NEXT:    store  [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
-; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = add  [[TMP10]], splat (i32 1)
+; CHECK-NEXT:    [[TMP15:%.*]] = add  [[TMP11]], splat (i32 2)
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call  @llvm.vector.interleave2.nxv8i32( [[TMP12]],  [[TMP15]])
+; CHECK-NEXT:    store  [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP1:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; CHECK-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
-; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
-; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q2]], align 4
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
 ; CHECK-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; CHECK-NEXT:    store i32 [[Y0]], ptr [[Q2]], align 4
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
+; CHECK-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; CHECK-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
 ; CHECK-NEXT:    store i32 [[Y1]], ptr [[Q1]], align 4
-; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @load_store_factor2_i32(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
-; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[Q0]], align 4
+; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; FIXED-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
+; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]]
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP2]], align 4
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> 
 ; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> 
-; FIXED-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
-; FIXED-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
-; FIXED-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> 
-; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> 
-; FIXED-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
-; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 8
-; FIXED-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; FIXED-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
+; FIXED-NEXT:    [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
+; FIXED-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP7]], <16 x i32> 
+; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> poison, <16 x i32> 
+; FIXED-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4
+; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; FIXED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP1:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; FIXED-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
-; FIXED-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
-; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q2]], align 4
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
 ; FIXED-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; FIXED-NEXT:    store i32 [[Y0]], ptr [[Q2]], align 4
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
+; FIXED-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; FIXED-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
 ; FIXED-NEXT:    store i32 [[Y1]], ptr [[Q1]], align 4
-; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
@@ -107,7 +107,7 @@ define void @load_store_factor2_i32(ptr %p) {
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
@@ -115,44 +115,44 @@ define void @load_store_factor2_i32(ptr %p) {
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 4
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
+; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 4
 ; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]])
-; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 0
-; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 1
-; SCALABLE-NEXT:    [[TMP11:%.*]] = add  [[TMP9]], splat (i32 1)
-; SCALABLE-NEXT:    [[TMP12:%.*]] = add  [[TMP10]], splat (i32 2)
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call  @llvm.vector.interleave2.nxv8i32( [[TMP11]],  [[TMP12]])
-; SCALABLE-NEXT:    store  [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
-; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
-; SCALABLE-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP12:%.*]] = add  [[TMP10]], splat (i32 1)
+; SCALABLE-NEXT:    [[TMP15:%.*]] = add  [[TMP11]], splat (i32 2)
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call  @llvm.vector.interleave2.nxv8i32( [[TMP12]],  [[TMP15]])
+; SCALABLE-NEXT:    store  [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; SCALABLE-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
-; SCALABLE-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q2]], align 4
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
 ; SCALABLE-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; SCALABLE-NEXT:    store i32 [[Y0]], ptr [[Q2]], align 4
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
+; SCALABLE-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; SCALABLE-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
 ; SCALABLE-NEXT:    store i32 [[Y1]], ptr [[Q1]], align 4
-; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -186,7 +186,7 @@ define void @load_store_factor2_i64(ptr %p) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
@@ -194,88 +194,88 @@ define void @load_store_factor2_i64(ptr %p) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
-; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = add  [[TMP9]], splat (i64 1)
-; CHECK-NEXT:    [[TMP12:%.*]] = add  [[TMP10]], splat (i64 2)
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call  @llvm.vector.interleave2.nxv4i64( [[TMP11]],  [[TMP12]])
-; CHECK-NEXT:    store  [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
-; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = add  [[TMP10]], splat (i64 1)
+; CHECK-NEXT:    [[TMP15:%.*]] = add  [[TMP11]], splat (i64 2)
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call  @llvm.vector.interleave2.nxv4i64( [[TMP12]],  [[TMP15]])
+; CHECK-NEXT:    store  [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP1:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; CHECK-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
-; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q2]], align 8
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q2]], align 8
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
+; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
 ; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @load_store_factor2_i64(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
-; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[Q0]], align 8
+; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; FIXED-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
+; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP2]], align 8
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> 
 ; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> 
-; FIXED-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
-; FIXED-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; FIXED-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> 
-; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> 
-; FIXED-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
-; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 4
-; FIXED-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; FIXED-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
+; FIXED-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
+; FIXED-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP7]], <8 x i32> 
+; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> poison, <8 x i32> 
+; FIXED-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
+; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; FIXED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP1:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; FIXED-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
-; FIXED-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q2]], align 8
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; FIXED-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q2]], align 8
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
+; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; FIXED-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
 ; FIXED-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
@@ -284,7 +284,7 @@ define void @load_store_factor2_i64(ptr %p) {
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
@@ -292,44 +292,44 @@ define void @load_store_factor2_i64(ptr %p) {
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
+; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 8
 ; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]])
-; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 0
-; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 1
-; SCALABLE-NEXT:    [[TMP11:%.*]] = add  [[TMP9]], splat (i64 1)
-; SCALABLE-NEXT:    [[TMP12:%.*]] = add  [[TMP10]], splat (i64 2)
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call  @llvm.vector.interleave2.nxv4i64( [[TMP11]],  [[TMP12]])
-; SCALABLE-NEXT:    store  [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
-; SCALABLE-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP12:%.*]] = add  [[TMP10]], splat (i64 1)
+; SCALABLE-NEXT:    [[TMP15:%.*]] = add  [[TMP11]], splat (i64 2)
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call  @llvm.vector.interleave2.nxv4i64( [[TMP12]],  [[TMP15]])
+; SCALABLE-NEXT:    store  [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; SCALABLE-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
-; SCALABLE-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q2]], align 8
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; SCALABLE-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q2]], align 8
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
+; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; SCALABLE-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
 ; SCALABLE-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -360,42 +360,42 @@ exit:
 define void @load_store_factor3_i32(ptr %p) {
 ; CHECK-LABEL: @load_store_factor3_i32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
-; CHECK-NEXT:    [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> 
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> 
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> 
-; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
-; CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
-; CHECK-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> 
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> 
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> 
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> 
-; CHECK-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
-; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 8
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
+; CHECK-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
+; CHECK-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> 
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> 
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> 
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> 
+; CHECK-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP1:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; CHECK-NEXT:    [[OFFSET3:%.*]] = mul i64 [[I1]], 3
-; CHECK-NEXT:    [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]]
-; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q3]], align 4
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
 ; CHECK-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; CHECK-NEXT:    store i32 [[Y0]], ptr [[Q3]], align 4
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1
+; CHECK-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; CHECK-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
@@ -405,50 +405,50 @@ define void @load_store_factor3_i32(ptr %p) {
 ; CHECK-NEXT:    [[X2:%.*]] = load i32, ptr [[Q2]], align 4
 ; CHECK-NEXT:    [[Y2:%.*]] = add i32 [[X2]], 3
 ; CHECK-NEXT:    store i32 [[Y2]], ptr [[Q2]], align 4
-; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @load_store_factor3_i32(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
-; FIXED-NEXT:    [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4
+; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; FIXED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
+; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]]
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> 
 ; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> 
 ; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> 
-; FIXED-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
-; FIXED-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
-; FIXED-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
-; FIXED-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> 
-; FIXED-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> 
-; FIXED-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> 
-; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> 
-; FIXED-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
-; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 8
-; FIXED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
+; FIXED-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
+; FIXED-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
+; FIXED-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
+; FIXED-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> 
+; FIXED-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> 
+; FIXED-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> 
+; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> 
+; FIXED-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4
+; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP1:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; FIXED-NEXT:    [[OFFSET3:%.*]] = mul i64 [[I1]], 3
-; FIXED-NEXT:    [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]]
-; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q3]], align 4
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
 ; FIXED-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; FIXED-NEXT:    store i32 [[Y0]], ptr [[Q3]], align 4
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1
+; FIXED-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; FIXED-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
@@ -458,50 +458,50 @@ define void @load_store_factor3_i32(ptr %p) {
 ; FIXED-NEXT:    [[X2:%.*]] = load i32, ptr [[Q2]], align 4
 ; FIXED-NEXT:    [[Y2:%.*]] = add i32 [[X2]], 3
 ; FIXED-NEXT:    store i32 [[Y2]], ptr [[Q2]], align 4
-; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
 ; SCALABLE-LABEL: @load_store_factor3_i32(
 ; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
+; SCALABLE-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4
 ; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> 
 ; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> 
 ; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> 
-; SCALABLE-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
-; SCALABLE-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
-; SCALABLE-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
-; SCALABLE-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> 
-; SCALABLE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> 
-; SCALABLE-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> 
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> 
-; SCALABLE-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
-; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 8
-; SCALABLE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
+; SCALABLE-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
+; SCALABLE-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> 
+; SCALABLE-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> 
+; SCALABLE-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> 
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> 
+; SCALABLE-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; SCALABLE-NEXT:    [[OFFSET3:%.*]] = mul i64 [[I1]], 3
-; SCALABLE-NEXT:    [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q3]], align 4
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
 ; SCALABLE-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; SCALABLE-NEXT:    store i32 [[Y0]], ptr [[Q3]], align 4
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1
+; SCALABLE-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; SCALABLE-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
@@ -511,9 +511,9 @@ define void @load_store_factor3_i32(ptr %p) {
 ; SCALABLE-NEXT:    [[X2:%.*]] = load i32, ptr [[Q2]], align 4
 ; SCALABLE-NEXT:    [[Y2:%.*]] = add i32 [[X2]], 3
 ; SCALABLE-NEXT:    store i32 [[Y2]], ptr [[Q2]], align 4
-; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -550,42 +550,42 @@ exit:
 define void @load_store_factor3_i64(ptr %p) {
 ; CHECK-LABEL: @load_store_factor3_i64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
-; CHECK-NEXT:    [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> 
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> 
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> 
-; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
-; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> 
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> 
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> 
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> 
-; CHECK-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
-; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 4
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> 
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> 
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> 
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> 
+; CHECK-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP1:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; CHECK-NEXT:    [[OFFSET3:%.*]] = mul i64 [[I1]], 3
-; CHECK-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q3]], align 8
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q3]], align 8
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1
+; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
@@ -595,50 +595,50 @@ define void @load_store_factor3_i64(ptr %p) {
 ; CHECK-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
 ; CHECK-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
 ; CHECK-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @load_store_factor3_i64(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
-; FIXED-NEXT:    [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8
+; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; FIXED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
+; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> 
 ; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> 
 ; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> 
-; FIXED-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
-; FIXED-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; FIXED-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; FIXED-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> 
-; FIXED-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> 
-; FIXED-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> 
-; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> 
-; FIXED-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
-; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 4
-; FIXED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; FIXED-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
+; FIXED-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
+; FIXED-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
+; FIXED-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> 
+; FIXED-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> 
+; FIXED-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> 
+; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> 
+; FIXED-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
+; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP1:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; FIXED-NEXT:    [[OFFSET3:%.*]] = mul i64 [[I1]], 3
-; FIXED-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
-; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q3]], align 8
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; FIXED-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q3]], align 8
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1
+; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; FIXED-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
@@ -648,50 +648,50 @@ define void @load_store_factor3_i64(ptr %p) {
 ; FIXED-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
 ; FIXED-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
 ; FIXED-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
 ; SCALABLE-LABEL: @load_store_factor3_i64(
 ; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
+; SCALABLE-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8
 ; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> 
 ; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> 
 ; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> 
-; SCALABLE-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
-; SCALABLE-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; SCALABLE-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; SCALABLE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> 
-; SCALABLE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> 
-; SCALABLE-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> 
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> 
-; SCALABLE-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 4
-; SCALABLE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
+; SCALABLE-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
+; SCALABLE-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> 
+; SCALABLE-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> 
+; SCALABLE-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> 
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> 
+; SCALABLE-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; SCALABLE-NEXT:    [[OFFSET3:%.*]] = mul i64 [[I1]], 3
-; SCALABLE-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q3]], align 8
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; SCALABLE-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q3]], align 8
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1
+; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; SCALABLE-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
@@ -701,9 +701,9 @@ define void @load_store_factor3_i64(ptr %p) {
 ; SCALABLE-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
 ; SCALABLE-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
 ; SCALABLE-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -740,75 +740,56 @@ exit:
 define void @load_store_factor8(ptr %p) {
 ; CHECK-LABEL: @load_store_factor8(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[I]], 0
-; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP3]], 3
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv8i64( [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv4i64( [[TMP6]])
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv4i64( [[TMP7]])
-; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { ,  } [[STRIDED_VEC1]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { ,  } [[STRIDED_VEC2]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { ,  } [[STRIDED_VEC1]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { ,  } [[STRIDED_VEC2]], 1
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv2i64( [[TMP8]])
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv2i64( [[TMP9]])
-; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv2i64( [[TMP10]])
-; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv2i64( [[TMP11]])
-; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { ,  } [[STRIDED_VEC3]], 0
-; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { ,  } [[STRIDED_VEC4]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { ,  } [[STRIDED_VEC5]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { ,  } [[STRIDED_VEC6]], 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { ,  } [[STRIDED_VEC3]], 1
-; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { ,  } [[STRIDED_VEC4]], 1
-; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { ,  } [[STRIDED_VEC5]], 1
-; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { ,  } [[STRIDED_VEC6]], 1
-; CHECK-NEXT:    [[TMP20:%.*]] = add  [[TMP12]], splat (i64 1)
-; CHECK-NEXT:    [[TMP21:%.*]] = add  [[TMP13]], splat (i64 2)
-; CHECK-NEXT:    [[TMP22:%.*]] = add  [[TMP14]], splat (i64 3)
-; CHECK-NEXT:    [[TMP23:%.*]] = add  [[TMP15]], splat (i64 4)
-; CHECK-NEXT:    [[TMP24:%.*]] = add  [[TMP16]], splat (i64 5)
-; CHECK-NEXT:    [[TMP25:%.*]] = add  [[TMP17]], splat (i64 6)
-; CHECK-NEXT:    [[TMP26:%.*]] = add  [[TMP18]], splat (i64 7)
-; CHECK-NEXT:    [[TMP27:%.*]] = add  [[TMP19]], splat (i64 8)
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call  @llvm.vector.interleave2.nxv2i64( [[TMP20]],  [[TMP24]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC7:%.*]] = call  @llvm.vector.interleave2.nxv2i64( [[TMP21]],  [[TMP25]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC8:%.*]] = call  @llvm.vector.interleave2.nxv2i64( [[TMP22]],  [[TMP26]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC9:%.*]] = call  @llvm.vector.interleave2.nxv2i64( [[TMP23]],  [[TMP27]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC10:%.*]] = call  @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC]],  [[INTERLEAVED_VEC8]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC11:%.*]] = call  @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC7]],  [[INTERLEAVED_VEC9]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC12:%.*]] = call  @llvm.vector.interleave2.nxv8i64( [[INTERLEAVED_VEC10]],  [[INTERLEAVED_VEC11]])
-; CHECK-NEXT:    store  [[INTERLEAVED_VEC12]], ptr [[Q0]], align 8
-; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP2]]
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> 
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> 
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> 
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> 
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> 
+; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> 
+; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> 
+; CHECK-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> 
+; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
+; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
+; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
+; CHECK-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
+; CHECK-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
+; CHECK-NEXT:    [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
+; CHECK-NEXT:    [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7)
+; CHECK-NEXT:    [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8)
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> 
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> 
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> 
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> 
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> 
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> 
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> 
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> 
+; CHECK-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP1:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; CHECK-NEXT:    [[OFFSET8:%.*]] = shl i64 [[I1]], 3
-; CHECK-NEXT:    [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q8]], align 8
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 3
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q8]], align 8
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1
+; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
@@ -843,23 +824,23 @@ define void @load_store_factor8(ptr %p) {
 ; CHECK-NEXT:    [[X7:%.*]] = load i64, ptr [[Q7]], align 8
 ; CHECK-NEXT:    [[Y7:%.*]] = add i64 [[X7]], 8
 ; CHECK-NEXT:    store i64 [[Y7]], ptr [[Q7]], align 8
-; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @load_store_factor8(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
-; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP0]], 3
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[Q0]], align 8
+; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; FIXED-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 3
+; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> 
 ; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> 
 ; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> 
@@ -868,39 +849,39 @@ define void @load_store_factor8(ptr %p) {
 ; FIXED-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> 
 ; FIXED-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> 
 ; FIXED-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> 
-; FIXED-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
-; FIXED-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; FIXED-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; FIXED-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
-; FIXED-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
-; FIXED-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
-; FIXED-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7)
-; FIXED-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8)
-; FIXED-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> 
-; FIXED-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> 
-; FIXED-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> [[TMP8]], <4 x i32> 
-; FIXED-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <4 x i32> 
-; FIXED-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP12]], <8 x i32> 
-; FIXED-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i64> [[TMP13]], <4 x i64> [[TMP14]], <8 x i32> 
-; FIXED-NEXT:    [[TMP17:%.*]] = shufflevector <8 x i64> [[TMP15]], <8 x i64> [[TMP16]], <16 x i32> 
-; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP17]], <16 x i64> poison, <16 x i32> 
-; FIXED-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
-; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 2
-; FIXED-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
+; FIXED-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
+; FIXED-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
+; FIXED-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
+; FIXED-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
+; FIXED-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
+; FIXED-NEXT:    [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
+; FIXED-NEXT:    [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7)
+; FIXED-NEXT:    [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8)
+; FIXED-NEXT:    [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> 
+; FIXED-NEXT:    [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> 
+; FIXED-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> 
+; FIXED-NEXT:    [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> 
+; FIXED-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> 
+; FIXED-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> 
+; FIXED-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> 
+; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> 
+; FIXED-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
+; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; FIXED-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP1:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; FIXED-NEXT:    [[OFFSET8:%.*]] = shl i64 [[I1]], 3
-; FIXED-NEXT:    [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]]
-; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q8]], align 8
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 3
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; FIXED-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q8]], align 8
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1
+; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; FIXED-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
@@ -935,83 +916,64 @@ define void @load_store_factor8(ptr %p) {
 ; FIXED-NEXT:    [[X7:%.*]] = load i64, ptr [[Q7]], align 8
 ; FIXED-NEXT:    [[Y7:%.*]] = add i64 [[X7]], 8
 ; FIXED-NEXT:    store i64 [[Y7]], ptr [[Q7]], align 8
-; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
 ; SCALABLE-LABEL: @load_store_factor8(
 ; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
-; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
-; SCALABLE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
-; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[TMP3:%.*]] = add i64 [[I]], 0
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP3]], 3
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv8i64( [[WIDE_VEC]])
-; SCALABLE-NEXT:    [[TMP6:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 0
-; SCALABLE-NEXT:    [[TMP7:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 1
-; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv4i64( [[TMP6]])
-; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv4i64( [[TMP7]])
-; SCALABLE-NEXT:    [[TMP8:%.*]] = extractvalue { ,  } [[STRIDED_VEC1]], 0
-; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { ,  } [[STRIDED_VEC2]], 0
-; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { ,  } [[STRIDED_VEC1]], 1
-; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { ,  } [[STRIDED_VEC2]], 1
-; SCALABLE-NEXT:    [[STRIDED_VEC3:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv2i64( [[TMP8]])
-; SCALABLE-NEXT:    [[STRIDED_VEC4:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv2i64( [[TMP9]])
-; SCALABLE-NEXT:    [[STRIDED_VEC5:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv2i64( [[TMP10]])
-; SCALABLE-NEXT:    [[STRIDED_VEC6:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv2i64( [[TMP11]])
-; SCALABLE-NEXT:    [[TMP12:%.*]] = extractvalue { ,  } [[STRIDED_VEC3]], 0
-; SCALABLE-NEXT:    [[TMP13:%.*]] = extractvalue { ,  } [[STRIDED_VEC4]], 0
-; SCALABLE-NEXT:    [[TMP14:%.*]] = extractvalue { ,  } [[STRIDED_VEC5]], 0
-; SCALABLE-NEXT:    [[TMP15:%.*]] = extractvalue { ,  } [[STRIDED_VEC6]], 0
-; SCALABLE-NEXT:    [[TMP16:%.*]] = extractvalue { ,  } [[STRIDED_VEC3]], 1
-; SCALABLE-NEXT:    [[TMP17:%.*]] = extractvalue { ,  } [[STRIDED_VEC4]], 1
-; SCALABLE-NEXT:    [[TMP18:%.*]] = extractvalue { ,  } [[STRIDED_VEC5]], 1
-; SCALABLE-NEXT:    [[TMP19:%.*]] = extractvalue { ,  } [[STRIDED_VEC6]], 1
-; SCALABLE-NEXT:    [[TMP20:%.*]] = add  [[TMP12]], splat (i64 1)
-; SCALABLE-NEXT:    [[TMP21:%.*]] = add  [[TMP13]], splat (i64 2)
-; SCALABLE-NEXT:    [[TMP22:%.*]] = add  [[TMP14]], splat (i64 3)
-; SCALABLE-NEXT:    [[TMP23:%.*]] = add  [[TMP15]], splat (i64 4)
-; SCALABLE-NEXT:    [[TMP24:%.*]] = add  [[TMP16]], splat (i64 5)
-; SCALABLE-NEXT:    [[TMP25:%.*]] = add  [[TMP17]], splat (i64 6)
-; SCALABLE-NEXT:    [[TMP26:%.*]] = add  [[TMP18]], splat (i64 7)
-; SCALABLE-NEXT:    [[TMP27:%.*]] = add  [[TMP19]], splat (i64 8)
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call  @llvm.vector.interleave2.nxv2i64( [[TMP20]],  [[TMP24]])
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC7:%.*]] = call  @llvm.vector.interleave2.nxv2i64( [[TMP21]],  [[TMP25]])
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC8:%.*]] = call  @llvm.vector.interleave2.nxv2i64( [[TMP22]],  [[TMP26]])
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC9:%.*]] = call  @llvm.vector.interleave2.nxv2i64( [[TMP23]],  [[TMP27]])
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC10:%.*]] = call  @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC]],  [[INTERLEAVED_VEC8]])
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC11:%.*]] = call  @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC7]],  [[INTERLEAVED_VEC9]])
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC12:%.*]] = call  @llvm.vector.interleave2.nxv8i64( [[INTERLEAVED_VEC10]],  [[INTERLEAVED_VEC11]])
-; SCALABLE-NEXT:    store  [[INTERLEAVED_VEC12]], ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP2]]
-; SCALABLE-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 3
+; SCALABLE-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8
+; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> 
+; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> 
+; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> 
+; SCALABLE-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> 
+; SCALABLE-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> 
+; SCALABLE-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> 
+; SCALABLE-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> 
+; SCALABLE-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> 
+; SCALABLE-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
+; SCALABLE-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
+; SCALABLE-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
+; SCALABLE-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
+; SCALABLE-NEXT:    [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
+; SCALABLE-NEXT:    [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7)
+; SCALABLE-NEXT:    [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8)
+; SCALABLE-NEXT:    [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> 
+; SCALABLE-NEXT:    [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> 
+; SCALABLE-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> 
+; SCALABLE-NEXT:    [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> 
+; SCALABLE-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> 
+; SCALABLE-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> 
+; SCALABLE-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> 
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> 
+; SCALABLE-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; SCALABLE-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; SCALABLE-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; SCALABLE-NEXT:    [[OFFSET8:%.*]] = shl i64 [[I1]], 3
-; SCALABLE-NEXT:    [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q8]], align 8
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 3
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; SCALABLE-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q8]], align 8
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1
+; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; SCALABLE-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
@@ -1046,9 +1008,9 @@ define void @load_store_factor8(ptr %p) {
 ; SCALABLE-NEXT:    [[X7:%.*]] = load i64, ptr [[Q7]], align 8
 ; SCALABLE-NEXT:    [[Y7:%.*]] = add i64 [[X7]], 8
 ; SCALABLE-NEXT:    store i64 [[Y7]], ptr [[Q7]], align 8
-; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -1118,7 +1080,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
@@ -1126,94 +1088,94 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
-; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 4
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = add  [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
-; CHECK-NEXT:    store  [[TMP11]], ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = add  [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
+; CHECK-NEXT:    store  [[TMP12]], ptr [[TMP14]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP1:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; CHECK-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
-; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
-; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q2]], align 4
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; CHECK-NEXT:    [[RES:%.*]] = add i32 [[X0]], [[X1]]
-; CHECK-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]]
+; CHECK-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]]
 ; CHECK-NEXT:    store i32 [[RES]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @combine_load_factor2_i32(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[I]], 8
-; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1
+; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
+; FIXED-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP0]], 1
 ; FIXED-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP1]], 1
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP2]]
 ; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP3]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[Q0]], align 4
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP4]], align 4
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> 
-; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> 
-; FIXED-NEXT:    [[WIDE_VEC2:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4
-; FIXED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> 
-; FIXED-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> 
-; FIXED-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]]
-; FIXED-NEXT:    [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC4]]
-; FIXED-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP0]]
-; FIXED-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0
-; FIXED-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP8]], i32 8
-; FIXED-NEXT:    store <8 x i32> [[TMP6]], ptr [[TMP9]], align 4
-; FIXED-NEXT:    store <8 x i32> [[TMP7]], ptr [[TMP10]], align 4
-; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 16
-; FIXED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
+; FIXED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> 
+; FIXED-NEXT:    [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4
+; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> 
+; FIXED-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> 
+; FIXED-NEXT:    [[TMP8:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC3]]
+; FIXED-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], [[STRIDED_VEC4]]
+; FIXED-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP0]]
+; FIXED-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
+; FIXED-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP10]], i32 8
+; FIXED-NEXT:    store <8 x i32> [[TMP8]], ptr [[TMP12]], align 4
+; FIXED-NEXT:    store <8 x i32> [[TMP9]], ptr [[TMP13]], align 4
+; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP1:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; FIXED-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
-; FIXED-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
-; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q2]], align 4
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; FIXED-NEXT:    [[RES:%.*]] = add i32 [[X0]], [[X1]]
-; FIXED-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]]
+; FIXED-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]]
 ; FIXED-NEXT:    store i32 [[RES]], ptr [[DST]], align 4
-; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
@@ -1222,7 +1184,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
@@ -1230,43 +1192,43 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 4
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
+; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 4
 ; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]])
-; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 0
-; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 1
-; SCALABLE-NEXT:    [[TMP11:%.*]] = add  [[TMP9]], [[TMP10]]
-; SCALABLE-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]]
-; SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
-; SCALABLE-NEXT:    store  [[TMP11]], ptr [[TMP13]], align 4
-; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
-; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP12:%.*]] = add  [[TMP10]], [[TMP11]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]]
+; SCALABLE-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
+; SCALABLE-NEXT:    store  [[TMP12]], ptr [[TMP14]], align 4
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; SCALABLE-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
-; SCALABLE-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q2]], align 4
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; SCALABLE-NEXT:    [[RES:%.*]] = add i32 [[X0]], [[X1]]
-; SCALABLE-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]]
+; SCALABLE-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]]
 ; SCALABLE-NEXT:    store i32 [[RES]], ptr [[DST]], align 4
-; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -1301,7 +1263,7 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
@@ -1309,94 +1271,94 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
-; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = add  [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
-; CHECK-NEXT:    store  [[TMP11]], ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = add  [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i64, ptr [[TMP13]], i32 0
+; CHECK-NEXT:    store  [[TMP12]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP1:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; CHECK-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
-; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q2]], align 8
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[RES:%.*]] = add i64 [[X0]], [[X1]]
-; CHECK-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]]
+; CHECK-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]]
 ; CHECK-NEXT:    store i64 [[RES]], ptr [[DST]], align 8
-; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @combine_load_factor2_i64(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[I]], 4
-; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1
+; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
+; FIXED-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP0]], 1
 ; FIXED-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP1]], 1
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]]
 ; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[Q0]], align 8
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> 
-; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> 
-; FIXED-NEXT:    [[WIDE_VEC2:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8
-; FIXED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC2]], <8 x i64> poison, <4 x i32> 
-; FIXED-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC2]], <8 x i64> poison, <4 x i32> 
-; FIXED-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC1]]
-; FIXED-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC3]], [[STRIDED_VEC4]]
-; FIXED-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]]
-; FIXED-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0
-; FIXED-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[TMP8]], i32 4
-; FIXED-NEXT:    store <4 x i64> [[TMP6]], ptr [[TMP9]], align 8
-; FIXED-NEXT:    store <4 x i64> [[TMP7]], ptr [[TMP10]], align 8
-; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 8
-; FIXED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]]
+; FIXED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> 
+; FIXED-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8
+; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> 
+; FIXED-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> 
+; FIXED-NEXT:    [[TMP8:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC3]]
+; FIXED-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], [[STRIDED_VEC4]]
+; FIXED-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]]
+; FIXED-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP10]], i32 0
+; FIXED-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP10]], i32 4
+; FIXED-NEXT:    store <4 x i64> [[TMP8]], ptr [[TMP12]], align 8
+; FIXED-NEXT:    store <4 x i64> [[TMP9]], ptr [[TMP13]], align 8
+; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP1:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; FIXED-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
-; FIXED-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q2]], align 8
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; FIXED-NEXT:    [[RES:%.*]] = add i64 [[X0]], [[X1]]
-; FIXED-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]]
+; FIXED-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]]
 ; FIXED-NEXT:    store i64 [[RES]], ptr [[DST]], align 8
-; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
@@ -1405,7 +1367,7 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
@@ -1413,43 +1375,43 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
+; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 8
 ; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { ,  } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]])
-; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 0
-; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 1
-; SCALABLE-NEXT:    [[TMP11:%.*]] = add  [[TMP9]], [[TMP10]]
-; SCALABLE-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]]
-; SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
-; SCALABLE-NEXT:    store  [[TMP11]], ptr [[TMP13]], align 8
-; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
-; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { ,  } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP12:%.*]] = add  [[TMP10]], [[TMP11]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]]
+; SCALABLE-NEXT:    [[TMP14:%.*]] = getelementptr i64, ptr [[TMP13]], i32 0
+; SCALABLE-NEXT:    store  [[TMP12]], ptr [[TMP14]], align 8
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; SCALABLE-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
-; SCALABLE-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q2]], align 8
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; SCALABLE-NEXT:    [[RES:%.*]] = add i64 [[X0]], [[X1]]
-; SCALABLE-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]]
+; SCALABLE-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]]
 ; SCALABLE-NEXT:    store i64 [[RES]], ptr [[DST]], align 8
-; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll
deleted file mode 100644
index b400b27df0839..0000000000000
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll
+++ /dev/null
@@ -1,135 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=loop-vectorize,interleaved-access -mattr=+sve -S -o - %s | FileCheck %s
-
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64"
-
-%struct.xyzt = type { i32, i32, i32, i32 }
-; for (int i = 0; i < 1024; ++i) {
-;   dst[i].x = a[i].x + b[i].x;
-;   dst[i].y = a[i].y - b[i].y;
-;   dst[i].z = a[i].z << b[i].z;
-;   dst[i].t = a[i].t >> b[i].t;
-; }
-
-define void @interleave_deinterleave(ptr noalias %dst, ptr %a, ptr %b) {
-; CHECK-LABEL: @interleave_deinterleave(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[TMP6]]
-; CHECK-NEXT:    [[LDN:%.*]] = call { , , ,  } @llvm.aarch64.sve.ld4.sret.nxv4i32( splat (i1 true), ptr [[TMP7]])
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { , , ,  } [[LDN]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { , , ,  } [[LDN]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { , , ,  } [[LDN]], 2
-; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { , , ,  } [[LDN]], 3
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[TMP6]]
-; CHECK-NEXT:    [[LDN9:%.*]] = call { , , ,  } @llvm.aarch64.sve.ld4.sret.nxv4i32( splat (i1 true), ptr [[TMP13]])
-; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { , , ,  } [[LDN9]], 0
-; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { , , ,  } [[LDN9]], 1
-; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { , , ,  } [[LDN9]], 2
-; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { , , ,  } [[LDN9]], 3
-; CHECK-NEXT:    [[TMP20:%.*]] = add nsw  [[TMP16]], [[TMP9]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST:%.*]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP22:%.*]] = sub nsw  [[TMP10]], [[TMP17]]
-; CHECK-NEXT:    [[TMP23:%.*]] = shl  [[TMP11]], [[TMP18]]
-; CHECK-NEXT:    [[TMP24:%.*]] = ashr  [[TMP12]], [[TMP19]]
-; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP20]],  [[TMP22]],  [[TMP23]],  [[TMP24]],  splat (i1 true), ptr [[TMP21]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP32]], [[TMP31]]
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4
-; CHECK-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4
-; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[Y]], align 4
-; CHECK-NEXT:    [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4
-; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[Y11]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP33]], [[TMP26]]
-; CHECK-NEXT:    [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4
-; CHECK-NEXT:    store i32 [[SUB]], ptr [[Y14]], align 4
-; CHECK-NEXT:    [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8
-; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[Z]], align 4
-; CHECK-NEXT:    [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8
-; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[Z19]], align 4
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[TMP27]], [[TMP28]]
-; CHECK-NEXT:    [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8
-; CHECK-NEXT:    store i32 [[SHL]], ptr [[Z22]], align 4
-; CHECK-NEXT:    [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12
-; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[T]], align 4
-; CHECK-NEXT:    [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12
-; CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[T27]], align 4
-; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[TMP29]], [[TMP30]]
-; CHECK-NEXT:    [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12
-; CHECK-NEXT:    store i32 [[SHR]], ptr [[T30]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds %struct.xyzt, ptr %a, i64 %indvars.iv
-  %0 = load i32, ptr %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %indvars.iv
-  %1 = load i32, ptr %arrayidx2, align 4
-  %add = add nsw i32 %1, %0
-  %arrayidx5 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %indvars.iv
-  store i32 %add, ptr %arrayidx5, align 4
-  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i64 4
-  %2 = load i32, ptr %y, align 4
-  %y11 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 4
-  %3 = load i32, ptr %y11, align 4
-  %sub = sub nsw i32 %2, %3
-  %y14 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 4
-  store i32 %sub, ptr %y14, align 4
-  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i64 8
-  %4 = load i32, ptr %z, align 4
-  %z19 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 8
-  %5 = load i32, ptr %z19, align 4
-  %shl = shl i32 %4, %5
-  %z22 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 8
-  store i32 %shl, ptr %z22, align 4
-  %t = getelementptr inbounds nuw i8, ptr %arrayidx, i64 12
-  %6 = load i32, ptr %t, align 4
-  %t27 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 12
-  %7 = load i32, ptr %t27, align 4
-  %shr = ashr i32 %6, %7
-  %t30 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 12
-  store i32 %shr, ptr %t30, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond.not, label %for.end, label %for.body
-
-for.end:
-  ret void
-}

From 28ae2ff2a44c0a5d09ce5e88d8d8f3309b6f127f Mon Sep 17 00:00:00 2001
From: B I Mohammed Abbas 
Date: Tue, 31 Dec 2024 03:44:43 +0530
Subject: [PATCH 194/567] Add truncxfhf2 with tests to compiler-rt (#120372)

Fixes #105181
---
 compiler-rt/lib/builtins/CMakeLists.txt       |  1 +
 compiler-rt/lib/builtins/truncxfhf2.c         | 15 ++++
 .../test/builtins/Unit/truncxfhf2_test.c      | 74 +++++++++++++++++++
 3 files changed, 90 insertions(+)
 create mode 100644 compiler-rt/lib/builtins/truncxfhf2.c
 create mode 100644 compiler-rt/test/builtins/Unit/truncxfhf2_test.c

diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 3a868c11e7288..0581688c05466 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -310,6 +310,7 @@ set(x86_80_BIT_SOURCES
   mulxc3.c
   powixf2.c
   trunctfxf2.c
+  truncxfhf2.c
 )
 
 if (NOT MSVC)
diff --git a/compiler-rt/lib/builtins/truncxfhf2.c b/compiler-rt/lib/builtins/truncxfhf2.c
new file mode 100644
index 0000000000000..0f0639865dbfd
--- /dev/null
+++ b/compiler-rt/lib/builtins/truncxfhf2.c
@@ -0,0 +1,15 @@
+//===-- lib/truncsfhf2.c - long double -> half conversion ---------*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define SRC_SINGLE
+#define DST_HALF
+#include "fp_trunc_impl.inc"
+
+COMPILER_RT_ABI dst_t __truncxfhf2(xf_float a) {
+  return __truncXfYf2__((float)a);
+}
diff --git a/compiler-rt/test/builtins/Unit/truncxfhf2_test.c b/compiler-rt/test/builtins/Unit/truncxfhf2_test.c
new file mode 100644
index 0000000000000..9038a91a5b4c1
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/truncxfhf2_test.c
@@ -0,0 +1,74 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_truncxfhf2
+
+#include 
+
+#include "fp_test.h"
+
+#if HAS_80_BIT_LONG_DOUBLE
+
+TYPE_FP16 __truncxfhf2(xf_float f);
+
+int test_truncxfhf2(uint16_t inputHi, uint64_t inputLo, uint16_t e) {
+  xf_float a = F80FromRep80(inputHi, inputLo);
+  TYPE_FP16 x = __truncxfhf2(a);
+  int ret = compareResultH(x, e);
+  if (ret) {
+    printf("error in test__truncxfhf2(%Lf) = %#.4x, "
+           "expected %#.4x\n",
+           a, toRep16(x), e);
+  }
+  return ret;
+}
+
+int main() {
+  // Small positive value
+  if (test_truncxfhf2(UINT16_C(0x3ffb), UINT64_C(0xccc0000000000000),
+                      UINT16_C(0x2e66)))
+    return 1;
+
+  // Small negative value
+  if (test_truncxfhf2(UINT16_C(0xbffb), UINT64_C(0xccc0000000000000),
+                      UINT16_C(0xae66)))
+    return 1;
+
+  // Zero
+  if (test_truncxfhf2(UINT16_C(0x0), UINT64_C(0x0), UINT16_C(0)))
+    return 1;
+
+  // Smallest positive non-zero value
+  if (test_truncxfhf2(UINT16_C(0x3fef), UINT64_C(0x8000000000000000),
+                      UINT16_C(0x0100)))
+    return 1;
+
+  // Smallest negative non-zero value
+  if (test_truncxfhf2(UINT16_C(0xbfef), UINT64_C(0x8000000000000000),
+                      UINT16_C(0x8100)))
+    return 1;
+
+  // Positive infinity
+  if (test_truncxfhf2(UINT16_C(0x7fff), UINT64_C(0x8000000000000000),
+                      UINT16_C(0x7c00U)))
+    return 1;
+
+  // Negative infinity
+  if (test_truncxfhf2(UINT16_C(0xffff), UINT64_C(0x8000000000000000),
+                      UINT16_C(0xfc00U)))
+    return 1;
+
+  // NaN
+  if (test_truncxfhf2(UINT16_C(0x7fff), UINT64_C(0xc000000000000000),
+                      UINT16_C(0x7e00U)))
+    return 1;
+
+  return 0;
+}
+
+#else
+
+int main() {
+  printf("skipped\n");
+  return 0;
+}
+
+#endif

From 70c9152f99818ffd0342260ae12d709268031235 Mon Sep 17 00:00:00 2001
From: Roland McGrath 
Date: Mon, 30 Dec 2024 14:35:31 -0800
Subject: [PATCH 195/567] [libc] Fix non-calls to cpp::is_complex_type_same
 (#121257)

Some uses were not actually calls, just references to the name.
---
 libc/test/UnitTest/FPMatcher.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libc/test/UnitTest/FPMatcher.h b/libc/test/UnitTest/FPMatcher.h
index 9f2bae3279208..b8e240bf328ce 100644
--- a/libc/test/UnitTest/FPMatcher.h
+++ b/libc/test/UnitTest/FPMatcher.h
@@ -131,11 +131,11 @@ template  class CFPMatcher : public Matcher {
     else if constexpr (cpp::is_complex_type_same())
       return matchComplex();
 #ifdef LIBC_TYPES_HAS_CFLOAT16
-    else if constexpr (cpp::is_complex_type_same)
+    else if constexpr (cpp::is_complex_type_same())
       return matchComplex();
 #endif
 #ifdef LIBC_TYPES_HAS_CFLOAT128
-    else if constexpr (cpp::is_complex_type_same)
+    else if constexpr (cpp::is_complex_type_same())
       return matchComplex();
 #endif
   }
@@ -148,11 +148,11 @@ template  class CFPMatcher : public Matcher {
     else if constexpr (cpp::is_complex_type_same())
       return explainErrorComplex();
 #ifdef LIBC_TYPES_HAS_CFLOAT16
-    else if constexpr (cpp::is_complex_type_same)
+    else if constexpr (cpp::is_complex_type_same())
       return explainErrorComplex();
 #endif
 #ifdef LIBC_TYPES_HAS_CFLOAT128
-    else if constexpr (cpp::is_complex_type_same)
+    else if constexpr (cpp::is_complex_type_same())
       return explainErrorComplex();
 #endif
   }

From 0897373f1a329a7a02f8ce3c501a05d2f9c89390 Mon Sep 17 00:00:00 2001
From: Zequan Wu 
Date: Mon, 30 Dec 2024 14:39:33 -0800
Subject: [PATCH 196/567] [Clang][test] Relax checking for libclang_rt.asan.so
 and libclang_rt.asan_static.a on arm android. (#121361)

This fixes test breakage on clang bots. See comment
https://github.com/llvm/llvm-project/pull/121081#issuecomment-2565933062
---
 clang/test/Driver/sanitizer-ld.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/test/Driver/sanitizer-ld.c b/clang/test/Driver/sanitizer-ld.c
index a82c45136d7bf..71078342b3617 100644
--- a/clang/test/Driver/sanitizer-ld.c
+++ b/clang/test/Driver/sanitizer-ld.c
@@ -332,8 +332,8 @@
 // RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-ANDROID-SHARED-LIBASAN
 //
 // CHECK-ASAN-ANDROID-SHARED-LIBASAN-NOT: argument unused during compilation: '-shared-libsan'
-// CHECK-ASAN-ANDROID-SHARED-LIBASAN: libclang_rt.asan.so"
-// CHECK-ASAN-ANDROID-SHARED-LIBASAN: libclang_rt.asan_static.a"
+// CHECK-ASAN-ANDROID-SHARED-LIBASAN: libclang_rt.asan{{.*}}.so"
+// CHECK-ASAN-ANDROID-SHARED-LIBASAN: libclang_rt.asan_static{{.*}}.a"
 //
 // RUN: %clang -### %s 2>&1 \
 // RUN:     --target=arm-linux-androideabi -fuse-ld=ld -fsanitize=address \

From 9abcca5e25296aea49288ad63901f9e0a332dad4 Mon Sep 17 00:00:00 2001
From: Roland McGrath 
Date: Mon, 30 Dec 2024 15:36:53 -0800
Subject: [PATCH 197/567] [libc] Move hdrgen into utils/ subdirectory (#121256)

---
 libc/CMakeLists.txt                           |   2 -
 libc/cmake/modules/LLVMLibCHeaderRules.cmake  |   5 +-
 libc/docs/dev/header_generation.rst           |  27 ++---
 libc/docs/dev/source_tree_layout.rst          |  10 --
 libc/docs/full_cross_build.rst                |  14 +--
 libc/include/CMakeLists.txt                   | 110 +++++++++---------
 libc/src/math/docs/add_math_function.md       |   2 +-
 libc/utils/CMakeLists.txt                     |   2 +
 libc/{ => utils}/hdrgen/CMakeLists.txt        |   4 +-
 libc/utils/hdrgen/README.rst                  |   5 +
 .../classes/enumeration.py                    |   0
 .../class_implementation/classes/function.py  |   0
 .../class_implementation/classes/macro.py     |   0
 .../class_implementation/classes/object.py    |   0
 .../class_implementation/classes/type.py      |   0
 libc/{ => utils}/hdrgen/gpu_headers.py        |   0
 libc/{ => utils}/hdrgen/header.py             |   0
 .../tests/expected_output/test_header.h       |   0
 .../hdrgen/tests/input/test_small.h.def       |   0
 .../hdrgen/tests/input/test_small.yaml        |   0
 .../hdrgen/tests/test_integration.py          |  12 +-
 libc/{ => utils}/hdrgen/yaml/arpa/inet.yaml   |   0
 libc/{ => utils}/hdrgen/yaml/assert.yaml      |   0
 libc/{ => utils}/hdrgen/yaml/complex.yaml     |   0
 libc/{ => utils}/hdrgen/yaml/ctype.yaml       |   0
 libc/{ => utils}/hdrgen/yaml/dirent.yaml      |   0
 libc/{ => utils}/hdrgen/yaml/dlfcn.yaml       |   0
 libc/{ => utils}/hdrgen/yaml/elf.yaml         |   0
 libc/{ => utils}/hdrgen/yaml/errno.yaml       |   0
 libc/{ => utils}/hdrgen/yaml/fcntl.yaml       |   0
 libc/{ => utils}/hdrgen/yaml/features.yaml    |   0
 libc/{ => utils}/hdrgen/yaml/fenv.yaml        |   0
 libc/{ => utils}/hdrgen/yaml/float.yaml       |   0
 libc/{ => utils}/hdrgen/yaml/inttypes.yaml    |   0
 libc/{ => utils}/hdrgen/yaml/limits.yaml      |   0
 libc/{ => utils}/hdrgen/yaml/link.yaml        |   0
 libc/{ => utils}/hdrgen/yaml/locale.yaml      |   0
 libc/{ => utils}/hdrgen/yaml/malloc.yaml      |   0
 libc/{ => utils}/hdrgen/yaml/math.yaml        |   0
 libc/{ => utils}/hdrgen/yaml/pthread.yaml     |   0
 libc/{ => utils}/hdrgen/yaml/sched.yaml       |   0
 libc/{ => utils}/hdrgen/yaml/search.yaml      |   0
 libc/{ => utils}/hdrgen/yaml/setjmp.yaml      |   0
 libc/{ => utils}/hdrgen/yaml/signal.yaml      |   0
 libc/{ => utils}/hdrgen/yaml/spawn.yaml       |   0
 libc/{ => utils}/hdrgen/yaml/stdbit.yaml      |   0
 libc/{ => utils}/hdrgen/yaml/stdckdint.yaml   |   0
 libc/{ => utils}/hdrgen/yaml/stdfix.yaml      |   0
 libc/{ => utils}/hdrgen/yaml/stdint.yaml      |   0
 libc/{ => utils}/hdrgen/yaml/stdio.yaml       |   0
 libc/{ => utils}/hdrgen/yaml/stdlib.yaml      |   0
 libc/{ => utils}/hdrgen/yaml/string.yaml      |   0
 libc/{ => utils}/hdrgen/yaml/strings.yaml     |   0
 libc/{ => utils}/hdrgen/yaml/sys/auxv.yaml    |   0
 libc/{ => utils}/hdrgen/yaml/sys/epoll.yaml   |   0
 libc/{ => utils}/hdrgen/yaml/sys/ioctl.yaml   |   0
 libc/{ => utils}/hdrgen/yaml/sys/mman.yaml    |   0
 libc/{ => utils}/hdrgen/yaml/sys/prctl.yaml   |   0
 libc/{ => utils}/hdrgen/yaml/sys/random.yaml  |   0
 .../{ => utils}/hdrgen/yaml/sys/resource.yaml |   0
 libc/{ => utils}/hdrgen/yaml/sys/select.yaml  |   0
 .../{ => utils}/hdrgen/yaml/sys/sendfile.yaml |   0
 libc/{ => utils}/hdrgen/yaml/sys/socket.yaml  |   0
 libc/{ => utils}/hdrgen/yaml/sys/stat.yaml    |   0
 libc/{ => utils}/hdrgen/yaml/sys/statvfs.yaml |   0
 libc/{ => utils}/hdrgen/yaml/sys/syscall.yaml |   0
 libc/{ => utils}/hdrgen/yaml/sys/time.yaml    |   0
 libc/{ => utils}/hdrgen/yaml/sys/types.yaml   |   0
 libc/{ => utils}/hdrgen/yaml/sys/utsname.yaml |   0
 libc/{ => utils}/hdrgen/yaml/sys/wait.yaml    |   0
 libc/{ => utils}/hdrgen/yaml/termios.yaml     |   0
 libc/{ => utils}/hdrgen/yaml/threads.yaml     |   0
 libc/{ => utils}/hdrgen/yaml/time.yaml        |   0
 libc/{ => utils}/hdrgen/yaml/uchar.yaml       |   0
 libc/{ => utils}/hdrgen/yaml/unistd.yaml      |   0
 libc/{ => utils}/hdrgen/yaml/wchar.yaml       |   0
 .../hdrgen/yaml_functions_sorted.py           |   0
 libc/{ => utils}/hdrgen/yaml_to_classes.py    |   0
 78 files changed, 95 insertions(+), 98 deletions(-)
 rename libc/{ => utils}/hdrgen/CMakeLists.txt (68%)
 create mode 100644 libc/utils/hdrgen/README.rst
 rename libc/{ => utils}/hdrgen/class_implementation/classes/enumeration.py (100%)
 rename libc/{ => utils}/hdrgen/class_implementation/classes/function.py (100%)
 rename libc/{ => utils}/hdrgen/class_implementation/classes/macro.py (100%)
 rename libc/{ => utils}/hdrgen/class_implementation/classes/object.py (100%)
 rename libc/{ => utils}/hdrgen/class_implementation/classes/type.py (100%)
 rename libc/{ => utils}/hdrgen/gpu_headers.py (100%)
 rename libc/{ => utils}/hdrgen/header.py (100%)
 rename libc/{ => utils}/hdrgen/tests/expected_output/test_header.h (100%)
 rename libc/{ => utils}/hdrgen/tests/input/test_small.h.def (100%)
 rename libc/{ => utils}/hdrgen/tests/input/test_small.yaml (100%)
 rename libc/{ => utils}/hdrgen/tests/test_integration.py (84%)
 rename libc/{ => utils}/hdrgen/yaml/arpa/inet.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/assert.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/complex.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/ctype.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/dirent.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/dlfcn.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/elf.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/errno.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/fcntl.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/features.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/fenv.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/float.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/inttypes.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/limits.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/link.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/locale.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/malloc.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/math.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/pthread.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/sched.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/search.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/setjmp.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/signal.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/spawn.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/stdbit.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/stdckdint.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/stdfix.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/stdint.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/stdio.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/stdlib.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/string.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/strings.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/sys/auxv.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/sys/epoll.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/sys/ioctl.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/sys/mman.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/sys/prctl.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/sys/random.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/sys/resource.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/sys/select.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/sys/sendfile.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/sys/socket.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/sys/stat.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/sys/statvfs.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/sys/syscall.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/sys/time.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/sys/types.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/sys/utsname.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/sys/wait.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/termios.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/threads.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/time.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/uchar.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/unistd.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml/wchar.yaml (100%)
 rename libc/{ => utils}/hdrgen/yaml_functions_sorted.py (100%)
 rename libc/{ => utils}/hdrgen/yaml_to_classes.py (100%)

diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index 00a07ea3c8ac7..6f1c180a3f192 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -64,8 +64,6 @@ if(LIBC_BUILD_GPU_LOADER OR ((NOT LLVM_RUNTIMES_BUILD) AND LLVM_LIBC_GPU_BUILD))
   return()
 endif()
 
-add_subdirectory(hdrgen)
-
 option(LIBC_CMAKE_VERBOSE_LOGGING
   "Log details warnings and notifications during CMake configuration." OFF)
 
diff --git a/libc/cmake/modules/LLVMLibCHeaderRules.cmake b/libc/cmake/modules/LLVMLibCHeaderRules.cmake
index 31a88f0ef93be..0de5e14359cfb 100644
--- a/libc/cmake/modules/LLVMLibCHeaderRules.cmake
+++ b/libc/cmake/modules/LLVMLibCHeaderRules.cmake
@@ -110,9 +110,10 @@ function(add_gen_header target_name)
   set(entry_points "${TARGET_ENTRYPOINT_NAME_LIST}")
   list(TRANSFORM entry_points PREPEND "--e=")
 
+  set(LIBC_HDRGEN "${LIBC_SOURCE_DIR}/utils/hdrgen/yaml_to_classes.py")
   add_custom_command(
     OUTPUT ${out_file}
-    COMMAND ${Python3_EXECUTABLE} ${LIBC_SOURCE_DIR}/hdrgen/yaml_to_classes.py
+    COMMAND ${Python3_EXECUTABLE} ${LIBC_HDRGEN}
             ${yaml_file}
             --h_def_file ${def_file}
             ${entry_points}
@@ -126,7 +127,7 @@ function(add_gen_header target_name)
     set(decl_out_file ${LIBC_INCLUDE_DIR}/llvm-libc-decls/${relative_path})
     add_custom_command(
       OUTPUT ${decl_out_file}
-      COMMAND ${Python3_EXECUTABLE} ${LIBC_SOURCE_DIR}/hdrgen/yaml_to_classes.py
+      COMMAND ${Python3_EXECUTABLE} ${LIBC_HDRGEN}
               ${yaml_file}
               --export-decls
               ${entry_points}
diff --git a/libc/docs/dev/header_generation.rst b/libc/docs/dev/header_generation.rst
index 2c586cc87b699..17a8d7af3a2c8 100644
--- a/libc/docs/dev/header_generation.rst
+++ b/libc/docs/dev/header_generation.rst
@@ -44,15 +44,15 @@ To add through the command line:
 
    .. code-block:: none
 
-     python3 libc/hdrgen/yaml_to_classes.py
-     libc/hdrgen/yaml/[yaml_file.yaml] --add_function ""  ""   
+     python3 libc/utils/hdrgen/yaml_to_classes.py
+     libc/utils/hdrgen/yaml/[yaml_file.yaml] --add_function ""  ""   
 
    Example:
 
    .. code-block:: none
 
-      python3 libc/hdrgen/yaml_to_classes.py
-      libc/hdrgen/yaml/ctype.yaml --add_function "char" example_function
+      python3 libc/utils/hdrgen/yaml_to_classes.py
+      libc/utils/hdrgen/yaml/ctype.yaml --add_function "char" example_function
       "int, void, const void" stdc example_float example_attribute
 
    Keep in mind only the return_type and arguments have quotes around them. If
@@ -62,7 +62,8 @@ To add through the command line:
    generated header file with the new addition in the hdrgen directory to
    examine.
 
-If you want to sort the functions alphabetically you can check out libc/hdrgen/yaml_functions_sorted.py.
+If you want to sort the functions alphabetically you can check out
+libc/utils/hdrgen/yaml_functions_sorted.py.
 
 
 Testing
@@ -75,10 +76,10 @@ ensures the process of YAML to classes to generate headers works properly. If
 there are any new additions on formatting headers, make sure the test is
 updated with the specific addition.
 
-Integration Test can be found in: ``libc/hdrgen/tests/test_integration.py``
+Integration Test can be found in: ``libc/utils/hdrgen/tests/test_integration.py``
 
 File to modify if adding something to formatting:
-``libc/hdrgen/tests/expected_output/test_header.h``
+``libc/utils/hdrgen/tests/expected_output/test_header.h``
 
 
 Common Errors
@@ -89,7 +90,7 @@ Common Errors
 
    .. code-block:: none
 
-      "/llvm-project/libc/hdrgen/yaml_to_classes.py", line 67, in yaml_to_classes function_data["return_type"]
+      "/llvm-project/libc/utils/hdrgen/yaml_to_classes.py", line 67, in yaml_to_classes function_data["return_type"]
 
    If you receive this error or any error pertaining to
    ``function_data[function_specific_component]`` while building the headers
@@ -117,7 +118,7 @@ Common Errors
    missing. Ensure the correct style and required files are present:
 
    | ``[header_name]``
-   | ``[../libc/hdrgen/yaml/[yaml_file.yaml]``
+   | ``[../libc/utils/hdrgen/yaml/[yaml_file.yaml]``
    | ``[header_name.h.def]``
    | ``[header_name.h]``
    | ``DEPENDS``
@@ -147,13 +148,13 @@ Common Errors
 
    .. code-block:: none
 
-     File "/llvm-project/libc/hdrgen/header.py", line 60, in __str__ for
+     File "/llvm-project/libc/utils/hdrgen/header.py", line 60, in __str__ for
      function in self.functions: AttributeError: 'HeaderFile' object has no
      attribute 'functions'
 
    When running ``ninja libc`` in the build directory to generate headers you
    may receive the error above. Essentially this means that in
-   ``libc/hdrgen/header.py`` there is a missing attribute named functions.
+   ``libc/utils/hdrgen/header.py`` there is a missing attribute named functions.
    Make sure all function components are defined within this file and there are
    no missing functions to add these components.
 
@@ -184,12 +185,12 @@ Common Errors
    Sometimes the integration test will fail but that
    still means the process is working unless the comparison between the output
    and expected_output is not showing. If that is the case make sure in
-   ``libc/hdrgen/tests/test_integration.py`` there are no missing arguments
+   ``libc/utils/hdrgen/tests/test_integration.py`` there are no missing arguments
    that run through the script.
 
    If the integration tests are failing due to mismatching of lines or small
    errors in spacing that is nothing to worry about. If this is happening while
    you are making a new change to the formatting of the headers, then
    ensure the expected output file
-   ``libc/hdrgen/tests/expected_output/test_header.h`` has the changes you
+   ``libc/utils/hdrgen/tests/expected_output/test_header.h`` has the changes you
    are applying.
diff --git a/libc/docs/dev/source_tree_layout.rst b/libc/docs/dev/source_tree_layout.rst
index bd9d6ca453e08..62c0434a0b2aa 100644
--- a/libc/docs/dev/source_tree_layout.rst
+++ b/libc/docs/dev/source_tree_layout.rst
@@ -15,7 +15,6 @@ directories::
         - examples
         - fuzzing
         - hdr
-        - hdrgen
         - include
         - lib
         - src
@@ -88,15 +87,6 @@ The ``lib`` directory
 This directory contains a ``CMakeLists.txt`` file listing the targets for the
 public libraries ``libc.a``, ``libm.a`` etc.
 
-The ``hdrgen`` directory
----------------------------
-
-This directory contains the sources and specifications for the types, macros
-and entrypoint functions. These definitions are organized in the ``yaml``
-subdirectory and match the organization of the ``*.h.def`` files. This folder
-also contains the python sources for headergen, which is what generates the
-headers.
-
 The ``src`` directory
 ---------------------
 
diff --git a/libc/docs/full_cross_build.rst b/libc/docs/full_cross_build.rst
index 5f57169d228ef..cd1ec89e5d5e9 100644
--- a/libc/docs/full_cross_build.rst
+++ b/libc/docs/full_cross_build.rst
@@ -8,7 +8,7 @@ Full Cross Build
    :depth: 1
    :local:
 
-.. note:: 
+.. note::
    Fullbuild requires running headergen, which is a python program that depends on
    pyyaml. The minimum versions are listed on the :ref:`header_generation`
    page, as well as additional information.
@@ -95,8 +95,8 @@ configure step.
 Bootstrap cross build
 =====================
 
-In this recipe, the clang compiler and the ``libc-hdrgen`` binary, both are
-built automatically before building the libc for the target.
+In this recipe, the clang compiler is built automatically before building
+the libc for the target.
 
 CMake configure step
 --------------------
@@ -151,8 +151,8 @@ built using any of the three recipes described above.
 Building for the GPU
 ====================
 
-To build for a GPU architecture, it should only be necessary to specify the 
-target triple as one of the supported GPU targets. Currently, this is either 
-``nvptx64-nvidia-cuda`` for NVIDIA GPUs or ``amdgcn-amd-amdhsa`` for AMD GPUs. 
-More detailed information is provided in the :ref:`GPU 
+To build for a GPU architecture, it should only be necessary to specify the
+target triple as one of the supported GPU targets. Currently, this is either
+``nvptx64-nvidia-cuda`` for NVIDIA GPUs or ``amdgcn-amd-amdhsa`` for AMD GPUs.
+More detailed information is provided in the :ref:`GPU
 documentation`.
diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index 3a05c01abba5a..e490840cafedb 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -32,7 +32,7 @@ endmacro()
 
 add_header_macro(
   ctype
-  ../libc/hdrgen/yaml/ctype.yaml
+  ../libc/utils/hdrgen/yaml/ctype.yaml
   ctype.h.def
   ctype.h
   DEPENDS
@@ -42,7 +42,7 @@ add_header_macro(
 
 add_header_macro(
   dirent
-  ../libc/hdrgen/yaml/dirent.yaml
+  ../libc/utils/hdrgen/yaml/dirent.yaml
   dirent.h.def
   dirent.h
   DEPENDS
@@ -54,7 +54,7 @@ add_header_macro(
 
 add_header_macro(
   fcntl
-  ../libc/hdrgen/yaml/fcntl.yaml
+  ../libc/utils/hdrgen/yaml/fcntl.yaml
   fcntl.h.def
   fcntl.h
   DEPENDS
@@ -70,7 +70,7 @@ add_header_macro(
 
 add_header_macro(
   dlfcn
-  ../libc/hdrgen/yaml/dlfcn.yaml
+  ../libc/utils/hdrgen/yaml/dlfcn.yaml
   dlfcn.h.def
   dlfcn.h
   DEPENDS
@@ -80,7 +80,7 @@ add_header_macro(
 
 add_header_macro(
   features
-  ../libc/hdrgen/yaml/features.yaml
+  ../libc/utils/hdrgen/yaml/features.yaml
   features.h.def
   features.h
   DEPENDS
@@ -90,7 +90,7 @@ add_header_macro(
 
 add_header_macro(
   fenv
-  ../libc/hdrgen/yaml/fenv.yaml
+  ../libc/utils/hdrgen/yaml/fenv.yaml
   fenv.h.def
   fenv.h
   DEPENDS
@@ -102,7 +102,7 @@ add_header_macro(
 
 add_header_macro(
   inttypes
-  ../libc/hdrgen/yaml/inttypes.yaml
+  ../libc/utils/hdrgen/yaml/inttypes.yaml
   inttypes.h.def
   inttypes.h
   DEPENDS
@@ -113,7 +113,7 @@ add_header_macro(
 
 add_header_macro(
   float
-  ../libc/hdrgen/yaml/float.yaml
+  ../libc/utils/hdrgen/yaml/float.yaml
   float.h.def
   float.h
   DEPENDS
@@ -122,7 +122,7 @@ add_header_macro(
 
 add_header_macro(
   stdint
-  ../libc/hdrgen/yaml/stdint.yaml
+  ../libc/utils/hdrgen/yaml/stdint.yaml
   stdint.h.def
   stdint.h
   DEPENDS
@@ -131,7 +131,7 @@ add_header_macro(
 
 add_header_macro(
   limits
-  ../libc/hdrgen/yaml/limits.yaml
+  ../libc/utils/hdrgen/yaml/limits.yaml
   limits.h.def
   limits.h
   DEPENDS
@@ -140,7 +140,7 @@ add_header_macro(
 
 add_header_macro(
   malloc
-  ../libc/hdrgen/yaml/malloc.yaml
+  ../libc/utils/hdrgen/yaml/malloc.yaml
   malloc.h.def
   malloc.h
   DEPENDS
@@ -150,7 +150,7 @@ add_header_macro(
 
 add_header_macro(
   math
-  ../libc/hdrgen/yaml/math.yaml
+  ../libc/utils/hdrgen/yaml/math.yaml
   math.h.def
   math.h
   DEPENDS
@@ -165,7 +165,7 @@ add_header_macro(
 
 add_header_macro(
   stdfix
-  ../libc/hdrgen/yaml/stdfix.yaml
+  ../libc/utils/hdrgen/yaml/stdfix.yaml
   stdfix.h.def
   stdfix.h
   DEPENDS
@@ -178,7 +178,7 @@ file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/arpa)
 
 add_header_macro(
   arpa_inet
-  ../libc/hdrgen/yaml/arpa/inet.yaml
+  ../libc/utils/hdrgen/yaml/arpa/inet.yaml
   arpa/inet.h.def
   arpa/inet.h
   DEPENDS
@@ -187,7 +187,7 @@ add_header_macro(
 
 add_header_macro(
   assert
-  ../libc/hdrgen/yaml/assert.yaml
+  ../libc/utils/hdrgen/yaml/assert.yaml
   assert.h.def
   assert.h
   DEPENDS
@@ -197,7 +197,7 @@ add_header_macro(
 
 add_header_macro(
   complex
-  ../libc/hdrgen/yaml/complex.yaml
+  ../libc/utils/hdrgen/yaml/complex.yaml
   complex.h.def
   complex.h
   DEPENDS
@@ -207,7 +207,7 @@ add_header_macro(
 
 add_header_macro(
   setjmp
-  ../libc/hdrgen/yaml/setjmp.yaml
+  ../libc/utils/hdrgen/yaml/setjmp.yaml
   setjmp.h.def
   setjmp.h
   DEPENDS
@@ -217,7 +217,7 @@ add_header_macro(
 
 add_header_macro(
   string
-  ../libc/hdrgen/yaml/string.yaml
+  ../libc/utils/hdrgen/yaml/string.yaml
   string.h.def
   string.h
   DEPENDS
@@ -228,7 +228,7 @@ add_header_macro(
 
 add_header_macro(
   strings
-  ../libc/hdrgen/yaml/strings.yaml
+  ../libc/utils/hdrgen/yaml/strings.yaml
   strings.h.def
   strings.h
   DEPENDS
@@ -238,7 +238,7 @@ add_header_macro(
 
 add_header_macro(
   search
-  ../libc/hdrgen/yaml/search.yaml
+  ../libc/utils/hdrgen/yaml/search.yaml
   search.h.def
   search.h
   DEPENDS
@@ -252,7 +252,7 @@ add_header_macro(
 
 add_header_macro(
   time
-  ../libc/hdrgen/yaml/time.yaml
+  ../libc/utils/hdrgen/yaml/time.yaml
   time.h.def
   time.h
   DEPENDS
@@ -268,7 +268,7 @@ add_header_macro(
 
 add_header_macro(
   threads
-  ../libc/hdrgen/yaml/threads.yaml
+  ../libc/utils/hdrgen/yaml/threads.yaml
   threads.h.def
   threads.h
   DEPENDS
@@ -285,7 +285,7 @@ add_header_macro(
 
 add_header_macro(
   errno
-  ../libc/hdrgen/yaml/errno.yaml
+  ../libc/utils/hdrgen/yaml/errno.yaml
   errno.h.def
   errno.h
   DEPENDS
@@ -295,7 +295,7 @@ add_header_macro(
 
 add_header_macro(
   signal
-  ../libc/hdrgen/yaml/signal.yaml
+  ../libc/utils/hdrgen/yaml/signal.yaml
   signal.h.def
   signal.h
   DEPENDS
@@ -311,7 +311,7 @@ add_header_macro(
 
 add_header_macro(
   stdbit
-  ../libc/hdrgen/yaml/stdbit.yaml
+  ../libc/utils/hdrgen/yaml/stdbit.yaml
   stdbit.h.def
   stdbit.h
   DEPENDS
@@ -321,7 +321,7 @@ add_header_macro(
 
 add_header_macro(
   stdckdint
-  ../libc/hdrgen/yaml/stdckdint.yaml
+  ../libc/utils/hdrgen/yaml/stdckdint.yaml
   stdckdint.h.def
   stdckdint.h
   DEPENDS
@@ -331,7 +331,7 @@ add_header_macro(
 
 add_header_macro(
   stdio
-  ../libc/hdrgen/yaml/stdio.yaml
+  ../libc/utils/hdrgen/yaml/stdio.yaml
   stdio.h.def
   stdio.h
   DEPENDS
@@ -347,7 +347,7 @@ add_header_macro(
 
 add_header_macro(
   stdlib
-  ../libc/hdrgen/yaml/stdlib.yaml
+  ../libc/utils/hdrgen/yaml/stdlib.yaml
   stdlib.h.def
   stdlib.h
   DEPENDS
@@ -366,7 +366,7 @@ add_header_macro(
 
 add_header_macro(
   unistd
-  ../libc/hdrgen/yaml/unistd.yaml
+  ../libc/utils/hdrgen/yaml/unistd.yaml
   unistd.h.def
   unistd.h
   DEPENDS
@@ -385,7 +385,7 @@ add_header_macro(
 
 add_header_macro(
   pthread
-  ../libc/hdrgen/yaml/pthread.yaml
+  ../libc/utils/hdrgen/yaml/pthread.yaml
   pthread.h.def
   pthread.h
   DEPENDS
@@ -409,7 +409,7 @@ add_header_macro(
 
 add_header_macro(
   sched
-  ../libc/hdrgen/yaml/sched.yaml
+  ../libc/utils/hdrgen/yaml/sched.yaml
   sched.h.def
   sched.h
   DEPENDS
@@ -426,7 +426,7 @@ add_header_macro(
 
 add_header_macro(
   spawn
-  ../libc/hdrgen/yaml/spawn.yaml
+  ../libc/utils/hdrgen/yaml/spawn.yaml
   spawn.h.def
   spawn.h
   DEPENDS
@@ -439,7 +439,7 @@ add_header_macro(
 
 add_header_macro(
   link
-  ../libc/hdrgen/yaml/link.yaml
+  ../libc/utils/hdrgen/yaml/link.yaml
   link.h.def
   link.h
   DEPENDS
@@ -449,7 +449,7 @@ add_header_macro(
 
 add_header_macro(
   elf
-  ../libc/hdrgen/yaml/elf.yaml
+  ../libc/utils/hdrgen/yaml/elf.yaml
   elf.h.def
   elf.h
   DEPENDS
@@ -463,7 +463,7 @@ file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/sys)
 
 add_header_macro(
   sys_auxv
-  ../libc/hdrgen/yaml/sys/auxv.yaml
+  ../libc/utils/hdrgen/yaml/sys/auxv.yaml
   sys/auxv.h.def
   sys/auxv.h
   DEPENDS
@@ -473,7 +473,7 @@ add_header_macro(
 
 add_header_macro(
   sys_epoll
-  ../libc/hdrgen/yaml/sys/epoll.yaml
+  ../libc/utils/hdrgen/yaml/sys/epoll.yaml
   sys/epoll.h.def
   sys/epoll.h
   DEPENDS
@@ -486,7 +486,7 @@ add_header_macro(
 
 add_header_macro(
   sys_ioctl
-  ../libc/hdrgen/yaml/sys/ioctl.yaml
+  ../libc/utils/hdrgen/yaml/sys/ioctl.yaml
   sys/ioctl.h.def
   sys/ioctl.h
   DEPENDS
@@ -496,7 +496,7 @@ add_header_macro(
 
 add_header_macro(
   sys_mman
-  ../libc/hdrgen/yaml/sys/mman.yaml
+  ../libc/utils/hdrgen/yaml/sys/mman.yaml
   sys/mman.h.def
   sys/mman.h
   DEPENDS
@@ -509,7 +509,7 @@ add_header_macro(
 
 add_header_macro(
   sys_prctl
-  ../libc/hdrgen/yaml/sys/prctl.yaml
+  ../libc/utils/hdrgen/yaml/sys/prctl.yaml
   sys/prctl.h.def
   sys/prctl.h
   DEPENDS
@@ -526,7 +526,7 @@ add_header(
 
 add_header_macro(
   sys_random
-  ../libc/hdrgen/yaml/sys/random.yaml
+  ../libc/utils/hdrgen/yaml/sys/random.yaml
   sys/random.h.def
   sys/random.h
   DEPENDS
@@ -538,7 +538,7 @@ add_header_macro(
 
 add_header_macro(
   sys_resource
-  ../libc/hdrgen/yaml/sys/resource.yaml
+  ../libc/utils/hdrgen/yaml/sys/resource.yaml
   sys/resource.h.def
   sys/resource.h
   DEPENDS
@@ -550,7 +550,7 @@ add_header_macro(
 
 add_header_macro(
   sys_stat
-  ../libc/hdrgen/yaml/sys/stat.yaml
+  ../libc/utils/hdrgen/yaml/sys/stat.yaml
   sys/stat.h.def
   sys/stat.h
   DEPENDS
@@ -572,7 +572,7 @@ add_header_macro(
 
 add_header_macro(
   sys_select
-  ../libc/hdrgen/yaml/sys/select.yaml
+  ../libc/utils/hdrgen/yaml/sys/select.yaml
   sys/select.h.def
   sys/select.h
   DEPENDS
@@ -588,7 +588,7 @@ add_header_macro(
 
 add_header_macro(
   sys_sendfile
-  ../libc/hdrgen/yaml/sys/sendfile.yaml
+  ../libc/utils/hdrgen/yaml/sys/sendfile.yaml
   sys/sendfile.h.def
   sys/sendfile.h
   DEPENDS
@@ -600,7 +600,7 @@ add_header_macro(
 
 add_header_macro(
   sys_socket
-  ../libc/hdrgen/yaml/sys/socket.yaml
+  ../libc/utils/hdrgen/yaml/sys/socket.yaml
   sys/socket.h.def
   sys/socket.h
   DEPENDS
@@ -616,7 +616,7 @@ add_header_macro(
 
 add_header_macro(
   sys_statvfs
-  ../libc/hdrgen/yaml/sys/statvfs.yaml
+  ../libc/utils/hdrgen/yaml/sys/statvfs.yaml
   sys/statvfs.h.def
   sys/statvfs.h
   DEPENDS
@@ -626,7 +626,7 @@ add_header_macro(
 
 add_header_macro(
   sys_syscall
-  ../libc/hdrgen/yaml/sys/syscall.yaml
+  ../libc/utils/hdrgen/yaml/sys/syscall.yaml
   sys/syscall.h.def
   sys/syscall.h
   DEPENDS
@@ -634,7 +634,7 @@ add_header_macro(
 
 add_header_macro(
   sys_time
-  ../libc/hdrgen/yaml/sys/time.yaml
+  ../libc/utils/hdrgen/yaml/sys/time.yaml
   sys/time.h.def
   sys/time.h
   DEPENDS
@@ -645,7 +645,7 @@ add_header_macro(
 
 add_header_macro(
   sys_types
-  ../libc/hdrgen/yaml/sys/types.yaml
+  ../libc/utils/hdrgen/yaml/sys/types.yaml
   sys/types.h.def
   sys/types.h
   DEPENDS
@@ -675,7 +675,7 @@ add_header_macro(
 
 add_header_macro(
   sys_utsname
-  ../libc/hdrgen/yaml/sys/utsname.yaml
+  ../libc/utils/hdrgen/yaml/sys/utsname.yaml
   sys/utsname.h.def
   sys/utsname.h
   DEPENDS
@@ -685,7 +685,7 @@ add_header_macro(
 
 add_header_macro(
   sys_wait
-  ../libc/hdrgen/yaml/sys/wait.yaml
+  ../libc/utils/hdrgen/yaml/sys/wait.yaml
   sys/wait.h.def
   sys/wait.h
   DEPENDS
@@ -698,7 +698,7 @@ add_header_macro(
 
 add_header_macro(
   termios
-  ../libc/hdrgen/yaml/termios.yaml
+  ../libc/utils/hdrgen/yaml/termios.yaml
   termios.h.def
   termios.h
   DEPENDS
@@ -713,7 +713,7 @@ add_header_macro(
 
 add_header_macro(
   uchar
-  ../libc/hdrgen/yaml/uchar.yaml
+  ../libc/utils/hdrgen/yaml/uchar.yaml
   uchar.h.def
   uchar.h
   DEPENDS
@@ -726,7 +726,7 @@ add_header_macro(
 
 add_header_macro(
   wchar
-  ../libc/hdrgen/yaml/wchar.yaml
+  ../libc/utils/hdrgen/yaml/wchar.yaml
   wchar.h.def
   wchar.h
   DEPENDS
@@ -740,7 +740,7 @@ add_header_macro(
 
 add_header_macro(
   locale
-  ../libc/hdrgen/yaml/locale.yaml
+  ../libc/utils/hdrgen/yaml/locale.yaml
   locale.h.def
   locale.h
   DEPENDS
diff --git a/libc/src/math/docs/add_math_function.md b/libc/src/math/docs/add_math_function.md
index f02d502399e2b..7d45bd02c4ff2 100644
--- a/libc/src/math/docs/add_math_function.md
+++ b/libc/src/math/docs/add_math_function.md
@@ -18,7 +18,7 @@ together with its specifications:
 ```
 - Add function specs to the file:
 ```
-  libc/hdrgen/yaml/math.yaml
+  libc/utils/hdrgen/yaml/math.yaml
 ```
 
 ## Implementation
diff --git a/libc/utils/CMakeLists.txt b/libc/utils/CMakeLists.txt
index 11f25503cc13e..a33c13a045a8a 100644
--- a/libc/utils/CMakeLists.txt
+++ b/libc/utils/CMakeLists.txt
@@ -1,3 +1,5 @@
+add_subdirectory(hdrgen)
+
 if(LLVM_INCLUDE_TESTS)
   add_subdirectory(MPFRWrapper)
 endif()
diff --git a/libc/hdrgen/CMakeLists.txt b/libc/utils/hdrgen/CMakeLists.txt
similarity index 68%
rename from libc/hdrgen/CMakeLists.txt
rename to libc/utils/hdrgen/CMakeLists.txt
index 8ebde4e3e4588..c6827da215055 100644
--- a/libc/hdrgen/CMakeLists.txt
+++ b/libc/utils/hdrgen/CMakeLists.txt
@@ -1,12 +1,12 @@
 if(LLVM_LIBC_FULL_BUILD)
   enable_testing()
 
-  set(NEWHDGEN_TESTS_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+  set(HDRGEN_TESTS_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tests)
   set(TEST_OUTPUT_DIR ${CMAKE_BINARY_DIR}/hdrgen/output)
 
   add_test(
     NAME hdrgen_integration_test
-    COMMAND python3 ${NEWHDGEN_TESTS_DIR}/test_integration.py --output_dir ${TEST_OUTPUT_DIR}
+    COMMAND python3 ${HDRGEN_TESTS_DIR}/test_integration.py --output_dir ${TEST_OUTPUT_DIR}
   )
 
   add_custom_target(check-hdrgen
diff --git a/libc/utils/hdrgen/README.rst b/libc/utils/hdrgen/README.rst
new file mode 100644
index 0000000000000..d16e6c5ccaec1
--- /dev/null
+++ b/libc/utils/hdrgen/README.rst
@@ -0,0 +1,5 @@
+This directory contains the sources and specifications for the types,
+macros and entrypoint functions.  These definitions are organized in the
+``yaml`` subdirectory and match the organization of the ``*.h.def``
+files. This directory also contains the Python sources for hdrgen, which is
+what generates the headers.
diff --git a/libc/hdrgen/class_implementation/classes/enumeration.py b/libc/utils/hdrgen/class_implementation/classes/enumeration.py
similarity index 100%
rename from libc/hdrgen/class_implementation/classes/enumeration.py
rename to libc/utils/hdrgen/class_implementation/classes/enumeration.py
diff --git a/libc/hdrgen/class_implementation/classes/function.py b/libc/utils/hdrgen/class_implementation/classes/function.py
similarity index 100%
rename from libc/hdrgen/class_implementation/classes/function.py
rename to libc/utils/hdrgen/class_implementation/classes/function.py
diff --git a/libc/hdrgen/class_implementation/classes/macro.py b/libc/utils/hdrgen/class_implementation/classes/macro.py
similarity index 100%
rename from libc/hdrgen/class_implementation/classes/macro.py
rename to libc/utils/hdrgen/class_implementation/classes/macro.py
diff --git a/libc/hdrgen/class_implementation/classes/object.py b/libc/utils/hdrgen/class_implementation/classes/object.py
similarity index 100%
rename from libc/hdrgen/class_implementation/classes/object.py
rename to libc/utils/hdrgen/class_implementation/classes/object.py
diff --git a/libc/hdrgen/class_implementation/classes/type.py b/libc/utils/hdrgen/class_implementation/classes/type.py
similarity index 100%
rename from libc/hdrgen/class_implementation/classes/type.py
rename to libc/utils/hdrgen/class_implementation/classes/type.py
diff --git a/libc/hdrgen/gpu_headers.py b/libc/utils/hdrgen/gpu_headers.py
similarity index 100%
rename from libc/hdrgen/gpu_headers.py
rename to libc/utils/hdrgen/gpu_headers.py
diff --git a/libc/hdrgen/header.py b/libc/utils/hdrgen/header.py
similarity index 100%
rename from libc/hdrgen/header.py
rename to libc/utils/hdrgen/header.py
diff --git a/libc/hdrgen/tests/expected_output/test_header.h b/libc/utils/hdrgen/tests/expected_output/test_header.h
similarity index 100%
rename from libc/hdrgen/tests/expected_output/test_header.h
rename to libc/utils/hdrgen/tests/expected_output/test_header.h
diff --git a/libc/hdrgen/tests/input/test_small.h.def b/libc/utils/hdrgen/tests/input/test_small.h.def
similarity index 100%
rename from libc/hdrgen/tests/input/test_small.h.def
rename to libc/utils/hdrgen/tests/input/test_small.h.def
diff --git a/libc/hdrgen/tests/input/test_small.yaml b/libc/utils/hdrgen/tests/input/test_small.yaml
similarity index 100%
rename from libc/hdrgen/tests/input/test_small.yaml
rename to libc/utils/hdrgen/tests/input/test_small.yaml
diff --git a/libc/hdrgen/tests/test_integration.py b/libc/utils/hdrgen/tests/test_integration.py
similarity index 84%
rename from libc/hdrgen/tests/test_integration.py
rename to libc/utils/hdrgen/tests/test_integration.py
index 8ea6d8a708073..ce80026e7bccd 100644
--- a/libc/hdrgen/tests/test_integration.py
+++ b/libc/utils/hdrgen/tests/test_integration.py
@@ -9,19 +9,19 @@
 class TestHeaderGenIntegration(unittest.TestCase):
     def setUp(self):
         self.output_dir = Path(
-            args.output_dir if args.output_dir else "libc/hdrgen/tests/output"
+            args.output_dir if args.output_dir else "libc/utils/hdrgen/tests/output"
         )
 
         self.maxDiff = None
 
-        self.source_dir = Path(__file__).resolve().parent.parent.parent.parent
+        self.source_dir = Path(__file__).resolve().parent.parent.parent.parent.parent
 
     def run_script(self, yaml_file, h_def_file, output_dir, entry_points):
         yaml_file = self.source_dir / yaml_file
         h_def_file = self.source_dir / h_def_file
         command = [
             "python3",
-            str(self.source_dir / "libc/hdrgen/yaml_to_classes.py"),
+            str(self.source_dir / "libc/utils/hdrgen/yaml_to_classes.py"),
             str(yaml_file),
             "--h_def_file",
             str(h_def_file),
@@ -51,10 +51,10 @@ def compare_files(self, generated_file, expected_file):
         self.assertEqual(gen_content, exp_content)
 
     def test_generate_header(self):
-        yaml_file = "libc/hdrgen/tests/input/test_small.yaml"
-        h_def_file = "libc/hdrgen/tests/input/test_small.h.def"
+        yaml_file = "libc/utils/hdrgen/tests/input/test_small.yaml"
+        h_def_file = "libc/utils/hdrgen/tests/input/test_small.h.def"
         expected_output_file = (
-            self.source_dir / "libc/hdrgen/tests/expected_output/test_header.h"
+            self.source_dir / "libc/utils/hdrgen/tests/expected_output/test_header.h"
         )
         output_file = self.output_dir / "test_small.h"
         entry_points = {"func_b", "func_a", "func_c", "func_d", "func_e"}
diff --git a/libc/hdrgen/yaml/arpa/inet.yaml b/libc/utils/hdrgen/yaml/arpa/inet.yaml
similarity index 100%
rename from libc/hdrgen/yaml/arpa/inet.yaml
rename to libc/utils/hdrgen/yaml/arpa/inet.yaml
diff --git a/libc/hdrgen/yaml/assert.yaml b/libc/utils/hdrgen/yaml/assert.yaml
similarity index 100%
rename from libc/hdrgen/yaml/assert.yaml
rename to libc/utils/hdrgen/yaml/assert.yaml
diff --git a/libc/hdrgen/yaml/complex.yaml b/libc/utils/hdrgen/yaml/complex.yaml
similarity index 100%
rename from libc/hdrgen/yaml/complex.yaml
rename to libc/utils/hdrgen/yaml/complex.yaml
diff --git a/libc/hdrgen/yaml/ctype.yaml b/libc/utils/hdrgen/yaml/ctype.yaml
similarity index 100%
rename from libc/hdrgen/yaml/ctype.yaml
rename to libc/utils/hdrgen/yaml/ctype.yaml
diff --git a/libc/hdrgen/yaml/dirent.yaml b/libc/utils/hdrgen/yaml/dirent.yaml
similarity index 100%
rename from libc/hdrgen/yaml/dirent.yaml
rename to libc/utils/hdrgen/yaml/dirent.yaml
diff --git a/libc/hdrgen/yaml/dlfcn.yaml b/libc/utils/hdrgen/yaml/dlfcn.yaml
similarity index 100%
rename from libc/hdrgen/yaml/dlfcn.yaml
rename to libc/utils/hdrgen/yaml/dlfcn.yaml
diff --git a/libc/hdrgen/yaml/elf.yaml b/libc/utils/hdrgen/yaml/elf.yaml
similarity index 100%
rename from libc/hdrgen/yaml/elf.yaml
rename to libc/utils/hdrgen/yaml/elf.yaml
diff --git a/libc/hdrgen/yaml/errno.yaml b/libc/utils/hdrgen/yaml/errno.yaml
similarity index 100%
rename from libc/hdrgen/yaml/errno.yaml
rename to libc/utils/hdrgen/yaml/errno.yaml
diff --git a/libc/hdrgen/yaml/fcntl.yaml b/libc/utils/hdrgen/yaml/fcntl.yaml
similarity index 100%
rename from libc/hdrgen/yaml/fcntl.yaml
rename to libc/utils/hdrgen/yaml/fcntl.yaml
diff --git a/libc/hdrgen/yaml/features.yaml b/libc/utils/hdrgen/yaml/features.yaml
similarity index 100%
rename from libc/hdrgen/yaml/features.yaml
rename to libc/utils/hdrgen/yaml/features.yaml
diff --git a/libc/hdrgen/yaml/fenv.yaml b/libc/utils/hdrgen/yaml/fenv.yaml
similarity index 100%
rename from libc/hdrgen/yaml/fenv.yaml
rename to libc/utils/hdrgen/yaml/fenv.yaml
diff --git a/libc/hdrgen/yaml/float.yaml b/libc/utils/hdrgen/yaml/float.yaml
similarity index 100%
rename from libc/hdrgen/yaml/float.yaml
rename to libc/utils/hdrgen/yaml/float.yaml
diff --git a/libc/hdrgen/yaml/inttypes.yaml b/libc/utils/hdrgen/yaml/inttypes.yaml
similarity index 100%
rename from libc/hdrgen/yaml/inttypes.yaml
rename to libc/utils/hdrgen/yaml/inttypes.yaml
diff --git a/libc/hdrgen/yaml/limits.yaml b/libc/utils/hdrgen/yaml/limits.yaml
similarity index 100%
rename from libc/hdrgen/yaml/limits.yaml
rename to libc/utils/hdrgen/yaml/limits.yaml
diff --git a/libc/hdrgen/yaml/link.yaml b/libc/utils/hdrgen/yaml/link.yaml
similarity index 100%
rename from libc/hdrgen/yaml/link.yaml
rename to libc/utils/hdrgen/yaml/link.yaml
diff --git a/libc/hdrgen/yaml/locale.yaml b/libc/utils/hdrgen/yaml/locale.yaml
similarity index 100%
rename from libc/hdrgen/yaml/locale.yaml
rename to libc/utils/hdrgen/yaml/locale.yaml
diff --git a/libc/hdrgen/yaml/malloc.yaml b/libc/utils/hdrgen/yaml/malloc.yaml
similarity index 100%
rename from libc/hdrgen/yaml/malloc.yaml
rename to libc/utils/hdrgen/yaml/malloc.yaml
diff --git a/libc/hdrgen/yaml/math.yaml b/libc/utils/hdrgen/yaml/math.yaml
similarity index 100%
rename from libc/hdrgen/yaml/math.yaml
rename to libc/utils/hdrgen/yaml/math.yaml
diff --git a/libc/hdrgen/yaml/pthread.yaml b/libc/utils/hdrgen/yaml/pthread.yaml
similarity index 100%
rename from libc/hdrgen/yaml/pthread.yaml
rename to libc/utils/hdrgen/yaml/pthread.yaml
diff --git a/libc/hdrgen/yaml/sched.yaml b/libc/utils/hdrgen/yaml/sched.yaml
similarity index 100%
rename from libc/hdrgen/yaml/sched.yaml
rename to libc/utils/hdrgen/yaml/sched.yaml
diff --git a/libc/hdrgen/yaml/search.yaml b/libc/utils/hdrgen/yaml/search.yaml
similarity index 100%
rename from libc/hdrgen/yaml/search.yaml
rename to libc/utils/hdrgen/yaml/search.yaml
diff --git a/libc/hdrgen/yaml/setjmp.yaml b/libc/utils/hdrgen/yaml/setjmp.yaml
similarity index 100%
rename from libc/hdrgen/yaml/setjmp.yaml
rename to libc/utils/hdrgen/yaml/setjmp.yaml
diff --git a/libc/hdrgen/yaml/signal.yaml b/libc/utils/hdrgen/yaml/signal.yaml
similarity index 100%
rename from libc/hdrgen/yaml/signal.yaml
rename to libc/utils/hdrgen/yaml/signal.yaml
diff --git a/libc/hdrgen/yaml/spawn.yaml b/libc/utils/hdrgen/yaml/spawn.yaml
similarity index 100%
rename from libc/hdrgen/yaml/spawn.yaml
rename to libc/utils/hdrgen/yaml/spawn.yaml
diff --git a/libc/hdrgen/yaml/stdbit.yaml b/libc/utils/hdrgen/yaml/stdbit.yaml
similarity index 100%
rename from libc/hdrgen/yaml/stdbit.yaml
rename to libc/utils/hdrgen/yaml/stdbit.yaml
diff --git a/libc/hdrgen/yaml/stdckdint.yaml b/libc/utils/hdrgen/yaml/stdckdint.yaml
similarity index 100%
rename from libc/hdrgen/yaml/stdckdint.yaml
rename to libc/utils/hdrgen/yaml/stdckdint.yaml
diff --git a/libc/hdrgen/yaml/stdfix.yaml b/libc/utils/hdrgen/yaml/stdfix.yaml
similarity index 100%
rename from libc/hdrgen/yaml/stdfix.yaml
rename to libc/utils/hdrgen/yaml/stdfix.yaml
diff --git a/libc/hdrgen/yaml/stdint.yaml b/libc/utils/hdrgen/yaml/stdint.yaml
similarity index 100%
rename from libc/hdrgen/yaml/stdint.yaml
rename to libc/utils/hdrgen/yaml/stdint.yaml
diff --git a/libc/hdrgen/yaml/stdio.yaml b/libc/utils/hdrgen/yaml/stdio.yaml
similarity index 100%
rename from libc/hdrgen/yaml/stdio.yaml
rename to libc/utils/hdrgen/yaml/stdio.yaml
diff --git a/libc/hdrgen/yaml/stdlib.yaml b/libc/utils/hdrgen/yaml/stdlib.yaml
similarity index 100%
rename from libc/hdrgen/yaml/stdlib.yaml
rename to libc/utils/hdrgen/yaml/stdlib.yaml
diff --git a/libc/hdrgen/yaml/string.yaml b/libc/utils/hdrgen/yaml/string.yaml
similarity index 100%
rename from libc/hdrgen/yaml/string.yaml
rename to libc/utils/hdrgen/yaml/string.yaml
diff --git a/libc/hdrgen/yaml/strings.yaml b/libc/utils/hdrgen/yaml/strings.yaml
similarity index 100%
rename from libc/hdrgen/yaml/strings.yaml
rename to libc/utils/hdrgen/yaml/strings.yaml
diff --git a/libc/hdrgen/yaml/sys/auxv.yaml b/libc/utils/hdrgen/yaml/sys/auxv.yaml
similarity index 100%
rename from libc/hdrgen/yaml/sys/auxv.yaml
rename to libc/utils/hdrgen/yaml/sys/auxv.yaml
diff --git a/libc/hdrgen/yaml/sys/epoll.yaml b/libc/utils/hdrgen/yaml/sys/epoll.yaml
similarity index 100%
rename from libc/hdrgen/yaml/sys/epoll.yaml
rename to libc/utils/hdrgen/yaml/sys/epoll.yaml
diff --git a/libc/hdrgen/yaml/sys/ioctl.yaml b/libc/utils/hdrgen/yaml/sys/ioctl.yaml
similarity index 100%
rename from libc/hdrgen/yaml/sys/ioctl.yaml
rename to libc/utils/hdrgen/yaml/sys/ioctl.yaml
diff --git a/libc/hdrgen/yaml/sys/mman.yaml b/libc/utils/hdrgen/yaml/sys/mman.yaml
similarity index 100%
rename from libc/hdrgen/yaml/sys/mman.yaml
rename to libc/utils/hdrgen/yaml/sys/mman.yaml
diff --git a/libc/hdrgen/yaml/sys/prctl.yaml b/libc/utils/hdrgen/yaml/sys/prctl.yaml
similarity index 100%
rename from libc/hdrgen/yaml/sys/prctl.yaml
rename to libc/utils/hdrgen/yaml/sys/prctl.yaml
diff --git a/libc/hdrgen/yaml/sys/random.yaml b/libc/utils/hdrgen/yaml/sys/random.yaml
similarity index 100%
rename from libc/hdrgen/yaml/sys/random.yaml
rename to libc/utils/hdrgen/yaml/sys/random.yaml
diff --git a/libc/hdrgen/yaml/sys/resource.yaml b/libc/utils/hdrgen/yaml/sys/resource.yaml
similarity index 100%
rename from libc/hdrgen/yaml/sys/resource.yaml
rename to libc/utils/hdrgen/yaml/sys/resource.yaml
diff --git a/libc/hdrgen/yaml/sys/select.yaml b/libc/utils/hdrgen/yaml/sys/select.yaml
similarity index 100%
rename from libc/hdrgen/yaml/sys/select.yaml
rename to libc/utils/hdrgen/yaml/sys/select.yaml
diff --git a/libc/hdrgen/yaml/sys/sendfile.yaml b/libc/utils/hdrgen/yaml/sys/sendfile.yaml
similarity index 100%
rename from libc/hdrgen/yaml/sys/sendfile.yaml
rename to libc/utils/hdrgen/yaml/sys/sendfile.yaml
diff --git a/libc/hdrgen/yaml/sys/socket.yaml b/libc/utils/hdrgen/yaml/sys/socket.yaml
similarity index 100%
rename from libc/hdrgen/yaml/sys/socket.yaml
rename to libc/utils/hdrgen/yaml/sys/socket.yaml
diff --git a/libc/hdrgen/yaml/sys/stat.yaml b/libc/utils/hdrgen/yaml/sys/stat.yaml
similarity index 100%
rename from libc/hdrgen/yaml/sys/stat.yaml
rename to libc/utils/hdrgen/yaml/sys/stat.yaml
diff --git a/libc/hdrgen/yaml/sys/statvfs.yaml b/libc/utils/hdrgen/yaml/sys/statvfs.yaml
similarity index 100%
rename from libc/hdrgen/yaml/sys/statvfs.yaml
rename to libc/utils/hdrgen/yaml/sys/statvfs.yaml
diff --git a/libc/hdrgen/yaml/sys/syscall.yaml b/libc/utils/hdrgen/yaml/sys/syscall.yaml
similarity index 100%
rename from libc/hdrgen/yaml/sys/syscall.yaml
rename to libc/utils/hdrgen/yaml/sys/syscall.yaml
diff --git a/libc/hdrgen/yaml/sys/time.yaml b/libc/utils/hdrgen/yaml/sys/time.yaml
similarity index 100%
rename from libc/hdrgen/yaml/sys/time.yaml
rename to libc/utils/hdrgen/yaml/sys/time.yaml
diff --git a/libc/hdrgen/yaml/sys/types.yaml b/libc/utils/hdrgen/yaml/sys/types.yaml
similarity index 100%
rename from libc/hdrgen/yaml/sys/types.yaml
rename to libc/utils/hdrgen/yaml/sys/types.yaml
diff --git a/libc/hdrgen/yaml/sys/utsname.yaml b/libc/utils/hdrgen/yaml/sys/utsname.yaml
similarity index 100%
rename from libc/hdrgen/yaml/sys/utsname.yaml
rename to libc/utils/hdrgen/yaml/sys/utsname.yaml
diff --git a/libc/hdrgen/yaml/sys/wait.yaml b/libc/utils/hdrgen/yaml/sys/wait.yaml
similarity index 100%
rename from libc/hdrgen/yaml/sys/wait.yaml
rename to libc/utils/hdrgen/yaml/sys/wait.yaml
diff --git a/libc/hdrgen/yaml/termios.yaml b/libc/utils/hdrgen/yaml/termios.yaml
similarity index 100%
rename from libc/hdrgen/yaml/termios.yaml
rename to libc/utils/hdrgen/yaml/termios.yaml
diff --git a/libc/hdrgen/yaml/threads.yaml b/libc/utils/hdrgen/yaml/threads.yaml
similarity index 100%
rename from libc/hdrgen/yaml/threads.yaml
rename to libc/utils/hdrgen/yaml/threads.yaml
diff --git a/libc/hdrgen/yaml/time.yaml b/libc/utils/hdrgen/yaml/time.yaml
similarity index 100%
rename from libc/hdrgen/yaml/time.yaml
rename to libc/utils/hdrgen/yaml/time.yaml
diff --git a/libc/hdrgen/yaml/uchar.yaml b/libc/utils/hdrgen/yaml/uchar.yaml
similarity index 100%
rename from libc/hdrgen/yaml/uchar.yaml
rename to libc/utils/hdrgen/yaml/uchar.yaml
diff --git a/libc/hdrgen/yaml/unistd.yaml b/libc/utils/hdrgen/yaml/unistd.yaml
similarity index 100%
rename from libc/hdrgen/yaml/unistd.yaml
rename to libc/utils/hdrgen/yaml/unistd.yaml
diff --git a/libc/hdrgen/yaml/wchar.yaml b/libc/utils/hdrgen/yaml/wchar.yaml
similarity index 100%
rename from libc/hdrgen/yaml/wchar.yaml
rename to libc/utils/hdrgen/yaml/wchar.yaml
diff --git a/libc/hdrgen/yaml_functions_sorted.py b/libc/utils/hdrgen/yaml_functions_sorted.py
similarity index 100%
rename from libc/hdrgen/yaml_functions_sorted.py
rename to libc/utils/hdrgen/yaml_functions_sorted.py
diff --git a/libc/hdrgen/yaml_to_classes.py b/libc/utils/hdrgen/yaml_to_classes.py
similarity index 100%
rename from libc/hdrgen/yaml_to_classes.py
rename to libc/utils/hdrgen/yaml_to_classes.py

From f385542f62fa1f57001c95c476165e1618cb54ba Mon Sep 17 00:00:00 2001
From: c8ef 
Date: Tue, 31 Dec 2024 09:28:34 +0800
Subject: [PATCH 198/567] [Tooling/Inclusion] Modify the Python script to open
 the C++ reference with UTF-8 encoding. (#121341)

This will prevent the error on systems with a default encoding other
than utf-8.

```
UnicodeDecodeError: 'gbk' codec can't decode byte 0xb6 in position 12958: illegal multibyte sequence
```
---
 clang/tools/include-mapping/cppreference_parser.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/tools/include-mapping/cppreference_parser.py b/clang/tools/include-mapping/cppreference_parser.py
index 9101f3dbff0f9..f7da2ba8bb6d8 100644
--- a/clang/tools/include-mapping/cppreference_parser.py
+++ b/clang/tools/include-mapping/cppreference_parser.py
@@ -139,7 +139,7 @@ def _ParseIndexPage(index_page_html):
 
 
 def _ReadSymbolPage(path, name, qual_name):
-    with open(path) as f:
+    with open(path, encoding="utf-8") as f:
         return _ParseSymbolPage(f.read(), name, qual_name)
 
 
@@ -156,7 +156,7 @@ def _GetSymbols(pool, root_dir, index_page_name, namespace, variants_to_accept):
     #      contains the defined header.
     #   2. Parse the symbol page to get the defined header.
     index_page_path = os.path.join(root_dir, index_page_name)
-    with open(index_page_path, "r") as f:
+    with open(index_page_path, "r", encoding="utf-8") as f:
         # Read each symbol page in parallel.
         results = []  # (symbol_name, promise of [header...])
         for symbol_name, symbol_page_path, variant in _ParseIndexPage(f.read()):

From 07e13b764d6a5002f5bd9a41bf514106607f13f1 Mon Sep 17 00:00:00 2001
From: Roland McGrath 
Date: Mon, 30 Dec 2024 17:28:59 -0800
Subject: [PATCH 199/567] [libc] Remove unnecessary subdirectory layers in
 utils/hdrgen (#121363)

Two extra layers of subdirectory for a handful of single-symbol
Python source files did not improve anything, and it complicated
integration of the hdrgen Python outside the LLVM CMake build.
---
 .../classes => }/enumeration.py                     |  0
 .../{class_implementation/classes => }/function.py  |  0
 .../{class_implementation/classes => }/macro.py     |  0
 .../{class_implementation/classes => }/object.py    |  0
 .../{class_implementation/classes => }/type.py      |  0
 libc/utils/hdrgen/yaml_to_classes.py                | 13 +++++++------
 6 files changed, 7 insertions(+), 6 deletions(-)
 rename libc/utils/hdrgen/{class_implementation/classes => }/enumeration.py (100%)
 rename libc/utils/hdrgen/{class_implementation/classes => }/function.py (100%)
 rename libc/utils/hdrgen/{class_implementation/classes => }/macro.py (100%)
 rename libc/utils/hdrgen/{class_implementation/classes => }/object.py (100%)
 rename libc/utils/hdrgen/{class_implementation/classes => }/type.py (100%)

diff --git a/libc/utils/hdrgen/class_implementation/classes/enumeration.py b/libc/utils/hdrgen/enumeration.py
similarity index 100%
rename from libc/utils/hdrgen/class_implementation/classes/enumeration.py
rename to libc/utils/hdrgen/enumeration.py
diff --git a/libc/utils/hdrgen/class_implementation/classes/function.py b/libc/utils/hdrgen/function.py
similarity index 100%
rename from libc/utils/hdrgen/class_implementation/classes/function.py
rename to libc/utils/hdrgen/function.py
diff --git a/libc/utils/hdrgen/class_implementation/classes/macro.py b/libc/utils/hdrgen/macro.py
similarity index 100%
rename from libc/utils/hdrgen/class_implementation/classes/macro.py
rename to libc/utils/hdrgen/macro.py
diff --git a/libc/utils/hdrgen/class_implementation/classes/object.py b/libc/utils/hdrgen/object.py
similarity index 100%
rename from libc/utils/hdrgen/class_implementation/classes/object.py
rename to libc/utils/hdrgen/object.py
diff --git a/libc/utils/hdrgen/class_implementation/classes/type.py b/libc/utils/hdrgen/type.py
similarity index 100%
rename from libc/utils/hdrgen/class_implementation/classes/type.py
rename to libc/utils/hdrgen/type.py
diff --git a/libc/utils/hdrgen/yaml_to_classes.py b/libc/utils/hdrgen/yaml_to_classes.py
index 0e8ca2d8a82b0..ec2441b78aee5 100644
--- a/libc/utils/hdrgen/yaml_to_classes.py
+++ b/libc/utils/hdrgen/yaml_to_classes.py
@@ -11,13 +11,14 @@
 import yaml
 import argparse
 from pathlib import Path
-from header import HeaderFile
+
+from enumeration import Enumeration
+from function import Function
 from gpu_headers import GpuHeaderFile as GpuHeader
-from class_implementation.classes.macro import Macro
-from class_implementation.classes.type import Type
-from class_implementation.classes.function import Function
-from class_implementation.classes.enumeration import Enumeration
-from class_implementation.classes.object import Object
+from header import HeaderFile
+from macro import Macro
+from object import Object
+from type import Type
 
 
 def yaml_to_classes(yaml_data, header_class, entry_points=None):

From fe1f64e7e935c9905a115842183ea29dd1312dfe Mon Sep 17 00:00:00 2001
From: Nathan Ridge 
Date: Mon, 30 Dec 2024 20:49:26 -0500
Subject: [PATCH 200/567] [clangd] Make EnableFunctionArgSnippets option
 string-typed (#121178)

Fixes https://github.com/clangd/clangd/issues/2232
---
 clang-tools-extra/clangd/tool/ClangdMain.cpp | 27 +++++++++++++++-----
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/clang-tools-extra/clangd/tool/ClangdMain.cpp b/clang-tools-extra/clangd/tool/ClangdMain.cpp
index 80a0653f8f740..714891703b6f3 100644
--- a/clang-tools-extra/clangd/tool/ClangdMain.cpp
+++ b/clang-tools-extra/clangd/tool/ClangdMain.cpp
@@ -242,13 +242,13 @@ opt FallbackStyle{
     init(clang::format::DefaultFallbackStyle),
 };
 
-opt EnableFunctionArgSnippets{
+opt EnableFunctionArgSnippets{
     "function-arg-placeholders",
     cat(Features),
     desc("When disabled (0), completions contain only parentheses for "
          "function calls. When enabled (1), completions also contain "
          "placeholders for method parameters"),
-    init(-1),
+    init("-1"),
 };
 
 opt HeaderInsertion{
@@ -636,6 +636,22 @@ loadExternalIndex(const Config::ExternalIndexSpec &External,
   llvm_unreachable("Invalid ExternalIndexKind.");
 }
 
+std::optional shouldEnableFunctionArgSnippets() {
+  std::string Val = EnableFunctionArgSnippets;
+  // Accept the same values that a bool option parser would, but also accept
+  // -1 to indicate "unspecified", in which case the ArgumentListsPolicy
+  // config option will be respected.
+  if (Val == "1" || Val == "true" || Val == "True" || Val == "TRUE")
+    return true;
+  if (Val == "0" || Val == "false" || Val == "False" || Val == "FALSE")
+    return false;
+  if (Val != "-1")
+    elog("Value specified by --function-arg-placeholders is invalid. Provide a "
+         "boolean value or leave unspecified to use ArgumentListsPolicy from "
+         "config instead.");
+  return std::nullopt;
+}
+
 class FlagsConfigProvider : public config::Provider {
 private:
   config::CompiledFragment Frag;
@@ -696,10 +712,9 @@ class FlagsConfigProvider : public config::Provider {
       BGPolicy = Config::BackgroundPolicy::Skip;
     }
 
-    if (EnableFunctionArgSnippets >= 0) {
-      ArgumentLists = EnableFunctionArgSnippets
-                          ? Config::ArgumentListsPolicy::FullPlaceholders
-                          : Config::ArgumentListsPolicy::Delimiters;
+    if (std::optional Enable = shouldEnableFunctionArgSnippets()) {
+      ArgumentLists = *Enable ? Config::ArgumentListsPolicy::FullPlaceholders
+                              : Config::ArgumentListsPolicy::Delimiters;
     }
 
     Frag = [=](const config::Params &, Config &C) {

From e50ec3e46bea819a1d7aea1cee2d7e11197bbdd2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20Kokem=C3=BCller?= 
Date: Tue, 31 Dec 2024 02:53:29 +0100
Subject: [PATCH 201/567] [Clang][Sema] Expose static inline functions from GMF
 (#104701)

In C, it is a common pattern to have `static inline` functions in
headers to avoid ODR issues. Currently, when those headers are included
in a GMF, the names are not found when two-phase name lookup and ADL is
involved. Those names are removed by `Sema::AddOverloadCandidate`.

Similarly, in C++, sometimes people use templates with internal linkage
in headers.

As the GMF was designed to be a transitional mechanism for headers,
special case those functions in `Sema::AddOverloadCandidate`.

This fixes .
---
 clang/lib/Sema/SemaOverload.cpp               | 23 +++++++++--
 .../expose-static-inline-from-gmf-1.cppm      | 37 +++++++++++++++++
 .../expose-static-inline-from-gmf-2.cppm      | 22 ++++++++++
 .../expose-static-inline-from-gmf-3.cppm      | 24 +++++++++++
 .../expose-static-inline-from-gmf-4.cppm      | 40 +++++++++++++++++++
 .../expose-static-inline-from-gmf-5.cppm      | 26 ++++++++++++
 6 files changed, 168 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/Modules/expose-static-inline-from-gmf-1.cppm
 create mode 100644 clang/test/Modules/expose-static-inline-from-gmf-2.cppm
 create mode 100644 clang/test/Modules/expose-static-inline-from-gmf-3.cppm
 create mode 100644 clang/test/Modules/expose-static-inline-from-gmf-4.cppm
 create mode 100644 clang/test/Modules/expose-static-inline-from-gmf-5.cppm

diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index fff49b759c935..7589701fb81de 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -6977,11 +6977,26 @@ void Sema::AddOverloadCandidate(
     /// have linkage. So that all entities of the same should share one
     /// linkage. But in clang, different entities of the same could have
     /// different linkage.
-    NamedDecl *ND = Function;
-    if (auto *SpecInfo = Function->getTemplateSpecializationInfo())
+    const NamedDecl *ND = Function;
+    bool IsImplicitlyInstantiated = false;
+    if (auto *SpecInfo = Function->getTemplateSpecializationInfo()) {
       ND = SpecInfo->getTemplate();
-
-    if (ND->getFormalLinkage() == Linkage::Internal) {
+      IsImplicitlyInstantiated = SpecInfo->getTemplateSpecializationKind() ==
+                                 TSK_ImplicitInstantiation;
+    }
+
+    /// Don't remove inline functions with internal linkage from the overload
+    /// set if they are declared in a GMF, in violation of C++ [basic.link]p17.
+    /// However:
+    /// - Inline functions with internal linkage are a common pattern in
+    ///   headers to avoid ODR issues.
+    /// - The global module is meant to be a transition mechanism for C and C++
+    ///   headers, and the current rules as written work against that goal.
+    const bool IsInlineFunctionInGMF =
+        Function->isFromGlobalModule() &&
+        (IsImplicitlyInstantiated || Function->isInlined());
+
+    if (ND->getFormalLinkage() == Linkage::Internal && !IsInlineFunctionInGMF) {
       Candidate.Viable = false;
       Candidate.FailureKind = ovl_fail_module_mismatched;
       return;
diff --git a/clang/test/Modules/expose-static-inline-from-gmf-1.cppm b/clang/test/Modules/expose-static-inline-from-gmf-1.cppm
new file mode 100644
index 0000000000000..4de9b583dac8d
--- /dev/null
+++ b/clang/test/Modules/expose-static-inline-from-gmf-1.cppm
@@ -0,0 +1,37 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang -std=c++20 %t/a.cppm --precompile -o %t/a.pcm \
+// RUN:   -DTEST_INLINE
+// RUN: %clang -std=c++20 %t/test.cc -fprebuilt-module-path=%t -fsyntax-only -Xclang -verify \
+// RUN:   -DTEST_INLINE
+//
+// RUN: %clang -std=c++20 %t/a.cppm --precompile -o %t/a.pcm
+// RUN: %clang -std=c++20 %t/test.cc -fprebuilt-module-path=%t -fsyntax-only -Xclang -verify
+
+//--- a.h
+#ifdef TEST_INLINE
+#define INLINE inline
+#else
+#define INLINE
+#endif
+static INLINE void func(long) {}
+template  void a() { func(T{}); }
+
+//--- a.cppm
+module;
+#include "a.h"
+export module a;
+export using ::a;
+
+//--- test.cc
+import a;
+auto m = (a(), 0);
+
+#ifdef TEST_INLINE
+// expected-no-diagnostics
+#else
+// expected-error@a.h:7 {{no matching function for call to 'func'}}
+// expected-note@test.cc:2 {{in instantiation of function template specialization 'a' requested here}}
+#endif
diff --git a/clang/test/Modules/expose-static-inline-from-gmf-2.cppm b/clang/test/Modules/expose-static-inline-from-gmf-2.cppm
new file mode 100644
index 0000000000000..c89b613f5074b
--- /dev/null
+++ b/clang/test/Modules/expose-static-inline-from-gmf-2.cppm
@@ -0,0 +1,22 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang -std=c++20 %t/a.cppm --precompile -o %t/a.pcm
+// RUN: %clang -std=c++20 %t/test.cc -fprebuilt-module-path=%t -fsyntax-only -Xclang -verify
+
+//--- a.h
+template  static inline void func() {}
+template  void a() { func(); }
+
+//--- a.cppm
+module;
+#include "a.h"
+export module a;
+export using ::a;
+
+//--- test.cc
+import a;
+auto m = (a(), 0);
+
+// expected-no-diagnostics
diff --git a/clang/test/Modules/expose-static-inline-from-gmf-3.cppm b/clang/test/Modules/expose-static-inline-from-gmf-3.cppm
new file mode 100644
index 0000000000000..dee7cddafdf70
--- /dev/null
+++ b/clang/test/Modules/expose-static-inline-from-gmf-3.cppm
@@ -0,0 +1,24 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang -std=c++20 %t/a.cppm --precompile -o %t/a.pcm
+// RUN: %clang -std=c++20 %t/test.cc -fprebuilt-module-path=%t -fsyntax-only -Xclang -verify
+
+//--- a.h
+namespace ns {
+template  static void func() {}
+template  void a() { func(); }
+}
+
+//--- a.cppm
+module;
+#include "a.h"
+export module a;
+export using ns::a;
+
+//--- test.cc
+import a;
+auto m = (a(), 0);
+
+// expected-no-diagnostics
diff --git a/clang/test/Modules/expose-static-inline-from-gmf-4.cppm b/clang/test/Modules/expose-static-inline-from-gmf-4.cppm
new file mode 100644
index 0000000000000..09c6b1ffd9c79
--- /dev/null
+++ b/clang/test/Modules/expose-static-inline-from-gmf-4.cppm
@@ -0,0 +1,40 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang -std=c++20 %t/a.cppm --precompile -o %t/a.pcm \
+// RUN:   -DTEST_INLINE
+// RUN: %clang -std=c++20 %t/test.cc -fprebuilt-module-path=%t -fsyntax-only -Xclang -verify \
+// RUN:   -DTEST_INLINE
+//
+// RUN: %clang -std=c++20 %t/a.cppm --precompile -o %t/a.pcm
+// RUN: %clang -std=c++20 %t/test.cc -fprebuilt-module-path=%t -fsyntax-only -Xclang -verify
+
+//--- a.h
+#ifdef TEST_INLINE
+#define INLINE inline
+#else
+#define INLINE
+#endif
+namespace ns {
+template  static void func() {}
+template <> INLINE void func() {}
+template  void a() { func(); }
+}
+
+//--- a.cppm
+module;
+#include "a.h"
+export module a;
+export using ns::a;
+
+//--- test.cc
+import a;
+auto m = (a(), 0);
+
+#ifdef TEST_INLINE
+// expected-no-diagnostics
+#else
+// expected-error@a.h:9 {{no matching function for call to 'func'}}
+// expected-note@test.cc:2 {{in instantiation of function template specialization 'ns::a' requested here}}
+#endif
diff --git a/clang/test/Modules/expose-static-inline-from-gmf-5.cppm b/clang/test/Modules/expose-static-inline-from-gmf-5.cppm
new file mode 100644
index 0000000000000..334af845a693d
--- /dev/null
+++ b/clang/test/Modules/expose-static-inline-from-gmf-5.cppm
@@ -0,0 +1,26 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang -std=c++20 %t/a.cppm --precompile -o %t/a.pcm
+// RUN: %clang -std=c++20 %t/test.cc -fprebuilt-module-path=%t -fsyntax-only -Xclang -verify
+
+//--- a.h
+namespace ns {
+namespace {
+template  void func() {}
+}
+template  void a() { func(); }
+}
+
+//--- a.cppm
+module;
+#include "a.h"
+export module a;
+export using ns::a;
+
+//--- test.cc
+import a;
+auto m = (a(), 0);
+
+// expected-no-diagnostics

From f590963db836ccbf7c547a3dea9dc719f24444d1 Mon Sep 17 00:00:00 2001
From: Philipp van Kempen 
Date: Tue, 31 Dec 2024 03:56:28 +0100
Subject: [PATCH 202/567] [RISCV] Implement
 RISCVTTIImpl::getPreferredAddressingMode for HasVendorXCVmem (#120533)

For a simple matmult kernel this heuristic reduces the length of the
critical basic block from 15 to 20 instructions, resulting in a 20%
speedup.

**Without heuristic:**

```
       13688: 001b838b      cv.lb   t2, (s7), 0x1
       1368c: 09cdbcab      cv.lb   s9, t3(s11)
       13690: 089db62b      cv.lb   a2, s1(s11)
       13694: 092dbdab      cv.lb   s11, s2(s11)
       13698: 001d028b      cv.lb   t0, (s10), 0x1
       1369c: 00f282b3      add     t0, t0, a5
       136a0: 9072b52b      cv.mac  a0, t0, t2
       136a4: 9192bfab      cv.mac  t6, t0, s9
       136a8: 90c2beab      cv.mac  t4, t0, a2
       136ac: 91b2bf2b      cv.mac  t5, t0, s11
       136b0: fffc0c13      addi    s8, s8, -0x1
       136b4: 018e0633      add     a2, t3, s8
       136b8: 91b2b0ab      cv.mac  ra, t0, s11
       136bc: 000b8d93      mv      s11, s7
       136c0: fc0614e3      bnez    a2, 0x13688 

       #instrs = 15
```

**With heuristic:**

```
        7bc0: 001c860b      cv.lb   a2, (s9), 0x1
        7bc4: 001e0d0b      cv.lb   s10, (t3), 0x1
        7bc8: 001e808b      cv.lb   ra, (t4), 0x1
        7bcc: 0015038b      cv.lb   t2, (a0), 0x1
        7bd0: 001c028b      cv.lb   t0, (s8), 0x1
        7bd4: 00f282b3      add     t0, t0, a5
        7bd8: 90c2bfab      cv.mac  t6, t0, a2
        7bdc: 91a2b92b      cv.mac  s2, t0, s10
        7be0: 9012b5ab      cv.mac  a1, t0, ra
        7be4: 9072b9ab      cv.mac  s3, t0, t2
        7be8: 9072b72b      cv.mac  a4, t0, t2
        7bec: fc851ae3      bne     a0, s0, 0x7bc0 

        #instrs = 12

        improvement = 1 - 12/15 = 0.2 = 20%
```
---
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |  9 +++++
 .../Target/RISCV/RISCVTargetTransformInfo.h   |  3 ++
 llvm/test/CodeGen/RISCV/xcvmem-heuristic.ll   | 34 +++++++++++++++++++
 3 files changed, 46 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/xcvmem-heuristic.ll

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 49192bd638022..2f9beb0b3983c 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -2329,6 +2329,15 @@ unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
   return std::max(1U, RegWidth.getFixedValue() / ElemWidth);
 }
 
+TTI::AddressingModeKind
+RISCVTTIImpl::getPreferredAddressingMode(const Loop *L,
+                                         ScalarEvolution *SE) const {
+  if (ST->hasVendorXCVmem() && !ST->is64Bit())
+    return TTI::AMK_PostIndexed;
+
+  return BasicTTIImplBase::getPreferredAddressingMode(L, SE);
+}
+
 bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
                                  const TargetTransformInfo::LSRCost &C2) {
   // RISC-V specific here are "instruction number 1st priority".
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index bd90bfed6e2c9..9b364391f0fa4 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -388,6 +388,9 @@ class RISCVTTIImpl : public BasicTTIImplBase {
     llvm_unreachable("unknown register class");
   }
 
+  TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L,
+                                                     ScalarEvolution *SE) const;
+
   unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const {
     if (Vector)
       return RISCVRegisterClass::VRRC;
diff --git a/llvm/test/CodeGen/RISCV/xcvmem-heuristic.ll b/llvm/test/CodeGen/RISCV/xcvmem-heuristic.ll
new file mode 100644
index 0000000000000..c8832bf49dd6a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/xcvmem-heuristic.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O3 -mtriple=riscv32 -mattr=+m,+xcvmem -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK
+
+define i32 @test_heuristic(ptr %b, i32 %e, i1 %0) {
+; CHECK-LABEL: test_heuristic:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    add a3, a0, a1
+; CHECK-NEXT:    andi a2, a2, 1
+; CHECK-NEXT:  .LBB0_1: # %loop
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    cv.lbu a1, (a3), 1
+; CHECK-NEXT:    addi a0, a0, 1
+; CHECK-NEXT:    beqz a2, .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %exit
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:    ret
+entry:
+  %1 = getelementptr i8, ptr %b, i32 %e
+  br label %loop
+
+loop:                                             ; preds = %loop, %entry
+  %2 = phi ptr [ %b, %entry ], [ %7, %loop ]
+  %3 = phi ptr [ %1, %entry ], [ %8, %loop ]
+  %4 = load i8, ptr %2, align 1
+  %5 = load i8, ptr %3, align 1
+  %6 = zext i8 %5 to i32
+  %7 = getelementptr i8, ptr %2, i32 1
+  %8 = getelementptr i8, ptr %3, i32 1
+  br i1 %0, label %exit, label %loop
+
+exit:                                             ; preds = %loop
+  ret i32 %6
+}

From 2e3d3903e624edd5bb32cdd9a9654c907d4f1744 Mon Sep 17 00:00:00 2001
From: Karthika Devi C 
Date: Tue, 31 Dec 2024 11:57:34 +0530
Subject: [PATCH 203/567] [polly] Skip instructions of different function in
 isHoistableLoad. (#118963)

After patch 5ce47a5, some assert crashes occur in Polly. This issue
arises because an instruction from one function queries the Dominator
Tree (DT) of another function. To fix this, the `isHoistableLoad`
function now skips instructions that belong to different function while
iterating.
---
 polly/lib/Support/ScopHelper.cpp        |  3 ++-
 polly/test/ScopDetect/dom-tree-crash.ll | 31 +++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100644 polly/test/ScopDetect/dom-tree-crash.ll

diff --git a/polly/lib/Support/ScopHelper.cpp b/polly/lib/Support/ScopHelper.cpp
index 6d50e297ef715..d0e305a1bdcde 100644
--- a/polly/lib/Support/ScopHelper.cpp
+++ b/polly/lib/Support/ScopHelper.cpp
@@ -604,7 +604,8 @@ bool polly::isHoistableLoad(LoadInst *LInst, Region &R, LoopInfo &LI,
 
   for (auto *User : Ptr->users()) {
     auto *UserI = dyn_cast(User);
-    if (!UserI || !R.contains(UserI))
+    if (!UserI || UserI->getFunction() != LInst->getFunction() ||
+        !R.contains(UserI))
       continue;
     if (!UserI->mayWriteToMemory())
       continue;
diff --git a/polly/test/ScopDetect/dom-tree-crash.ll b/polly/test/ScopDetect/dom-tree-crash.ll
new file mode 100644
index 0000000000000..efc732c50e177
--- /dev/null
+++ b/polly/test/ScopDetect/dom-tree-crash.ll
@@ -0,0 +1,31 @@
+; RUN: opt %loadNPMPolly '-passes=print' -disable-output < %s 2>&1 | FileCheck %s
+
+; CHECK: Detected Scops in Function foo
+
+; This unit test case is to check if the following IR does not crash in isHoistableLoad function during Scop Detection.
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64-unknown-linux-gnueabi"
+
+define void @foo(ptr %block) {
+entry:
+  br label %for.body
+
+for.cond1.preheader:                              ; preds = %for.body
+  %0 = load ptr, ptr null, align 8
+  %1 = load i16, ptr %block, align 2
+  %2 = load i16, ptr %0, align 2
+  br label %foo.exit
+
+for.body:                                         ; preds = %for.body, %entry
+  br i1 false, label %for.cond1.preheader, label %for.body
+
+foo.exit:                                     ; preds = %for.cond1.preheader
+  ret void
+}
+
+define void @init_foo() {
+entry:
+  store ptr null, ptr null, align 8
+  ret void
+}

From 6f3d1d3018448fcad1071e2dc308632c19486f65 Mon Sep 17 00:00:00 2001
From: Tristan Ross 
Date: Mon, 30 Dec 2024 23:34:16 -0800
Subject: [PATCH 204/567] [libc] add __stack_chk_guard to generic (#121121)

---
 libc/src/compiler/generic/__stack_chk_fail.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libc/src/compiler/generic/__stack_chk_fail.cpp b/libc/src/compiler/generic/__stack_chk_fail.cpp
index c76ec1407ad35..183cf9eb2cbf2 100644
--- a/libc/src/compiler/generic/__stack_chk_fail.cpp
+++ b/libc/src/compiler/generic/__stack_chk_fail.cpp
@@ -12,6 +12,8 @@
 
 extern "C" {
 
+uintptr_t __stack_chk_guard = static_cast(0xa9fff01234);
+
 void __stack_chk_fail(void) {
   LIBC_NAMESPACE::write_to_stderr("stack smashing detected\n");
   LIBC_NAMESPACE::abort();

From 5b5ef254a341768283035718262a9cad6cc743e8 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng 
Date: Tue, 31 Dec 2024 16:18:13 +0800
Subject: [PATCH 205/567] [RISCV] Fix typo: vmv.x.i -> vmv.v.i

---
 llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 2f9beb0b3983c..0abb270edcabc 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1663,7 +1663,7 @@ InstructionCost RISCVTTIImpl::getStoreImmCost(Type *Ty,
     return 0;
 
   if (OpInfo.isUniform())
-    // vmv.x.i, vmv.v.x, or vfmv.v.f
+    // vmv.v.i, vmv.v.x, or vfmv.v.f
     // We ignore the cost of the scalar constant materialization to be consistent
     // with how we treat scalar constants themselves just above.
     return 1;

From e3fe41cdf5583d3a2f7454c76fa5cadccdccaf22 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng 
Date: Tue, 31 Dec 2024 17:22:08 +0800
Subject: [PATCH 206/567] [RISCV] Add missing ReadFMA16Addend in
 UnsupportedSchedZfh

---
 llvm/lib/Target/RISCV/RISCVSchedule.td | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/Target/RISCV/RISCVSchedule.td b/llvm/lib/Target/RISCV/RISCVSchedule.td
index 7946a746efd02..ceaeb85d421ff 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedule.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedule.td
@@ -237,6 +237,7 @@ def : ReadAdvance;
 def : ReadAdvance;
 def : ReadAdvance;
 def : ReadAdvance;
+def : ReadAdvance;
 def : ReadAdvance;
 def : ReadAdvance;
 def : ReadAdvance;

From f035351af785b7349ab7bcd55149c781ceca24cb Mon Sep 17 00:00:00 2001
From: Julian Nagele 
Date: Tue, 31 Dec 2024 10:24:48 +0100
Subject: [PATCH 207/567] [SCEV] Make sure starting block is marked as visited
 when recursively collecting loop guards. (#120749)

When `collectFromBlock` is called without a predecessor (in particular
for loops that don't have a unique predecessor outside the loop) we
never start climbing the predecessor chain, and thus don't mark the
starting block as visited.

Fixes https://github.com/llvm/llvm-project/issues/120615.
---
 llvm/lib/Analysis/ScalarEvolution.cpp         |  1 +
 ...t-guard-info-with-multiple-predecessors.ll | 26 +++++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 8ab56025546e6..b5668a14a4a21 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -15765,6 +15765,7 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
   // original header.
   // TODO: share this logic with isLoopEntryGuardedByCond.
   unsigned NumCollectedConditions = 0;
+  VisitedBlocks.insert(Block);
   std::pair Pair(Pred, Block);
   for (; Pair.first;
        Pair = SE.getPredecessorWithUniqueSuccessorForBB(Pair.first)) {
diff --git a/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll
index 81fe96a2f30c0..46dccf454f21a 100644
--- a/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll
+++ b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll
@@ -310,3 +310,29 @@ inner.header:
 exit:
   ret void
 }
+
+; Checks correct traversal for loops without a unique predecessor
+; outside the loop.
+define void @pr120615() {
+; CHECK-LABEL: pr120615
+; CHECK-NEXT:  Determining loop execution counts for: @pr120615
+; CHECK-NEXT:  Loop %header: backedge-taken count is i32 0
+; CHECK-NEXT:  Loop %header: constant max backedge-taken count is i32 0
+; CHECK-NEXT:  Loop %header: symbolic max backedge-taken count is i32 0
+; CHECK-NEXT:  Loop %header: Trip multiple is 1
+entry:
+  br label %header
+
+bb:
+  br label %header
+
+header:
+  %0 = phi i32 [ %1, %header ], [ 0, %bb ], [ 0, %entry ]
+  %1 = add i32 %0, 1
+  %icmp = icmp slt i32 %0, 0
+  br i1 %icmp, label %header, label %exit
+
+exit:
+  ret void
+
+}

From b35d3453ddf14e9564a6b65bd325051f4492311c Mon Sep 17 00:00:00 2001
From: David Green 
Date: Tue, 31 Dec 2024 11:07:42 +0000
Subject: [PATCH 208/567] [AArch64] Add an option for
 sve-prefer-fixed-over-scalable-if-equal. NFC

Add a new option to control preferFixedOverScalableIfEqualCost, useful for
testing.
---
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 9 +++++++++
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h   | 4 +---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 0566a87590012..515764c915bf4 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -35,6 +35,9 @@ using namespace llvm::PatternMatch;
 static cl::opt EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
                                                cl::init(true), cl::Hidden);
 
+static cl::opt SVEPreferFixedOverScalableIfEqualCost(
+    "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
+
 static cl::opt SVEGatherOverhead("sve-gather-overhead", cl::init(10),
                                            cl::Hidden);
 
@@ -4919,6 +4922,12 @@ static bool containsDecreasingPointers(Loop *TheLoop,
   return false;
 }
 
+bool AArch64TTIImpl::preferFixedOverScalableIfEqualCost() const {
+  if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
+    return SVEPreferFixedOverScalableIfEqualCost;
+  return ST->useFixedOverScalableIfEqualCost();
+}
+
 unsigned AArch64TTIImpl::getEpilogueVectorizationMinVF() const {
   return ST->getEpilogueVectorizationMinVF();
 }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 83b86e31565e4..214fb4e352eeb 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -387,9 +387,7 @@ class AArch64TTIImpl : public BasicTTIImplBase {
     return TailFoldingStyle::DataWithoutLaneMask;
   }
 
-  bool preferFixedOverScalableIfEqualCost() const {
-    return ST->useFixedOverScalableIfEqualCost();
-  }
+  bool preferFixedOverScalableIfEqualCost() const;
 
   unsigned getEpilogueVectorizationMinVF() const;
 

From f0d60170cc501447c999569db8fe91aacaad5fe2 Mon Sep 17 00:00:00 2001
From: Timm Baeder 
Date: Tue, 31 Dec 2024 12:14:27 +0100
Subject: [PATCH 209/567] [clang][bytecode] Check memove/memcpy for available
 elements (#121383)

Both destination and source pointer need to have at least as many
elements as requested.
---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp      | 53 +++++++++++++------
 clang/test/AST/ByteCode/builtin-functions.cpp |  9 ++++
 2 files changed, 47 insertions(+), 15 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 2ae91feb2d9e8..d0d8b03deab26 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -17,6 +17,7 @@
 #include "clang/Basic/Builtins.h"
 #include "clang/Basic/TargetBuiltins.h"
 #include "clang/Basic/TargetInfo.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/SipHash.h"
 
 namespace clang {
@@ -1837,6 +1838,7 @@ static bool interp__builtin_memcpy(InterpState &S, CodePtr OpPC,
   assert(Call->getNumArgs() == 3);
   unsigned ID = Func->getBuiltinID();
   Pointer DestPtr = getParam(Frame, 0);
+  const ASTContext &ASTCtx = S.getASTContext();
   const Pointer &SrcPtr = getParam(Frame, 1);
   const APSInt &Size =
       peekToAPSInt(S.Stk, *S.getContext().classify(Call->getArg(2)));
@@ -1857,34 +1859,55 @@ static bool interp__builtin_memcpy(InterpState &S, CodePtr OpPC,
     Pointer DiagPtr = (SrcPtr.isZero() ? SrcPtr : DestPtr);
     S.FFDiag(S.Current->getSource(OpPC), diag::note_constexpr_memcpy_null)
         << /*IsMove=*/Move << /*IsWchar=*/false << !SrcPtr.isZero()
-        << DiagPtr.toDiagnosticString(S.getASTContext());
+        << DiagPtr.toDiagnosticString(ASTCtx);
     return false;
   }
 
-  QualType ElemType;
-  if (DestPtr.getFieldDesc()->isArray())
-    ElemType = DestPtr.getFieldDesc()->getElemQualType();
-  else
-    ElemType = DestPtr.getType();
+  QualType DestElemType;
+  size_t RemainingDestElems;
+  if (DestPtr.getFieldDesc()->isArray()) {
+    DestElemType = DestPtr.getFieldDesc()->getElemQualType();
+    RemainingDestElems = (DestPtr.getNumElems() - DestPtr.getIndex());
+  } else {
+    DestElemType = DestPtr.getType();
+    RemainingDestElems = 1;
+  }
+  unsigned DestElemSize = ASTCtx.getTypeSizeInChars(DestElemType).getQuantity();
 
-  unsigned ElemSize =
-      S.getASTContext().getTypeSizeInChars(ElemType).getQuantity();
-  if (Size.urem(ElemSize) != 0) {
+  if (Size.urem(DestElemSize) != 0) {
     S.FFDiag(S.Current->getSource(OpPC),
              diag::note_constexpr_memcpy_unsupported)
-        << Move << /*IsWchar=*/false << 0 << ElemType << Size << ElemSize;
+        << Move << /*IsWchar=*/false << 0 << DestElemType << Size
+        << DestElemSize;
     return false;
   }
 
   QualType SrcElemType;
-  if (SrcPtr.getFieldDesc()->isArray())
+  size_t RemainingSrcElems;
+  if (SrcPtr.getFieldDesc()->isArray()) {
     SrcElemType = SrcPtr.getFieldDesc()->getElemQualType();
-  else
+    RemainingSrcElems = (SrcPtr.getNumElems() - SrcPtr.getIndex());
+  } else {
     SrcElemType = SrcPtr.getType();
+    RemainingSrcElems = 1;
+  }
+  unsigned SrcElemSize = ASTCtx.getTypeSizeInChars(SrcElemType).getQuantity();
 
-  if (!S.getASTContext().hasSameUnqualifiedType(ElemType, SrcElemType)) {
+  if (!ASTCtx.hasSameUnqualifiedType(DestElemType, SrcElemType)) {
     S.FFDiag(S.Current->getSource(OpPC), diag::note_constexpr_memcpy_type_pun)
-        << Move << SrcElemType << ElemType;
+        << Move << SrcElemType << DestElemType;
+    return false;
+  }
+
+  // Check if we have enough elements to read from and write to/
+  size_t RemainingDestBytes = RemainingDestElems * DestElemSize;
+  size_t RemainingSrcBytes = RemainingSrcElems * SrcElemSize;
+  if (Size.ugt(RemainingDestBytes) || Size.ugt(RemainingSrcBytes)) {
+    APInt N = Size.udiv(DestElemSize);
+    S.FFDiag(S.Current->getSource(OpPC),
+             diag::note_constexpr_memcpy_unsupported)
+        << Move << /*IsWChar*/ false << (Size.ugt(RemainingSrcBytes) ? 1 : 2)
+        << DestElemType << toString(N, 10, /*Signed=*/false);
     return false;
   }
 
@@ -1905,7 +1928,7 @@ static bool interp__builtin_memcpy(InterpState &S, CodePtr OpPC,
   // As a last resort, reject dummy pointers.
   if (DestPtr.isDummy() || SrcPtr.isDummy())
     return false;
-  assert(Size.getZExtValue() % ElemSize == 0);
+  assert(Size.getZExtValue() % DestElemSize == 0);
   if (!DoMemcpy(S, OpPC, SrcPtr, DestPtr, Bytes(Size.getZExtValue()).toBits()))
     return false;
 
diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp
index c1fd1bc138150..b0f8ea2e55ee0 100644
--- a/clang/test/AST/ByteCode/builtin-functions.cpp
+++ b/clang/test/AST/ByteCode/builtin-functions.cpp
@@ -1244,6 +1244,15 @@ namespace BuiltinMemcpy {
   }
   static_assert(cpyptr());
 
+#ifndef __AVR__
+  constexpr int test_memmove(int a, int b, int n) {
+    int arr[4] = {1, 2, 3, 4};
+    __builtin_memmove(arr + a, arr + b, n); // both-note {{destination is not a contiguous array of at least 3 elements of type 'int'}}
+    return result(arr);
+  }
+  static_assert(test_memmove(2, 0, 12) == 4234); // both-error {{constant}} \
+                                                 // both-note {{in call}}
+#endif
 }
 
 namespace Memcmp {

From 31613de9cf22b2915cb39bfb043d957d513bd1cd Mon Sep 17 00:00:00 2001
From: Matthias Springer 
Date: Tue, 31 Dec 2024 12:44:50 +0100
Subject: [PATCH 210/567] [mlir][ArmSME] Migrate `arm-sme-vector-legalization`
 to dialect conversion (#121101)

Use the regular dialect conversion driver instead of the 1:N dialect
conversion driver. The 1:N dialect conversion driver will be removed
soon.
---
 .../ArmSME/Transforms/VectorLegalization.cpp  | 94 +++++++++++--------
 1 file changed, 56 insertions(+), 38 deletions(-)

diff --git a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
index 61767f3b21c9c..12c65a72babcb 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
@@ -17,7 +17,7 @@
 #include "mlir/Dialect/ArmSME/Transforms/Passes.h"
 #include "mlir/Dialect/ArmSME/Utils/Utils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Func/Transforms/OneToNFuncConversions.h"
+#include "mlir/Dialect/Func/Transforms/FuncConversions.h"
 #include "mlir/Dialect/Index/IR/IndexDialect.h"
 #include "mlir/Dialect/Index/IR/IndexOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
@@ -25,7 +25,8 @@
 #include "mlir/Dialect/SCF/Transforms/Patterns.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Vector/Utils/VectorUtils.h"
-#include "mlir/Transforms/OneToNTypeConversion.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 #define DEBUG_TYPE "arm-sme-vector-legalization"
 
@@ -172,12 +173,12 @@ int getNumberOfSMETilesForVectorType(VectorType type) {
 /// Legalize `arith.constant dense` splat operations to fit within SME
 /// tiles by decomposing them into tile-sized operations.
 struct LegalizeArithConstantOpsByDecomposition
-    : public OneToNOpConversionPattern {
-  using OneToNOpConversionPattern::OneToNOpConversionPattern;
+    : public OpConversionPattern {
+  using OpConversionPattern::OpConversionPattern;
 
   LogicalResult
   matchAndRewrite(arith::ConstantOp constantOp, OpAdaptor adaptor,
-                  OneToNPatternRewriter &rewriter) const override {
+                  ConversionPatternRewriter &rewriter) const override {
     auto vectorType = dyn_cast(constantOp.getType());
     auto denseAttr = dyn_cast(constantOp.getValueAttr());
     if (!vectorType || !denseAttr || !denseAttr.isSplat())
@@ -191,8 +192,8 @@ struct LegalizeArithConstantOpsByDecomposition
     auto tileCount = getNumberOfSMETilesForVectorType(vectorType);
     auto tileSplat = rewriter.create(
         constantOp.getLoc(), denseAttr.resizeSplat(smeTileType));
-    rewriter.replaceOp(constantOp, SmallVector(tileCount, tileSplat),
-                       adaptor.getResultMapping());
+    SmallVector repl(tileCount, tileSplat);
+    rewriter.replaceOpWithMultiple(constantOp, {repl});
 
     return success();
   }
@@ -201,12 +202,13 @@ struct LegalizeArithConstantOpsByDecomposition
 /// Legalize `vector.outerproduct` operations to fit within SME tiles by
 /// decomposing them into tile-sized operations.
 struct LegalizeVectorOuterProductOpsByDecomposition
-    : public OneToNOpConversionPattern {
-  using OneToNOpConversionPattern::OneToNOpConversionPattern;
+    : public OpConversionPattern {
+  using OpConversionPattern::OpConversionPattern;
 
   LogicalResult
-  matchAndRewrite(vector::OuterProductOp outerProductOp, OpAdaptor adaptor,
-                  OneToNPatternRewriter &rewriter) const override {
+  matchAndRewrite(vector::OuterProductOp outerProductOp,
+                  OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
     auto vectorType = outerProductOp.getResultVectorType();
     if (!isMultipleOfSMETileVectorType(vectorType))
       return rewriter.notifyMatchFailure(outerProductOp,
@@ -219,6 +221,7 @@ struct LegalizeVectorOuterProductOpsByDecomposition
       auto maskOp = outerProductOp.getMaskingOp();
       mask = maskOp.getMask();
       rootOp = maskOp;
+      rewriter.setInsertionPoint(rootOp);
     }
 
     if (!isSupportedMaskOp(mask))
@@ -248,7 +251,7 @@ struct LegalizeVectorOuterProductOpsByDecomposition
       resultSMETiles.push_back(maskedOuterProduct->getResult(0));
     }
 
-    rewriter.replaceOp(rootOp, resultSMETiles, adaptor.getResultMapping());
+    rewriter.replaceOpWithMultiple(rootOp, {resultSMETiles});
     return success();
   }
 };
@@ -259,12 +262,12 @@ struct LegalizeVectorOuterProductOpsByDecomposition
 // (invalid). This pattern matches on `vector.mask` then calls into the
 // `vector.outerproduct` pattern to work around this issue.
 struct LegalizeMaskedVectorOuterProductOpsByDecomposition
-    : public OneToNOpConversionPattern {
-  using OneToNOpConversionPattern::OneToNOpConversionPattern;
+    : public OpConversionPattern {
+  using OpConversionPattern::OpConversionPattern;
 
   LogicalResult
-  matchAndRewrite(vector::MaskOp maskOp, OpAdaptor adaptor,
-                  OneToNPatternRewriter &rewriter) const override {
+  matchAndRewrite(vector::MaskOp maskOp, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
     if (auto outerProductOp = llvm::dyn_cast_or_null(
             maskOp.getMaskableOp())) {
       LegalizeVectorOuterProductOpsByDecomposition pattern(*getTypeConverter(),
@@ -279,12 +282,12 @@ struct LegalizeMaskedVectorOuterProductOpsByDecomposition
 /// Legalize `vector.transfer_read` operations to fit within SME tiles by
 /// decomposing them into tile-sized operations.
 struct LegalizeTransferReadOpsByDecomposition
-    : public OneToNOpConversionPattern {
-  using OneToNOpConversionPattern::OneToNOpConversionPattern;
+    : public OpConversionPattern {
+  using OpConversionPattern::OpConversionPattern;
 
   LogicalResult
-  matchAndRewrite(vector::TransferReadOp readOp, OpAdaptor adaptor,
-                  OneToNPatternRewriter &rewriter) const override {
+  matchAndRewrite(vector::TransferReadOp readOp, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
     auto vectorType = readOp.getVectorType();
     if (!isMultipleOfSMETileVectorType(vectorType))
       return rewriter.notifyMatchFailure(readOp,
@@ -319,7 +322,7 @@ struct LegalizeTransferReadOpsByDecomposition
       resultSMETiles.push_back(smeRead);
     }
 
-    rewriter.replaceOp(readOp, resultSMETiles, adaptor.getResultMapping());
+    rewriter.replaceOpWithMultiple(readOp, {resultSMETiles});
     return success();
   }
 };
@@ -327,12 +330,12 @@ struct LegalizeTransferReadOpsByDecomposition
 /// Legalize `vector.transfer_write` operations to fit within SME tiles by
 /// decomposing them into tile-sized operations.
 struct LegalizeTransferWriteOpsByDecomposition
-    : public OneToNOpConversionPattern {
-  using OneToNOpConversionPattern::OneToNOpConversionPattern;
+    : public OpConversionPattern {
+  using OpConversionPattern::OpConversionPattern;
 
   LogicalResult
-  matchAndRewrite(vector::TransferWriteOp writeOp, OpAdaptor adaptor,
-                  OneToNPatternRewriter &rewriter) const override {
+  matchAndRewrite(vector::TransferWriteOp writeOp, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
     auto vectorType = writeOp.getVectorType();
     if (!isMultipleOfSMETileVectorType(vectorType))
       return rewriter.notifyMatchFailure(writeOp,
@@ -409,12 +412,12 @@ struct LegalizeTransferWriteOpsByDecomposition
 /// }
 /// ```
 struct LegalizeMultiTileTransferWriteAsStoreLoop
-    : public OneToNOpConversionPattern {
-  using OneToNOpConversionPattern::OneToNOpConversionPattern;
+    : public OpConversionPattern {
+  using OpConversionPattern::OpConversionPattern;
 
   LogicalResult
-  matchAndRewrite(vector::TransferWriteOp writeOp, OpAdaptor adaptor,
-                  OneToNPatternRewriter &rewriter) const override {
+  matchAndRewrite(vector::TransferWriteOp writeOp, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
     if (writeOp.hasPureTensorSemantics())
       return rewriter.notifyMatchFailure(
           writeOp, "TODO: tensor semantics are unsupported");
@@ -936,10 +939,16 @@ struct VectorLegalizationPass
           return success();
         });
 
-    patterns.add(context);
+    // Apply preprocessing patterns.
+    RewritePatternSet rewritePatterns(context);
+    rewritePatterns.add(context);
+    if (failed(
+            applyPatternsGreedily(getOperation(), std::move(rewritePatterns))))
+      return signalPassFailure();
+
     // Note: These two patterns are added with a high benefit to ensure:
     //  - Masked outer products are handled before unmasked ones
     //  - Multi-tile writes are lowered as a store loop (if possible)
@@ -950,11 +959,20 @@ struct VectorLegalizationPass
                  LegalizeVectorOuterProductOpsByDecomposition,
                  LegalizeTransferReadOpsByDecomposition,
                  LegalizeTransferWriteOpsByDecomposition>(converter, context);
-    populateFuncTypeConversionPatterns(converter, patterns);
-    scf::populateSCFStructuralOneToNTypeConversions(converter, patterns);
-
-    if (failed(applyPartialOneToNConversion(getOperation(), converter,
-                                            std::move(patterns))))
+    populateFunctionOpInterfaceTypeConversionPattern(patterns,
+                                                                   converter);
+    populateCallOpTypeConversionPattern(patterns, converter);
+    populateReturnOpTypeConversionPattern(patterns, converter);
+    scf::populateSCFStructuralTypeConversions(converter, patterns);
+
+    ConversionTarget target(getContext());
+    target.markUnknownOpDynamicallyLegal(
+        [&](Operation *op) { return converter.isLegal(op); });
+    target.addDynamicallyLegalOp([&](func::FuncOp op) {
+      return converter.isSignatureLegal(op.getFunctionType());
+    });
+    if (failed(applyPartialConversion(getOperation(), target,
+                                      std::move(patterns))))
       return signalPassFailure();
   }
 };

From 80ecbaa3c0ae02cf3f7005bfebef28003c6f8d0e Mon Sep 17 00:00:00 2001
From: Matthias Springer 
Date: Tue, 31 Dec 2024 13:11:19 +0100
Subject: [PATCH 211/567] [mlir][Transforms] Mark 1:N conversion driver as
 deprecated (#121102)

The 1:N conversion driver will be removed soon.

Note for LLVM integration: Please migrate your code base to the regular dialect conversion driver.
---
 .../mlir/Dialect/SCF/Transforms/Patterns.h       |  4 ++++
 .../mlir/Transforms/OneToNTypeConversion.h       | 16 ++++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/Patterns.h b/mlir/include/mlir/Dialect/SCF/Transforms/Patterns.h
index b87407d302a82..9c1479d28c305 100644
--- a/mlir/include/mlir/Dialect/SCF/Transforms/Patterns.h
+++ b/mlir/include/mlir/Dialect/SCF/Transforms/Patterns.h
@@ -66,6 +66,10 @@ void populateSCFStructuralTypeConversionTarget(
 /// Populates the provided pattern set with patterns that do 1:N type
 /// conversions on (some) SCF ops. This is intended to be used with
 /// applyPartialOneToNConversion.
+/// FIXME: The 1:N dialect conversion is deprecated and will be removed soon.
+/// 1:N support has been added to the regular dialect conversion driver.
+LLVM_DEPRECATED("Use populateSCFStructuralTypeConversions() instead",
+                "populateSCFStructuralTypeConversions")
 void populateSCFStructuralOneToNTypeConversions(
     const TypeConverter &typeConverter, RewritePatternSet &patterns);
 
diff --git a/mlir/include/mlir/Transforms/OneToNTypeConversion.h b/mlir/include/mlir/Transforms/OneToNTypeConversion.h
index 7b4dd65cbff7b..9c74bf916d971 100644
--- a/mlir/include/mlir/Transforms/OneToNTypeConversion.h
+++ b/mlir/include/mlir/Transforms/OneToNTypeConversion.h
@@ -6,6 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 //
+// Note: The 1:N dialect conversion is deprecated and will be removed soon.
+// 1:N support has been added to the regular dialect conversion driver.
+//
 // This file provides utils for implementing (poor-man's) dialect conversion
 // passes with 1:N type conversions.
 //
@@ -119,6 +122,10 @@ class OneToNPatternRewriter : public PatternRewriter {
   /// types must be the same as the result types of the op) and the new values
   /// (i.e., the converted types must be the same as the types of the new
   /// values).
+  /// FIXME: The 1:N dialect conversion is deprecated and will be removed soon.
+  /// 1:N support has been added to the regular dialect conversion driver.
+  LLVM_DEPRECATED("Use replaceOpWithMultiple() instead",
+                  "replaceOpWithMultiple")
   void replaceOp(Operation *op, ValueRange newValues,
                  const OneToNTypeMapping &resultMapping);
   using PatternRewriter::replaceOp;
@@ -251,6 +258,10 @@ class OneToNOpConversionPattern : public OneToNConversionPattern {
 /// or illegal types; the function simply applies the given patterns and does
 /// not fail if some ops or types remain unconverted (i.e., the conversion is
 /// only "partial").
+/// FIXME: The 1:N dialect conversion is deprecated and will be removed soon.
+/// 1:N support has been added to the regular dialect conversion driver.
+LLVM_DEPRECATED("Use applyPartialConversion() instead",
+                "applyPartialConversion")
 LogicalResult
 applyPartialOneToNConversion(Operation *op, TypeConverter &typeConverter,
                              const FrozenRewritePatternSet &patterns);
@@ -259,6 +270,11 @@ applyPartialOneToNConversion(Operation *op, TypeConverter &typeConverter,
 /// FunctionOpInterface op with the given type converter. This only supports
 /// ops which use FunctionType to represent their type. This is intended to be
 /// used with the 1:N dialect conversion.
+/// FIXME: The 1:N dialect conversion is deprecated and will be removed soon.
+/// 1:N support has been added to the regular dialect conversion driver.
+LLVM_DEPRECATED(
+    "Use populateFunctionOpInterfaceTypeConversionPattern() instead",
+    "populateFunctionOpInterfaceTypeConversionPattern")
 void populateOneToNFunctionOpInterfaceTypeConversionPattern(
     StringRef functionLikeOpName, const TypeConverter &converter,
     RewritePatternSet &patterns);

From 1d5154663509b6200038a2f0b0ac958ea556fa9e Mon Sep 17 00:00:00 2001
From: Zibi Sarbinowski 
Date: Tue, 31 Dec 2024 07:24:59 -0500
Subject: [PATCH 212/567] [SystemZ][z/OS] Open YAML files for read as text
 files (#121340)

This patch makes sure YAML files are opened for reading as text file to
trigger auto-conversion from EBCDIC encoding into expected ASCII
encoding on z/OS platform. This is required to fix the following lit
tests:

```
LLVM :: tools/llvm-gsymutil/ARM_AArch64/macho-gsym-callsite-info-exe.yaml
LLVM :: tools/llvm-gsymutil/ARM_AArch64/macho-gsym-callsite-info-obj.test
LLVM :: tools/llvm-gsymutil/ARM_AArch64/macho-gsym-callsite-info-dsym.yaml
LLVM :: Transforms/PGOProfile/memprof_undrift_missing_leaf.ll
```
---
 llvm/lib/DebugInfo/GSYM/CallSiteInfo.cpp | 2 +-
 llvm/lib/ProfileData/MemProfReader.cpp   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/DebugInfo/GSYM/CallSiteInfo.cpp b/llvm/lib/DebugInfo/GSYM/CallSiteInfo.cpp
index 85b41e2899131..c112c0bc3ddc9 100644
--- a/llvm/lib/DebugInfo/GSYM/CallSiteInfo.cpp
+++ b/llvm/lib/DebugInfo/GSYM/CallSiteInfo.cpp
@@ -151,7 +151,7 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(FunctionYAML)
 
 Error CallSiteInfoLoader::loadYAML(StringRef YAMLFile) {
   // Step 1: Read YAML file
-  auto BufferOrError = MemoryBuffer::getFile(YAMLFile);
+  auto BufferOrError = MemoryBuffer::getFile(YAMLFile, /*IsText=*/true);
   if (!BufferOrError)
     return errorCodeToError(BufferOrError.getError());
 
diff --git a/llvm/lib/ProfileData/MemProfReader.cpp b/llvm/lib/ProfileData/MemProfReader.cpp
index 10c36f25c4b79..6a4fecd5ae05e 100644
--- a/llvm/lib/ProfileData/MemProfReader.cpp
+++ b/llvm/lib/ProfileData/MemProfReader.cpp
@@ -754,7 +754,7 @@ Error RawMemProfReader::readNextRecord(
 
 Expected>
 YAMLMemProfReader::create(const Twine &Path) {
-  auto BufferOr = MemoryBuffer::getFileOrSTDIN(Path);
+  auto BufferOr = MemoryBuffer::getFileOrSTDIN(Path, /*IsText=*/true);
   if (std::error_code EC = BufferOr.getError())
     return report(errorCodeToError(EC), Path.getSingleStringRef());
 
@@ -770,7 +770,7 @@ YAMLMemProfReader::create(std::unique_ptr Buffer) {
 }
 
 bool YAMLMemProfReader::hasFormat(const StringRef Path) {
-  auto BufferOr = MemoryBuffer::getFileOrSTDIN(Path);
+  auto BufferOr = MemoryBuffer::getFileOrSTDIN(Path, /*IsText=*/true);
   if (!BufferOr)
     return false;
 

From ddef380cd6c30668cc6f6d952b4c045f724f8d57 Mon Sep 17 00:00:00 2001
From: Florian Hahn 
Date: Tue, 31 Dec 2024 13:23:19 +0000
Subject: [PATCH 213/567] [VPlan] Move simplifyRecipe(s) definitions up to
 allow re-use (NFC)

Move definitions to allow easy reuse in
https://github.com/llvm/llvm-project/pull/108378.
---
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 290 +++++++++---------
 1 file changed, 145 insertions(+), 145 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 1f5acf996a772..89aab71905a29 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -662,6 +662,151 @@ static void recursivelyDeleteDeadRecipes(VPValue *V) {
   }
 }
 
+/// Try to simplify recipe \p R.
+static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
+  using namespace llvm::VPlanPatternMatch;
+
+  if (auto *Blend = dyn_cast(&R)) {
+    // Try to remove redundant blend recipes.
+    SmallPtrSet UniqueValues;
+    if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
+      UniqueValues.insert(Blend->getIncomingValue(0));
+    for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
+      if (!match(Blend->getMask(I), m_False()))
+        UniqueValues.insert(Blend->getIncomingValue(I));
+
+    if (UniqueValues.size() == 1) {
+      Blend->replaceAllUsesWith(*UniqueValues.begin());
+      Blend->eraseFromParent();
+      return;
+    }
+
+    if (Blend->isNormalized())
+      return;
+
+    // Normalize the blend so its first incoming value is used as the initial
+    // value with the others blended into it.
+
+    unsigned StartIndex = 0;
+    for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
+      // If a value's mask is used only by the blend then is can be deadcoded.
+      // TODO: Find the most expensive mask that can be deadcoded, or a mask
+      // that's used by multiple blends where it can be removed from them all.
+      VPValue *Mask = Blend->getMask(I);
+      if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
+        StartIndex = I;
+        break;
+      }
+    }
+
+    SmallVector OperandsWithMask;
+    OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
+
+    for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
+      if (I == StartIndex)
+        continue;
+      OperandsWithMask.push_back(Blend->getIncomingValue(I));
+      OperandsWithMask.push_back(Blend->getMask(I));
+    }
+
+    auto *NewBlend = new VPBlendRecipe(
+        cast(Blend->getUnderlyingValue()), OperandsWithMask);
+    NewBlend->insertBefore(&R);
+
+    VPValue *DeadMask = Blend->getMask(StartIndex);
+    Blend->replaceAllUsesWith(NewBlend);
+    Blend->eraseFromParent();
+    recursivelyDeleteDeadRecipes(DeadMask);
+    return;
+  }
+
+  VPValue *A;
+  if (match(&R, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
+    VPValue *Trunc = R.getVPSingleValue();
+    Type *TruncTy = TypeInfo.inferScalarType(Trunc);
+    Type *ATy = TypeInfo.inferScalarType(A);
+    if (TruncTy == ATy) {
+      Trunc->replaceAllUsesWith(A);
+    } else {
+      // Don't replace a scalarizing recipe with a widened cast.
+      if (isa(&R))
+        return;
+      if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
+
+        unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
+                                 ? Instruction::SExt
+                                 : Instruction::ZExt;
+        auto *VPC =
+            new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
+        if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) {
+          // UnderlyingExt has distinct return type, used to retain legacy cost.
+          VPC->setUnderlyingValue(UnderlyingExt);
+        }
+        VPC->insertBefore(&R);
+        Trunc->replaceAllUsesWith(VPC);
+      } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
+        auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
+        VPC->insertBefore(&R);
+        Trunc->replaceAllUsesWith(VPC);
+      }
+    }
+#ifndef NDEBUG
+    // Verify that the cached type info is for both A and its users is still
+    // accurate by comparing it to freshly computed types.
+    VPTypeAnalysis TypeInfo2(
+        R.getParent()->getPlan()->getCanonicalIV()->getScalarType());
+    assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
+    for (VPUser *U : A->users()) {
+      auto *R = cast(U);
+      for (VPValue *VPV : R->definedValues())
+        assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
+    }
+#endif
+  }
+
+  // Simplify (X && Y) || (X && !Y) -> X.
+  // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
+  // && (Y || Z) and (X || !X) into true. This requires queuing newly created
+  // recipes to be visited during simplification.
+  VPValue *X, *Y, *X1, *Y1;
+  if (match(&R,
+            m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)),
+                         m_LogicalAnd(m_VPValue(X1), m_Not(m_VPValue(Y1))))) &&
+      X == X1 && Y == Y1) {
+    R.getVPSingleValue()->replaceAllUsesWith(X);
+    R.eraseFromParent();
+    return;
+  }
+
+  if (match(&R, m_c_Mul(m_VPValue(A), m_SpecificInt(1))))
+    return R.getVPSingleValue()->replaceAllUsesWith(A);
+
+  if (match(&R, m_Not(m_Not(m_VPValue(A)))))
+    return R.getVPSingleValue()->replaceAllUsesWith(A);
+
+  // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
+  if ((match(&R,
+             m_DerivedIV(m_SpecificInt(0), m_VPValue(A), m_SpecificInt(1))) ||
+       match(&R,
+             m_DerivedIV(m_SpecificInt(0), m_SpecificInt(0), m_VPValue()))) &&
+      TypeInfo.inferScalarType(R.getOperand(1)) ==
+          TypeInfo.inferScalarType(R.getVPSingleValue()))
+    return R.getVPSingleValue()->replaceAllUsesWith(R.getOperand(1));
+}
+
+/// Try to simplify the recipes in \p Plan
+static void simplifyRecipes(VPlan &Plan) {
+  ReversePostOrderTraversal> RPOT(
+      Plan.getEntry());
+  Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
+  VPTypeAnalysis TypeInfo(CanonicalIVType);
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) {
+    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+      simplifyRecipe(R, TypeInfo);
+    }
+  }
+}
+
 void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
                                          unsigned BestUF,
                                          PredicatedScalarEvolution &PSE) {
@@ -942,138 +1087,6 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
   }
 }
 
-/// Try to simplify recipe \p R.
-static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
-  using namespace llvm::VPlanPatternMatch;
-
-  if (auto *Blend = dyn_cast(&R)) {
-    // Try to remove redundant blend recipes.
-    SmallPtrSet UniqueValues;
-    if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
-      UniqueValues.insert(Blend->getIncomingValue(0));
-    for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
-      if (!match(Blend->getMask(I), m_False()))
-        UniqueValues.insert(Blend->getIncomingValue(I));
-
-    if (UniqueValues.size() == 1) {
-      Blend->replaceAllUsesWith(*UniqueValues.begin());
-      Blend->eraseFromParent();
-      return;
-    }
-
-    if (Blend->isNormalized())
-      return;
-
-    // Normalize the blend so its first incoming value is used as the initial
-    // value with the others blended into it.
-
-    unsigned StartIndex = 0;
-    for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
-      // If a value's mask is used only by the blend then is can be deadcoded.
-      // TODO: Find the most expensive mask that can be deadcoded, or a mask
-      // that's used by multiple blends where it can be removed from them all.
-      VPValue *Mask = Blend->getMask(I);
-      if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
-        StartIndex = I;
-        break;
-      }
-    }
-
-    SmallVector OperandsWithMask;
-    OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
-
-    for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
-      if (I == StartIndex)
-        continue;
-      OperandsWithMask.push_back(Blend->getIncomingValue(I));
-      OperandsWithMask.push_back(Blend->getMask(I));
-    }
-
-    auto *NewBlend = new VPBlendRecipe(
-        cast(Blend->getUnderlyingValue()), OperandsWithMask);
-    NewBlend->insertBefore(&R);
-
-    VPValue *DeadMask = Blend->getMask(StartIndex);
-    Blend->replaceAllUsesWith(NewBlend);
-    Blend->eraseFromParent();
-    recursivelyDeleteDeadRecipes(DeadMask);
-    return;
-  }
-
-  VPValue *A;
-  if (match(&R, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
-    VPValue *Trunc = R.getVPSingleValue();
-    Type *TruncTy = TypeInfo.inferScalarType(Trunc);
-    Type *ATy = TypeInfo.inferScalarType(A);
-    if (TruncTy == ATy) {
-      Trunc->replaceAllUsesWith(A);
-    } else {
-      // Don't replace a scalarizing recipe with a widened cast.
-      if (isa(&R))
-        return;
-      if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
-
-        unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
-                                 ? Instruction::SExt
-                                 : Instruction::ZExt;
-        auto *VPC =
-            new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
-        if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) {
-          // UnderlyingExt has distinct return type, used to retain legacy cost.
-          VPC->setUnderlyingValue(UnderlyingExt);
-        }
-        VPC->insertBefore(&R);
-        Trunc->replaceAllUsesWith(VPC);
-      } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
-        auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
-        VPC->insertBefore(&R);
-        Trunc->replaceAllUsesWith(VPC);
-      }
-    }
-#ifndef NDEBUG
-    // Verify that the cached type info is for both A and its users is still
-    // accurate by comparing it to freshly computed types.
-    VPTypeAnalysis TypeInfo2(
-        R.getParent()->getPlan()->getCanonicalIV()->getScalarType());
-    assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
-    for (VPUser *U : A->users()) {
-      auto *R = cast(U);
-      for (VPValue *VPV : R->definedValues())
-        assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
-    }
-#endif
-  }
-
-  // Simplify (X && Y) || (X && !Y) -> X.
-  // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
-  // && (Y || Z) and (X || !X) into true. This requires queuing newly created
-  // recipes to be visited during simplification.
-  VPValue *X, *Y, *X1, *Y1;
-  if (match(&R,
-            m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)),
-                         m_LogicalAnd(m_VPValue(X1), m_Not(m_VPValue(Y1))))) &&
-      X == X1 && Y == Y1) {
-    R.getVPSingleValue()->replaceAllUsesWith(X);
-    R.eraseFromParent();
-    return;
-  }
-
-  if (match(&R, m_c_Mul(m_VPValue(A), m_SpecificInt(1))))
-    return R.getVPSingleValue()->replaceAllUsesWith(A);
-
-  if (match(&R, m_Not(m_Not(m_VPValue(A)))))
-    return R.getVPSingleValue()->replaceAllUsesWith(A);
-
-  // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
-  if ((match(&R,
-             m_DerivedIV(m_SpecificInt(0), m_VPValue(A), m_SpecificInt(1))) ||
-       match(&R,
-             m_DerivedIV(m_SpecificInt(0), m_SpecificInt(0), m_VPValue()))) &&
-      TypeInfo.inferScalarType(R.getOperand(1)) ==
-          TypeInfo.inferScalarType(R.getVPSingleValue()))
-    return R.getVPSingleValue()->replaceAllUsesWith(R.getOperand(1));
-}
-
 /// Move loop-invariant recipes out of the vector loop region in \p Plan.
 static void licm(VPlan &Plan) {
   VPBasicBlock *Preheader = Plan.getVectorPreheader();
@@ -1108,19 +1121,6 @@ static void licm(VPlan &Plan) {
   }
 }
 
-/// Try to simplify the recipes in \p Plan.
-static void simplifyRecipes(VPlan &Plan) {
-  ReversePostOrderTraversal> RPOT(
-      Plan.getEntry());
-  Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
-  VPTypeAnalysis TypeInfo(CanonicalIVType);
-  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) {
-    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
-      simplifyRecipe(R, TypeInfo);
-    }
-  }
-}
-
 void VPlanTransforms::truncateToMinimalBitwidths(
     VPlan &Plan, const MapVector &MinBWs) {
 #ifndef NDEBUG

From 6279d2e0f33ed1aa686aace48da6ccf912ab4b28 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim 
Date: Tue, 31 Dec 2024 11:57:47 +0000
Subject: [PATCH 214/567] AArch64ABIInfo::passAsAggregateType - don't directly
 dereference getAs<> result. NFC.

Reported by coverity static analyzer - we know the type is a BuiltinType so use castAs<>
---
 clang/lib/CodeGen/Targets/AArch64.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index ad7f405cc7255..0680b4828b6da 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -662,7 +662,7 @@ bool AArch64ABIInfo::isZeroLengthBitfieldPermittedInHomogeneousAggregate()
 
 bool AArch64ABIInfo::passAsAggregateType(QualType Ty) const {
   if (Kind == AArch64ABIKind::AAPCS && Ty->isSVESizelessBuiltinType()) {
-    const auto *BT = Ty->getAs();
+    const auto *BT = Ty->castAs();
     return !BT->isSVECount() &&
            getContext().getBuiltinVectorTypeInfo(BT).NumVectors > 1;
   }

From b195bb87e1a0120d8bc6f7fd7e6a7424bd664004 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim 
Date: Tue, 31 Dec 2024 14:42:39 +0000
Subject: [PATCH 215/567] [VectorCombine] scalarizeLoadExtract - consistently
 use LoadInst and ExtractElementInst specific operand getters. NFC

Noticed while investigating the hung builds reported after af83093933ca73bc82c33130f8bda9f1ae54aae2
---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 2460ccc61d84d..dd109637552c4 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1360,8 +1360,8 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
   if (!match(&I, m_Load(m_Value(Ptr))))
     return false;
 
-  auto *VecTy = cast(I.getType());
   auto *LI = cast(&I);
+  auto *VecTy = cast(LI->getType());
   if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
     return false;
 
@@ -1401,7 +1401,8 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
       LastCheckedInst = UI;
     }
 
-    auto ScalarIdx = canScalarizeAccess(VecTy, UI->getOperand(1), &I, AC, DT);
+    auto ScalarIdx =
+        canScalarizeAccess(VecTy, UI->getIndexOperand(), LI, AC, DT);
     if (ScalarIdx.isUnsafe())
       return false;
     if (ScalarIdx.isSafeWithFreeze()) {
@@ -1409,7 +1410,7 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
       ScalarIdx.discard();
     }
 
-    auto *Index = dyn_cast(UI->getOperand(1));
+    auto *Index = dyn_cast(UI->getIndexOperand());
     OriginalCost +=
         TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
                                Index ? Index->getZExtValue() : -1);
@@ -1425,7 +1426,7 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
   // Replace extracts with narrow scalar loads.
   for (User *U : LI->users()) {
     auto *EI = cast(U);
-    Value *Idx = EI->getOperand(1);
+    Value *Idx = EI->getIndexOperand();
 
     // Insert 'freeze' for poison indexes.
     auto It = NeedFreeze.find(EI);

From d1e5e6735a845f1281f11389da1e5a55a0d2e87a Mon Sep 17 00:00:00 2001
From: Simon Pilgrim 
Date: Tue, 31 Dec 2024 15:25:12 +0000
Subject: [PATCH 216/567] [PhaseOrdering] Update test RUN lines to use
 `-passes="default"` to allow evaluation by DOS batch scripts. NFC.

`-passes='default'` isn't correctly parsed on DOS, so when update_test_checks.py runs a system call on the opt RUN line, it fails to evaluate properly - use `-passes="default"` instead.
---
 .../PhaseOrdering/AArch64/extra-unroll-simplifications.ll | 2 +-
 .../PhaseOrdering/AArch64/hoist-runtime-checks.ll         | 2 +-
 .../hoisting-sinking-required-for-vectorization.ll        | 2 +-
 .../PhaseOrdering/AArch64/indvars-vectorization.ll        | 2 +-
 llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll | 2 +-
 .../PhaseOrdering/AArch64/matrix-extract-insert.ll        | 2 +-
 .../PhaseOrdering/AArch64/memcpy-constant-size.ll         | 2 +-
 llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll      | 2 +-
 llvm/test/Transforms/PhaseOrdering/ARM/arm_fill_q7.ll     | 2 +-
 llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll     | 2 +-
 llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll    | 2 +-
 llvm/test/Transforms/PhaseOrdering/ARM/mve-floatreduce.ll | 2 +-
 .../Transforms/PhaseOrdering/X86/excessive-unrolling.ll   | 2 +-
 .../Transforms/PhaseOrdering/X86/preserve-access-group.ll | 2 +-
 llvm/test/Transforms/PhaseOrdering/bitfield-bittests.ll   | 2 +-
 llvm/test/Transforms/PhaseOrdering/dae-dce.ll             | 2 +-
 .../deletion-of-loops-that-became-side-effect-free.ll     | 2 +-
 llvm/test/Transforms/PhaseOrdering/globalaa-retained.ll   | 2 +-
 .../Transforms/PhaseOrdering/instcombine-sroa-inttoptr.ll | 2 +-
 llvm/test/Transforms/PhaseOrdering/lifetime-sanitizer.ll  | 2 +-
 .../loop-rotation-vs-common-code-hoisting.ll              | 8 ++++----
 llvm/test/Transforms/PhaseOrdering/pr32544.ll             | 2 +-
 llvm/test/Transforms/PhaseOrdering/pr45682.ll             | 2 +-
 llvm/test/Transforms/PhaseOrdering/pr62311.ll             | 2 +-
 llvm/test/Transforms/PhaseOrdering/pr95152.ll             | 2 +-
 llvm/test/Transforms/PhaseOrdering/rotate.ll              | 2 +-
 ...implifycfg-switch-lowering-vs-correlatedpropagation.ll | 2 +-
 llvm/test/Transforms/PhaseOrdering/switch-sext.ll         | 2 +-
 llvm/test/Transforms/PhaseOrdering/switch_with_geps.ll    | 2 +-
 29 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll
index 6c45442bdcd3c..13ea35a87c312 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes='default' -S %s | FileCheck %s
+; RUN: opt -passes="default" -S %s | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 target triple = "arm64-apple-macosx11.0.0"
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll
index b2d6455a82944..a38413f26ec97 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes='default' -S %s | FileCheck %s
+; RUN: opt -passes="default" -S %s | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 target triple = "arm64-apple-macosx11.0.0"
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
index 82b1cf90b4720..f583a616dd375 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='default' -S %s | FileCheck %s
+; RUN: opt -passes="default" -S %s | FileCheck %s
 
 target triple = "arm64-apple-darwin"
 
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
index 2f61c89241fd4..801a8a05d82ed 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes='default' -S -o - %s | FileCheck %s
+; RUN: opt -passes="default" -S -o - %s | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 target triple = "arm64-apple-macosx14.0.0"
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll
index b14a36c8b3bcd..2703d2390ce52 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='default' -enable-loop-flatten -loop-flatten-cost-threshold=3 -S %s | FileCheck %s
+; RUN: opt -passes="default" -enable-loop-flatten -loop-flatten-cost-threshold=3 -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64"
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
index 7fccfeea39bb9..886e7a758d053 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='default' -enable-matrix -S %s | FileCheck %s
+; RUN: opt -passes="default" -enable-matrix -S %s | FileCheck %s
 
 target triple = "arm64-apple-ios"
 
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/memcpy-constant-size.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/memcpy-constant-size.ll
index 10b07ad6e7491..d34063824fe79 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/memcpy-constant-size.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/memcpy-constant-size.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt %s -mtriple=arm64-apple-macosx -passes='default' -inline-threshold=2 -inline-call-penalty=5 -S | FileCheck %s
+; RUN: opt %s -mtriple=arm64-apple-macosx -passes="default" -inline-threshold=2 -inline-call-penalty=5 -S | FileCheck %s
 
 declare i64 @llvm.objectsize.i64.p0(ptr, i1, i1, i1)
 declare ptr @__memcpy_chk(ptr, ptr, i64, i64)
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll
index b1d0c70abc558..76d9d14b7d14f 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes='default' -S | FileCheck %s
+; RUN: opt < %s -passes="default" -S | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv8.1m.main-arm-none-eabi"
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_fill_q7.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_fill_q7.ll
index 5b7622bca6b5c..2ab6f2bdca8d6 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_fill_q7.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_fill_q7.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -O3 -S                                        | FileCheck --check-prefix=OLDPM %s
-; RUN: opt < %s -passes='default' -S | FileCheck --check-prefix=NEWPM %s
+; RUN: opt < %s -passes="default" -S | FileCheck --check-prefix=NEWPM %s
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv6m-none-none-eabi"
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
index aab787b2b2d2c..778f25f5620f2 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes='default' -S | FileCheck %s
+; RUN: opt < %s -passes="default" -S | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv6m-none-none-eabi"
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
index 66a0771c8d373..9032c363eb936 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes='default' -S | FileCheck %s
+; RUN: opt < %s -passes="default" -S | FileCheck %s
 
 ; This test after a lot of cleanup should produce pick a tail-predicated 8x
 ; vector loop. The 8x will be more profitable, to pick a VQDMULH.s16 instruction.
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/mve-floatreduce.ll b/llvm/test/Transforms/PhaseOrdering/ARM/mve-floatreduce.ll
index 6cbba5ca2a6ad..664953a71c9a3 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/mve-floatreduce.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/mve-floatreduce.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -O3 -S                   | FileCheck %s
-; RUN: opt < %s -passes='default' -S | FileCheck %s
+; RUN: opt < %s -passes="default" -S | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv8.1m.main-none-none-eabi"
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll b/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
index 57a3d8175ba51..5a3742c7d85bc 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='default' -unroll-runtime -S %s | FileCheck %s
+; RUN: opt -passes="default" -unroll-runtime -S %s | FileCheck %s
 
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx"
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll b/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll
index fe5bba1ae9776..7bb22e2e9f5b5 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes='default' -S %s | FileCheck %s
+; RUN: opt -passes="default" -S %s | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/PhaseOrdering/bitfield-bittests.ll b/llvm/test/Transforms/PhaseOrdering/bitfield-bittests.ll
index 2843a7e761234..126be02520623 100644
--- a/llvm/test/Transforms/PhaseOrdering/bitfield-bittests.ll
+++ b/llvm/test/Transforms/PhaseOrdering/bitfield-bittests.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -O3 -S < %s                    | FileCheck %s
-; RUN: opt -passes='default' -S < %s  | FileCheck %s
+; RUN: opt -passes="default" -S < %s  | FileCheck %s
 
 ; These are tests that check for set/clear bits in a bitfield based on PR37098:
 ; https://bugs.llvm.org/show_bug.cgi?id=37098
diff --git a/llvm/test/Transforms/PhaseOrdering/dae-dce.ll b/llvm/test/Transforms/PhaseOrdering/dae-dce.ll
index 7ff3c5dc5536f..7cdddd1e4232b 100644
--- a/llvm/test/Transforms/PhaseOrdering/dae-dce.ll
+++ b/llvm/test/Transforms/PhaseOrdering/dae-dce.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes='default' < %s | FileCheck %s --check-prefixes=CHECK,DEFAULT
+; RUN: opt -S -passes="default" < %s | FileCheck %s --check-prefixes=CHECK,DEFAULT
 ; RUN: opt -S -passes='lto' < %s | FileCheck %s --check-prefixes=CHECK,LTO
 
 declare void @llvm.trap()
diff --git a/llvm/test/Transforms/PhaseOrdering/deletion-of-loops-that-became-side-effect-free.ll b/llvm/test/Transforms/PhaseOrdering/deletion-of-loops-that-became-side-effect-free.ll
index 689f4a9798a75..641f216302146 100644
--- a/llvm/test/Transforms/PhaseOrdering/deletion-of-loops-that-became-side-effect-free.ll
+++ b/llvm/test/Transforms/PhaseOrdering/deletion-of-loops-that-became-side-effect-free.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='default' -S < %s  | FileCheck %s --check-prefixes=ALL,O3
+; RUN: opt -passes="default" -S < %s  | FileCheck %s --check-prefixes=ALL,O3
 ; RUN: opt -passes='default' -S < %s  | FileCheck %s --check-prefixes=ALL,O2
 ; RUN: opt -passes='default' -S < %s  | FileCheck %s --check-prefixes=ALL,O1
 
diff --git a/llvm/test/Transforms/PhaseOrdering/globalaa-retained.ll b/llvm/test/Transforms/PhaseOrdering/globalaa-retained.ll
index 2139542c3d3bc..82a453d72079f 100644
--- a/llvm/test/Transforms/PhaseOrdering/globalaa-retained.ll
+++ b/llvm/test/Transforms/PhaseOrdering/globalaa-retained.ll
@@ -1,4 +1,4 @@
-; RUN: opt -passes='default' -S < %s | FileCheck %s
+; RUN: opt -passes="default" -S < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64"
 
diff --git a/llvm/test/Transforms/PhaseOrdering/instcombine-sroa-inttoptr.ll b/llvm/test/Transforms/PhaseOrdering/instcombine-sroa-inttoptr.ll
index ba6c36abd24a1..cc20233a5fefc 100644
--- a/llvm/test/Transforms/PhaseOrdering/instcombine-sroa-inttoptr.ll
+++ b/llvm/test/Transforms/PhaseOrdering/instcombine-sroa-inttoptr.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -O3 -S                                        | FileCheck %s
-; RUN: opt < %s -passes='default' -S | FileCheck %s
+; RUN: opt < %s -passes="default" -S | FileCheck %s
 
 ; This is based on the following most basic C++ code:
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/lifetime-sanitizer.ll b/llvm/test/Transforms/PhaseOrdering/lifetime-sanitizer.ll
index 21fa234535cad..1239b18c07017 100644
--- a/llvm/test/Transforms/PhaseOrdering/lifetime-sanitizer.ll
+++ b/llvm/test/Transforms/PhaseOrdering/lifetime-sanitizer.ll
@@ -5,7 +5,7 @@
 ; RUN: opt < %s -passes='default' -S | FileCheck %s --check-prefixes=CHECK,NOOPT
 ; RUN: opt < %s -passes='default' -S | FileCheck %s --check-prefixes=CHECK,OPT
 ; RUN: opt < %s -passes='default' -S | FileCheck %s --check-prefixes=CHECK,OPT
-; RUN: opt < %s -passes='default' -S | FileCheck %s --check-prefixes=CHECK,OPT
+; RUN: opt < %s -passes="default" -S | FileCheck %s --check-prefixes=CHECK,OPT
 
 declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
diff --git a/llvm/test/Transforms/PhaseOrdering/loop-rotation-vs-common-code-hoisting.ll b/llvm/test/Transforms/PhaseOrdering/loop-rotation-vs-common-code-hoisting.ll
index bae3e269b6f07..c6b5e5f3ccae0 100644
--- a/llvm/test/Transforms/PhaseOrdering/loop-rotation-vs-common-code-hoisting.ll
+++ b/llvm/test/Transforms/PhaseOrdering/loop-rotation-vs-common-code-hoisting.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='default' -rotation-max-header-size=0 -S < %s  | FileCheck %s --check-prefix=HOIST
-; RUN: opt -passes='default' -rotation-max-header-size=1 -S < %s  | FileCheck %s --check-prefix=HOIST
-; RUN: opt -passes='default' -rotation-max-header-size=2 -S < %s  | FileCheck %s --check-prefix=ROTATE
-; RUN: opt -passes='default' -rotation-max-header-size=3 -S < %s  | FileCheck %s --check-prefix=ROTATE
+; RUN: opt -passes="default" -rotation-max-header-size=0 -S < %s  | FileCheck %s --check-prefix=HOIST
+; RUN: opt -passes="default" -rotation-max-header-size=1 -S < %s  | FileCheck %s --check-prefix=HOIST
+; RUN: opt -passes="default" -rotation-max-header-size=2 -S < %s  | FileCheck %s --check-prefix=ROTATE
+; RUN: opt -passes="default" -rotation-max-header-size=3 -S < %s  | FileCheck %s --check-prefix=ROTATE
 
 ; This example is produced from a very basic C code:
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/pr32544.ll b/llvm/test/Transforms/PhaseOrdering/pr32544.ll
index 421260b102312..135084dbbc7d5 100644
--- a/llvm/test/Transforms/PhaseOrdering/pr32544.ll
+++ b/llvm/test/Transforms/PhaseOrdering/pr32544.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -O3 -S < %s                    | FileCheck %s
-; RUN: opt -passes='default' -S < %s  | FileCheck %s
+; RUN: opt -passes="default" -S < %s  | FileCheck %s
 
 define void @foo(i1 %which, i32 %a, i32 %b, ptr %result) {
 ; CHECK-LABEL: @foo(
diff --git a/llvm/test/Transforms/PhaseOrdering/pr45682.ll b/llvm/test/Transforms/PhaseOrdering/pr45682.ll
index 22305806a19c1..46ee19178e356 100644
--- a/llvm/test/Transforms/PhaseOrdering/pr45682.ll
+++ b/llvm/test/Transforms/PhaseOrdering/pr45682.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -O3 -S < %s                    | FileCheck %s
-; RUN: opt -passes='default' -S < %s  | FileCheck %s
+; RUN: opt -passes="default" -S < %s  | FileCheck %s
 
 define void @PR45682(i32 %x, i32 %y) {
 ; CHECK-LABEL: @PR45682(
diff --git a/llvm/test/Transforms/PhaseOrdering/pr62311.ll b/llvm/test/Transforms/PhaseOrdering/pr62311.ll
index 03276d83d8a01..027df7da6a74a 100644
--- a/llvm/test/Transforms/PhaseOrdering/pr62311.ll
+++ b/llvm/test/Transforms/PhaseOrdering/pr62311.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes='default' -S | FileCheck %s
+; RUN: opt < %s -passes="default" -S | FileCheck %s
 
 ; C++ version of test case
 ; #include 
diff --git a/llvm/test/Transforms/PhaseOrdering/pr95152.ll b/llvm/test/Transforms/PhaseOrdering/pr95152.ll
index 016460fed7c35..6941ea2aece92 100644
--- a/llvm/test/Transforms/PhaseOrdering/pr95152.ll
+++ b/llvm/test/Transforms/PhaseOrdering/pr95152.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -passes='default' < %s | FileCheck %s
+; RUN: opt -S -passes="default" < %s | FileCheck %s
 
 ; Make sure that interaction of "writable" with various passes does not
 ; result in the elimination of the store prior to @j().
diff --git a/llvm/test/Transforms/PhaseOrdering/rotate.ll b/llvm/test/Transforms/PhaseOrdering/rotate.ll
index 9ce196941df9c..9179edc4281eb 100644
--- a/llvm/test/Transforms/PhaseOrdering/rotate.ll
+++ b/llvm/test/Transforms/PhaseOrdering/rotate.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -O3 -S < %s                    | FileCheck %s
-; RUN: opt -passes='default' -S < %s  | FileCheck %s
+; RUN: opt -passes="default" -S < %s  | FileCheck %s
 
 ; This should become a single funnel shift through a combination
 ; of aggressive-instcombine, simplifycfg, and instcombine.
diff --git a/llvm/test/Transforms/PhaseOrdering/simplifycfg-switch-lowering-vs-correlatedpropagation.ll b/llvm/test/Transforms/PhaseOrdering/simplifycfg-switch-lowering-vs-correlatedpropagation.ll
index 03df1384ca40e..9da46bde96ef0 100644
--- a/llvm/test/Transforms/PhaseOrdering/simplifycfg-switch-lowering-vs-correlatedpropagation.ll
+++ b/llvm/test/Transforms/PhaseOrdering/simplifycfg-switch-lowering-vs-correlatedpropagation.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes='default' -S < %s | FileCheck %s
 ; RUN: opt -passes='default' -S < %s | FileCheck %s
-; RUN: opt -passes='default' -S < %s | FileCheck %s
+; RUN: opt -passes="default" -S < %s | FileCheck %s
 
 ; We are worse at propagating correlation facts when in select form
 ; as compared to the PHI form, so if we lower switches to early,
diff --git a/llvm/test/Transforms/PhaseOrdering/switch-sext.ll b/llvm/test/Transforms/PhaseOrdering/switch-sext.ll
index 0e352ba52f6a2..3fbb02d046293 100644
--- a/llvm/test/Transforms/PhaseOrdering/switch-sext.ll
+++ b/llvm/test/Transforms/PhaseOrdering/switch-sext.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S -passes='default' < %s | FileCheck %s
+; RUN: opt -S -passes="default" < %s | FileCheck %s
 
 define i8 @test_switch_with_sext_phi(i8 %code) {
 ; CHECK-LABEL: define noundef i8 @test_switch_with_sext_phi(
diff --git a/llvm/test/Transforms/PhaseOrdering/switch_with_geps.ll b/llvm/test/Transforms/PhaseOrdering/switch_with_geps.ll
index d2f33f9c3b754..33266caf70923 100644
--- a/llvm/test/Transforms/PhaseOrdering/switch_with_geps.ll
+++ b/llvm/test/Transforms/PhaseOrdering/switch_with_geps.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -S -passes='default' < %s | FileCheck %s
 ; RUN: opt -S -passes='default' < %s | FileCheck %s
-; RUN: opt -S -passes='default' < %s | FileCheck %s
+; RUN: opt -S -passes="default" < %s | FileCheck %s
 
 target datalayout = "n64"
 

From 0b08e095cc05288d1209cf051988621f6935c940 Mon Sep 17 00:00:00 2001
From: Ivan Butygin 
Date: Tue, 31 Dec 2024 16:54:41 +0100
Subject: [PATCH 217/567] [mlir][nfc] GpuToROCDL: Remove some dead code
 (#121395)

---
 .../Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp   | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index d52a86987b1ce..a1cefe289a696 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -47,7 +47,6 @@
 
 #include "../GPUCommon/GPUOpsLowering.h"
 #include "../GPUCommon/IndexIntrinsicsOpLowering.h"
-#include "../GPUCommon/OpToFuncCallLowering.h"
 
 namespace mlir {
 #define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS
@@ -346,16 +345,6 @@ void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) {
   target.addLegalOp();
 }
 
-template 
-static void populateOpPatterns(const LLVMTypeConverter &converter,
-                               RewritePatternSet &patterns, StringRef f32Func,
-                               StringRef f64Func, StringRef f32ApproxFunc,
-                               StringRef f16Func) {
-  patterns.add>(converter);
-  patterns.add>(converter, f32Func, f32ApproxFunc,
-                                           f16Func);
-}
-
 void mlir::populateGpuToROCDLConversionPatterns(
     const LLVMTypeConverter &converter, RewritePatternSet &patterns,
     mlir::gpu::amd::Runtime runtime) {

From 018b32ca1fd0214e4a359ed8388a2c859d0fc841 Mon Sep 17 00:00:00 2001
From: Ivan Butygin 
Date: Tue, 31 Dec 2024 16:55:00 +0100
Subject: [PATCH 218/567] Revert "[mlir][nfc] GpuToROCDL: Remove some dead
 code" (#121402)

Reverts llvm/llvm-project#121395
---
 .../Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp   | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index a1cefe289a696..d52a86987b1ce 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -47,6 +47,7 @@
 
 #include "../GPUCommon/GPUOpsLowering.h"
 #include "../GPUCommon/IndexIntrinsicsOpLowering.h"
+#include "../GPUCommon/OpToFuncCallLowering.h"
 
 namespace mlir {
 #define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS
@@ -345,6 +346,16 @@ void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) {
   target.addLegalOp();
 }
 
+template 
+static void populateOpPatterns(const LLVMTypeConverter &converter,
+                               RewritePatternSet &patterns, StringRef f32Func,
+                               StringRef f64Func, StringRef f32ApproxFunc,
+                               StringRef f16Func) {
+  patterns.add>(converter);
+  patterns.add>(converter, f32Func, f32ApproxFunc,
+                                           f16Func);
+}
+
 void mlir::populateGpuToROCDLConversionPatterns(
     const LLVMTypeConverter &converter, RewritePatternSet &patterns,
     mlir::gpu::amd::Runtime runtime) {

From ebdb6cf2acd6008e8741764a9078e2403cbc4fb5 Mon Sep 17 00:00:00 2001
From: Angus Lees 
Date: Wed, 1 Jan 2025 03:52:35 +1100
Subject: [PATCH 219/567] [lit] Cope with more cat variants (#121376)

BusyBox `cat` has yet another slight variation of error formatting:

```console
$ cat --help 2>&1 | head -1
BusyBox v1.37.0 (2024-09-30 10:39:57 UTC) multi-call binary.

$ cat does-not-exist
cat: can't open 'does-not-exist': No such file or directory
```

Rather than extend the test result regex with a third case,
recognise that we only really care about the filename and errno string.
Weaken the regex to ignore all "noise" around the filename.

Note this also corrects what looks like a bug with the previous regex.
Previously, the `cannot open does-not-exist` alternate did not assert
the following errno message.  This was introduced in
https://reviews.llvm.org/D60553 (apparently) due to differences in the
`cat` command on AIX.  That bug doesn't include the specific
AIX output, so it's unclear if this omission was intended.
---
 llvm/utils/lit/tests/shtest-format.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/lit/tests/shtest-format.py b/llvm/utils/lit/tests/shtest-format.py
index 3a1959549e5d0..d58973993b272 100644
--- a/llvm/utils/lit/tests/shtest-format.py
+++ b/llvm/utils/lit/tests/shtest-format.py
@@ -18,7 +18,7 @@
 # CHECK: Command Output (stderr):
 # CHECK-NEXT: --
 # CHECK-NOT: --
-# CHECK: cat{{(_64)?(\.exe)?}}: {{cannot open does-not-exist|does-not-exist: No such file or directory}}
+# CHECK: cat{{(_64)?(\.exe)?}}: {{.*does-not-exist.*}}: No such file or directory
 # CHECK: --
 
 # CHECK: FAIL: shtest-format :: external_shell/fail_with_bad_encoding.txt

From 40e734e041fa4d5369197e88ce6d354238695e0c Mon Sep 17 00:00:00 2001
From: Ellis Hoag 
Date: Tue, 31 Dec 2024 09:07:00 -0800
Subject: [PATCH 220/567] [lld][MachO] Allow separate --irpgo-profile flag
 (#121354)

---
 lld/MachO/Driver.cpp                     | 2 +-
 lld/MachO/Options.td                     | 4 +++-
 lld/test/MachO/bp-section-orderer-errs.s | 3 ++-
 lld/test/MachO/bp-section-orderer.s      | 2 +-
 4 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 7550b0b9fa531..31630ba7d69de 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -1842,7 +1842,7 @@ bool link(ArrayRef argsArr, llvm::raw_ostream &stdoutOS,
       args.hasArg(OPT_irpgo_profile_sort_eq))
     warn("--irpgo-profile-sort is deprecated. Please use "
          "--bp-startup-sort=function");
-  if (const Arg *arg = args.getLastArg(OPT_irpgo_profile_eq))
+  if (const Arg *arg = args.getLastArg(OPT_irpgo_profile))
     config->irpgoProfilePath = arg->getValue();
 
   if (const Arg *arg = args.getLastArg(OPT_irpgo_profile_sort)) {
diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td
index 1d7f1d806cc7f..4b1e9e4391070 100644
--- a/lld/MachO/Options.td
+++ b/lld/MachO/Options.td
@@ -126,8 +126,10 @@ def no_call_graph_profile_sort : Flag<["--"], "no-call-graph-profile-sort">,
 def print_symbol_order_eq: Joined<["--"], "print-symbol-order=">,
     HelpText<"Print a symbol order specified by --call-graph-profile-sort into the specified file">,
     Group;
+def irpgo_profile: Separate<["--"], "irpgo-profile">, Group;
 def irpgo_profile_eq: Joined<["--"], "irpgo-profile=">,
-    HelpText<"Read the IRPGO profile for use with -bp-startup-sort and other profile-guided optimizations">,
+    Alias(irpgo_profile)>, MetaVarName<"">,
+    HelpText<"Read the IRPGO  for use with -bp-startup-sort and other profile-guided optimizations">,
     Group;
 def bp_startup_sort: Joined<["--"], "bp-startup-sort=">,
     MetaVarName<"[none,function]">,
diff --git a/lld/test/MachO/bp-section-orderer-errs.s b/lld/test/MachO/bp-section-orderer-errs.s
index 8d19e01c716ea..abeb25122a929 100644
--- a/lld/test/MachO/bp-section-orderer-errs.s
+++ b/lld/test/MachO/bp-section-orderer-errs.s
@@ -14,8 +14,9 @@
 # RUN: not %lld -o /dev/null --bp-compression-sort-startup-functions 2>&1 | FileCheck %s --check-prefix=STARTUP
 # STARTUP: --bp-compression-sort-startup-functions must be used with --bp-startup-sort=function
 
+# RUN: not %lld -o /dev/null --irpgo-profile %s --bp-startup-sort=function --call-graph-profile-sort 2>&1 | FileCheck %s --check-prefix=IRPGO-STARTUP
 # RUN: not %lld -o /dev/null --irpgo-profile=%s --bp-startup-sort=function --call-graph-profile-sort 2>&1 | FileCheck %s --check-prefix=IRPGO-STARTUP
 # IRPGO-STARTUP: --bp-startup-sort= is incompatible with --call-graph-profile-sort
 
 # RUN: not %lld -o /dev/null --bp-startup-sort=function 2>&1 | FileCheck %s --check-prefix=STARTUP-COMPRESSION
-# STARTUP-COMPRESSION: --bp-startup-sort=function must be used with --irpgo-profile
\ No newline at end of file
+# STARTUP-COMPRESSION: --bp-startup-sort=function must be used with --irpgo-profile
diff --git a/lld/test/MachO/bp-section-orderer.s b/lld/test/MachO/bp-section-orderer.s
index e5d0e7137b30d..2eaff04bdc047 100644
--- a/lld/test/MachO/bp-section-orderer.s
+++ b/lld/test/MachO/bp-section-orderer.s
@@ -7,7 +7,7 @@
 # RUN: %no-fatal-warnings-lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --irpgo-profile-sort=%t/a.profdata --verbose-bp-section-orderer 2>&1 | FileCheck %s --check-prefix=STARTUP
 # RUN: %no-fatal-warnings-lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --irpgo-profile-sort=%t/a.profdata --verbose-bp-section-orderer --icf=all --compression-sort=none 2>&1 | FileCheck %s --check-prefix=STARTUP
 
-# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --irpgo-profile=%t/a.profdata --bp-startup-sort=function --verbose-bp-section-orderer 2>&1 | FileCheck %s --check-prefix=STARTUP
+# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --irpgo-profile %t/a.profdata --bp-startup-sort=function --verbose-bp-section-orderer 2>&1 | FileCheck %s --check-prefix=STARTUP
 # RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --irpgo-profile=%t/a.profdata --bp-startup-sort=function --verbose-bp-section-orderer --icf=all --bp-compression-sort=none 2>&1 | FileCheck %s --check-prefix=STARTUP
 # STARTUP: Ordered 3 sections using balanced partitioning
 

From 8e8d0c149960c7d37e5c4487f434cdd41b1d3866 Mon Sep 17 00:00:00 2001
From: B I Mohammed Abbas 
Date: Tue, 31 Dec 2024 22:57:15 +0530
Subject: [PATCH 221/567] Fix integer suffix in truncxfhf2_test (#121388)

Fixes error introduced by #120372.
---
 compiler-rt/test/builtins/Unit/truncxfhf2_test.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/compiler-rt/test/builtins/Unit/truncxfhf2_test.c b/compiler-rt/test/builtins/Unit/truncxfhf2_test.c
index 9038a91a5b4c1..b5e2a91f0b930 100644
--- a/compiler-rt/test/builtins/Unit/truncxfhf2_test.c
+++ b/compiler-rt/test/builtins/Unit/truncxfhf2_test.c
@@ -48,17 +48,17 @@ int main() {
 
   // Positive infinity
   if (test_truncxfhf2(UINT16_C(0x7fff), UINT64_C(0x8000000000000000),
-                      UINT16_C(0x7c00U)))
+                      UINT16_C(0x7c00)))
     return 1;
 
   // Negative infinity
   if (test_truncxfhf2(UINT16_C(0xffff), UINT64_C(0x8000000000000000),
-                      UINT16_C(0xfc00U)))
+                      UINT16_C(0xfc00)))
     return 1;
 
   // NaN
   if (test_truncxfhf2(UINT16_C(0x7fff), UINT64_C(0xc000000000000000),
-                      UINT16_C(0x7e00U)))
+                      UINT16_C(0x7e00)))
     return 1;
 
   return 0;

From a5f3058caae37471f7d2b55a4f621b104c52658f Mon Sep 17 00:00:00 2001
From: Spencer Abson 
Date: Tue, 31 Dec 2024 17:26:31 +0000
Subject: [PATCH 222/567] [AArch64][NFC] Remove redundant comments in
 SMEInstrInfo

---
 .../lib/Target/AArch64/AArch64SMEInstrInfo.td | 32 ++++++++-----------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 8b8d73d78a1ea..aee54ed47a3ab 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -979,8 +979,7 @@ defm FSCALE_2ZZ   : sme2_fp_sve_destructive_vector_vg2_single<"fscale", 0b001100
 defm FSCALE_4ZZ   : sme2_fp_sve_destructive_vector_vg4_single<"fscale", 0b0011000>;
 defm FSCALE_2Z2Z  : sme2_fp_sve_destructive_vector_vg2_multi<"fscale",  0b0011000>;
 defm FSCALE_4Z4Z  : sme2_fp_sve_destructive_vector_vg4_multi<"fscale",  0b0011000>;
-
-} // [HasSME2, HasFP8]
+}
 
 let Predicates = [HasSME2, HasFAMINMAX] in {
 defm FAMAX_2Z2Z : sme2_fp_sve_destructive_vector_vg2_multi<"famax", 0b0010100>;
@@ -988,17 +987,16 @@ defm FAMIN_2Z2Z : sme2_fp_sve_destructive_vector_vg2_multi<"famin", 0b0010101>;
 
 defm FAMAX_4Z4Z : sme2_fp_sve_destructive_vector_vg4_multi<"famax", 0b0010100>;
 defm FAMIN_4Z4Z : sme2_fp_sve_destructive_vector_vg4_multi<"famin", 0b0010101>;
-} //[HasSME2, HasFAMINMAX]
-
+}
 
 let Predicates = [HasSME_LUTv2] in {
 defm MOVT_TIZ : sme2_movt_zt_to_zt<"movt",  0b0011111, int_aarch64_sme_write_lane_zt, int_aarch64_sme_write_zt>;
 def LUTI4_4ZZT2Z    : sme2_luti4_vector_vg4<0b00, 0b00,"luti4">;
-} //[HasSME_LUTv2]
+}
 
 let Predicates = [HasSME2p1, HasSME_LUTv2] in {
 def LUTI4_S_4ZZT2Z  : sme2_luti4_vector_vg4_strided<0b00, 0b00, "luti4">;
-} //[HasSME2p1, HasSME_LUTv2]
+}
 
 let Predicates = [HasSMEF8F16] in {
 defm FVDOT_VG2_M2ZZI_BtoH : sme2_fp8_fdot_index_za16_vg1x2<"fvdot", 0b110, int_aarch64_sme_fp8_fvdot_lane_za16_vg1x2>;
@@ -1014,17 +1012,15 @@ defm FMLAL_MZZI_BtoH      : sme2_fp8_fmlal_index_za16<"fmlal",      int_aarch64_
 defm FMLAL_VG2_M2ZZI_BtoH : sme2_fp8_fmlal_index_za16_vgx2<"fmlal", int_aarch64_sme_fp8_fmlal_lane_za16_vg2x2>;
 defm FMLAL_VG4_M4ZZI_BtoH : sme2_fp8_fmlal_index_za16_vgx4<"fmlal", int_aarch64_sme_fp8_fmlal_lane_za16_vg2x4>;
 
-// FP8 FMLAL (single)
 defm FMLAL_VG2_MZZ_BtoH  : sme2_fp8_fmlal_single_za16<"fmlal", int_aarch64_sme_fp8_fmlal_single_za16_vg2x1>;
-defm FMLAL_VG2_M2ZZ_BtoH : sme2_fp_mla_long_array_vg2_single<"fmlal",  0b001, MatrixOp16, ZZ_b, ZPR4b8, nxv16i8,  int_aarch64_sme_fp8_fmlal_single_za16_vg2x2, [FPMR, FPCR]>;
+defm FMLAL_VG2_M2ZZ_BtoH : sme2_fp_mla_long_array_vg2_single<"fmlal", 0b001, MatrixOp16, ZZ_b, ZPR4b8, nxv16i8,   int_aarch64_sme_fp8_fmlal_single_za16_vg2x2, [FPMR, FPCR]>;
 defm FMLAL_VG4_M4ZZ_BtoH : sme2_fp_mla_long_array_vg4_single<"fmlal", 0b001, MatrixOp16, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_fp8_fmlal_single_za16_vg2x4, [FPMR, FPCR]>;
 
-// FP8 FMLALL (multi)
 defm FMLAL_VG2_M2Z2Z_BtoH : sme2_fp_mla_long_array_vg2_multi<"fmlal", 0b100, MatrixOp16, ZZ_b_mul_r, nxv16i8,   int_aarch64_sme_fp8_fmlal_multi_za16_vg2x2, [FPMR, FPCR]>;
 defm FMLAL_VG4_M4Z4Z_BtoH : sme2_fp_mla_long_array_vg4_multi<"fmlal", 0b100, MatrixOp16, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_fp8_fmlal_multi_za16_vg2x4, [FPMR, FPCR]>;
 
 defm FMOPA_MPPZZ_BtoH : sme2_fp8_fmopa_za16<"fmopa", int_aarch64_sme_fp8_fmopa_za16>;
-} //[HasSMEF8F16]
+}
 
 let Predicates = [HasSMEF8F32] in {
 defm FDOT_VG2_M2ZZI_BtoS : sme2_fp8_fdot_index_za32_vg1x2<"fdot", int_aarch64_sme_fp8_fdot_lane_za32_vg1x2>;
@@ -1042,17 +1038,15 @@ defm FMLALL_MZZI_BtoS      : sme2_mla_ll_array_index_32b<"fmlall",     0b01, 0b0
 defm FMLALL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"fmlall", 0b10, 0b100,  int_aarch64_sme_fp8_fmlall_lane_za32_vg4x2, [FPMR, FPCR]>;
 defm FMLALL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"fmlall", 0b00, 0b1000, int_aarch64_sme_fp8_fmlall_lane_za32_vg4x4, [FPMR, FPCR]>;
 
-// FP8 FMLALL (single)
 defm FMLALL_MZZ_BtoS       : sme2_mla_ll_array_single<"fmlall", 0b01000, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_fp8_fmlall_single_za32_vg4x1, [FPMR, FPCR]>;
 defm FMLALL_VG2_M2ZZ_BtoS  : sme2_mla_ll_array_vg2_single<"fmlall", 0b000001, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_fp8_fmlall_single_za32_vg4x2, [FPMR, FPCR]>;
 defm FMLALL_VG4_M4ZZ_BtoS  : sme2_mla_ll_array_vg4_single<"fmlall", 0b010001, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_fp8_fmlall_single_za32_vg4x4, [FPMR, FPCR]>;
 
-// FP8 FMLALL (multi)
 defm FMLALL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"fmlall", 0b01000, MatrixOp32, ZZ_b_mul_r, nxv16i8,   int_aarch64_sme_fp8_fmlall_multi_za32_vg4x2, [FPMR, FPCR]>;
 defm FMLALL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"fmlall", 0b01000, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_fp8_fmlall_multi_za32_vg4x4, [FPMR, FPCR]>;
 
 defm FMOPA_MPPZZ_BtoS : sme2_fp8_fmopa_za32<"fmopa", int_aarch64_sme_fp8_fmopa_za32>;
-} //[HasSMEF8F32]
+}
 
 let Predicates = [HasSME2, HasSVEBFSCALE] in {
   defm BFSCALE : sme2_bfscale_single<"bfscale">;
@@ -1077,31 +1071,31 @@ let Predicates = [HasSME2p2] in {
 
   defm FMOP4A : sme2_fmop4as_fp16_fp32_widening<0, "fmop4a">;
   defm FMOP4S : sme2_fmop4as_fp16_fp32_widening<1, "fmop4s">;
-} // [HasSME2p2]
+}
 
 let Predicates = [HasSME2p2, HasSMEB16B16] in {
   def BFTMOPA_M2ZZZI_HtoH : sme_tmopa_16b<0b11001, ZZ_h_mul_r, ZPR16, "bftmopa">;
-} // [HasSME2p2, HasSMEB16B16]
+}
 
 let Predicates = [HasSME2p2, HasSMEF8F32], Uses = [FPMR, FPCR] in {
   def FTMOPA_M2ZZZI_BtoS : sme_tmopa_32b<0b01000, ZZ_b_mul_r, ZPR8, "ftmopa">;
-} // [HasSME2p2, HasSMEF8F32], Uses = [FPMR, FPCR]
+}
 
 let Predicates = [HasSME2p2, HasSMEF8F16], Uses = [FPMR, FPCR] in {
   def FTMOPA_M2ZZZI_BtoH : sme_tmopa_16b<0b01001, ZZ_b_mul_r, ZPR8, "ftmopa">;
   defm FMOP4A : sme2_fmop4a_fp8_fp16_2way<"fmop4a">;
-} // [HasSME2p2, HasSMEF8F16],  Uses = [FPMR, FPCR]
+}
 
 let Predicates = [HasSME2p2, HasSMEF16F16] in {
   def FTMOPA_M2ZZZI_HtoH : sme_tmopa_16b<0b10001, ZZ_h_mul_r, ZPR16, "ftmopa">;
   defm FMOP4A : sme2_fmop4as_fp16_non_widening<0, "fmop4a">;
   defm FMOP4S : sme2_fmop4as_fp16_non_widening<1, "fmop4s">;
-} // [HasSME2p2, HasSMEF16F16]
+}
 
 let Predicates = [HasSME2, HasSVEBFSCALE] in {
   defm BFMUL : sme2_bfmul_single<"bfmul">;
   defm BFMUL : sme2_bfmul_multi<"bfmul">;
-} //[HasSME2, HasSVEBFSCALE]
+}
 
 let Uses = [FPMR, FPCR] in {
 let Predicates = [HasSME2p2, HasSMEF8F32] in {

From 0e23cb0cc5b087f7039e35595a16ae549a42aada Mon Sep 17 00:00:00 2001
From: Ivan Butygin 
Date: Tue, 31 Dec 2024 18:39:31 +0100
Subject: [PATCH 223/567] [mlir][nfc] GpuToROCDL: Remove some dead code
 (#121403)

---
 .../Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp   | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index d52a86987b1ce..a1cefe289a696 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -47,7 +47,6 @@
 
 #include "../GPUCommon/GPUOpsLowering.h"
 #include "../GPUCommon/IndexIntrinsicsOpLowering.h"
-#include "../GPUCommon/OpToFuncCallLowering.h"
 
 namespace mlir {
 #define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS
@@ -346,16 +345,6 @@ void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) {
   target.addLegalOp();
 }
 
-template 
-static void populateOpPatterns(const LLVMTypeConverter &converter,
-                               RewritePatternSet &patterns, StringRef f32Func,
-                               StringRef f64Func, StringRef f32ApproxFunc,
-                               StringRef f16Func) {
-  patterns.add>(converter);
-  patterns.add>(converter, f32Func, f32ApproxFunc,
-                                           f16Func);
-}
-
 void mlir::populateGpuToROCDLConversionPatterns(
     const LLVMTypeConverter &converter, RewritePatternSet &patterns,
     mlir::gpu::amd::Runtime runtime) {

From 2cee9034adb389a20937f0a77d51675ff5a105f8 Mon Sep 17 00:00:00 2001
From: David Green 
Date: Tue, 31 Dec 2024 18:07:10 +0000
Subject: [PATCH 224/567] [AArch64] Add some tests for csel/subs with swapped
 conditions. NFC

---
 .../test/CodeGen/AArch64/csel-subs-swapped.ll | 322 ++++++++++++++++++
 1 file changed, 322 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/csel-subs-swapped.ll

diff --git a/llvm/test/CodeGen/AArch64/csel-subs-swapped.ll b/llvm/test/CodeGen/AArch64/csel-subs-swapped.ll
new file mode 100644
index 0000000000000..7c628cf1683d6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/csel-subs-swapped.ll
@@ -0,0 +1,322 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+define i32 @eq_i32(i32 %x) {
+; CHECK-LABEL: eq_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
+; CHECK-NEXT:    cmn w0, #512, lsl #12 // =2097152
+; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    csel w0, w0, w8, eq
+; CHECK-NEXT:    ret
+  %cmp = icmp eq i32 %x, -2097152
+  %sub = sub i32 -2097152, %x
+  %retval.0 = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %retval.0
+}
+
+define i32 @ne_i32(i32 %x) {
+; CHECK-LABEL: ne_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
+; CHECK-NEXT:    cmn w0, #512, lsl #12 // =2097152
+; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    csel w0, w0, w8, ne
+; CHECK-NEXT:    ret
+  %cmp = icmp ne i32 %x, -2097152
+  %sub = sub i32 -2097152, %x
+  %retval.0 = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %retval.0
+}
+
+define i32 @sgt_i32(i32 %x) {
+; CHECK-LABEL: sgt_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
+; CHECK-NEXT:    cmn w0, #512, lsl #12 // =2097152
+; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    csel w0, w0, w8, gt
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i32 %x, -2097152
+  %sub = sub i32 -2097152, %x
+  %retval.0 = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %retval.0
+}
+
+define i32 @sge_i32(i32 %x) {
+; CHECK-LABEL: sge_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
+; CHECK-NEXT:    mov w9, #-2097153 // =0xffdfffff
+; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    cmp w0, w9
+; CHECK-NEXT:    csel w0, w0, w8, gt
+; CHECK-NEXT:    ret
+  %cmp = icmp sge i32 %x, -2097152
+  %sub = sub i32 -2097152, %x
+  %retval.0 = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %retval.0
+}
+
+define i32 @slt_i32(i32 %x) {
+; CHECK-LABEL: slt_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
+; CHECK-NEXT:    cmn w0, #512, lsl #12 // =2097152
+; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    csel w0, w0, w8, lt
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i32 %x, -2097152
+  %sub = sub i32 -2097152, %x
+  %retval.0 = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %retval.0
+}
+
+define i32 @sle_i32(i32 %x) {
+; CHECK-LABEL: sle_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
+; CHECK-NEXT:    mov w9, #-2097151 // =0xffe00001
+; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    cmp w0, w9
+; CHECK-NEXT:    csel w0, w0, w8, lt
+; CHECK-NEXT:    ret
+  %cmp = icmp sle i32 %x, -2097152
+  %sub = sub i32 -2097152, %x
+  %retval.0 = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %retval.0
+}
+
+define i32 @ugt_i32(i32 %x) {
+; CHECK-LABEL: ugt_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
+; CHECK-NEXT:    cmn w0, #512, lsl #12 // =2097152
+; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    csel w0, w0, w8, hi
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %x, -2097152
+  %sub = sub i32 -2097152, %x
+  %retval.0 = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %retval.0
+}
+
+define i32 @uge_i32(i32 %x) {
+; CHECK-LABEL: uge_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsr w9, w0, #21
+; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
+; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    cmp w9, #2046
+; CHECK-NEXT:    csel w0, w0, w8, hi
+; CHECK-NEXT:    ret
+  %cmp = icmp uge i32 %x, -2097152
+  %sub = sub i32 -2097152, %x
+  %retval.0 = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %retval.0
+}
+
+define i32 @ult_i32(i32 %x) {
+; CHECK-LABEL: ult_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
+; CHECK-NEXT:    cmn w0, #512, lsl #12 // =2097152
+; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    csel w0, w0, w8, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, -2097152
+  %sub = sub i32 -2097152, %x
+  %retval.0 = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %retval.0
+}
+
+define i32 @ule_i32(i32 %x) {
+; CHECK-LABEL: ule_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
+; CHECK-NEXT:    mov w9, #-2097151 // =0xffe00001
+; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    cmp w0, w9
+; CHECK-NEXT:    csel w0, w0, w8, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp ule i32 %x, -2097152
+  %sub = sub i32 -2097152, %x
+  %retval.0 = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %retval.0
+}
+
+
+define i64 @eq_i64(i64 %x) {
+; CHECK-LABEL: eq_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cmp x0, #100
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, eq
+; CHECK-NEXT:    ret
+  %cmp = icmp eq i64 %x, 100
+  %sub = sub i64 100, %x
+  %retval.0 = select i1 %cmp, i64 %x, i64 %sub
+  ret i64 %retval.0
+}
+
+define i64 @ne_i64(i64 %x) {
+; CHECK-LABEL: ne_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cmp x0, #100
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, ne
+; CHECK-NEXT:    ret
+  %cmp = icmp ne i64 %x, 100
+  %sub = sub i64 100, %x
+  %retval.0 = select i1 %cmp, i64 %x, i64 %sub
+  ret i64 %retval.0
+}
+
+define i64 @sgt_i64(i64 %x) {
+; CHECK-LABEL: sgt_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cmp x0, #100
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, gt
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i64 %x, 100
+  %sub = sub i64 100, %x
+  %retval.0 = select i1 %cmp, i64 %x, i64 %sub
+  ret i64 %retval.0
+}
+
+define i64 @sge_i64(i64 %x) {
+; CHECK-LABEL: sge_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cmp x0, #99
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, gt
+; CHECK-NEXT:    ret
+  %cmp = icmp sge i64 %x, 100
+  %sub = sub i64 100, %x
+  %retval.0 = select i1 %cmp, i64 %x, i64 %sub
+  ret i64 %retval.0
+}
+
+define i64 @slt_i64(i64 %x) {
+; CHECK-LABEL: slt_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cmp x0, #100
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, lt
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i64 %x, 100
+  %sub = sub i64 100, %x
+  %retval.0 = select i1 %cmp, i64 %x, i64 %sub
+  ret i64 %retval.0
+}
+
+define i64 @sle_i64(i64 %x) {
+; CHECK-LABEL: sle_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cmp x0, #101
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, lt
+; CHECK-NEXT:    ret
+  %cmp = icmp sle i64 %x, 100
+  %sub = sub i64 100, %x
+  %retval.0 = select i1 %cmp, i64 %x, i64 %sub
+  ret i64 %retval.0
+}
+
+define i64 @ugt_i64(i64 %x) {
+; CHECK-LABEL: ugt_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cmp x0, #100
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, hi
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i64 %x, 100
+  %sub = sub i64 100, %x
+  %retval.0 = select i1 %cmp, i64 %x, i64 %sub
+  ret i64 %retval.0
+}
+
+define i64 @uge_i64(i64 %x) {
+; CHECK-LABEL: uge_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cmp x0, #99
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, hi
+; CHECK-NEXT:    ret
+  %cmp = icmp uge i64 %x, 100
+  %sub = sub i64 100, %x
+  %retval.0 = select i1 %cmp, i64 %x, i64 %sub
+  ret i64 %retval.0
+}
+
+define i64 @ult_i64(i64 %x) {
+; CHECK-LABEL: ult_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cmp x0, #100
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i64 %x, 100
+  %sub = sub i64 100, %x
+  %retval.0 = select i1 %cmp, i64 %x, i64 %sub
+  ret i64 %retval.0
+}
+
+define i64 @ule_i64(i64 %x) {
+; CHECK-LABEL: ule_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cmp x0, #101
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp ule i64 %x, 100
+  %sub = sub i64 100, %x
+  %retval.0 = select i1 %cmp, i64 %x, i64 %sub
+  ret i64 %retval.0
+}
+
+
+define i64 @both(i64 %x) {
+; CHECK-LABEL: both:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    sub x9, x0, #100
+; CHECK-NEXT:    cmp x0, #101
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x8, x9, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp ule i64 %x, 100
+  %sub1 = sub i64 100, %x
+  %sub2 = sub i64 %x, 100
+  %retval.0 = select i1 %cmp, i64 %sub1, i64 %sub2
+  ret i64 %retval.0
+}
+
+define i32 @qabs(i32 %0) {
+; CHECK-LABEL: qabs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2147483648 // =0x80000000
+; CHECK-NEXT:    cmp w0, w8
+; CHECK-NEXT:    mov w8, #2147483647 // =0x7fffffff
+; CHECK-NEXT:    csneg w8, w8, w0, eq
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    csel w0, w0, w8, gt
+; CHECK-NEXT:    ret
+  %cmp1 = icmp sgt i32 %0, 0
+  %cmp2 = icmp eq i32 %0, -2147483648
+  %sub = sub nsw i32 0, %0
+  %cond = select i1 %cmp2, i32 2147483647, i32 %sub
+  %cond6 = select i1 %cmp1, i32 %0, i32 %cond
+  ret i32 %cond6
+}

From 5056a4b556077da79afe34f54b5447c19a77d97d Mon Sep 17 00:00:00 2001
From: Jason Molenda 
Date: Tue, 31 Dec 2024 10:48:26 -0800
Subject: [PATCH 225/567] [lldb] Update two API tests to fix x86 Darwin
 failures (#121380)

The Intel Darwin CI bots had their Xcode updated, which brought in a
debugserver with Brendan Shanks' change from September
7281e0cb3bbcce396aab8b3ea0967d7a17cd287a
https://github.com/llvm/llvm-project/pull/108663 where four general
purpose registers are sent by debugserver when in certain process
states. But most processes (nearly all in the testsuite) do not have
these registers available, so we will get register read failures when
requesting those four. These two tests would flag those as errors. There
would have been an additional problem with the g/G packet (which lldb
doesn't use w/ debugserver, but the testsuite tests) if placeholder
values were not included in the full register context bytes; I fixed
that issue with the SME patch to debugserver recently already.
---
 .../test/tools/lldb-server/gdbremote_testcase.py     | 12 +++++++++++-
 .../register/register_command/TestRegisters.py       |  7 +++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py
index d6cb68f55bf29..cbe430c92fa7f 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py
@@ -1410,7 +1410,17 @@ def read_register_values(self, reg_infos, endian, thread_id=None):
             p_response = context.get("p_response")
             self.assertIsNotNone(p_response)
             self.assertTrue(len(p_response) > 0)
-            self.assertFalse(p_response[0] == "E")
+
+            # on x86 Darwin, 4 GPR registers are often
+            # unavailable, this is expected and correct.
+            if (
+                self.getArchitecture() == "x86_64"
+                and self.platformIsDarwin()
+                and p_response[0] == "E"
+            ):
+                values[reg_index] = 0
+            else:
+                self.assertFalse(p_response[0] == "E")
 
             values[reg_index] = unpack_register_hex_unsigned(endian, p_response)
 
diff --git a/lldb/test/API/commands/register/register/register_command/TestRegisters.py b/lldb/test/API/commands/register/register/register_command/TestRegisters.py
index 0b80a09534371..99290e02cd2b0 100644
--- a/lldb/test/API/commands/register/register/register_command/TestRegisters.py
+++ b/lldb/test/API/commands/register/register/register_command/TestRegisters.py
@@ -58,6 +58,13 @@ def test_register_commands(self):
             # could not be read.  This is expected.
             error_str_matched = True
 
+        if self.getArchitecture() == "x86_64" and self.platformIsDarwin():
+            # debugserver on x86 will provide ds/es/ss/gsbase when the
+            # kernel provides them, but most of the time they will be
+            # unavailable.  So "register read -a" will report that
+            # 4 registers were unavailable, it is expected.
+            error_str_matched = True
+
         self.expect(
             "register read -a",
             MISSING_EXPECTED_REGISTERS,

From 71d6b0b0c1e5e7f34ccb710470cb90a9a51005c8 Mon Sep 17 00:00:00 2001
From: David Green 
Date: Tue, 31 Dec 2024 19:08:05 +0000
Subject: [PATCH 226/567] [AArch64][GlobalISel] Lower shuffle vector with
 scalar destinations. (#121384)

I believe these are usually canonicalized to vector extracts in most
situations, but under -O0 we might trigger failures in the widening code
if we do not handle scalar destinations correctly. The simplest solution
should be to lower the shuffle to an extract.

Fixes #121365.
---
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |  7 +--
 .../AArch64/GlobalISel/legalize-shuffle-1x.ll | 43 +++++++++++++++++++
 .../GlobalISel/legalize-shuffle-vector.mir    | 29 +++++++++++++
 3 files changed, 76 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-1x.ll

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 4b7d4158faf06..7de066e09ed2f 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1062,10 +1062,11 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
         return llvm::is_contained(
             {v2s64, v2s32, v4s32, v4s16, v16s8, v8s8, v8s16}, DstTy);
       })
-      // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors), we
-      // just want those lowered into G_BUILD_VECTOR
+      // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors) or scalar
+      // destinations, we just want those lowered into G_BUILD_VECTOR or
+      // G_EXTRACT_ELEMENT.
       .lowerIf([=](const LegalityQuery &Query) {
-        return !Query.Types[1].isVector();
+        return !Query.Types[0].isVector() || !Query.Types[1].isVector();
       })
       .moreElementsIf(
           [](const LegalityQuery &Query) {
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-1x.ll b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-1x.ll
new file mode 100644
index 0000000000000..b52957767de4d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-1x.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple aarch64 -O0 -global-isel -o - %s | FileCheck %s
+
+define <1 x i1> @shuffle_extract_4(<8 x i1> %a, <8 x i1> %b) {
+; CHECK-LABEL: shuffle_extract_4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    umov w8, v0.h[4]
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %extractvec60 = shufflevector <8 x i1> %a, <8 x i1> %b, <1 x i32> 
+  ret <1 x i1> %extractvec60
+}
+
+define <1 x i1> @shuffle_extract_12(<8 x i1> %a, <8 x i1> %b) {
+; CHECK-LABEL: shuffle_extract_12:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v0.8h, v1.8b, #0
+; CHECK-NEXT:    umov w8, v0.h[4]
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %extractvec60 = shufflevector <8 x i1> %a, <8 x i1> %b, <1 x i32> 
+  ret <1 x i1> %extractvec60
+}
+
+define <1 x i1> @shuffle_extract_p(<8 x i1> %a, <8 x i1> %b) {
+; CHECK-LABEL: shuffle_extract_p:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // implicit-def: $w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %extractvec60 = shufflevector <8 x i1> %a, <8 x i1> %b, <1 x i32> 
+  ret <1 x i1> %extractvec60
+}
+
+define <1 x i32> @shufflevector_v1i32(<1 x i32> %a, <1 x i32> %b) {
+; CHECK-LABEL: shufflevector_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d0, d1
+; CHECK-NEXT:    ret
+    %c = shufflevector <1 x i32> %a, <1 x i32> %b, <1 x i32> 
+    ret <1 x i32> %c
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
index 2464026aa125b..af03a21806982 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
@@ -618,3 +618,32 @@ body:             |
     RET_ReallyLR implicit $q0
 
 ...
+---
+name:            shuffle_v8i1_v1i8
+alignment:       4
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $d0, $d1
+    ; CHECK-LABEL: name: shuffle_v8i1_v1i8
+    ; CHECK: liveins: $d0, $d1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d1
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[COPY]](<8 x s8>)
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[ANYEXT]](<8 x s16>), [[C]](s64)
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC]](s16)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C1]]
+    ; CHECK-NEXT: $w0 = COPY [[AND]](s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %2:_(<8 x s8>) = COPY $d0
+    %0:_(<8 x s1>) = G_TRUNC %2:_(<8 x s8>)
+    %3:_(<8 x s8>) = COPY $d1
+    %1:_(<8 x s1>) = G_TRUNC %3:_(<8 x s8>)
+    %4:_(s1) = G_SHUFFLE_VECTOR %0:_(<8 x s1>), %1:_, shufflemask(12)
+    %5:_(s8) = G_ZEXT %4:_(s1)
+    %6:_(s32) = G_ANYEXT %5:_(s8)
+    $w0 = COPY %6:_(s32)
+    RET_ReallyLR implicit $w0
+...

From b06a45c66fee13fa9bcab422534cba86541f3dab Mon Sep 17 00:00:00 2001
From: Florian Hahn 
Date: Tue, 31 Dec 2024 19:34:25 +0000
Subject: [PATCH 227/567] [VPlan] Add all blocks to outer loop if present
 during ::execute (NFCI).

This ensures that all blocks created during VPlan execution are properly
added to an enclosing loop, if present.

Split off from https://github.com/llvm/llvm-project/pull/108378 and also
needed once more of the skeleton blocks are created directly via VPlan.

This also allows removing the custom logic for early-exit loop
vectorization added as part of
https://github.com/llvm/llvm-project/pull/117008.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 20 +++----------------
 llvm/lib/Transforms/Vectorize/VPlan.cpp       | 19 +++++++++---------
 llvm/lib/Transforms/Vectorize/VPlan.h         |  7 ++++---
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  2 +-
 4 files changed, 18 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a80f4b67f96e2..4282f815849a8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2961,22 +2961,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
   PSE.getSE()->forgetLoop(OrigLoop);
   PSE.getSE()->forgetBlockAndLoopDispositions();
 
-  // When dealing with uncountable early exits we create middle.split blocks
-  // between the vector loop region and the exit block. These blocks need
-  // adding to any outer loop.
-  VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
-  Loop *OuterLoop = OrigLoop->getParentLoop();
-  if (Legal->hasUncountableEarlyExit() && OuterLoop) {
-    VPBasicBlock *MiddleVPBB = State.Plan->getMiddleBlock();
-    VPBlockBase *PredVPBB = MiddleVPBB->getSinglePredecessor();
-    while (PredVPBB && PredVPBB != VectorRegion) {
-      BasicBlock *MiddleSplitBB =
-          State.CFG.VPBB2IRBB[cast(PredVPBB)];
-      OuterLoop->addBasicBlockToLoop(MiddleSplitBB, *LI);
-      PredVPBB = PredVPBB->getSinglePredecessor();
-    }
-  }
-
   // After vectorization, the exit blocks of the original loop will have
   // additional predecessors. Invalidate SCEVs for the exit phis in case SE
   // looked through single-entry phis.
@@ -3007,6 +2991,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
   for (Instruction *PI : PredicatedInstructions)
     sinkScalarOperands(&*PI);
 
+  VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
   VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
   BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
 
@@ -7715,7 +7700,8 @@ DenseMap LoopVectorizationPlanner::executePlan(
 
   // Perform the actual loop transformation.
   VPTransformState State(&TTI, BestVF, BestUF, LI, DT, ILV.Builder, &ILV,
-                         &BestVPlan, Legal->getWidestInductionType());
+                         &BestVPlan, OrigLoop->getParentLoop(),
+                         Legal->getWidestInductionType());
 
 #ifdef EXPENSIVE_CHECKS
   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 82a42b29c6a7d..0619f47f77cbe 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -216,9 +216,10 @@ VPTransformState::VPTransformState(const TargetTransformInfo *TTI,
                                    ElementCount VF, unsigned UF, LoopInfo *LI,
                                    DominatorTree *DT, IRBuilderBase &Builder,
                                    InnerLoopVectorizer *ILV, VPlan *Plan,
-                                   Type *CanonicalIVTy)
+                                   Loop *CurrentParentLoop, Type *CanonicalIVTy)
     : TTI(TTI), VF(VF), CFG(DT), LI(LI), Builder(Builder), ILV(ILV), Plan(Plan),
-      LVer(nullptr), TypeAnalysis(CanonicalIVTy) {}
+      CurrentParentLoop(CurrentParentLoop), LVer(nullptr),
+      TypeAnalysis(CanonicalIVTy) {}
 
 Value *VPTransformState::get(VPValue *Def, const VPLane &Lane) {
   if (Def->isLiveIn())
@@ -502,8 +503,8 @@ void VPBasicBlock::execute(VPTransformState *State) {
     UnreachableInst *Terminator = State->Builder.CreateUnreachable();
     // Register NewBB in its loop. In innermost loops its the same for all
     // BB's.
-    if (State->CurrentVectorLoop)
-      State->CurrentVectorLoop->addBasicBlockToLoop(NewBB, *State->LI);
+    if (State->CurrentParentLoop)
+      State->CurrentParentLoop->addBasicBlockToLoop(NewBB, *State->LI);
     State->Builder.SetInsertPoint(Terminator);
 
     State->CFG.PrevBB = NewBB;
@@ -713,17 +714,17 @@ void VPRegionBlock::execute(VPTransformState *State) {
 
   if (!isReplicator()) {
     // Create and register the new vector loop.
-    Loop *PrevLoop = State->CurrentVectorLoop;
-    State->CurrentVectorLoop = State->LI->AllocateLoop();
+    Loop *PrevLoop = State->CurrentParentLoop;
+    State->CurrentParentLoop = State->LI->AllocateLoop();
     BasicBlock *VectorPH = State->CFG.VPBB2IRBB[getPreheaderVPBB()];
     Loop *ParentLoop = State->LI->getLoopFor(VectorPH);
 
     // Insert the new loop into the loop nest and register the new basic blocks
     // before calling any utilities such as SCEV that require valid LoopInfo.
     if (ParentLoop)
-      ParentLoop->addChildLoop(State->CurrentVectorLoop);
+      ParentLoop->addChildLoop(State->CurrentParentLoop);
     else
-      State->LI->addTopLevelLoop(State->CurrentVectorLoop);
+      State->LI->addTopLevelLoop(State->CurrentParentLoop);
 
     // Visit the VPBlocks connected to "this", starting from it.
     for (VPBlockBase *Block : RPOT) {
@@ -731,7 +732,7 @@ void VPRegionBlock::execute(VPTransformState *State) {
       Block->execute(State);
     }
 
-    State->CurrentVectorLoop = PrevLoop;
+    State->CurrentParentLoop = PrevLoop;
     return;
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 199e0dd7a6bec..88f3f672d3aa3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -236,7 +236,8 @@ class VPLane {
 struct VPTransformState {
   VPTransformState(const TargetTransformInfo *TTI, ElementCount VF, unsigned UF,
                    LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder,
-                   InnerLoopVectorizer *ILV, VPlan *Plan, Type *CanonicalIVTy);
+                   InnerLoopVectorizer *ILV, VPlan *Plan,
+                   Loop *CurrentParentLoop, Type *CanonicalIVTy);
   /// Target Transform Info.
   const TargetTransformInfo *TTI;
 
@@ -373,8 +374,8 @@ struct VPTransformState {
   /// Pointer to the VPlan code is generated for.
   VPlan *Plan;
 
-  /// The loop object for the current parent region, or nullptr.
-  Loop *CurrentVectorLoop = nullptr;
+  /// The parent loop object for the current scope, or nullptr.
+  Loop *CurrentParentLoop = nullptr;
 
   /// LoopVersioning.  It's only set up (non-null) if memchecks were
   /// used.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index edba3de0719eb..77c08839dbfa9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3365,7 +3365,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
                           : VectorType::get(StartV->getType(), State.VF);
 
   BasicBlock *HeaderBB = State.CFG.PrevBB;
-  assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
+  assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
          "recipe must be in the vector loop header");
   auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
   Phi->insertBefore(HeaderBB->getFirstInsertionPt());

From 7d6ec3b9680a53e58235743080bf223067050fbc Mon Sep 17 00:00:00 2001
From: Florian Hahn 
Date: Tue, 31 Dec 2024 20:08:54 +0000
Subject: [PATCH 228/567] [LV] Add more tests for vector loop removal.

Add missing test coverage of loops where the vector loop region can be
removed that include replicate recipes as well as nested loops.

Extra test coverage for https://github.com/llvm/llvm-project/pull/108378.
---
 .../vector-loop-backedge-elimination.ll       | 788 +++++++++++++++++-
 1 file changed, 780 insertions(+), 8 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
index fd75177c0d106..8bcba56e0d43a 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
@@ -1,19 +1,141 @@
-; RUN: opt -passes=loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=CHECK,VF8UF1 %s
-; RUN: opt -passes=loop-vectorize -force-vector-width=8 -force-vector-interleave=2 -S %s | FileCheck --check-prefixes=CHECK,VF8UF2 %s
-; RUN: opt -passes=loop-vectorize -force-vector-width=16 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=CHECK,VF16UF1 %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='loop-vectorize,verify' -force-vector-width=8 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=CHECK,VF8UF1 %s
+; RUN: opt -passes='loop-vectorize,verify' -force-vector-width=8 -force-vector-interleave=2 -S %s | FileCheck --check-prefixes=CHECK,VF8UF2 %s
+; RUN: opt -passes='loop-vectorize,verify' -force-vector-width=16 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=CHECK,VF16UF1 %s
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
 ; Check if the vector loop condition can be simplified to true for a given
 ; VF/IC combination.
 define void @test_tc_less_than_16(ptr %A, i64 %N) {
-; CHECK-LABEL: define void @test_tc_less_than_16(
-; VF8UF1:       [[CMP:%.+]] = icmp eq i64 %index.next, %n.vec
-; VF8UF1-NEXT:  br i1 [[CMP]], label %middle.block, label %vector.body
+; VF8UF1-LABEL: define void @test_tc_less_than_16(
+; VF8UF1-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; VF8UF1-NEXT:  [[ENTRY:.*]]:
+; VF8UF1-NEXT:    [[AND:%.*]] = and i64 [[N]], 15
+; VF8UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[AND]], 8
+; VF8UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF1:       [[VECTOR_PH]]:
+; VF8UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[AND]], 8
+; VF8UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[AND]], [[N_MOD_VF]]
+; VF8UF1-NEXT:    [[TMP0:%.*]] = sub i64 [[AND]], [[N_VEC]]
+; VF8UF1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; VF8UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF1:       [[VECTOR_BODY]]:
+; VF8UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF8UF1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; VF8UF1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; VF8UF1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; VF8UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP3]], align 1
+; VF8UF1-NEXT:    [[TMP4:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10)
+; VF8UF1-NEXT:    store <8 x i8> [[TMP4]], ptr [[TMP3]], align 1
+; VF8UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; VF8UF1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF8UF1-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF8UF1:       [[MIDDLE_BLOCK]]:
+; VF8UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[AND]], [[N_VEC]]
+; VF8UF1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF1:       [[SCALAR_PH]]:
+; VF8UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[AND]], %[[ENTRY]] ]
+; VF8UF1-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; VF8UF1-NEXT:    br label %[[LOOP:.*]]
+; VF8UF1:       [[LOOP]]:
+; VF8UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF1-NEXT:    [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF1-NEXT:    [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1
+; VF8UF1-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF8UF1-NEXT:    [[ADD:%.*]] = add nsw i8 [[L]], 10
+; VF8UF1-NEXT:    store i8 [[ADD]], ptr [[P_SRC]], align 1
+; VF8UF1-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; VF8UF1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; VF8UF1-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF8UF1:       [[EXIT]]:
+; VF8UF1-NEXT:    ret void
 ;
-; VF8UF2:       br i1 true, label %middle.block, label %vector.body
+; VF8UF2-LABEL: define void @test_tc_less_than_16(
+; VF8UF2-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; VF8UF2-NEXT:  [[ENTRY:.*]]:
+; VF8UF2-NEXT:    [[AND:%.*]] = and i64 [[N]], 15
+; VF8UF2-NEXT:    br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF2:       [[VECTOR_PH]]:
+; VF8UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[AND]], 16
+; VF8UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[AND]], [[N_MOD_VF]]
+; VF8UF2-NEXT:    [[TMP0:%.*]] = sub i64 [[AND]], [[N_VEC]]
+; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF2:       [[VECTOR_BODY]]:
+; VF8UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF8UF2-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; VF8UF2-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
+; VF8UF2-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; VF8UF2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 8
+; VF8UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP3]], align 1
+; VF8UF2-NEXT:    [[TMP4:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10)
+; VF8UF2-NEXT:    [[TMP5:%.*]] = add nsw <8 x i8> [[WIDE_LOAD1]], splat (i8 10)
+; VF8UF2-NEXT:    store <8 x i8> [[TMP4]], ptr [[TMP2]], align 1
+; VF8UF2-NEXT:    store <8 x i8> [[TMP5]], ptr [[TMP3]], align 1
+; VF8UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF8UF2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF8UF2:       [[MIDDLE_BLOCK]]:
+; VF8UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[AND]], [[N_VEC]]
+; VF8UF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF2:       [[SCALAR_PH]]:
+; VF8UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[AND]], %[[ENTRY]] ]
+; VF8UF2-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; VF8UF2-NEXT:    br label %[[LOOP:.*]]
+; VF8UF2:       [[LOOP]]:
+; VF8UF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF2-NEXT:    [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF2-NEXT:    [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1
+; VF8UF2-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF8UF2-NEXT:    [[ADD:%.*]] = add nsw i8 [[L]], 10
+; VF8UF2-NEXT:    store i8 [[ADD]], ptr [[P_SRC]], align 1
+; VF8UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; VF8UF2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; VF8UF2-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF8UF2:       [[EXIT]]:
+; VF8UF2-NEXT:    ret void
 ;
-; VF16UF1:      br i1 true, label %middle.block, label %vector.body
+; VF16UF1-LABEL: define void @test_tc_less_than_16(
+; VF16UF1-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; VF16UF1-NEXT:  [[ENTRY:.*]]:
+; VF16UF1-NEXT:    [[AND:%.*]] = and i64 [[N]], 15
+; VF16UF1-NEXT:    br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF16UF1:       [[VECTOR_PH]]:
+; VF16UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[AND]], 16
+; VF16UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[AND]], [[N_MOD_VF]]
+; VF16UF1-NEXT:    [[TMP0:%.*]] = sub i64 [[AND]], [[N_VEC]]
+; VF16UF1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF16UF1:       [[VECTOR_BODY]]:
+; VF16UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF16UF1-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; VF16UF1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP4]]
+; VF16UF1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; VF16UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; VF16UF1-NEXT:    [[TMP3:%.*]] = add nsw <16 x i8> [[WIDE_LOAD]], splat (i8 10)
+; VF16UF1-NEXT:    store <16 x i8> [[TMP3]], ptr [[TMP2]], align 1
+; VF16UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF16UF1-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF16UF1:       [[MIDDLE_BLOCK]]:
+; VF16UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[AND]], [[N_VEC]]
+; VF16UF1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF16UF1:       [[SCALAR_PH]]:
+; VF16UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[AND]], %[[ENTRY]] ]
+; VF16UF1-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; VF16UF1-NEXT:    br label %[[LOOP:.*]]
+; VF16UF1:       [[LOOP]]:
+; VF16UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF16UF1-NEXT:    [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ]
+; VF16UF1-NEXT:    [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1
+; VF16UF1-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF16UF1-NEXT:    [[ADD:%.*]] = add nsw i8 [[L]], 10
+; VF16UF1-NEXT:    store i8 [[ADD]], ptr [[P_SRC]], align 1
+; VF16UF1-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; VF16UF1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; VF16UF1-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF16UF1:       [[EXIT]]:
+; VF16UF1-NEXT:    ret void
 ;
 entry:
   %and = and i64 %N, 15
@@ -33,3 +155,653 @@ loop:
 exit:
   ret void
 }
+
+define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, 10) %N) {
+; VF8UF1-LABEL: define void @remove_loop_region_with_replicate_recipe(
+; VF8UF1-SAME: ptr [[DST:%.*]], i64 range(i64 5, 10) [[N:%.*]]) {
+; VF8UF1-NEXT:  [[ENTRY:.*]]:
+; VF8UF1-NEXT:    [[TMP0:%.*]] = add nsw i64 [[N]], -2
+; VF8UF1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF1:       [[VECTOR_PH]]:
+; VF8UF1-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 7
+; VF8UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
+; VF8UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF8UF1-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
+; VF8UF1-NEXT:    [[TMP1:%.*]] = add i64 2, [[N_VEC]]
+; VF8UF1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF8UF1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; VF8UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF1:       [[VECTOR_BODY]]:
+; VF8UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE16:.*]] ]
+; VF8UF1-NEXT:    [[OFFSET_IDX:%.*]] = add i64 2, [[INDEX]]
+; VF8UF1-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i64 0
+; VF8UF1-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
+; VF8UF1-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT1]], 
+; VF8UF1-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; VF8UF1-NEXT:    [[TMP3:%.*]] = extractelement <8 x i1> [[TMP2]], i32 0
+; VF8UF1-NEXT:    br i1 [[TMP3]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF8UF1:       [[PRED_STORE_IF]]:
+; VF8UF1-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 0
+; VF8UF1-NEXT:    [[TMP4:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP20]]
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP4]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF8UF1:       [[PRED_STORE_CONTINUE]]:
+; VF8UF1-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP2]], i32 1
+; VF8UF1-NEXT:    br i1 [[TMP5]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; VF8UF1:       [[PRED_STORE_IF3]]:
+; VF8UF1-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 1
+; VF8UF1-NEXT:    [[TMP6:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP21]]
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP6]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; VF8UF1:       [[PRED_STORE_CONTINUE4]]:
+; VF8UF1-NEXT:    [[TMP7:%.*]] = extractelement <8 x i1> [[TMP2]], i32 2
+; VF8UF1-NEXT:    br i1 [[TMP7]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; VF8UF1:       [[PRED_STORE_IF5]]:
+; VF8UF1-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 2
+; VF8UF1-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP23]]
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP8]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
+; VF8UF1:       [[PRED_STORE_CONTINUE6]]:
+; VF8UF1-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[TMP2]], i32 3
+; VF8UF1-NEXT:    br i1 [[TMP9]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF8UF1:       [[PRED_STORE_IF7]]:
+; VF8UF1-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 3
+; VF8UF1-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP24]]
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP10]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF8UF1:       [[PRED_STORE_CONTINUE8]]:
+; VF8UF1-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP2]], i32 4
+; VF8UF1-NEXT:    br i1 [[TMP11]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF8UF1:       [[PRED_STORE_IF9]]:
+; VF8UF1-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 4
+; VF8UF1-NEXT:    [[TMP12:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP26]]
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP12]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF8UF1:       [[PRED_STORE_CONTINUE10]]:
+; VF8UF1-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[TMP2]], i32 5
+; VF8UF1-NEXT:    br i1 [[TMP13]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; VF8UF1:       [[PRED_STORE_IF11]]:
+; VF8UF1-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 5
+; VF8UF1-NEXT:    [[TMP14:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP19]]
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP14]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF8UF1:       [[PRED_STORE_CONTINUE12]]:
+; VF8UF1-NEXT:    [[TMP15:%.*]] = extractelement <8 x i1> [[TMP2]], i32 6
+; VF8UF1-NEXT:    br i1 [[TMP15]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; VF8UF1:       [[PRED_STORE_IF13]]:
+; VF8UF1-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 6
+; VF8UF1-NEXT:    [[TMP16:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP22]]
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP16]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
+; VF8UF1:       [[PRED_STORE_CONTINUE14]]:
+; VF8UF1-NEXT:    [[TMP17:%.*]] = extractelement <8 x i1> [[TMP2]], i32 7
+; VF8UF1-NEXT:    br i1 [[TMP17]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16]]
+; VF8UF1:       [[PRED_STORE_IF15]]:
+; VF8UF1-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 7
+; VF8UF1-NEXT:    [[TMP18:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP25]]
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP18]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE16]]
+; VF8UF1:       [[PRED_STORE_CONTINUE16]]:
+; VF8UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; VF8UF1-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF8UF1:       [[MIDDLE_BLOCK]]:
+; VF8UF1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF1:       [[SCALAR_PH]]:
+; VF8UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ]
+; VF8UF1-NEXT:    br label %[[LOOP:.*]]
+; VF8UF1:       [[LOOP]]:
+; VF8UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF1-NEXT:    [[GEP_DST:%.*]] = getelementptr i16, ptr [[DST]], i64 [[IV]]
+; VF8UF1-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
+; VF8UF1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; VF8UF1-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF8UF1-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF8UF1:       [[EXIT]]:
+; VF8UF1-NEXT:    ret void
+;
+; VF8UF2-LABEL: define void @remove_loop_region_with_replicate_recipe(
+; VF8UF2-SAME: ptr [[DST:%.*]], i64 range(i64 5, 10) [[N:%.*]]) {
+; VF8UF2-NEXT:  [[ENTRY:.*]]:
+; VF8UF2-NEXT:    [[TMP0:%.*]] = add nsw i64 [[N]], -2
+; VF8UF2-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF2:       [[VECTOR_PH]]:
+; VF8UF2-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 15
+; VF8UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; VF8UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF8UF2-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
+; VF8UF2-NEXT:    [[TMP1:%.*]] = add i64 2, [[N_VEC]]
+; VF8UF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF8UF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF2:       [[VECTOR_BODY]]:
+; VF8UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE35:.*]] ]
+; VF8UF2-NEXT:    [[OFFSET_IDX:%.*]] = add i64 2, [[INDEX]]
+; VF8UF2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i64 0
+; VF8UF2-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
+; VF8UF2-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT1]], 
+; VF8UF2-NEXT:    [[VEC_IV3:%.*]] = add <8 x i64> [[BROADCAST_SPLAT1]], 
+; VF8UF2-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; VF8UF2-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i64> [[VEC_IV3]], [[BROADCAST_SPLAT]]
+; VF8UF2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP2]], i32 0
+; VF8UF2-NEXT:    br i1 [[TMP4]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF8UF2:       [[PRED_STORE_IF]]:
+; VF8UF2-NEXT:    [[TMP36:%.*]] = add i64 [[OFFSET_IDX]], 0
+; VF8UF2-NEXT:    [[TMP5:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP36]]
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP5]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF8UF2:       [[PRED_STORE_CONTINUE]]:
+; VF8UF2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP2]], i32 1
+; VF8UF2-NEXT:    br i1 [[TMP6]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7:.*]]
+; VF8UF2:       [[PRED_STORE_IF6]]:
+; VF8UF2-NEXT:    [[TMP37:%.*]] = add i64 [[OFFSET_IDX]], 1
+; VF8UF2-NEXT:    [[TMP7:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP37]]
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP7]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE7]]
+; VF8UF2:       [[PRED_STORE_CONTINUE7]]:
+; VF8UF2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i1> [[TMP2]], i32 2
+; VF8UF2-NEXT:    br i1 [[TMP8]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9:.*]]
+; VF8UF2:       [[PRED_STORE_IF8]]:
+; VF8UF2-NEXT:    [[TMP39:%.*]] = add i64 [[OFFSET_IDX]], 2
+; VF8UF2-NEXT:    [[TMP9:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP39]]
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP9]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE9]]
+; VF8UF2:       [[PRED_STORE_CONTINUE9]]:
+; VF8UF2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[TMP2]], i32 3
+; VF8UF2-NEXT:    br i1 [[TMP10]], label %[[PRED_STORE_IF10:.*]], label %[[PRED_STORE_CONTINUE11:.*]]
+; VF8UF2:       [[PRED_STORE_IF10]]:
+; VF8UF2-NEXT:    [[TMP40:%.*]] = add i64 [[OFFSET_IDX]], 3
+; VF8UF2-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP40]]
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP11]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE11]]
+; VF8UF2:       [[PRED_STORE_CONTINUE11]]:
+; VF8UF2-NEXT:    [[TMP12:%.*]] = extractelement <8 x i1> [[TMP2]], i32 4
+; VF8UF2-NEXT:    br i1 [[TMP12]], label %[[PRED_STORE_IF12:.*]], label %[[PRED_STORE_CONTINUE13:.*]]
+; VF8UF2:       [[PRED_STORE_IF12]]:
+; VF8UF2-NEXT:    [[TMP42:%.*]] = add i64 [[OFFSET_IDX]], 4
+; VF8UF2-NEXT:    [[TMP13:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP42]]
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP13]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE13]]
+; VF8UF2:       [[PRED_STORE_CONTINUE13]]:
+; VF8UF2-NEXT:    [[TMP14:%.*]] = extractelement <8 x i1> [[TMP2]], i32 5
+; VF8UF2-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF14:.*]], label %[[PRED_STORE_CONTINUE15:.*]]
+; VF8UF2:       [[PRED_STORE_IF14]]:
+; VF8UF2-NEXT:    [[TMP43:%.*]] = add i64 [[OFFSET_IDX]], 5
+; VF8UF2-NEXT:    [[TMP15:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP43]]
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP15]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE15]]
+; VF8UF2:       [[PRED_STORE_CONTINUE15]]:
+; VF8UF2-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP2]], i32 6
+; VF8UF2-NEXT:    br i1 [[TMP16]], label %[[PRED_STORE_IF16:.*]], label %[[PRED_STORE_CONTINUE17:.*]]
+; VF8UF2:       [[PRED_STORE_IF16]]:
+; VF8UF2-NEXT:    [[TMP45:%.*]] = add i64 [[OFFSET_IDX]], 6
+; VF8UF2-NEXT:    [[TMP17:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP45]]
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP17]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE17]]
+; VF8UF2:       [[PRED_STORE_CONTINUE17]]:
+; VF8UF2-NEXT:    [[TMP18:%.*]] = extractelement <8 x i1> [[TMP2]], i32 7
+; VF8UF2-NEXT:    br i1 [[TMP18]], label %[[PRED_STORE_IF18:.*]], label %[[PRED_STORE_CONTINUE19:.*]]
+; VF8UF2:       [[PRED_STORE_IF18]]:
+; VF8UF2-NEXT:    [[TMP46:%.*]] = add i64 [[OFFSET_IDX]], 7
+; VF8UF2-NEXT:    [[TMP19:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP46]]
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP19]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE19]]
+; VF8UF2:       [[PRED_STORE_CONTINUE19]]:
+; VF8UF2-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0
+; VF8UF2-NEXT:    br i1 [[TMP20]], label %[[PRED_STORE_IF20:.*]], label %[[PRED_STORE_CONTINUE21:.*]]
+; VF8UF2:       [[PRED_STORE_IF20]]:
+; VF8UF2-NEXT:    [[TMP48:%.*]] = add i64 [[OFFSET_IDX]], 8
+; VF8UF2-NEXT:    [[TMP21:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP48]]
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP21]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE21]]
+; VF8UF2:       [[PRED_STORE_CONTINUE21]]:
+; VF8UF2-NEXT:    [[TMP22:%.*]] = extractelement <8 x i1> [[TMP3]], i32 1
+; VF8UF2-NEXT:    br i1 [[TMP22]], label %[[PRED_STORE_IF22:.*]], label %[[PRED_STORE_CONTINUE23:.*]]
+; VF8UF2:       [[PRED_STORE_IF22]]:
+; VF8UF2-NEXT:    [[TMP49:%.*]] = add i64 [[OFFSET_IDX]], 9
+; VF8UF2-NEXT:    [[TMP23:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP49]]
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP23]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE23]]
+; VF8UF2:       [[PRED_STORE_CONTINUE23]]:
+; VF8UF2-NEXT:    [[TMP24:%.*]] = extractelement <8 x i1> [[TMP3]], i32 2
+; VF8UF2-NEXT:    br i1 [[TMP24]], label %[[PRED_STORE_IF24:.*]], label %[[PRED_STORE_CONTINUE25:.*]]
+; VF8UF2:       [[PRED_STORE_IF24]]:
+; VF8UF2-NEXT:    [[TMP51:%.*]] = add i64 [[OFFSET_IDX]], 10
+; VF8UF2-NEXT:    [[TMP25:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP51]]
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP25]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE25]]
+; VF8UF2:       [[PRED_STORE_CONTINUE25]]:
+; VF8UF2-NEXT:    [[TMP26:%.*]] = extractelement <8 x i1> [[TMP3]], i32 3
+; VF8UF2-NEXT:    br i1 [[TMP26]], label %[[PRED_STORE_IF26:.*]], label %[[PRED_STORE_CONTINUE27:.*]]
+; VF8UF2:       [[PRED_STORE_IF26]]:
+; VF8UF2-NEXT:    [[TMP38:%.*]] = add i64 [[OFFSET_IDX]], 11
+; VF8UF2-NEXT:    [[TMP27:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP38]]
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP27]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE27]]
+; VF8UF2:       [[PRED_STORE_CONTINUE27]]:
+; VF8UF2-NEXT:    [[TMP28:%.*]] = extractelement <8 x i1> [[TMP3]], i32 4
+; VF8UF2-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF28:.*]], label %[[PRED_STORE_CONTINUE29:.*]]
+; VF8UF2:       [[PRED_STORE_IF28]]:
+; VF8UF2-NEXT:    [[TMP41:%.*]] = add i64 [[OFFSET_IDX]], 12
+; VF8UF2-NEXT:    [[TMP29:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP41]]
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP29]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE29]]
+; VF8UF2:       [[PRED_STORE_CONTINUE29]]:
+; VF8UF2-NEXT:    [[TMP30:%.*]] = extractelement <8 x i1> [[TMP3]], i32 5
+; VF8UF2-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF30:.*]], label %[[PRED_STORE_CONTINUE31:.*]]
+; VF8UF2:       [[PRED_STORE_IF30]]:
+; VF8UF2-NEXT:    [[TMP44:%.*]] = add i64 [[OFFSET_IDX]], 13
+; VF8UF2-NEXT:    [[TMP31:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP44]]
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP31]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE31]]
+; VF8UF2:       [[PRED_STORE_CONTINUE31]]:
+; VF8UF2-NEXT:    [[TMP32:%.*]] = extractelement <8 x i1> [[TMP3]], i32 6
+; VF8UF2-NEXT:    br i1 [[TMP32]], label %[[PRED_STORE_IF32:.*]], label %[[PRED_STORE_CONTINUE33:.*]]
+; VF8UF2:       [[PRED_STORE_IF32]]:
+; VF8UF2-NEXT:    [[TMP47:%.*]] = add i64 [[OFFSET_IDX]], 14
+; VF8UF2-NEXT:    [[TMP33:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP47]]
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP33]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE33]]
+; VF8UF2:       [[PRED_STORE_CONTINUE33]]:
+; VF8UF2-NEXT:    [[TMP34:%.*]] = extractelement <8 x i1> [[TMP3]], i32 7
+; VF8UF2-NEXT:    br i1 [[TMP34]], label %[[PRED_STORE_IF34:.*]], label %[[PRED_STORE_CONTINUE35]]
+; VF8UF2:       [[PRED_STORE_IF34]]:
+; VF8UF2-NEXT:    [[TMP50:%.*]] = add i64 [[OFFSET_IDX]], 15
+; VF8UF2-NEXT:    [[TMP35:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP50]]
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP35]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE35]]
+; VF8UF2:       [[PRED_STORE_CONTINUE35]]:
+; VF8UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF8UF2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF8UF2:       [[MIDDLE_BLOCK]]:
+; VF8UF2-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF2:       [[SCALAR_PH]]:
+; VF8UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ]
+; VF8UF2-NEXT:    br label %[[LOOP:.*]]
+; VF8UF2:       [[LOOP]]:
+; VF8UF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF2-NEXT:    [[GEP_DST:%.*]] = getelementptr i16, ptr [[DST]], i64 [[IV]]
+; VF8UF2-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
+; VF8UF2-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; VF8UF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF8UF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF8UF2:       [[EXIT]]:
+; VF8UF2-NEXT:    ret void
+;
+; VF16UF1-LABEL: define void @remove_loop_region_with_replicate_recipe(
+; VF16UF1-SAME: ptr [[DST:%.*]], i64 range(i64 5, 10) [[N:%.*]]) {
+; VF16UF1-NEXT:  [[ENTRY:.*]]:
+; VF16UF1-NEXT:    [[TMP0:%.*]] = add nsw i64 [[N]], -2
+; VF16UF1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF16UF1:       [[VECTOR_PH]]:
+; VF16UF1-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 15
+; VF16UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; VF16UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF16UF1-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
+; VF16UF1-NEXT:    [[TMP1:%.*]] = add i64 2, [[N_VEC]]
+; VF16UF1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF16UF1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF16UF1:       [[VECTOR_BODY]]:
+; VF16UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE32:.*]] ]
+; VF16UF1-NEXT:    [[OFFSET_IDX:%.*]] = add i64 2, [[INDEX]]
+; VF16UF1-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[INDEX]], i64 0
+; VF16UF1-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
+; VF16UF1-NEXT:    [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT1]], 
+; VF16UF1-NEXT:    [[TMP2:%.*]] = icmp ule <16 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; VF16UF1-NEXT:    [[TMP3:%.*]] = extractelement <16 x i1> [[TMP2]], i32 0
+; VF16UF1-NEXT:    br i1 [[TMP3]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF16UF1:       [[PRED_STORE_IF]]:
+; VF16UF1-NEXT:    [[TMP35:%.*]] = add i64 [[OFFSET_IDX]], 0
+; VF16UF1-NEXT:    [[TMP4:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP35]]
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP4]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF16UF1:       [[PRED_STORE_CONTINUE]]:
+; VF16UF1-NEXT:    [[TMP5:%.*]] = extractelement <16 x i1> [[TMP2]], i32 1
+; VF16UF1-NEXT:    br i1 [[TMP5]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; VF16UF1:       [[PRED_STORE_IF3]]:
+; VF16UF1-NEXT:    [[TMP36:%.*]] = add i64 [[OFFSET_IDX]], 1
+; VF16UF1-NEXT:    [[TMP6:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP36]]
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP6]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; VF16UF1:       [[PRED_STORE_CONTINUE4]]:
+; VF16UF1-NEXT:    [[TMP7:%.*]] = extractelement <16 x i1> [[TMP2]], i32 2
+; VF16UF1-NEXT:    br i1 [[TMP7]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; VF16UF1:       [[PRED_STORE_IF5]]:
+; VF16UF1-NEXT:    [[TMP38:%.*]] = add i64 [[OFFSET_IDX]], 2
+; VF16UF1-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP38]]
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP8]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
+; VF16UF1:       [[PRED_STORE_CONTINUE6]]:
+; VF16UF1-NEXT:    [[TMP9:%.*]] = extractelement <16 x i1> [[TMP2]], i32 3
+; VF16UF1-NEXT:    br i1 [[TMP9]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF16UF1:       [[PRED_STORE_IF7]]:
+; VF16UF1-NEXT:    [[TMP39:%.*]] = add i64 [[OFFSET_IDX]], 3
+; VF16UF1-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP39]]
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP10]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF16UF1:       [[PRED_STORE_CONTINUE8]]:
+; VF16UF1-NEXT:    [[TMP11:%.*]] = extractelement <16 x i1> [[TMP2]], i32 4
+; VF16UF1-NEXT:    br i1 [[TMP11]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF16UF1:       [[PRED_STORE_IF9]]:
+; VF16UF1-NEXT:    [[TMP41:%.*]] = add i64 [[OFFSET_IDX]], 4
+; VF16UF1-NEXT:    [[TMP12:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP41]]
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP12]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF16UF1:       [[PRED_STORE_CONTINUE10]]:
+; VF16UF1-NEXT:    [[TMP13:%.*]] = extractelement <16 x i1> [[TMP2]], i32 5
+; VF16UF1-NEXT:    br i1 [[TMP13]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; VF16UF1:       [[PRED_STORE_IF11]]:
+; VF16UF1-NEXT:    [[TMP42:%.*]] = add i64 [[OFFSET_IDX]], 5
+; VF16UF1-NEXT:    [[TMP14:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP42]]
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP14]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF16UF1:       [[PRED_STORE_CONTINUE12]]:
+; VF16UF1-NEXT:    [[TMP15:%.*]] = extractelement <16 x i1> [[TMP2]], i32 6
+; VF16UF1-NEXT:    br i1 [[TMP15]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; VF16UF1:       [[PRED_STORE_IF13]]:
+; VF16UF1-NEXT:    [[TMP44:%.*]] = add i64 [[OFFSET_IDX]], 6
+; VF16UF1-NEXT:    [[TMP16:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP44]]
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP16]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
+; VF16UF1:       [[PRED_STORE_CONTINUE14]]:
+; VF16UF1-NEXT:    [[TMP17:%.*]] = extractelement <16 x i1> [[TMP2]], i32 7
+; VF16UF1-NEXT:    br i1 [[TMP17]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]]
+; VF16UF1:       [[PRED_STORE_IF15]]:
+; VF16UF1-NEXT:    [[TMP45:%.*]] = add i64 [[OFFSET_IDX]], 7
+; VF16UF1-NEXT:    [[TMP18:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP45]]
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP18]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE16]]
+; VF16UF1:       [[PRED_STORE_CONTINUE16]]:
+; VF16UF1-NEXT:    [[TMP19:%.*]] = extractelement <16 x i1> [[TMP2]], i32 8
+; VF16UF1-NEXT:    br i1 [[TMP19]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
+; VF16UF1:       [[PRED_STORE_IF17]]:
+; VF16UF1-NEXT:    [[TMP47:%.*]] = add i64 [[OFFSET_IDX]], 8
+; VF16UF1-NEXT:    [[TMP20:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP47]]
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP20]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE18]]
+; VF16UF1:       [[PRED_STORE_CONTINUE18]]:
+; VF16UF1-NEXT:    [[TMP21:%.*]] = extractelement <16 x i1> [[TMP2]], i32 9
+; VF16UF1-NEXT:    br i1 [[TMP21]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
+; VF16UF1:       [[PRED_STORE_IF19]]:
+; VF16UF1-NEXT:    [[TMP48:%.*]] = add i64 [[OFFSET_IDX]], 9
+; VF16UF1-NEXT:    [[TMP22:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP48]]
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP22]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE20]]
+; VF16UF1:       [[PRED_STORE_CONTINUE20]]:
+; VF16UF1-NEXT:    [[TMP23:%.*]] = extractelement <16 x i1> [[TMP2]], i32 10
+; VF16UF1-NEXT:    br i1 [[TMP23]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
+; VF16UF1:       [[PRED_STORE_IF21]]:
+; VF16UF1-NEXT:    [[TMP50:%.*]] = add i64 [[OFFSET_IDX]], 10
+; VF16UF1-NEXT:    [[TMP24:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP50]]
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP24]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE22]]
+; VF16UF1:       [[PRED_STORE_CONTINUE22]]:
+; VF16UF1-NEXT:    [[TMP25:%.*]] = extractelement <16 x i1> [[TMP2]], i32 11
+; VF16UF1-NEXT:    br i1 [[TMP25]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
+; VF16UF1:       [[PRED_STORE_IF23]]:
+; VF16UF1-NEXT:    [[TMP37:%.*]] = add i64 [[OFFSET_IDX]], 11
+; VF16UF1-NEXT:    [[TMP26:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP37]]
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP26]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE24]]
+; VF16UF1:       [[PRED_STORE_CONTINUE24]]:
+; VF16UF1-NEXT:    [[TMP27:%.*]] = extractelement <16 x i1> [[TMP2]], i32 12
+; VF16UF1-NEXT:    br i1 [[TMP27]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
+; VF16UF1:       [[PRED_STORE_IF25]]:
+; VF16UF1-NEXT:    [[TMP40:%.*]] = add i64 [[OFFSET_IDX]], 12
+; VF16UF1-NEXT:    [[TMP28:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP40]]
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP28]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE26]]
+; VF16UF1:       [[PRED_STORE_CONTINUE26]]:
+; VF16UF1-NEXT:    [[TMP29:%.*]] = extractelement <16 x i1> [[TMP2]], i32 13
+; VF16UF1-NEXT:    br i1 [[TMP29]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
+; VF16UF1:       [[PRED_STORE_IF27]]:
+; VF16UF1-NEXT:    [[TMP43:%.*]] = add i64 [[OFFSET_IDX]], 13
+; VF16UF1-NEXT:    [[TMP30:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP43]]
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP30]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE28]]
+; VF16UF1:       [[PRED_STORE_CONTINUE28]]:
+; VF16UF1-NEXT:    [[TMP31:%.*]] = extractelement <16 x i1> [[TMP2]], i32 14
+; VF16UF1-NEXT:    br i1 [[TMP31]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]]
+; VF16UF1:       [[PRED_STORE_IF29]]:
+; VF16UF1-NEXT:    [[TMP46:%.*]] = add i64 [[OFFSET_IDX]], 14
+; VF16UF1-NEXT:    [[TMP32:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP46]]
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP32]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE30]]
+; VF16UF1:       [[PRED_STORE_CONTINUE30]]:
+; VF16UF1-NEXT:    [[TMP33:%.*]] = extractelement <16 x i1> [[TMP2]], i32 15
+; VF16UF1-NEXT:    br i1 [[TMP33]], label %[[PRED_STORE_IF31:.*]], label %[[PRED_STORE_CONTINUE32]]
+; VF16UF1:       [[PRED_STORE_IF31]]:
+; VF16UF1-NEXT:    [[TMP49:%.*]] = add i64 [[OFFSET_IDX]], 15
+; VF16UF1-NEXT:    [[TMP34:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP49]]
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP34]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE32]]
+; VF16UF1:       [[PRED_STORE_CONTINUE32]]:
+; VF16UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF16UF1-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF16UF1:       [[MIDDLE_BLOCK]]:
+; VF16UF1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF16UF1:       [[SCALAR_PH]]:
+; VF16UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ]
+; VF16UF1-NEXT:    br label %[[LOOP:.*]]
+; VF16UF1:       [[LOOP]]:
+; VF16UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF16UF1-NEXT:    [[GEP_DST:%.*]] = getelementptr i16, ptr [[DST]], i64 [[IV]]
+; VF16UF1-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
+; VF16UF1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; VF16UF1-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF16UF1-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF16UF1:       [[EXIT]]:
+; VF16UF1-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 2, %entry ], [ %iv.next, %loop ]
+  %gep.dst = getelementptr i16, ptr %dst, i64 %iv
+  store i16 0, ptr %gep.dst, align 2
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+declare i1 @cond()
+
+define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias %src, ptr %dst) {
+; VF8UF1-LABEL: define void @remove_loop_region_outer_loop(
+; VF8UF1-SAME: i64 range(i64 8, 17) [[N:%.*]], ptr noalias [[SRC:%.*]], ptr [[DST:%.*]]) {
+; VF8UF1-NEXT:  [[ENTRY:.*]]:
+; VF8UF1-NEXT:    br label %[[OUTER_HEADER:.*]]
+; VF8UF1:       [[OUTER_HEADER]]:
+; VF8UF1-NEXT:    [[OUTER_IV:%.*]] = phi ptr [ [[SRC]], %[[ENTRY]] ], [ [[OUTER_IV_NEXT:%.*]], %[[OUTER_LATCH:.*]] ]
+; VF8UF1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF1:       [[VECTOR_PH]]:
+; VF8UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; VF8UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF8UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF1:       [[VECTOR_BODY]]:
+; VF8UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF8UF1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF8UF1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[TMP0]]
+; VF8UF1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; VF8UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; VF8UF1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
+; VF8UF1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
+; VF8UF1-NEXT:    store <8 x i8> [[WIDE_LOAD]], ptr [[TMP4]], align 1
+; VF8UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; VF8UF1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF8UF1-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF8UF1:       [[MIDDLE_BLOCK]]:
+; VF8UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF8UF1-NEXT:    br i1 [[CMP_N]], label %[[OUTER_LATCH]], label %[[SCALAR_PH]]
+; VF8UF1:       [[SCALAR_PH]]:
+; VF8UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_HEADER]] ]
+; VF8UF1-NEXT:    br label %[[INNER:.*]]
+; VF8UF1:       [[INNER]]:
+; VF8UF1-NEXT:    [[INNER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[INNER]] ]
+; VF8UF1-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[INNER_IV]]
+; VF8UF1-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; VF8UF1-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INNER_IV]]
+; VF8UF1-NEXT:    store i8 [[L]], ptr [[GEP_DST]], align 1
+; VF8UF1-NEXT:    [[IV_NEXT]] = add i64 [[INNER_IV]], 1
+; VF8UF1-NEXT:    [[C_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF8UF1-NEXT:    br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF8UF1:       [[OUTER_LATCH]]:
+; VF8UF1-NEXT:    [[OUTER_IV_NEXT]] = getelementptr i8, ptr [[OUTER_IV]], i64 1
+; VF8UF1-NEXT:    [[C_2:%.*]] = call i1 @cond()
+; VF8UF1-NEXT:    br i1 [[C_2]], label %[[OUTER_HEADER]], label %[[EXIT:.*]]
+; VF8UF1:       [[EXIT]]:
+; VF8UF1-NEXT:    ret void
+;
+; VF8UF2-LABEL: define void @remove_loop_region_outer_loop(
+; VF8UF2-SAME: i64 range(i64 8, 17) [[N:%.*]], ptr noalias [[SRC:%.*]], ptr [[DST:%.*]]) {
+; VF8UF2-NEXT:  [[ENTRY:.*]]:
+; VF8UF2-NEXT:    br label %[[OUTER_HEADER:.*]]
+; VF8UF2:       [[OUTER_HEADER]]:
+; VF8UF2-NEXT:    [[OUTER_IV:%.*]] = phi ptr [ [[SRC]], %[[ENTRY]] ], [ [[OUTER_IV_NEXT:%.*]], %[[OUTER_LATCH:.*]] ]
+; VF8UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; VF8UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF2:       [[VECTOR_PH]]:
+; VF8UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; VF8UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF2:       [[VECTOR_BODY]]:
+; VF8UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF8UF2-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; VF8UF2-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[TMP6]]
+; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
+; VF8UF2-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
+; VF8UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; VF8UF2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
+; VF8UF2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
+; VF8UF2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP3]], i32 8
+; VF8UF2-NEXT:    store <8 x i8> [[WIDE_LOAD]], ptr [[TMP4]], align 1
+; VF8UF2-NEXT:    store <8 x i8> [[WIDE_LOAD1]], ptr [[TMP5]], align 1
+; VF8UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF8UF2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF8UF2:       [[MIDDLE_BLOCK]]:
+; VF8UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF8UF2-NEXT:    br i1 [[CMP_N]], label %[[OUTER_LATCH]], label %[[SCALAR_PH]]
+; VF8UF2:       [[SCALAR_PH]]:
+; VF8UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_HEADER]] ]
+; VF8UF2-NEXT:    br label %[[INNER:.*]]
+; VF8UF2:       [[INNER]]:
+; VF8UF2-NEXT:    [[INNER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[INNER]] ]
+; VF8UF2-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[INNER_IV]]
+; VF8UF2-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; VF8UF2-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INNER_IV]]
+; VF8UF2-NEXT:    store i8 [[L]], ptr [[GEP_DST]], align 1
+; VF8UF2-NEXT:    [[IV_NEXT]] = add i64 [[INNER_IV]], 1
+; VF8UF2-NEXT:    [[C_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF8UF2-NEXT:    br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF8UF2:       [[OUTER_LATCH]]:
+; VF8UF2-NEXT:    [[OUTER_IV_NEXT]] = getelementptr i8, ptr [[OUTER_IV]], i64 1
+; VF8UF2-NEXT:    [[C_2:%.*]] = call i1 @cond()
+; VF8UF2-NEXT:    br i1 [[C_2]], label %[[OUTER_HEADER]], label %[[EXIT:.*]]
+; VF8UF2:       [[EXIT]]:
+; VF8UF2-NEXT:    ret void
+;
+; VF16UF1-LABEL: define void @remove_loop_region_outer_loop(
+; VF16UF1-SAME: i64 range(i64 8, 17) [[N:%.*]], ptr noalias [[SRC:%.*]], ptr [[DST:%.*]]) {
+; VF16UF1-NEXT:  [[ENTRY:.*]]:
+; VF16UF1-NEXT:    br label %[[OUTER_HEADER:.*]]
+; VF16UF1:       [[OUTER_HEADER]]:
+; VF16UF1-NEXT:    [[OUTER_IV:%.*]] = phi ptr [ [[SRC]], %[[ENTRY]] ], [ [[OUTER_IV_NEXT:%.*]], %[[OUTER_LATCH:.*]] ]
+; VF16UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; VF16UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF16UF1:       [[VECTOR_PH]]:
+; VF16UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; VF16UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF16UF1:       [[VECTOR_BODY]]:
+; VF16UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF16UF1-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; VF16UF1-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[TMP4]]
+; VF16UF1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
+; VF16UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; VF16UF1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP4]]
+; VF16UF1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP2]], i32 0
+; VF16UF1-NEXT:    store <16 x i8> [[WIDE_LOAD]], ptr [[TMP3]], align 1
+; VF16UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF16UF1-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF16UF1:       [[MIDDLE_BLOCK]]:
+; VF16UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF16UF1-NEXT:    br i1 [[CMP_N]], label %[[OUTER_LATCH]], label %[[SCALAR_PH]]
+; VF16UF1:       [[SCALAR_PH]]:
+; VF16UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_HEADER]] ]
+; VF16UF1-NEXT:    br label %[[INNER:.*]]
+; VF16UF1:       [[INNER]]:
+; VF16UF1-NEXT:    [[INNER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[INNER]] ]
+; VF16UF1-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[INNER_IV]]
+; VF16UF1-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; VF16UF1-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INNER_IV]]
+; VF16UF1-NEXT:    store i8 [[L]], ptr [[GEP_DST]], align 1
+; VF16UF1-NEXT:    [[IV_NEXT]] = add i64 [[INNER_IV]], 1
+; VF16UF1-NEXT:    [[C_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF16UF1-NEXT:    br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF16UF1:       [[OUTER_LATCH]]:
+; VF16UF1-NEXT:    [[OUTER_IV_NEXT]] = getelementptr i8, ptr [[OUTER_IV]], i64 1
+; VF16UF1-NEXT:    [[C_2:%.*]] = call i1 @cond()
+; VF16UF1-NEXT:    br i1 [[C_2]], label %[[OUTER_HEADER]], label %[[EXIT:.*]]
+; VF16UF1:       [[EXIT]]:
+; VF16UF1-NEXT:    ret void
+;
+entry:
+  br label %outer.header
+
+outer.header:
+  %outer.iv = phi ptr [ %src, %entry ], [ %outer.iv.next, %outer.latch ]
+  br label %inner
+
+inner:
+  %inner.iv = phi i64 [ 0, %outer.header ], [ %iv.next, %inner ]
+  %gep.src = getelementptr i8, ptr %outer.iv, i64 %inner.iv
+  %l = load i8, ptr %gep.src, align 1
+  %gep.dst = getelementptr i8, ptr %dst, i64 %inner.iv
+  store i8 %l, ptr %gep.dst, align 1
+  %iv.next = add i64 %inner.iv, 1
+  %c.1 = icmp eq i64 %iv.next, %N
+  br i1 %c.1, label %outer.latch, label %inner
+
+outer.latch:
+  %outer.iv.next = getelementptr i8, ptr %outer.iv, i64 1
+  %c.2 = call i1 @cond()
+  br i1 %c.2, label %outer.header, label %exit
+
+exit:
+  ret void
+}
+;.
+; VF8UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; VF8UF1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; VF8UF1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; VF8UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; VF8UF1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; VF8UF1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; VF8UF1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; VF8UF1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+;.
+; VF8UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; VF8UF2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; VF8UF2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; VF8UF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; VF8UF2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; VF8UF2: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; VF8UF2: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; VF8UF2: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+;.
+; VF16UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; VF16UF1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; VF16UF1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; VF16UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; VF16UF1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; VF16UF1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; VF16UF1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; VF16UF1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+;.
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}

From 3026ecaff54b220409ecc254b4f6209801a251b9 Mon Sep 17 00:00:00 2001
From: Florian Hahn 
Date: Tue, 31 Dec 2024 20:11:01 +0000
Subject: [PATCH 229/567] [LV] Also verify loops in vector loop removal tests.

Also verify loop info in tests added in 7d6ec3b9680.
---
 .../LoopVectorize/vector-loop-backedge-elimination.ll       | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
index 8bcba56e0d43a..7e6e5249381cd 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes='loop-vectorize,verify' -force-vector-width=8 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=CHECK,VF8UF1 %s
-; RUN: opt -passes='loop-vectorize,verify' -force-vector-width=8 -force-vector-interleave=2 -S %s | FileCheck --check-prefixes=CHECK,VF8UF2 %s
-; RUN: opt -passes='loop-vectorize,verify' -force-vector-width=16 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=CHECK,VF16UF1 %s
+; RUN: opt -passes='loop-vectorize,verify' -force-vector-width=8 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF8UF1 %s
+; RUN: opt -passes='loop-vectorize,verify' -force-vector-width=8 -force-vector-interleave=2 -S %s | FileCheck --check-prefixes=VF8UF2 %s
+; RUN: opt -passes='loop-vectorize,verify' -force-vector-width=16 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF16UF1 %s
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 

From 1d0f40ba05b76ff028c69054899f88f1c7452b4b Mon Sep 17 00:00:00 2001
From: Tristan Ross 
Date: Tue, 31 Dec 2024 16:36:24 -0800
Subject: [PATCH 230/567] [libc] fix generic __stack_check_fail for fuchsia
 (#121401)

---
 libc/src/compiler/generic/__stack_chk_fail.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libc/src/compiler/generic/__stack_chk_fail.cpp b/libc/src/compiler/generic/__stack_chk_fail.cpp
index 183cf9eb2cbf2..00e976ad8bc2a 100644
--- a/libc/src/compiler/generic/__stack_chk_fail.cpp
+++ b/libc/src/compiler/generic/__stack_chk_fail.cpp
@@ -9,6 +9,7 @@
 #include "src/compiler/__stack_chk_fail.h"
 #include "src/__support/OSUtil/io.h"
 #include "src/stdlib/abort.h"
+#include  // For uintptr_t
 
 extern "C" {
 

From 2feffecb8853b1cdd38a0653df63d70412e65c12 Mon Sep 17 00:00:00 2001
From: Stephen Senran Zhang 
Date: Wed, 1 Jan 2025 10:40:17 +0800
Subject: [PATCH 231/567] [ConstantRange] Estimate tighter lower (upper) bounds
 for masked binary and (or) (#120352)

Fixes #118108.

Co-author: Yingwei Zheng (@dtcxzyw)
---
 clang/test/CodeGen/AArch64/fpm-helpers.c      | 18 ++--
 llvm/lib/IR/ConstantRange.cpp                 | 76 ++++++++++++++--
 .../SCCP/range-and-or-bit-masked.ll           | 88 +++++++++++++++++++
 llvm/unittests/IR/ConstantRangeTest.cpp       | 31 +++++++
 4 files changed, 198 insertions(+), 15 deletions(-)
 create mode 100644 llvm/test/Transforms/SCCP/range-and-or-bit-masked.ll

diff --git a/clang/test/CodeGen/AArch64/fpm-helpers.c b/clang/test/CodeGen/AArch64/fpm-helpers.c
index 4bced01d5c71f..6264b5caeb4f5 100644
--- a/clang/test/CodeGen/AArch64/fpm-helpers.c
+++ b/clang/test/CodeGen/AArch64/fpm-helpers.c
@@ -35,7 +35,7 @@ extern "C" {
 //
 fpm_t test_init() { return __arm_fpm_init(); }
 
-// CHECK-LABEL: define dso_local noundef i64 @test_src1_1(
+// CHECK-LABEL: define dso_local noundef range(i64 0, -6) i64 @test_src1_1(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i64 -8
@@ -44,7 +44,7 @@ fpm_t test_src1_1() {
   return __arm_set_fpm_src1_format(INIT_ONES, __ARM_FPM_E5M2);
 }
 
-// CHECK-LABEL: define dso_local noundef i64 @test_src1_2(
+// CHECK-LABEL: define dso_local noundef range(i64 0, -6) i64 @test_src1_2(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i64 1
@@ -53,7 +53,7 @@ fpm_t test_src1_2() {
   return __arm_set_fpm_src1_format(INIT_ZERO, __ARM_FPM_E4M3);
 }
 
-// CHECK-LABEL: define dso_local noundef i64 @test_src2_1(
+// CHECK-LABEL: define dso_local noundef range(i64 0, -48) i64 @test_src2_1(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i64 -57
@@ -62,7 +62,7 @@ fpm_t test_src2_1() {
   return __arm_set_fpm_src2_format(INIT_ONES, __ARM_FPM_E5M2);
 }
 
-// CHECK-LABEL: define dso_local noundef i64 @test_src2_2(
+// CHECK-LABEL: define dso_local noundef range(i64 0, -48) i64 @test_src2_2(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i64 8
@@ -71,7 +71,7 @@ fpm_t test_src2_2() {
   return __arm_set_fpm_src2_format(INIT_ZERO, __ARM_FPM_E4M3);
 }
 
-// CHECK-LABEL: define dso_local noundef i64 @test_dst1_1(
+// CHECK-LABEL: define dso_local noundef range(i64 0, -384) i64 @test_dst1_1(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i64 -449
@@ -80,7 +80,7 @@ fpm_t test_dst1_1() {
   return __arm_set_fpm_dst_format(INIT_ONES, __ARM_FPM_E5M2);
 }
 
-// CHECK-LABEL: define dso_local noundef i64 @test_dst2_2(
+// CHECK-LABEL: define dso_local noundef range(i64 0, -384) i64 @test_dst2_2(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i64 64
@@ -139,21 +139,21 @@ fpm_t test_lscale() { return __arm_set_fpm_lscale(INIT_ZERO, 127); }
 //
 fpm_t test_lscale2() { return __arm_set_fpm_lscale2(INIT_ZERO, 63); }
 
-// CHECK-LABEL: define dso_local noundef range(i64 0, 4294967296) i64 @test_nscale_1(
+// CHECK-LABEL: define dso_local noundef range(i64 0, 4278190081) i64 @test_nscale_1(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i64 2147483648
 //
 fpm_t test_nscale_1() { return __arm_set_fpm_nscale(INIT_ZERO, -128); }
 
-// CHECK-LABEL: define dso_local noundef range(i64 0, 4294967296) i64 @test_nscale_2(
+// CHECK-LABEL: define dso_local noundef range(i64 0, 4278190081) i64 @test_nscale_2(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i64 2130706432
 //
 fpm_t test_nscale_2() { return __arm_set_fpm_nscale(INIT_ZERO, 127); }
 
-// CHECK-LABEL: define dso_local noundef range(i64 0, 4294967296) i64 @test_nscale_3(
+// CHECK-LABEL: define dso_local noundef range(i64 0, 4278190081) i64 @test_nscale_3(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i64 4278190080
diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp
index d81a292916fde..3566435398992 100644
--- a/llvm/lib/IR/ConstantRange.cpp
+++ b/llvm/lib/IR/ConstantRange.cpp
@@ -1520,15 +1520,72 @@ ConstantRange ConstantRange::binaryNot() const {
   return ConstantRange(APInt::getAllOnes(getBitWidth())).sub(*this);
 }
 
+/// Estimate the 'bit-masked AND' operation's lower bound.
+///
+/// E.g., given two ranges as follows (single quotes are separators and
+/// have no meaning here),
+///
+///   LHS = [10'00101'1,  ; LLo
+///          10'10000'0]  ; LHi
+///   RHS = [10'11111'0,  ; RLo
+///          10'11111'1]  ; RHi
+///
+/// we know that the higher 2 bits of the result is always 10; and we also
+/// notice that RHS[1:6] are always 1, so the result[1:6] cannot be less than
+/// LHS[1:6] (i.e., 00101). Thus, the lower bound is 10'00101'0.
+///
+/// The algorithm is as follows,
+/// 1. we first calculate a mask to find the higher common bits by
+///       Mask = ~((LLo ^ LHi) | (RLo ^ RHi) | (LLo ^ RLo));
+///       Mask = clear all non-leading-ones bits in Mask;
+///    in the example, the Mask is set to 11'00000'0;
+/// 2. calculate a new mask by setting all common leading bits to 1 in RHS, and
+///    keeping the longest leading ones (i.e., 11'11111'0 in the example);
+/// 3. return (LLo & new mask) as the lower bound;
+/// 4. repeat the step 2 and 3 with LHS and RHS swapped, and update the lower
+///    bound with the larger one.
+static APInt estimateBitMaskedAndLowerBound(const ConstantRange &LHS,
+                                            const ConstantRange &RHS) {
+  auto BitWidth = LHS.getBitWidth();
+  // If either is full set or unsigned wrapped, then the range must contain '0'
+  // which leads the lower bound to 0.
+  if ((LHS.isFullSet() || RHS.isFullSet()) ||
+      (LHS.isWrappedSet() || RHS.isWrappedSet()))
+    return APInt::getZero(BitWidth);
+
+  auto LLo = LHS.getLower();
+  auto LHi = LHS.getUpper() - 1;
+  auto RLo = RHS.getLower();
+  auto RHi = RHS.getUpper() - 1;
+
+  // Calculate the mask for the higher common bits.
+  auto Mask = ~((LLo ^ LHi) | (RLo ^ RHi) | (LLo ^ RLo));
+  unsigned LeadingOnes = Mask.countLeadingOnes();
+  Mask.clearLowBits(BitWidth - LeadingOnes);
+
+  auto estimateBound = [BitWidth, &Mask](APInt ALo, const APInt &BLo,
+                                         const APInt &BHi) {
+    unsigned LeadingOnes = ((BLo & BHi) | Mask).countLeadingOnes();
+    unsigned StartBit = BitWidth - LeadingOnes;
+    ALo.clearLowBits(StartBit);
+    return ALo;
+  };
+
+  auto LowerBoundByLHS = estimateBound(LLo, RLo, RHi);
+  auto LowerBoundByRHS = estimateBound(RLo, LLo, LHi);
+
+  return APIntOps::umax(LowerBoundByLHS, LowerBoundByRHS);
+}
+
 ConstantRange ConstantRange::binaryAnd(const ConstantRange &Other) const {
   if (isEmptySet() || Other.isEmptySet())
     return getEmpty();
 
   ConstantRange KnownBitsRange =
       fromKnownBits(toKnownBits() & Other.toKnownBits(), false);
-  ConstantRange UMinUMaxRange =
-      getNonEmpty(APInt::getZero(getBitWidth()),
-                  APIntOps::umin(Other.getUnsignedMax(), getUnsignedMax()) + 1);
+  auto LowerBound = estimateBitMaskedAndLowerBound(*this, Other);
+  ConstantRange UMinUMaxRange = getNonEmpty(
+      LowerBound, APIntOps::umin(Other.getUnsignedMax(), getUnsignedMax()) + 1);
   return KnownBitsRange.intersectWith(UMinUMaxRange);
 }
 
@@ -1538,10 +1595,17 @@ ConstantRange ConstantRange::binaryOr(const ConstantRange &Other) const {
 
   ConstantRange KnownBitsRange =
       fromKnownBits(toKnownBits() | Other.toKnownBits(), false);
+
+  //      ~a & ~b    >= x
+  // <=>  ~(~a & ~b) <= ~x
+  // <=>  a | b      <= ~x
+  // <=>  a | b      <  ~x + 1 = -x
+  // thus, UpperBound(a | b) == -LowerBound(~a & ~b)
+  auto UpperBound =
+      -estimateBitMaskedAndLowerBound(binaryNot(), Other.binaryNot());
   // Upper wrapped range.
-  ConstantRange UMaxUMinRange =
-      getNonEmpty(APIntOps::umax(getUnsignedMin(), Other.getUnsignedMin()),
-                  APInt::getZero(getBitWidth()));
+  ConstantRange UMaxUMinRange = getNonEmpty(
+      APIntOps::umax(getUnsignedMin(), Other.getUnsignedMin()), UpperBound);
   return KnownBitsRange.intersectWith(UMaxUMinRange);
 }
 
diff --git a/llvm/test/Transforms/SCCP/range-and-or-bit-masked.ll b/llvm/test/Transforms/SCCP/range-and-or-bit-masked.ll
new file mode 100644
index 0000000000000..e81c5d739c6d2
--- /dev/null
+++ b/llvm/test/Transforms/SCCP/range-and-or-bit-masked.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=ipsccp %s | FileCheck %s
+
+declare void @use(i1)
+
+define i1 @test1(i64 %x) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp ugt i64 [[X:%.*]], 65535
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[MASK:%.*]] = and i64 [[X]], -65521
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  %cond = icmp ugt i64 %x, 65535
+  call void @llvm.assume(i1 %cond)
+  %mask = and i64 %x, -65521
+  %cmp = icmp eq i64 %mask, 0
+  ret i1 %cmp
+}
+
+define void @test.and(i64 %x, i64 %y) {
+; CHECK-LABEL: @test.and(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C0:%.*]] = icmp uge i64 [[X:%.*]], 138
+; CHECK-NEXT:    [[C1:%.*]] = icmp ule i64 [[X]], 161
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C0]])
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C1]])
+; CHECK-NEXT:    [[C2:%.*]] = icmp uge i64 [[Y:%.*]], 186
+; CHECK-NEXT:    [[C3:%.*]] = icmp ule i64 [[Y]], 188
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C2]])
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C3]])
+; CHECK-NEXT:    [[AND:%.*]] = and i64 [[X]], [[Y]]
+; CHECK-NEXT:    call void @use(i1 false)
+; CHECK-NEXT:    [[R1:%.*]] = icmp ult i64 [[AND]], 137
+; CHECK-NEXT:    call void @use(i1 [[R1]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c0 = icmp uge i64 %x, 138 ; 0b10001010
+  %c1 = icmp ule i64 %x, 161 ; 0b10100000
+  call void @llvm.assume(i1 %c0)
+  call void @llvm.assume(i1 %c1)
+  %c2 = icmp uge i64 %y, 186 ; 0b10111010
+  %c3 = icmp ule i64 %y, 188 ; 0b10111110
+  call void @llvm.assume(i1 %c2)
+  call void @llvm.assume(i1 %c3)
+  %and = and i64 %x, %y
+  %r0 = icmp ult i64 %and, 136 ; 0b10001000
+  call void @use(i1 %r0) ; false
+  %r1 = icmp ult i64 %and, 137
+  call void @use(i1 %r1) ; unknown
+  ret void
+}
+
+define void @test.or(i64 %x, i64 %y) {
+; CHECK-LABEL: @test.or(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C0:%.*]] = icmp ule i64 [[X:%.*]], 117
+; CHECK-NEXT:    [[C1:%.*]] = icmp uge i64 [[X]], 95
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C0]])
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C1]])
+; CHECK-NEXT:    [[C2:%.*]] = icmp ule i64 [[Y:%.*]], 69
+; CHECK-NEXT:    [[C3:%.*]] = icmp uge i64 [[Y]], 67
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C2]])
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C3]])
+; CHECK-NEXT:    [[OR:%.*]] = or i64 [[X]], [[Y]]
+; CHECK-NEXT:    call void @use(i1 false)
+; CHECK-NEXT:    [[R1:%.*]] = icmp ugt i64 [[OR]], 118
+; CHECK-NEXT:    call void @use(i1 [[R1]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c0 = icmp ule i64 %x, 117 ; 0b01110101
+  %c1 = icmp uge i64 %x, 95  ; 0b01011111
+  call void @llvm.assume(i1 %c0)
+  call void @llvm.assume(i1 %c1)
+  %c2 = icmp ule i64 %y, 69  ; 0b01000101
+  %c3 = icmp uge i64 %y, 67  ; 0b01000011
+  call void @llvm.assume(i1 %c2)
+  call void @llvm.assume(i1 %c3)
+  %or = or i64 %x, %y
+  %r0 = icmp ugt i64 %or, 119 ; 0b01110111
+  call void @use(i1 %r0) ; false
+  %r1 = icmp ugt i64 %or, 118
+  call void @use(i1 %r1) ; unknown
+  ret void
+}
diff --git a/llvm/unittests/IR/ConstantRangeTest.cpp b/llvm/unittests/IR/ConstantRangeTest.cpp
index e1d9b3e387b20..c390ffea1c352 100644
--- a/llvm/unittests/IR/ConstantRangeTest.cpp
+++ b/llvm/unittests/IR/ConstantRangeTest.cpp
@@ -2720,6 +2720,37 @@ TEST_F(ConstantRangeTest, binaryAnd) {
   EXPECT_EQ(R16_32.binaryAnd(R0_99), R0_32);
   EXPECT_EQ(R0_99.binaryAnd(R16_32), R0_32);
 
+  // 'And' with leading bits are masked (with common leading bits stripped)
+  ConstantRange RMaskedL(APInt(8, 0b10'00101'1), APInt(8, 0b10'10000'0 + 1));
+  ConstantRange RMaskedR(APInt(8, 0b10'11111'0), APInt(8, 0b10'11111'1 + 1));
+  EXPECT_EQ(RMaskedL.binaryAnd(RMaskedR).getLower(), APInt(8, 0b10'00101'0));
+  EXPECT_EQ(RMaskedR.binaryAnd(RMaskedL).getLower(), APInt(8, 0b10'00101'0));
+
+  ConstantRange RMaskedL1(APInt(8, 0b00'011'010), APInt(8, 0b00'100'100 + 1));
+  ConstantRange RMaskedR1(APInt(8, 0b00'111'010), APInt(8, 0b00'111'110 + 1));
+  EXPECT_EQ(RMaskedL1.binaryAnd(RMaskedR1).getLower(), APInt(8, 0b00'011'000));
+  EXPECT_EQ(RMaskedR1.binaryAnd(RMaskedL1).getLower(), APInt(8, 0b00'011'000));
+
+  ConstantRange RMaskedL2(APInt(8, 0b0000'0111u), APInt(8, 0b0000'1101u + 1u));
+  ConstantRange RMaskedR2(APInt(8, 0xff), APInt(8, 0));
+  EXPECT_EQ(RMaskedL2.binaryAnd(RMaskedR2), RMaskedL2);
+  EXPECT_EQ(RMaskedR2.binaryAnd(RMaskedL2), RMaskedL2);
+
+  ConstantRange RMaskedL3(APInt(4, 0b0011u), APInt(4, 0));
+  ConstantRange RMaskedR3(APInt(4, 0b1011u), APInt(4, 0));
+  APInt Zero_4(4, 0);
+  EXPECT_EQ(RMaskedL3.binaryAnd(RMaskedR3).getLower().uge(Zero_4), true);
+  EXPECT_EQ(RMaskedR3.binaryAnd(RMaskedL3).getLower().uge(Zero_4), true);
+
+  // wrapped set
+  APInt NegSeven(4, 9); // Also -7
+  ConstantRange RMaskedL4(NegSeven, APInt(4, 1));
+  ConstantRange RMaskedR4(NegSeven, APInt(4, 0));
+  EXPECT_EQ(RMaskedL4.binaryAnd(RMaskedR4).contains(Zero_4), true);
+  EXPECT_EQ(RMaskedR4.binaryAnd(RMaskedL4).contains(Zero_4), true);
+  EXPECT_EQ(RMaskedL4.binaryAnd(RMaskedR4).contains(NegSeven), true);
+  EXPECT_EQ(RMaskedR4.binaryAnd(RMaskedL4).contains(NegSeven), true);
+
   TestBinaryOpExhaustive(
       [](const ConstantRange &CR1, const ConstantRange &CR2) {
         return CR1.binaryAnd(CR2);

From f1fa292cd61a70f41aa8dd8c33c4ac8d036dcfd0 Mon Sep 17 00:00:00 2001
From: Vikram Hegde <115221833+vikramRH@users.noreply.github.com>
Date: Wed, 1 Jan 2025 10:17:37 +0530
Subject: [PATCH 232/567] [AMDGPU] Pre-commit tests for "lshr + mad" fold
 (#119509)

---
 llvm/test/CodeGen/AMDGPU/mad_64_32.ll | 663 ++++++++++++++++++++++++++
 1 file changed, 663 insertions(+)

diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index 33007e5b285d8..3be17f9538d0f 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -1333,5 +1333,668 @@ define i48 @mad_i48_i48(i48 %arg0, i48 %arg1, i48 %arg2) #0 {
   ret i48 %a
 }
 
+define i64 @lshr_mad_i64_1(i64 %arg0, i64 %arg1) #0 {
+; CI-LABEL: lshr_mad_i64_1:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_movk_i32 s4, 0xfc19
+; CI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; CI-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
+; CI-NEXT:    v_mov_b32_e32 v0, v2
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_1:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_movk_i32 s4, 0xfc19
+; SI-NEXT:    v_mul_hi_u32 v2, v1, s4
+; SI-NEXT:    v_mul_lo_u32 v3, v1, s4
+; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_movk_i32 s4, 0xfc19
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v1, v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, 0xfffffc19, v1, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0xfffffc19, v1, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %lsh = lshr i64 %arg0, 32
+  %mul = mul i64 %lsh, s0xfffffffffffffc19
+  %mad = add i64 %mul, %arg0
+
+  ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_2(i64 %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_2:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_movk_i32 s4, 0xd1
+; CI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; CI-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
+; CI-NEXT:    v_mov_b32_e32 v0, v2
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_2:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_movk_i32 s4, 0xd1
+; SI-NEXT:    v_mul_hi_u32 v2, v1, s4
+; SI-NEXT:    v_mul_lo_u32 v3, v1, s4
+; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_movk_i32 s4, 0xd1
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v1, v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, 0xd1, v1, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0xd1, v1, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %lsh = lshr i64 %arg0, 32
+  %mul = mul i64 %lsh, s0xffffffff000000d1
+  %mad = add i64 %mul, %arg0
+
+  ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_3(i64 %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_3:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_movk_i32 s4, 0xfc88
+; CI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; CI-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
+; CI-NEXT:    v_mov_b32_e32 v0, v2
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_3:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_movk_i32 s4, 0xfc88
+; SI-NEXT:    v_mul_hi_u32 v2, v1, s4
+; SI-NEXT:    v_mul_lo_u32 v3, v1, s4
+; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_movk_i32 s4, 0xfc88
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v1, v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_3:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, 0xfffffc88, v1, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_3:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0xfffffc88, v1, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %lsh = lshr i64 %arg0, 32
+  %mul = mul i64 s0xfffffffffffffc88, %lsh
+  %mad = add i64 %mul, %arg0
+
+  ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_4(i32 %arg0, i64 %arg1) #0 {
+; CI-LABEL: lshr_mad_i64_4:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_mul_lo_u32 v3, v2, v0
+; CI-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v1, v0, 0
+; CI-NEXT:    s_movk_i32 s4, 0xfc88
+; CI-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, s4, v[1:2]
+; CI-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_4:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mul_lo_u32 v2, v2, v0
+; SI-NEXT:    v_mul_hi_u32 v3, v1, v0
+; SI-NEXT:    s_movk_i32 s4, 0xfc88
+; SI-NEXT:    v_mul_lo_u32 v0, v1, v0
+; SI-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; SI-NEXT:    v_mul_hi_u32 v3, v2, s4
+; SI-NEXT:    v_mul_lo_u32 v1, v2, s4
+; SI-NEXT:    v_sub_i32_e32 v3, vcc, v3, v2
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v2, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, v0, v[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9-NEXT:    s_movk_i32 s4, 0xfc88
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, s4, v[4:5]
+; GFX9-NEXT:    v_sub_u32_e32 v1, v1, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_4:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mad_u64_u32 v[3:4], null, v1, v0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v4
+; GFX11-NEXT:    v_mad_u64_u32 v[5:6], null, v2, v0, v[1:2]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v5
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, 0xfffffc88, v5, v[3:4]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v1, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_4:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[3:4], null, v1, v0, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v1, v4
+; GFX12-NEXT:    v_mad_co_u64_u32 v[5:6], null, v2, v0, v[1:2]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v4, v5
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, 0xfffffc88, v5, v[3:4]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %ext = zext i32 %arg0 to i64
+  %mul1 = mul i64 %arg1, %ext
+  %lsh = lshr i64 %mul1, 32
+  %mul2 = mul i64 %lsh, s0xfffffffffffffc88
+  %mad = add i64 %mul2, %mul1
+  ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_negative_1(i64 %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_negative_1:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshrrev_b32_e32 v2, 4, v1
+; CI-NEXT:    s_movk_i32 s4, 0xfc19
+; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v2, s4, v[0:1]
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_negative_1:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 4, v1
+; SI-NEXT:    s_movk_i32 s4, 0xfc19
+; SI-NEXT:    v_mul_lo_u32 v3, v2, s4
+; SI-NEXT:    v_mul_hi_i32 v2, v2, s4
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_negative_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 4, v1
+; GFX9-NEXT:    s_movk_i32 s4, 0xfc19
+; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v2, s4, v[0:1]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: lshr_mad_i64_negative_1:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_lshrrev_b32_e32 v4, 4, v1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_mad_i64_i32 v[2:3], null, 0xfffffc19, v4, v[0:1]
+; GFX1100-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1150-LABEL: lshr_mad_i64_negative_1:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1150-NEXT:    v_lshrrev_b32_e32 v2, 4, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_mad_i64_i32 v[0:1], null, 0xfffffc19, v2, v[0:1]
+; GFX1150-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_negative_1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 4, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mad_co_i64_i32 v[0:1], null, 0xfffffc19, v2, v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %lsh = lshr i64 %arg0, 36
+  %mul = mul i64 %lsh, s0xfffffffffffffc19
+  %mad = add i64 %mul, %arg0
+
+  ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_negative_2(i64 %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_negative_2:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_movk_i32 s4, 0xd1
+; CI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
+; CI-NEXT:    v_sub_i32_e32 v1, vcc, v3, v0
+; CI-NEXT:    v_mov_b32_e32 v0, v2
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_negative_2:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_movk_i32 s4, 0xd1
+; SI-NEXT:    v_mul_hi_u32 v2, v1, s4
+; SI-NEXT:    v_mul_lo_u32 v4, v1, s4
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v1
+; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_negative_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_movk_i32 s4, 0xd1
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
+; GFX9-NEXT:    v_sub_u32_e32 v1, v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_negative_2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, 0xd1, v1, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v3, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_negative_2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0xd1, v1, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v3, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %lsh = lshr i64 %arg0, 32
+  %mul = mul i64 %lsh, s0xffffff00000000d1
+  %mad = add i64 %mul, %arg0
+
+  ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_negative_3(i64 %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_negative_3:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshr_b64 v[2:3], v[0:1], 22
+; CI-NEXT:    v_and_b32_e32 v2, 0xfffffc00, v2
+; CI-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; CI-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; CI-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
+; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_negative_3:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_lshr_b64 v[2:3], v[0:1], 22
+; SI-NEXT:    v_and_b32_e32 v2, 0xfffffc00, v2
+; SI-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; SI-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_negative_3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 22, v[0:1]
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xfffffc00, v2
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_negative_3:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b64 v[2:3], 22, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xfffffc00, v2
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_negative_3:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b64 v[2:3], 22, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xfffffc00, v2
+; GFX12-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = add i64 %arg0, 1
+  %lsh = lshr i64 %arg0, 32
+  %mul = mul i64 %lsh, s0xfffffffffffffc00
+  %mad = add i64 %mul, %op
+
+  ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_negative_4(i64 %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_negative_4:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, v0, v[0:1]
+; CI-NEXT:    v_mul_lo_u32 v0, v1, v1
+; CI-NEXT:    v_add_i32_e32 v1, vcc, v0, v3
+; CI-NEXT:    v_mov_b32_e32 v0, v2
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_negative_4:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mul_hi_u32 v2, v1, v0
+; SI-NEXT:    v_mul_lo_u32 v3, v1, v1
+; SI-NEXT:    v_mul_lo_u32 v4, v1, v0
+; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_negative_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, v0, v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v1, v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: lshr_mad_i64_negative_4:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_mad_u64_u32 v[2:3], null, v1, v0, v[0:1]
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1100-NEXT:    v_mad_u64_u32 v[3:4], null, v1, v1, v[0:1]
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1150-LABEL: lshr_mad_i64_negative_4:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1150-NEXT:    v_mad_u64_u32 v[3:4], null, v1, v0, v[0:1]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mov_b32_e32 v0, v4
+; GFX1150-NEXT:    v_mad_u64_u32 v[1:2], null, v1, v1, v[0:1]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1150-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1150-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_negative_4:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[3:4], null, v1, v0, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v4
+; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], null, v1, v1, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %lsh = lshr i64 %arg0, 32
+  %mul = mul i64 %lsh, %arg0
+  %mad = add i64 %mul, %arg0
+
+  ret i64 %mad
+}
+
+define amdgpu_ps i64 @lshr_mad_i64_sgpr(i64 inreg %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_sgpr:
+; CI:       ; %bb.0:
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v2, 0xffff1c18
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s1, v2, v[0:1]
+; CI-NEXT:    v_subrev_i32_e32 v1, vcc, s1, v1
+; CI-NEXT:    v_readfirstlane_b32 s0, v0
+; CI-NEXT:    v_readfirstlane_b32 s1, v1
+; CI-NEXT:    ; return to shader part epilog
+;
+; SI-LABEL: lshr_mad_i64_sgpr:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_mov_b32_e32 v0, 0xffff1c18
+; SI-NEXT:    v_mul_hi_u32 v0, s1, v0
+; SI-NEXT:    s_mul_i32 s2, s1, 0xffff1c18
+; SI-NEXT:    v_readfirstlane_b32 s3, v0
+; SI-NEXT:    s_sub_i32 s3, s3, s1
+; SI-NEXT:    s_add_u32 s0, s2, s0
+; SI-NEXT:    s_addc_u32 s1, s3, s1
+; SI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: lshr_mad_i64_sgpr:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mul_hi_u32 s2, s1, 0xffff1c18
+; GFX9-NEXT:    s_sub_i32 s2, s2, s1
+; GFX9-NEXT:    s_mul_i32 s3, s1, 0xffff1c18
+; GFX9-NEXT:    s_add_u32 s0, s3, s0
+; GFX9-NEXT:    s_addc_u32 s1, s2, s1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: lshr_mad_i64_sgpr:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mul_hi_u32 s2, s1, 0xffff1c18
+; GFX11-NEXT:    s_mul_i32 s3, s1, 0xffff1c18
+; GFX11-NEXT:    s_sub_i32 s2, s2, s1
+; GFX11-NEXT:    s_add_u32 s0, s3, s0
+; GFX11-NEXT:    s_addc_u32 s1, s2, s1
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: lshr_mad_i64_sgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_mov_b32 s4, 0xffff1c18
+; GFX12-NEXT:    s_mov_b32 s3, 0
+; GFX12-NEXT:    s_mov_b32 s2, s1
+; GFX12-NEXT:    s_mov_b32 s5, -1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mul_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[0:1]
+; GFX12-NEXT:    ; return to shader part epilog
+  %lsh = lshr i64 %arg0, 32
+  %mul = mul i64 %lsh, s0xffffffffffff1c18
+  %mad = add i64 %mul, %arg0
+
+  ret i64 %mad
+}
+
+define <2 x i64> @lshr_mad_i64_vec(<2 x i64> %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_vec:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s4, 0xffff1c18
+; CI-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[0:1]
+; CI-NEXT:    s_mov_b32 s4, 0xffff1118
+; CI-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v3, s4, v[2:3]
+; CI-NEXT:    v_sub_i32_e32 v1, vcc, v5, v1
+; CI-NEXT:    v_sub_i32_e32 v3, vcc, v7, v3
+; CI-NEXT:    v_mov_b32_e32 v0, v4
+; CI-NEXT:    v_mov_b32_e32 v2, v6
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_vec:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, 0xffff1118
+; SI-NEXT:    v_mul_lo_u32 v4, v3, s4
+; SI-NEXT:    v_mul_hi_u32 v5, v3, s4
+; SI-NEXT:    s_mov_b32 s4, 0xffff1c18
+; SI-NEXT:    v_mul_hi_u32 v6, v1, s4
+; SI-NEXT:    v_mul_lo_u32 v7, v1, s4
+; SI-NEXT:    v_sub_i32_e32 v5, vcc, v5, v3
+; SI-NEXT:    v_sub_i32_e32 v6, vcc, v6, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v6, v1, vcc
+; SI-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; SI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_vec:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff1c18
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[0:1]
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff1118
+; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v3, s4, v[2:3]
+; GFX9-NEXT:    v_sub_u32_e32 v1, v5, v1
+; GFX9-NEXT:    v_sub_u32_e32 v3, v7, v3
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v2, v6
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_vec:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mad_u64_u32 v[4:5], null, 0xffff1c18, v1, v[0:1]
+; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, 0xffff1118, v3, v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v5, v1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, v7, v3
+; GFX11-NEXT:    v_mov_b32_e32 v2, v6
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_vec:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[4:5], null, 0xffff1c18, v1, v[0:1]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[6:7], null, 0xffff1118, v3, v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v5, v1
+; GFX12-NEXT:    v_mov_b32_e32 v0, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_sub_nc_u32_e32 v3, v7, v3
+; GFX12-NEXT:    v_mov_b32_e32 v2, v6
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %lsh = lshr <2 x i64> %arg0, 
+  %mul = mul <2 x i64> %lsh, 
+  %mad = add <2 x i64> %mul, %arg0
+
+  ret <2 x i64> %mad
+}
+
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone speculatable }

From 283806695a6deeacd5d3eaf88becec2b627ab56f Mon Sep 17 00:00:00 2001
From: Vikash Gupta 
Date: Wed, 1 Jan 2025 11:14:53 +0530
Subject: [PATCH 233/567] [GlobalIsel] Add combine for select with constants
 (#121088)

The SelectionDAG Isel supports the both version of combines mentioned
below :
```
select Cond, Pow2, 0 --> (zext Cond)  << log2(Pow2)
select Cond, 0, Pow2 --> (zext !Cond) << log2(Pow2)
```
The GlobalIsel for now only supports the first one defined in it's
generic combinerHelper.cpp. This patch adds the missing second one.
---
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 17 +++++++++++
 .../AArch64/GlobalISel/combine-select.mir     | 30 +++++++++++++++++++
 2 files changed, 47 insertions(+)

diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index c20e9d0c6876e..c061c01d3c1b1 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -6864,6 +6864,23 @@ bool CombinerHelper::tryFoldSelectOfConstants(GSelect *Select,
     };
     return true;
   }
+
+  // select Cond, 0, Pow2 --> (zext (!Cond)) << log2(Pow2)
+  if (FalseValue.isPowerOf2() && TrueValue.isZero()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Not = MRI.createGenericVirtualRegister(CondTy);
+      B.buildNot(Not, Cond);
+      Register Inner = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildZExtOrTrunc(Inner, Not);
+      // The shift amount must be scalar.
+      LLT ShiftTy = TrueTy.isVector() ? TrueTy.getElementType() : TrueTy;
+      auto ShAmtC = B.buildConstant(ShiftTy, FalseValue.exactLogBase2());
+      B.buildShl(Dest, Inner, ShAmtC, Flags);
+    };
+    return true;
+  }
+
   // select Cond, -1, C --> or (sext Cond), C
   if (TrueValue.isAllOnes()) {
     MatchInfo = [=](MachineIRBuilder &B) {
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
index 86fa12aa064ac..4afa0d4378fe1 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
@@ -436,6 +436,36 @@ body:             |
     $w0 = COPY %ext(s32)
 ...
 ---
+# select cond, 0, 64 --> (zext (!Cond)) << log2(Pow2)
+name:            select_cond_0_64_to_shift
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_0_64_to_shift
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR %c, [[C]]
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s8) = G_ZEXT [[XOR]](s1)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s8) = G_CONSTANT i8 6
+    ; CHECK-NEXT: %sel:_(s8) = G_SHL [[ZEXT]], [[C1]](s8)
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s8)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %two:_(s8) = G_CONSTANT i8 0
+    %one:_(s8) = G_CONSTANT i8 64
+    %sel:_(s8) = G_SELECT %c, %two, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
 # select cond, -1, 0 --> sext Cond
 name:            select_cond_minus_1_0_to_sext_cond
 body:             |

From eafbab6facb0627e11757efb1eae98f806387b55 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng 
Date: Wed, 1 Jan 2025 15:02:08 +0800
Subject: [PATCH 234/567] [EntryExitInstrumenter][AArch64][RISCV][LoongArch]
 Pass `__builtin_return_address(0)` into `_mcount` (#121107)

On RISC-V, AArch64, and LoongArch, the `_mcount` function takes
`__builtin_return_address(0)` as an argument since
`__builtin_return_address(1)` is not available on these platforms. This
patch fixes the argument passing to match the behavior of glibc/gcc.

Closes https://github.com/llvm/llvm-project/issues/121103.
---
 .../Utils/EntryExitInstrumenter.cpp           | 15 +++++++++++
 .../mcount-with-frompc.ll                     | 25 +++++++++++++++++++
 2 files changed, 40 insertions(+)
 create mode 100644 llvm/test/Transforms/EntryExitInstrumenter/mcount-with-frompc.ll

diff --git a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
index 47bb31905d1ac..5b33edd51cffa 100644
--- a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
+++ b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
@@ -48,6 +48,21 @@ static void insertCall(Function &CurFn, StringRef Func,
                                                   /*isVarArg=*/false)),
           {GV}, "", InsertionPt);
       Call->setDebugLoc(DL);
+    } else if (TargetTriple.isRISCV() || TargetTriple.isAArch64() ||
+               TargetTriple.isLoongArch()) {
+      // On RISC-V, AArch64, and LoongArch, the `_mcount` function takes
+      // `__builtin_return_address(0)` as an argument since
+      // `__builtin_return_address(1)` is not available on these platforms.
+      Instruction *RetAddr = CallInst::Create(
+          Intrinsic::getOrInsertDeclaration(&M, Intrinsic::returnaddress),
+          ConstantInt::get(Type::getInt32Ty(C), 0), "", InsertionPt);
+      RetAddr->setDebugLoc(DL);
+
+      FunctionCallee Fn = M.getOrInsertFunction(
+          Func, FunctionType::get(Type::getVoidTy(C), PointerType::getUnqual(C),
+                                  false));
+      CallInst *Call = CallInst::Create(Fn, RetAddr, "", InsertionPt);
+      Call->setDebugLoc(DL);
     } else {
       FunctionCallee Fn = M.getOrInsertFunction(Func, Type::getVoidTy(C));
       CallInst *Call = CallInst::Create(Fn, "", InsertionPt);
diff --git a/llvm/test/Transforms/EntryExitInstrumenter/mcount-with-frompc.ll b/llvm/test/Transforms/EntryExitInstrumenter/mcount-with-frompc.ll
new file mode 100644
index 0000000000000..0f8cf5c735453
--- /dev/null
+++ b/llvm/test/Transforms/EntryExitInstrumenter/mcount-with-frompc.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=riscv64 -passes="ee-instrument" -S < %s | FileCheck %s --check-prefixes=CHECK,RISCV64
+; RUN: opt -mtriple=riscv32 -passes="ee-instrument" -S < %s | FileCheck %s --check-prefixes=CHECK,RISCV32
+; RUN: opt -mtriple=loongarch64 -passes="ee-instrument" -S < %s | FileCheck %s --check-prefixes=CHECK,LOONGARCH64
+; RUN: opt -mtriple=loongarch32 -passes="ee-instrument" -S < %s | FileCheck %s --check-prefixes=CHECK,LOONGARCH32
+; RUN: opt -mtriple=aarch64 -passes="ee-instrument" -S < %s | FileCheck %s --check-prefixes=CHECK,AARCH64
+; RUN: opt -mtriple=aarch64_be -passes="ee-instrument" -S < %s | FileCheck %s --check-prefixes=CHECK,AARCH64_BE
+; RUN: opt -mtriple=aarch64_32 -passes="ee-instrument" -S < %s | FileCheck %s --check-prefixes=CHECK,AARCH64_32
+
+define void @f1() "instrument-function-entry-inlined"="_mcount" {
+; CHECK-LABEL: define void @f1() {
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @_mcount(ptr [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AARCH64: {{.*}}
+; AARCH64_32: {{.*}}
+; AARCH64_BE: {{.*}}
+; LOONGARCH32: {{.*}}
+; LOONGARCH64: {{.*}}
+; RISCV32: {{.*}}
+; RISCV64: {{.*}}

From a29bd8cbab7390e5a9e3182e6e87908f4e7daced Mon Sep 17 00:00:00 2001
From: Akshat Oke 
Date: Wed, 1 Jan 2025 12:36:10 +0530
Subject: [PATCH 235/567] [CodeGen][NewPM] Record parameterized machine pass
 names to PIC (#120554)

Required for `{start|stop}-{after-before}` cli
---
 llvm/include/llvm/Passes/MachinePassRegistry.def | 2 +-
 llvm/lib/Passes/PassBuilder.cpp                  | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index 5a4e79d7225db..29763995e8b51 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -179,7 +179,7 @@ MACHINE_FUNCTION_PASS("verify", MachineTraceMetricsVerifi
                                           PARAMS)
 #endif
 MACHINE_FUNCTION_PASS_WITH_PARAMS(
-    "regallocfast", "RegAllocFast",
+    "regallocfast", "RegAllocFastPass",
     [](RegAllocFastPassOptions Opts) { return RegAllocFastPass(Opts); },
     [PB = this](StringRef Params) {
       return parseRegAllocFastPassOptions(*PB, Params);
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index a936f5381137c..30b8d7c949948 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -492,6 +492,9 @@ PassBuilder::PassBuilder(TargetMachine *TM, PipelineTuningOptions PTO,
   PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
 #define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS)                               \
   PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
+#define MACHINE_FUNCTION_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER,    \
+                                          PARAMS)                              \
+  PIC->addClassToPassName(CLASS, NAME);
 #include "llvm/Passes/MachinePassRegistry.def"
     });
   }

From 1623c435948ae305220e638066e968cb3296e567 Mon Sep 17 00:00:00 2001
From: TilakChad <49703944+TilakChad@users.noreply.github.com>
Date: Wed, 1 Jan 2025 13:35:05 +0545
Subject: [PATCH 236/567] [Clang] Resolved type of expression indexing into
 pack of values of a non-dependent type (#121405)

---
 clang/docs/ReleaseNotes.rst                |  1 +
 clang/lib/AST/ExprCXX.cpp                  |  2 +-
 clang/test/SemaCXX/cxx2c-pack-indexing.cpp | 16 ++++++++++++++++
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index b7da12bcf6581..2a688a677294f 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -886,6 +886,7 @@ Bug Fixes to C++ Support
   out of a module (which is the case e.g. in MSVC's implementation of ``std`` module). (#GH118218)
 - Fixed a pack expansion issue in checking unexpanded parameter sizes. (#GH17042)
 - Fixed a bug where captured structured bindings were modifiable inside non-mutable lambda (#GH95081)
+- Fixed an issue while resolving type of expression indexing into a pack of values of non-dependent type (#GH121242)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/AST/ExprCXX.cpp b/clang/lib/AST/ExprCXX.cpp
index fc09d24fc30cb..5bf5d6adf525a 100644
--- a/clang/lib/AST/ExprCXX.cpp
+++ b/clang/lib/AST/ExprCXX.cpp
@@ -1722,7 +1722,7 @@ PackIndexingExpr *PackIndexingExpr::Create(
   if (Index && FullySubstituted && !SubstitutedExprs.empty())
     Type = SubstitutedExprs[*Index]->getType();
   else
-    Type = Context.DependentTy;
+    Type = PackIdExpr->getType();
 
   void *Storage =
       Context.Allocate(totalSizeToAlloc(SubstitutedExprs.size()));
diff --git a/clang/test/SemaCXX/cxx2c-pack-indexing.cpp b/clang/test/SemaCXX/cxx2c-pack-indexing.cpp
index cb679a6c3ad87..58b642d2735b6 100644
--- a/clang/test/SemaCXX/cxx2c-pack-indexing.cpp
+++ b/clang/test/SemaCXX/cxx2c-pack-indexing.cpp
@@ -305,3 +305,19 @@ template  struct mdispatch_ {
 mdispatch_ d;
 
 } // namespace GH116105
+
+namespace GH121242 {
+    // Non-dependent type pack access
+    template 
+    int y = x...[0];
+
+    struct X {};
+
+    template 
+    X z = x...[0];
+
+    void foo() {
+        (void)y<0>;
+        (void)z;
+    }
+} // namespace GH121242

From ca2ab74838c41a4146835b5bcc91ce4732273f7d Mon Sep 17 00:00:00 2001
From: xtex 
Date: Wed, 1 Jan 2025 08:29:04 +0000
Subject: [PATCH 237/567] [clang] Canonicalize absolute paths in dependency
 file (#117458)

This fixes #117438.

If paths in dependency file are not absoulte, make (or ninja) will
canonicalize them.
While their canonicalization does not involves symbolic links expansion
(for IO performance concerns), leaving a non-absolute path in dependency
file may lead to unexpected canonicalization.
For example, '/a/../b', where '/a' is a symlink to '/c/d', it should be
'/c/b' but make (and ninja) canonicalizes it as '/b', and fails for file
not found.
---
 clang/include/clang/Frontend/Utils.h          |  1 +
 clang/lib/Frontend/DependencyFile.cpp         | 22 +++++++++++++++----
 clang/test/Frontend/dependency-gen-symlink.c  |  2 +-
 .../dependency-gen-windows-duplicates.c       |  2 +-
 clang/test/VFS/external-names.c               |  2 +-
 5 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/clang/include/clang/Frontend/Utils.h b/clang/include/clang/Frontend/Utils.h
index 604e42067a3f1..8ed17179c9824 100644
--- a/clang/include/clang/Frontend/Utils.h
+++ b/clang/include/clang/Frontend/Utils.h
@@ -120,6 +120,7 @@ class DependencyFileGenerator : public DependencyCollector {
 private:
   void outputDependencyFile(DiagnosticsEngine &Diags);
 
+  llvm::IntrusiveRefCntPtr FS;
   std::string OutputFile;
   std::vector Targets;
   bool IncludeSystemHeaders;
diff --git a/clang/lib/Frontend/DependencyFile.cpp b/clang/lib/Frontend/DependencyFile.cpp
index 528eae2c5283e..8a36d835d82b3 100644
--- a/clang/lib/Frontend/DependencyFile.cpp
+++ b/clang/lib/Frontend/DependencyFile.cpp
@@ -10,11 +10,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Frontend/Utils.h"
 #include "clang/Basic/FileManager.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Frontend/DependencyOutputOptions.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
+#include "clang/Frontend/Utils.h"
 #include "clang/Lex/DirectoryLookup.h"
 #include "clang/Lex/ModuleMap.h"
 #include "clang/Lex/PPCallbacks.h"
@@ -23,8 +23,10 @@
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/raw_ostream.h"
 #include 
+#include 
 
 using namespace clang;
 
@@ -236,6 +238,7 @@ void DependencyFileGenerator::attachToPreprocessor(Preprocessor &PP) {
     PP.SetSuppressIncludeNotFoundError(true);
 
   DependencyCollector::attachToPreprocessor(PP);
+  FS = PP.getFileManager().getVirtualFileSystemPtr();
 }
 
 bool DependencyFileGenerator::sawDependency(StringRef Filename, bool FromModule,
@@ -312,11 +315,22 @@ void DependencyFileGenerator::finishedMainFile(DiagnosticsEngine &Diags) {
 /// https://msdn.microsoft.com/en-us/library/dd9y37ha.aspx for NMake info,
 /// https://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx
 /// for Windows file-naming info.
-static void PrintFilename(raw_ostream &OS, StringRef Filename,
+static void printFilename(raw_ostream &OS, llvm::vfs::FileSystem *FS,
+                          StringRef Filename,
                           DependencyOutputFormat OutputFormat) {
   // Convert filename to platform native path
   llvm::SmallString<256> NativePath;
   llvm::sys::path::native(Filename.str(), NativePath);
+  // Resolve absolute path. Make and Ninja canonicalize paths
+  // without checking for symbolic links in the path, for performance concerns.
+  // If there is something like `/bin/../lib64` -> `/usr/lib64`
+  // (where `/bin` links to `/usr/bin`), Make will see them as `/lib64`.
+  if (FS != nullptr && llvm::sys::path::is_absolute(NativePath)) {
+    llvm::SmallString<256> NativePathTmp = NativePath;
+    std::error_code EC = FS->getRealPath(NativePathTmp, NativePath);
+    if (EC)
+      NativePath = NativePathTmp;
+  }
 
   if (OutputFormat == DependencyOutputFormat::NMake) {
     // Add quotes if needed. These are the characters listed as "special" to
@@ -400,7 +414,7 @@ void DependencyFileGenerator::outputDependencyFile(llvm::raw_ostream &OS) {
       Columns = 2;
     }
     OS << ' ';
-    PrintFilename(OS, File, OutputFormat);
+    printFilename(OS, FS.get(), File, OutputFormat);
     Columns += N + 1;
   }
   OS << '\n';
@@ -411,7 +425,7 @@ void DependencyFileGenerator::outputDependencyFile(llvm::raw_ostream &OS) {
     for (auto I = Files.begin(), E = Files.end(); I != E; ++I) {
       if (Index++ == InputFileIndex)
         continue;
-      PrintFilename(OS, *I, OutputFormat);
+      printFilename(OS, FS.get(), *I, OutputFormat);
       OS << ":\n";
     }
   }
diff --git a/clang/test/Frontend/dependency-gen-symlink.c b/clang/test/Frontend/dependency-gen-symlink.c
index 2fa339ad2abf2..15664a46b90c8 100644
--- a/clang/test/Frontend/dependency-gen-symlink.c
+++ b/clang/test/Frontend/dependency-gen-symlink.c
@@ -15,7 +15,7 @@
 // CHECK: dependency-gen-symlink.c.o
 // CHECK: dependency-gen-symlink.c
 // CHECK: a/header.h
-// CHECK: b/header.h
+// CHECK-NOT: b/header.h
 // CHECK-NOT: with-header-guard.h
 #include "a/header.h"
 #include "b/header.h"
diff --git a/clang/test/Frontend/dependency-gen-windows-duplicates.c b/clang/test/Frontend/dependency-gen-windows-duplicates.c
index abd351377dc33..0ecc23226fb9c 100644
--- a/clang/test/Frontend/dependency-gen-windows-duplicates.c
+++ b/clang/test/Frontend/dependency-gen-windows-duplicates.c
@@ -9,7 +9,7 @@
 // RUN: %clang -MD -MF - %t.dir/test.c -fsyntax-only -I %t.dir/subdir | FileCheck %s
 // CHECK: test.o:
 // CHECK-NEXT: \test.c
-// CHECK-NEXT: \SubDir\X.h
+// CHECK-NEXT: \subdir\x.h
 // File x.h must appear only once (case insensitive check).
 // CHECK-NOT: {{\\|/}}{{x|X}}.{{h|H}}
 
diff --git a/clang/test/VFS/external-names.c b/clang/test/VFS/external-names.c
index 5b7c443b36e56..dd0b5eb501840 100644
--- a/clang/test/VFS/external-names.c
+++ b/clang/test/VFS/external-names.c
@@ -47,4 +47,4 @@
 
 // RUN: %clang_cc1 -D REINCLUDE -I %t -ivfsoverlay %t.yaml -Eonly %s -MTfoo -dependency-file %t.dep
 // RUN: cat %t.dep | FileCheck --check-prefix=CHECK-DEP %s
-// CHECK-DEP-NOT: Inputs
+// CHECK-DEP: Inputs{{..?}}external-names.h

From 50054ba2f446c29b46f0fe55e7b8c48b3818a23f Mon Sep 17 00:00:00 2001
From: Akshat Oke 
Date: Wed, 1 Jan 2025 14:54:08 +0530
Subject: [PATCH 238/567] [CodeGen] LiveRegMatrix: Use allocator through a
 unique_ptr (#120556)

`LIU::Matrix` holds on to a pointer to the allocator in LiveRegMatrix and is left hanging when the allocator moves with the LiveRegMatrix.

This extends the lifetime of the allocator so that it does not get destroyed when moving a LiveRegMatrix object.
---
 llvm/include/llvm/CodeGen/LiveRegMatrix.h | 11 ++++-------
 llvm/lib/CodeGen/LiveRegMatrix.cpp        |  2 +-
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/LiveRegMatrix.h b/llvm/include/llvm/CodeGen/LiveRegMatrix.h
index 486392ca3c49d..373f4402dd8d6 100644
--- a/llvm/include/llvm/CodeGen/LiveRegMatrix.h
+++ b/llvm/include/llvm/CodeGen/LiveRegMatrix.h
@@ -48,7 +48,7 @@ class LiveRegMatrix {
   unsigned UserTag = 0;
 
   // The matrix is represented as a LiveIntervalUnion per register unit.
-  LiveIntervalUnion::Allocator LIUAlloc;
+  std::unique_ptr LIUAlloc;
   LiveIntervalUnion::Array Matrix;
 
   // Cached queries per register unit.
@@ -59,15 +59,12 @@ class LiveRegMatrix {
   unsigned RegMaskVirtReg = 0;
   BitVector RegMaskUsable;
 
-  LiveRegMatrix() = default;
+  LiveRegMatrix()
+      : LIUAlloc(std::make_unique()) {};
   void releaseMemory();
 
 public:
-  LiveRegMatrix(LiveRegMatrix &&Other)
-      : TRI(Other.TRI), LIS(Other.LIS), VRM(Other.VRM), UserTag(Other.UserTag),
-        Matrix(std::move(Other.Matrix)), Queries(std::move(Other.Queries)),
-        RegMaskTag(Other.RegMaskTag), RegMaskVirtReg(Other.RegMaskVirtReg),
-        RegMaskUsable(std::move(Other.RegMaskUsable)) {}
+  LiveRegMatrix(LiveRegMatrix &&Other) = default;
 
   void init(MachineFunction &MF, LiveIntervals &LIS, VirtRegMap &VRM);
 
diff --git a/llvm/lib/CodeGen/LiveRegMatrix.cpp b/llvm/lib/CodeGen/LiveRegMatrix.cpp
index 9744c47d5a851..3367171a15662 100644
--- a/llvm/lib/CodeGen/LiveRegMatrix.cpp
+++ b/llvm/lib/CodeGen/LiveRegMatrix.cpp
@@ -66,7 +66,7 @@ void LiveRegMatrix::init(MachineFunction &MF, LiveIntervals &pLIS,
   unsigned NumRegUnits = TRI->getNumRegUnits();
   if (NumRegUnits != Matrix.size())
     Queries.reset(new LiveIntervalUnion::Query[NumRegUnits]);
-  Matrix.init(LIUAlloc, NumRegUnits);
+  Matrix.init(*LIUAlloc, NumRegUnits);
 
   // Make sure no stale queries get reused.
   invalidateVirtRegs();

From ed572f2003275da8e06a634b4d6658b7921e8334 Mon Sep 17 00:00:00 2001
From: Vitaly Buka 
Date: Wed, 1 Jan 2025 02:30:35 -0800
Subject: [PATCH 239/567] Reapply "[libc++] Explicitly convert to masks in SIMD
 code (#107983)" (#121352)

This reverts commit 0ea40bf02138c02e7680ce6fa8169502f2a8bd42.

Passes with https://github.com/llvm/llvm-project/issues/121365 fix:
https://lab.llvm.org/buildbot/#/builders/55/builds/4930
---
 libcxx/include/__algorithm/mismatch.h   |  8 +--
 libcxx/include/__algorithm/simd_utils.h | 77 ++++++++++++++++---------
 2 files changed, 54 insertions(+), 31 deletions(-)

diff --git a/libcxx/include/__algorithm/mismatch.h b/libcxx/include/__algorithm/mismatch.h
index a6836792c0581..f5855379f6878 100644
--- a/libcxx/include/__algorithm/mismatch.h
+++ b/libcxx/include/__algorithm/mismatch.h
@@ -78,7 +78,7 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
       }
 
       for (size_t __i = 0; __i != __unroll_count; ++__i) {
-        if (auto __cmp_res = __lhs[__i] == __rhs[__i]; !std::__all_of(__cmp_res)) {
+        if (auto __cmp_res = std::__as_mask(__lhs[__i] == __rhs[__i]); !std::__all_of(__cmp_res)) {
           auto __offset = __i * __vec_size + std::__find_first_not_set(__cmp_res);
           return {__first1 + __offset, __first2 + __offset};
         }
@@ -90,7 +90,7 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
 
     // check the remaining 0-3 vectors
     while (static_cast(__last1 - __first1) >= __vec_size) {
-      if (auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2);
+      if (auto __cmp_res = std::__as_mask(std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2));
           !std::__all_of(__cmp_res)) {
         auto __offset = std::__find_first_not_set(__cmp_res);
         return {__first1 + __offset, __first2 + __offset};
@@ -107,8 +107,8 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
     if (static_cast(__first1 - __orig_first1) >= __vec_size) {
       __first1 = __last1 - __vec_size;
       __first2 = __last2 - __vec_size;
-      auto __offset =
-          std::__find_first_not_set(std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2));
+      auto __offset = std::__find_first_not_set(
+          std::__as_mask(std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2)));
       return {__first1 + __offset, __first2 + __offset};
     } // else loop over the elements individually
   }
diff --git a/libcxx/include/__algorithm/simd_utils.h b/libcxx/include/__algorithm/simd_utils.h
index 4e3e4f2b9404e..3ca79247bbd03 100644
--- a/libcxx/include/__algorithm/simd_utils.h
+++ b/libcxx/include/__algorithm/simd_utils.h
@@ -116,42 +116,65 @@ template 
   }(make_index_sequence<__simd_vector_size_v<_VecT>>{});
 }
 
-template 
-[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI bool __all_of(__simd_vector<_Tp, _Np> __vec) noexcept {
-  return __builtin_reduce_and(__builtin_convertvector(__vec, __simd_vector));
+template 
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI bool __all_of(__simd_vector __vec) noexcept {
+  return __builtin_reduce_and(__vec);
 }
 
 template 
-[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI size_t __find_first_set(__simd_vector<_Tp, _Np> __vec) noexcept {
-  using __mask_vec = __simd_vector;
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI auto __as_mask(__simd_vector<_Tp, _Np> __vec) noexcept {
+  static_assert(!is_same<_Tp, bool>::value, "vector type should not be a bool!");
+  return __builtin_convertvector(__vec, __simd_vector);
+}
 
-  // This has MSan disabled du to https://github.com/llvm/llvm-project/issues/85876
-  auto __impl = [&](_MaskT) _LIBCPP_NO_SANITIZE("memory") noexcept {
-#  if defined(_LIBCPP_BIG_ENDIAN)
-    return std::min(
-        _Np, std::__countl_zero(__builtin_bit_cast(_MaskT, __builtin_convertvector(__vec, __mask_vec))));
-#  else
-    return std::min(
-        _Np, std::__countr_zero(__builtin_bit_cast(_MaskT, __builtin_convertvector(__vec, __mask_vec))));
-#  endif
-  };
-
-  if constexpr (sizeof(__mask_vec) == sizeof(uint8_t)) {
-    return __impl(uint8_t{});
-  } else if constexpr (sizeof(__mask_vec) == sizeof(uint16_t)) {
-    return __impl(uint16_t{});
-  } else if constexpr (sizeof(__mask_vec) == sizeof(uint32_t)) {
-    return __impl(uint32_t{});
-  } else if constexpr (sizeof(__mask_vec) == sizeof(uint64_t)) {
-    return __impl(uint64_t{});
+// This uses __builtin_convertvector around the __builtin_shufflevector to work around #107981.
+template 
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI __simd_vector
+__extend_vector(__simd_vector __vec) noexcept {
+  using _VecT = __simd_vector;
+  if constexpr (_Np == 4) {
+    return __builtin_convertvector(
+        __builtin_shufflevector(__vec, _VecT{}, 0, 1, 2, 3, 4, 5, 6, 7), __simd_vector);
+  } else if constexpr (_Np == 2) {
+    return std::__extend_vector(
+        __builtin_convertvector(__builtin_shufflevector(__vec, _VecT{}, 0, 1, 2, 3), __simd_vector));
+  } else if constexpr (_Np == 1) {
+    return std::__extend_vector(
+        __builtin_convertvector(__builtin_shufflevector(__vec, _VecT{}, 0, 1), __simd_vector));
   } else {
-    static_assert(sizeof(__mask_vec) == 0, "unexpected required size for mask integer type");
+    static_assert(sizeof(_VecT) == 0, "Unexpected vector size");
+  }
+}
+
+template 
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI auto __to_int_mask(__simd_vector __vec) {
+  if constexpr (_Np < 8) {
+    return std::__bit_cast(std::__extend_vector(__vec));
+  } else if constexpr (_Np == 8) {
+    return std::__bit_cast(__vec);
+  } else if constexpr (_Np == 16) {
+    return std::__bit_cast(__vec);
+  } else if constexpr (_Np == 32) {
+    return std::__bit_cast(__vec);
+  } else if constexpr (_Np == 64) {
+    return std::__bit_cast(__vec);
+  } else {
+    static_assert(sizeof(__simd_vector) == 0, "Unexpected vector size");
     return 0;
   }
 }
 
-template 
-[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI size_t __find_first_not_set(__simd_vector<_Tp, _Np> __vec) noexcept {
+template 
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI size_t __find_first_set(__simd_vector __vec) noexcept {
+#  if defined(_LIBCPP_BIG_ENDIAN)
+  return std::min(_Np, std::__countl_zero(std::__to_int_mask(__vec)));
+#  else
+  return std::min(_Np, std::__countr_zero(std::__to_int_mask(__vec)));
+#  endif
+}
+
+template 
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI size_t __find_first_not_set(__simd_vector __vec) noexcept {
   return std::__find_first_set(~__vec);
 }
 

From fac6be61c6ccbae15647f4f2485415f630341e2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= 
Date: Wed, 1 Jan 2025 15:31:49 +0000
Subject: [PATCH 240/567] [mlir] Prepend include directories before LLVM
 includes (#121223)

Prepend mlir's include directories before system LLVM include
directories. This is particularly important for standalone builds, where
system include directory may contain the previous version of mlir, and
therefore various mlir targets (particularly tablegen) end up using the
headers from the previous version over the fresh ones. The new logic is
copied from clang.
---
 mlir/CMakeLists.txt | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
index 5ea49c0dbfa7e..7416e522083b7 100644
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@@ -196,8 +196,10 @@ endif()
 
 set(CMAKE_INCLUDE_CURRENT_DIR ON)
 
-include_directories( "include")
-include_directories( ${MLIR_INCLUDE_DIR})
+include_directories(BEFORE
+  "include"
+  ${MLIR_INCLUDE_DIR}
+  )
 
 # Adding tools/mlir-tblgen here as calling add_tablegen sets some variables like
 # MLIR_TABLEGEN_EXE in PARENT_SCOPE which gets lost if that folder is included

From 6d3d952f11d350f6ad2cd6199681af158f4bd397 Mon Sep 17 00:00:00 2001
From: Tomer Shafir <86107951+tomershafir@users.noreply.github.com>
Date: Wed, 1 Jan 2025 19:52:38 +0200
Subject: [PATCH 241/567] [XRay][account] add account test for nonempty exit
 mismatch (#93564)

- Add `llvm-xray account` regression test for an exit mismatch with a
non empty stack (its a different code path than empty stack).
- Align empty stack test case name.
---
 ...ount-exit-mismatch-empty-stack-error.yaml} |  0
 ...t-exit-mismatch-non-empty-stack-error.yaml | 31 +++++++++++++++++++
 .../llvm-xray/X86/account-keep-going.yaml     |  4 +--
 3 files changed, 33 insertions(+), 2 deletions(-)
 rename llvm/test/tools/llvm-xray/X86/{account-empty-stack-error.yaml => account-exit-mismatch-empty-stack-error.yaml} (100%)
 create mode 100644 llvm/test/tools/llvm-xray/X86/account-exit-mismatch-non-empty-stack-error.yaml

diff --git a/llvm/test/tools/llvm-xray/X86/account-empty-stack-error.yaml b/llvm/test/tools/llvm-xray/X86/account-exit-mismatch-empty-stack-error.yaml
similarity index 100%
rename from llvm/test/tools/llvm-xray/X86/account-empty-stack-error.yaml
rename to llvm/test/tools/llvm-xray/X86/account-exit-mismatch-empty-stack-error.yaml
diff --git a/llvm/test/tools/llvm-xray/X86/account-exit-mismatch-non-empty-stack-error.yaml b/llvm/test/tools/llvm-xray/X86/account-exit-mismatch-non-empty-stack-error.yaml
new file mode 100644
index 0000000000000..72331107057fe
--- /dev/null
+++ b/llvm/test/tools/llvm-xray/X86/account-exit-mismatch-non-empty-stack-error.yaml
@@ -0,0 +1,31 @@
+#RUN: not llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -d 2>&1 | FileCheck %s
+#RUN: llvm-xray account %s -k -o - -m %S/Inputs/simple-instrmap.yaml -d 2>&1 | FileCheck %s --check-prefix=KEEPGOING
+
+---
+header:
+  version: 1
+  type: 0
+  constant-tsc: true
+  nonstop-tsc: true
+  cycle-frequency: 0
+records:
+# An exit record doesn't match an entry record on a non empty stack with sibling call deduction. 
+# This can happen for example when an instrumented function does a 'fork()', 
+# where the child process will not see
+# the entry record but see the exit record. This is completely valid data,
+# which should be handled with grace (i.e. we treat it as an error, but since
+# the llvm-xray account tool has an option to keep going, gives the user a
+# chance to retry).
+  - { type: 0, func-id: 1, cpu: 1, thread: 1, kind: function-enter, tsc: 10000 }
+  - { type: 0, func-id: 4, cpu: 1, thread: 1, kind: function-exit,  tsc: 10001 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 1, kind: function-exit,  tsc: 10002 }
+...
+
+#CHECK:      Error processing record: {{.*}}
+#CHECK-NEXT: Thread ID: 1
+#CHECK-NEXT:   #1 @(1)
+#CHECK-NEXT: llvm-xray: Failed accounting function calls in file '{{.*}}'.
+
+#KEEPGOING:      Error processing record: {{.*}}
+#KEEPGOING-NEXT: Thread ID: 1
+#KEEPGOING-NEXT:   #1 @(1)
diff --git a/llvm/test/tools/llvm-xray/X86/account-keep-going.yaml b/llvm/test/tools/llvm-xray/X86/account-keep-going.yaml
index 76011ee8e6e5e..fb1a8f422bad7 100644
--- a/llvm/test/tools/llvm-xray/X86/account-keep-going.yaml
+++ b/llvm/test/tools/llvm-xray/X86/account-keep-going.yaml
@@ -7,8 +7,8 @@ header:
   nonstop-tsc: true
   cycle-frequency: 0
 records:
-# We want to test the case for when we see spurious exits, but keep going
-# anyway ignoring the records in the process.
+# We want to test the case for when we see spurious exits without sibling call deduction, 
+# but keep going anyway ignoring the records in the process.
   - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter, tsc: 10000 }
   - { type: 0, func-id: 2, cpu: 1, thread: 111, kind: function-enter, tsc: 10001 }
   - { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-enter, tsc: 10002 }

From 2a90efd854f5bcce65a7b0fe5a75860160dc103b Mon Sep 17 00:00:00 2001
From: Stephen Senran Zhang 
Date: Thu, 2 Jan 2025 01:53:31 +0800
Subject: [PATCH 242/567] [NFC][ConstraintElimination] Optimize code styles
 (#121422)

This patch does following things,

- prefer early exits;
- add missing std::move;
- avoid duplicate map lookups;
- prefer emplace_back to avoid unnecessary copies.
---
 .../Scalar/ConstraintElimination.cpp          | 75 +++++++++----------
 1 file changed, 37 insertions(+), 38 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index ead07ed37f215..91a3c3f0d392a 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -216,7 +216,7 @@ struct StackEntry {
   StackEntry(unsigned NumIn, unsigned NumOut, bool IsSigned,
              SmallVector ValuesToRelease)
       : NumIn(NumIn), NumOut(NumOut), IsSigned(IsSigned),
-        ValuesToRelease(ValuesToRelease) {}
+        ValuesToRelease(std::move(ValuesToRelease)) {}
 };
 
 struct ConstraintTy {
@@ -726,8 +726,8 @@ ConstraintInfo::getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
   }
 
   for (const auto &KV : VariablesB) {
-    if (SubOverflow(R[GetOrAddIndex(KV.Variable)], KV.Coefficient,
-                    R[GetOrAddIndex(KV.Variable)]))
+    auto &Coeff = R[GetOrAddIndex(KV.Variable)];
+    if (SubOverflow(Coeff, KV.Coefficient, Coeff))
       return {};
     auto I =
         KnownNonNegativeVariables.insert({KV.Variable, KV.IsKnownNonNegative});
@@ -759,9 +759,9 @@ ConstraintInfo::getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
     if (!KV.second ||
         (!Value2Index.contains(KV.first) && !NewIndexMap.contains(KV.first)))
       continue;
-    SmallVector C(Value2Index.size() + NewVariables.size() + 1, 0);
+    auto &C = Res.ExtraInfo.emplace_back(
+        Value2Index.size() + NewVariables.size() + 1, 0);
     C[GetOrAddIndex(KV.first)] = -1;
-    Res.ExtraInfo.push_back(C);
   }
   return Res;
 }
@@ -1591,53 +1591,52 @@ void ConstraintInfo::addFact(CmpInst::Predicate Pred, Value *A, Value *B,
 
   LLVM_DEBUG(dbgs() << "Adding '"; dumpUnpackedICmp(dbgs(), Pred, A, B);
              dbgs() << "'\n");
-  bool Added = false;
   auto &CSToUse = getCS(R.IsSigned);
   if (R.Coefficients.empty())
     return;
 
-  Added |= CSToUse.addVariableRowFill(R.Coefficients);
+  bool Added = CSToUse.addVariableRowFill(R.Coefficients);
+  if (!Added)
+    return;
 
   // If R has been added to the system, add the new variables and queue it for
   // removal once it goes out-of-scope.
-  if (Added) {
-    SmallVector ValuesToRelease;
-    auto &Value2Index = getValue2Index(R.IsSigned);
-    for (Value *V : NewVariables) {
-      Value2Index.insert({V, Value2Index.size() + 1});
-      ValuesToRelease.push_back(V);
-    }
-
-    LLVM_DEBUG({
-      dbgs() << "  constraint: ";
-      dumpConstraint(R.Coefficients, getValue2Index(R.IsSigned));
-      dbgs() << "\n";
-    });
+  SmallVector ValuesToRelease;
+  auto &Value2Index = getValue2Index(R.IsSigned);
+  for (Value *V : NewVariables) {
+    Value2Index.insert({V, Value2Index.size() + 1});
+    ValuesToRelease.push_back(V);
+  }
 
-    DFSInStack.emplace_back(NumIn, NumOut, R.IsSigned,
-                            std::move(ValuesToRelease));
-
-    if (!R.IsSigned) {
-      for (Value *V : NewVariables) {
-        ConstraintTy VarPos(SmallVector(Value2Index.size() + 1, 0),
-                            false, false, false);
-        VarPos.Coefficients[Value2Index[V]] = -1;
-        CSToUse.addVariableRow(VarPos.Coefficients);
-        DFSInStack.emplace_back(NumIn, NumOut, R.IsSigned,
-                                SmallVector());
-      }
-    }
+  LLVM_DEBUG({
+    dbgs() << "  constraint: ";
+    dumpConstraint(R.Coefficients, getValue2Index(R.IsSigned));
+    dbgs() << "\n";
+  });
 
-    if (R.isEq()) {
-      // Also add the inverted constraint for equality constraints.
-      for (auto &Coeff : R.Coefficients)
-        Coeff *= -1;
-      CSToUse.addVariableRowFill(R.Coefficients);
+  DFSInStack.emplace_back(NumIn, NumOut, R.IsSigned,
+                          std::move(ValuesToRelease));
 
+  if (!R.IsSigned) {
+    for (Value *V : NewVariables) {
+      ConstraintTy VarPos(SmallVector(Value2Index.size() + 1, 0),
+                          false, false, false);
+      VarPos.Coefficients[Value2Index[V]] = -1;
+      CSToUse.addVariableRow(VarPos.Coefficients);
       DFSInStack.emplace_back(NumIn, NumOut, R.IsSigned,
                               SmallVector());
     }
   }
+
+  if (R.isEq()) {
+    // Also add the inverted constraint for equality constraints.
+    for (auto &Coeff : R.Coefficients)
+      Coeff *= -1;
+    CSToUse.addVariableRowFill(R.Coefficients);
+
+    DFSInStack.emplace_back(NumIn, NumOut, R.IsSigned,
+                            SmallVector());
+  }
 }
 
 static bool replaceSubOverflowUses(IntrinsicInst *II, Value *A, Value *B,

From 1135d36f869541f14934471e255c6e2631d6eebe Mon Sep 17 00:00:00 2001
From: Mark Danial <118996571+madanial0@users.noreply.github.com>
Date: Wed, 1 Jan 2025 13:34:28 -0500
Subject: [PATCH 243/567] [AIX] [lit] Fix shtest-format.py to account for
 behaviour on AIX (#121426)

The changes from https://github.com/llvm/llvm-project/pull/121376 has
broken the ppc64 aix bot:
https://lab.llvm.org/buildbot/#/builders/64/builds/1835. Adjusted the
testcase to account for `cat` behaviour on AIX prior to the changes
---
 llvm/utils/lit/tests/shtest-format.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/lit/tests/shtest-format.py b/llvm/utils/lit/tests/shtest-format.py
index d58973993b272..fda3ef52e8043 100644
--- a/llvm/utils/lit/tests/shtest-format.py
+++ b/llvm/utils/lit/tests/shtest-format.py
@@ -18,7 +18,7 @@
 # CHECK: Command Output (stderr):
 # CHECK-NEXT: --
 # CHECK-NOT: --
-# CHECK: cat{{(_64)?(\.exe)?}}: {{.*does-not-exist.*}}: No such file or directory
+# CHECK: cat{{(_64)?(\.exe)?}}: {{(cannot open does-not-exist|.*does-not-exist.*: No such file or directory)}}
 # CHECK: --
 
 # CHECK: FAIL: shtest-format :: external_shell/fail_with_bad_encoding.txt

From 8435225374e1dc17fddf618eec3cf8c396ac669f Mon Sep 17 00:00:00 2001
From: Jacek Caban 
Date: Wed, 1 Jan 2025 19:42:49 +0100
Subject: [PATCH 244/567] [LLD][COFF] Move addFile implementation to
 LinkerDriver (NFC) (#121342)

The addFile implementation does not rely on the SymbolTable object. With
#119294, the symbol table for input files is determined during the
construction of the objects representing them. To clarify that
relationship, this change moves the implementation from the SymbolTable
class to the LinkerDriver class.
---
 lld/COFF/Driver.cpp      | 82 ++++++++++++++++++++++++++++++++++++----
 lld/COFF/Driver.h        | 11 ++++--
 lld/COFF/InputFiles.cpp  |  2 +-
 lld/COFF/SymbolTable.cpp | 70 +---------------------------------
 lld/COFF/SymbolTable.h   |  3 --
 5 files changed, 84 insertions(+), 84 deletions(-)

diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index be01ee41c9a2f..83d3f5d4cf99c 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -189,6 +189,71 @@ bool LinkerDriver::findUnderscoreMangle(StringRef sym) {
   return s && !isa(s);
 }
 
+static bool compatibleMachineType(COFFLinkerContext &ctx, MachineTypes mt) {
+  if (mt == IMAGE_FILE_MACHINE_UNKNOWN)
+    return true;
+  switch (ctx.config.machine) {
+  case ARM64:
+    return mt == ARM64 || mt == ARM64X;
+  case ARM64EC:
+    return isArm64EC(mt) || mt == AMD64;
+  case ARM64X:
+    return isAnyArm64(mt) || mt == AMD64;
+  case IMAGE_FILE_MACHINE_UNKNOWN:
+    return true;
+  default:
+    return ctx.config.machine == mt;
+  }
+}
+
+void LinkerDriver::addFile(InputFile *file) {
+  Log(ctx) << "Reading " << toString(file);
+  if (file->lazy) {
+    if (auto *f = dyn_cast(file))
+      f->parseLazy();
+    else
+      cast(file)->parseLazy();
+  } else {
+    file->parse();
+    if (auto *f = dyn_cast(file)) {
+      ctx.objFileInstances.push_back(f);
+    } else if (auto *f = dyn_cast(file)) {
+      if (ltoCompilationDone) {
+        Err(ctx) << "LTO object file " << toString(file)
+                 << " linked in after "
+                    "doing LTO compilation.";
+      }
+      ctx.bitcodeFileInstances.push_back(f);
+    } else if (auto *f = dyn_cast(file)) {
+      ctx.importFileInstances.push_back(f);
+    }
+  }
+
+  MachineTypes mt = file->getMachineType();
+  // The ARM64EC target must be explicitly specified and cannot be inferred.
+  if (mt == ARM64EC &&
+      (ctx.config.machine == IMAGE_FILE_MACHINE_UNKNOWN ||
+       (ctx.config.machineInferred &&
+        (ctx.config.machine == ARM64 || ctx.config.machine == AMD64)))) {
+    Err(ctx) << toString(file)
+             << ": machine type arm64ec is ambiguous and cannot be "
+                "inferred, use /machine:arm64ec or /machine:arm64x";
+    return;
+  }
+  if (!compatibleMachineType(ctx, mt)) {
+    Err(ctx) << toString(file) << ": machine type " << machineToStr(mt)
+             << " conflicts with " << machineToStr(ctx.config.machine);
+    return;
+  }
+  if (ctx.config.machine == IMAGE_FILE_MACHINE_UNKNOWN &&
+      mt != IMAGE_FILE_MACHINE_UNKNOWN) {
+    ctx.config.machineInferred = true;
+    setMachine(mt);
+  }
+
+  parseDirectives(file);
+}
+
 MemoryBufferRef LinkerDriver::takeBuffer(std::unique_ptr mb) {
   MemoryBufferRef mbref = *mb;
   make>(std::move(mb)); // take ownership
@@ -222,17 +287,17 @@ void LinkerDriver::addBuffer(std::unique_ptr mb,
         addArchiveBuffer(m, "", filename, memberIndex++);
       return;
     }
-    ctx.symtab.addFile(make(ctx, mbref));
+    addFile(make(ctx, mbref));
     break;
   case file_magic::bitcode:
-    ctx.symtab.addFile(make(ctx, mbref, "", 0, lazy));
+    addFile(make(ctx, mbref, "", 0, lazy));
     break;
   case file_magic::coff_object:
   case file_magic::coff_import_library:
-    ctx.symtab.addFile(ObjFile::create(ctx, mbref, lazy));
+    addFile(ObjFile::create(ctx, mbref, lazy));
     break;
   case file_magic::pdb:
-    ctx.symtab.addFile(make(ctx, mbref));
+    addFile(make(ctx, mbref));
     break;
   case file_magic::coff_cl_gl_object:
     Err(ctx) << filename
@@ -240,7 +305,7 @@ void LinkerDriver::addBuffer(std::unique_ptr mb,
     break;
   case file_magic::pecoff_executable:
     if (ctx.config.mingw) {
-      ctx.symtab.addFile(make(ctx.symtab, mbref));
+      addFile(make(ctx.symtab, mbref));
       break;
     }
     if (filename.ends_with_insensitive(".dll")) {
@@ -306,7 +371,7 @@ void LinkerDriver::addArchiveBuffer(MemoryBufferRef mb, StringRef symName,
   if (magic == file_magic::coff_import_library) {
     InputFile *imp = make(ctx, mb);
     imp->parentName = parentName;
-    ctx.symtab.addFile(imp);
+    addFile(imp);
     return;
   }
 
@@ -326,7 +391,7 @@ void LinkerDriver::addArchiveBuffer(MemoryBufferRef mb, StringRef symName,
   }
 
   obj->parentName = parentName;
-  ctx.symtab.addFile(obj);
+  addFile(obj);
   Log(ctx) << "Loaded " << obj << " for " << symName;
 }
 
@@ -1400,7 +1465,7 @@ void LinkerDriver::convertResources() {
   }
   ObjFile *f =
       ObjFile::create(ctx, convertResToCOFF(resources, resourceObjFiles));
-  ctx.symtab.addFile(f);
+  addFile(f);
   f->includeResourceChunks();
 }
 
@@ -2702,6 +2767,7 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) {
   // Do LTO by compiling bitcode input files to a set of native COFF files then
   // link those files (unless -thinlto-index-only was given, in which case we
   // resolve symbols and write indices, but don't generate native code or link).
+  ltoCompilationDone = true;
   ctx.symtab.compileBitcodeFiles();
 
   if (Defined *d =
diff --git a/lld/COFF/Driver.h b/lld/COFF/Driver.h
index c2b92f61dcf4b..b04a00e2d1cd1 100644
--- a/lld/COFF/Driver.h
+++ b/lld/COFF/Driver.h
@@ -80,13 +80,10 @@ class LinkerDriver {
 
   void linkerMain(llvm::ArrayRef args);
 
-  void setMachine(llvm::COFF::MachineTypes machine);
+  void addFile(InputFile *file);
 
   void addClangLibSearchPaths(const std::string &argv0);
 
-  // Used by the resolver to parse .drectve section contents.
-  void parseDirectives(InputFile *file);
-
   // Used by ArchiveFile to enqueue members.
   void enqueueArchiveMember(const Archive::Child &c, const Archive::Symbol &sym,
                             StringRef parentName);
@@ -121,6 +118,7 @@ class LinkerDriver {
   // Symbol names are mangled by prepending "_" on x86.
   StringRef mangle(StringRef sym);
 
+  void setMachine(llvm::COFF::MachineTypes machine);
   llvm::Triple::ArchType getArch();
 
   uint64_t getDefaultImageBase();
@@ -144,6 +142,9 @@ class LinkerDriver {
 
   void createImportLibrary(bool asLib);
 
+  // Used by the resolver to parse .drectve section contents.
+  void parseDirectives(InputFile *file);
+
   void parseModuleDefs(StringRef path);
 
   // Parse an /order file. If an option is given, the linker places COMDAT
@@ -279,6 +280,8 @@ class LinkerDriver {
   // Create export thunks for exported and patchable Arm64EC function symbols.
   void createECExportThunks();
   void maybeCreateECExportThunk(StringRef name, Symbol *&sym);
+
+  bool ltoCompilationDone = false;
 };
 
 // Create enum with OPT_xxx values for each option in Options.td
diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp
index ad21311e8b28f..e698f66b84f62 100644
--- a/lld/COFF/InputFiles.cpp
+++ b/lld/COFF/InputFiles.cpp
@@ -1412,5 +1412,5 @@ void DLLFile::makeImport(DLLFile::Symbol *s) {
   memcpy(p, s->dllName.data(), s->dllName.size());
   MemoryBufferRef mbref = MemoryBufferRef(StringRef(buf, size), s->dllName);
   ImportFile *impFile = make(symtab.ctx, mbref);
-  symtab.addFile(impFile);
+  symtab.ctx.driver.addFile(impFile);
 }
diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp
index fc78afb4c9e40..6f25ad0620927 100644
--- a/lld/COFF/SymbolTable.cpp
+++ b/lld/COFF/SymbolTable.cpp
@@ -37,71 +37,6 @@ StringRef ltrim1(StringRef s, const char *chars) {
   return s;
 }
 
-static bool compatibleMachineType(COFFLinkerContext &ctx, MachineTypes mt) {
-  if (mt == IMAGE_FILE_MACHINE_UNKNOWN)
-    return true;
-  switch (ctx.config.machine) {
-  case ARM64:
-    return mt == ARM64 || mt == ARM64X;
-  case ARM64EC:
-    return COFF::isArm64EC(mt) || mt == AMD64;
-  case ARM64X:
-    return COFF::isAnyArm64(mt) || mt == AMD64;
-  case IMAGE_FILE_MACHINE_UNKNOWN:
-    return true;
-  default:
-    return ctx.config.machine == mt;
-  }
-}
-
-void SymbolTable::addFile(InputFile *file) {
-  Log(ctx) << "Reading " << toString(file);
-  if (file->lazy) {
-    if (auto *f = dyn_cast(file))
-      f->parseLazy();
-    else
-      cast(file)->parseLazy();
-  } else {
-    file->parse();
-    if (auto *f = dyn_cast(file)) {
-      ctx.objFileInstances.push_back(f);
-    } else if (auto *f = dyn_cast(file)) {
-      if (ltoCompilationDone) {
-        Err(ctx) << "LTO object file " << toString(file)
-                 << " linked in after "
-                    "doing LTO compilation.";
-      }
-      ctx.bitcodeFileInstances.push_back(f);
-    } else if (auto *f = dyn_cast(file)) {
-      ctx.importFileInstances.push_back(f);
-    }
-  }
-
-  MachineTypes mt = file->getMachineType();
-  // The ARM64EC target must be explicitly specified and cannot be inferred.
-  if (mt == ARM64EC &&
-      (ctx.config.machine == IMAGE_FILE_MACHINE_UNKNOWN ||
-       (ctx.config.machineInferred &&
-        (ctx.config.machine == ARM64 || ctx.config.machine == AMD64)))) {
-    Err(ctx) << toString(file)
-             << ": machine type arm64ec is ambiguous and cannot be "
-                "inferred, use /machine:arm64ec or /machine:arm64x";
-    return;
-  }
-  if (!compatibleMachineType(ctx, mt)) {
-    Err(ctx) << toString(file) << ": machine type " << machineToStr(mt)
-             << " conflicts with " << machineToStr(ctx.config.machine);
-    return;
-  }
-  if (ctx.config.machine == IMAGE_FILE_MACHINE_UNKNOWN &&
-      mt != IMAGE_FILE_MACHINE_UNKNOWN) {
-    ctx.config.machineInferred = true;
-    ctx.driver.setMachine(mt);
-  }
-
-  ctx.driver.parseDirectives(file);
-}
-
 static COFFSyncStream errorOrWarn(COFFLinkerContext &ctx) {
   return {ctx, ctx.config.forceUnresolved ? DiagLevel::Warn : DiagLevel::Err};
 }
@@ -118,7 +53,7 @@ static void forceLazy(Symbol *s) {
   case Symbol::Kind::LazyObjectKind: {
     InputFile *file = cast(s)->file;
     file->lazy = false;
-    file->symtab.addFile(file);
+    file->symtab.ctx.driver.addFile(file);
     break;
   }
   case Symbol::Kind::LazyDLLSymbolKind: {
@@ -776,7 +711,7 @@ void SymbolTable::addLazyObject(InputFile *f, StringRef n) {
     return;
   s->pendingArchiveLoad = true;
   f->lazy = false;
-  addFile(f);
+  ctx.driver.addFile(f);
 }
 
 void SymbolTable::addLazyDLLSymbol(DLLFile *f, DLLFile::Symbol *sym,
@@ -1054,7 +989,6 @@ Symbol *SymbolTable::addUndefined(StringRef name) {
 }
 
 void SymbolTable::compileBitcodeFiles() {
-  ltoCompilationDone = true;
   if (ctx.bitcodeFileInstances.empty())
     return;
 
diff --git a/lld/COFF/SymbolTable.h b/lld/COFF/SymbolTable.h
index 8548a6d036a9d..5443815172dfd 100644
--- a/lld/COFF/SymbolTable.h
+++ b/lld/COFF/SymbolTable.h
@@ -51,8 +51,6 @@ class SymbolTable {
               llvm::COFF::MachineTypes machine = IMAGE_FILE_MACHINE_UNKNOWN)
       : ctx(c), machine(machine) {}
 
-  void addFile(InputFile *file);
-
   // Emit errors for symbols that cannot be resolved.
   void reportUnresolvable();
 
@@ -155,7 +153,6 @@ class SymbolTable {
 
   llvm::DenseMap symMap;
   std::unique_ptr lto;
-  bool ltoCompilationDone = false;
   std::vector> entryThunks;
   llvm::DenseMap exitThunks;
 };

From 684052173971868aab0e6b62d7770a6299e84141 Mon Sep 17 00:00:00 2001
From: Philip Reames 
Date: Wed, 1 Jan 2025 10:53:24 -0800
Subject: [PATCH 245/567] Revert "[RISCV][CG]Use processShuffleMasks for
 per-register shuffles"

This reverts commit b8952d4b1b0c73bf39d6440ad3166a088ced563f.

spec x264 fails to build in all VLS configurations, with the assertion
failure: clang: ../llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp:5246: llvm::SDValue lowerShuffleViaVRegSplitting(llvm::ShuffleVectorSDNode*, llvm::SelectionDAG&, const llvm::RISCVSubtarget&): Assertion `RegCnt == NumOfDestRegs && "Whole vector must be processed"' failed.

I can reduce a failing piece of IR, but the failure appears pretty
broad, so I suspect any reasonable vls build will hit it.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 93 ++++++++-----------
 .../rvv/fixed-vectors-shuffle-exact-vlen.ll   | 89 ++++++++++--------
 2 files changed, 89 insertions(+), 93 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index cda64ae5f498d..04dd23d9cdaa2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5104,6 +5104,7 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,
   SDValue V1 = SVN->getOperand(0);
   SDValue V2 = SVN->getOperand(1);
   ArrayRef Mask = SVN->getMask();
+  unsigned NumElts = VT.getVectorNumElements();
 
   // If we don't know exact data layout, not much we can do.  If this
   // is already m1 or smaller, no point in splitting further.
@@ -5120,70 +5121,58 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,
 
   MVT ElemVT = VT.getVectorElementType();
   unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
+  unsigned VRegsPerSrc = NumElts / ElemsPerVReg;
+
+  SmallVector>>
+    OutMasks(VRegsPerSrc, {-1, {}});
+
+  // Check if our mask can be done as a 1-to-1 mapping from source
+  // to destination registers in the group without needing to
+  // write each destination more than once.
+  for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx++) {
+    int DstVecIdx = DstIdx / ElemsPerVReg;
+    int DstSubIdx = DstIdx % ElemsPerVReg;
+    int SrcIdx = Mask[DstIdx];
+    if (SrcIdx < 0 || (unsigned)SrcIdx >= 2 * NumElts)
+      continue;
+    int SrcVecIdx = SrcIdx / ElemsPerVReg;
+    int SrcSubIdx = SrcIdx % ElemsPerVReg;
+    if (OutMasks[DstVecIdx].first == -1)
+      OutMasks[DstVecIdx].first = SrcVecIdx;
+    if (OutMasks[DstVecIdx].first != SrcVecIdx)
+      // Note: This case could easily be handled by keeping track of a chain
+      // of source values and generating two element shuffles below.  This is
+      // less an implementation question, and more a profitability one.
+      return SDValue();
+
+    OutMasks[DstVecIdx].second.resize(ElemsPerVReg, -1);
+    OutMasks[DstVecIdx].second[DstSubIdx] = SrcSubIdx;
+  }
 
   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
   MVT OneRegVT = MVT::getVectorVT(ElemVT, ElemsPerVReg);
   MVT M1VT = getContainerForFixedLengthVector(DAG, OneRegVT, Subtarget);
   assert(M1VT == getLMUL1VT(M1VT));
   unsigned NumOpElts = M1VT.getVectorMinNumElements();
-  unsigned NormalizedVF = ContainerVT.getVectorMinNumElements();
-  unsigned NumOfSrcRegs = NormalizedVF / NumOpElts;
-  unsigned NumOfDestRegs = NormalizedVF / NumOpElts;
+  SDValue Vec = DAG.getUNDEF(ContainerVT);
   // The following semantically builds up a fixed length concat_vector
   // of the component shuffle_vectors.  We eagerly lower to scalable here
   // to avoid DAG combining it back to a large shuffle_vector again.
   V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
   V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
-  SmallVector SubRegs(NumOfDestRegs);
-  unsigned RegCnt = 0;
-  unsigned PrevCnt = 0;
-  processShuffleMasks(
-      Mask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs,
-      [&]() {
-        PrevCnt = RegCnt;
-        ++RegCnt;
-      },
-      [&, &DAG = DAG](ArrayRef SrcSubMask, unsigned SrcVecIdx,
-                      unsigned DstVecIdx) {
-        SDValue SrcVec = SrcVecIdx >= NumOfSrcRegs ? V2 : V1;
-        unsigned ExtractIdx = (SrcVecIdx % NumOfSrcRegs) * NumOpElts;
-        SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec,
-                                     DAG.getVectorIdxConstant(ExtractIdx, DL));
-        SubVec = convertFromScalableVector(OneRegVT, SubVec, DAG, Subtarget);
-        SubVec = DAG.getVectorShuffle(OneRegVT, DL, SubVec, SubVec, SrcSubMask);
-        SubRegs[RegCnt] = convertToScalableVector(M1VT, SubVec, DAG, Subtarget);
-        PrevCnt = RegCnt;
-        ++RegCnt;
-      },
-      [&, &DAG = DAG](ArrayRef SrcSubMask, unsigned Idx1, unsigned Idx2) {
-        if (PrevCnt + 1 == RegCnt)
-          ++RegCnt;
-        SDValue SubVec1 = SubRegs[PrevCnt + 1];
-        if (!SubVec1) {
-          SDValue SrcVec = Idx1 >= NumOfSrcRegs ? V2 : V1;
-          unsigned ExtractIdx = (Idx1 % NumOfSrcRegs) * NumOpElts;
-          SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec,
-                                DAG.getVectorIdxConstant(ExtractIdx, DL));
-        }
-        SubVec1 = convertFromScalableVector(OneRegVT, SubVec1, DAG, Subtarget);
-        SDValue SrcVec = Idx2 >= NumOfSrcRegs ? V2 : V1;
-        unsigned ExtractIdx = (Idx2 % NumOfSrcRegs) * NumOpElts;
-        SDValue SubVec2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec,
-                                      DAG.getVectorIdxConstant(ExtractIdx, DL));
-        SubVec2 = convertFromScalableVector(OneRegVT, SubVec2, DAG, Subtarget);
-        SubVec1 =
-            DAG.getVectorShuffle(OneRegVT, DL, SubVec1, SubVec2, SrcSubMask);
-        SubVec1 = convertToScalableVector(M1VT, SubVec1, DAG, Subtarget);
-        SubRegs[PrevCnt + 1] = SubVec1;
-      });
-  assert(RegCnt == NumOfDestRegs && "Whole vector must be processed");
-  SDValue Vec = DAG.getUNDEF(ContainerVT);
-  for (auto [I, V] : enumerate(SubRegs)) {
-    if (!V)
+  for (unsigned DstVecIdx = 0 ; DstVecIdx < OutMasks.size(); DstVecIdx++) {
+    auto &[SrcVecIdx, SrcSubMask] = OutMasks[DstVecIdx];
+    if (SrcVecIdx == -1)
       continue;
-    unsigned InsertIdx = I * NumOpElts;
-
-    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Vec, V,
+    unsigned ExtractIdx = (SrcVecIdx % VRegsPerSrc) * NumOpElts;
+    SDValue SrcVec = (unsigned)SrcVecIdx >= VRegsPerSrc ? V2 : V1;
+    SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec,
+                                 DAG.getVectorIdxConstant(ExtractIdx, DL));
+    SubVec = convertFromScalableVector(OneRegVT, SubVec, DAG, Subtarget);
+    SubVec = DAG.getVectorShuffle(OneRegVT, DL, SubVec, SubVec, SrcSubMask);
+    SubVec = convertToScalableVector(M1VT, SubVec, DAG, Subtarget);
+    unsigned InsertIdx = DstVecIdx * NumOpElts;
+    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Vec, SubVec,
                       DAG.getVectorIdxConstant(InsertIdx, DL));
   }
   return convertFromScalableVector(VT, Vec, DAG, Subtarget);
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
index 4e06d0094d945..f0ee780137300 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
@@ -168,11 +168,12 @@ define <4 x i64> @m2_splat_into_slide_two_source_v2_lo(<4 x i64> %v1, <4 x i64>
 define <4 x i64> @m2_splat_into_slide_two_source(<4 x i64> %v1, <4 x i64> %v2) vscale_range(2,2) {
 ; CHECK-LABEL: m2_splat_into_slide_two_source:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v13, v10, 1
-; CHECK-NEXT:    vslideup.vi v13, v11, 1
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 12
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vi v12, v8, 0
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vslideup.vi v12, v10, 1, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
   %res = shufflevector <4 x i64> %v1, <4 x i64> %v2, <4 x i32> 
   ret <4 x i64> %res
@@ -182,17 +183,18 @@ define void @shuffle1(ptr %explicit_0, ptr %explicit_1) vscale_range(2,2) {
 ; CHECK-LABEL: shuffle1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi a0, a0, 252
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vid.v v8
 ; CHECK-NEXT:    vsetivli zero, 3, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v11, (a0)
-; CHECK-NEXT:    vmv.v.i v0, 5
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vsrl.vi v10, v10, 1
-; CHECK-NEXT:    vadd.vi v10, v10, 1
-; CHECK-NEXT:    vrgather.vv v9, v11, v10, v0.t
+; CHECK-NEXT:    vle32.v v9, (a0)
+; CHECK-NEXT:    li a0, 175
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsrl.vi v8, v8, 1
+; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vadd.vi v8, v8, 1
+; CHECK-NEXT:    vrgather.vv v11, v9, v8
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v8, v10, 0, v0
 ; CHECK-NEXT:    addi a0, a1, 672
 ; CHECK-NEXT:    vs2r.v v8, (a0)
 ; CHECK-NEXT:    ret
@@ -209,15 +211,15 @@ define void @shuffle1(ptr %explicit_0, ptr %explicit_1) vscale_range(2,2) {
 define <16 x float> @shuffle2(<4 x float> %a) vscale_range(2,2) {
 ; CHECK-LABEL: shuffle2:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vid.v v9
+; CHECK-NEXT:    li a0, -97
+; CHECK-NEXT:    vadd.vv v9, v9, v9
+; CHECK-NEXT:    vrsub.vi v9, v9, 4
+; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vrgather.vv v13, v8, v9
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vmv1r.v v12, v8
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vid.v v13
-; CHECK-NEXT:    vadd.vv v13, v13, v13
-; CHECK-NEXT:    vmv.v.i v0, 6
-; CHECK-NEXT:    vrsub.vi v13, v13, 4
-; CHECK-NEXT:    vrgather.vv v9, v12, v13, v0.t
+; CHECK-NEXT:    vmerge.vim v8, v12, 0, v0
 ; CHECK-NEXT:    ret
   %b = extractelement <4 x float> %a, i32 2
   %c = insertelement <16 x float> , float %b, i32 5
@@ -229,15 +231,16 @@ define <16 x float> @shuffle2(<4 x float> %a) vscale_range(2,2) {
 define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) vscale_range(2,2) {
 ; RV32-LABEL: extract_any_extend_vector_inreg_v16i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, mu
 ; RV32-NEXT:    vmv.v.i v16, 0
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; RV32-NEXT:    vmv.v.i v0, 1
+; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vrgather.vi v18, v15, 1, v0.t
-; RV32-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
+; RV32-NEXT:    vrgather.vi v16, v8, 15, v0.t
+; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vx v8, v16, a0
 ; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
@@ -255,14 +258,13 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) vsca
 ; RV64-NEXT:    addi s0, sp, 256
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    andi sp, sp, -128
-; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; RV64-NEXT:    vmv.v.i v0, 1
+; RV64-NEXT:    li a1, -17
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vmv.v.i v16, 0
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; RV64-NEXT:    vrgather.vi v18, v15, 1, v0.t
+; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    vrgather.vi v16, v8, 15
+; RV64-NEXT:    vmerge.vim v8, v16, 0, v0
 ; RV64-NEXT:    mv s2, sp
-; RV64-NEXT:    vs8r.v v16, (s2)
+; RV64-NEXT:    vs8r.v v8, (s2)
 ; RV64-NEXT:    andi a0, a0, 15
 ; RV64-NEXT:    li a1, 8
 ; RV64-NEXT:    call __muldi3
@@ -288,16 +290,21 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) vsca
 define <4 x double> @shuffles_add(<4 x double> %0, <4 x double> %1) vscale_range(2,2) {
 ; CHECK-LABEL: shuffles_add:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; CHECK-NEXT:    vmv1r.v v13, v10
-; CHECK-NEXT:    vslideup.vi v13, v11, 1
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    vmv.v.i v0, 1
-; CHECK-NEXT:    vrgather.vi v12, v9, 0
-; CHECK-NEXT:    vmv1r.v v9, v11
-; CHECK-NEXT:    vrgather.vi v9, v10, 1, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vfadd.vv v8, v12, v8
+; CHECK-NEXT:    vrgather.vi v12, v8, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vid.v v14
+; CHECK-NEXT:    vmv.v.i v0, 12
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vrgather.vi v16, v8, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v14, v14
+; CHECK-NEXT:    vadd.vi v9, v8, -4
+; CHECK-NEXT:    vadd.vi v8, v8, -3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
+; CHECK-NEXT:    vrgatherei16.vv v12, v10, v9, v0.t
+; CHECK-NEXT:    vrgatherei16.vv v16, v10, v8, v0.t
+; CHECK-NEXT:    vfadd.vv v8, v12, v16
 ; CHECK-NEXT:    ret
   %3 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> 
   %4 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> 

From 0cbe28df7100bf4384f84542d602f90cb783a2d4 Mon Sep 17 00:00:00 2001
From: Nico Weber 
Date: Wed, 1 Jan 2025 14:24:25 -0500
Subject: [PATCH 246/567] [gn] port 28ae2ff2a44c

---
 llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn
index 67343297fae41..d1048259bcd44 100644
--- a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn
@@ -299,6 +299,7 @@ static_library("builtins") {
         "mulxc3.c",
         "powixf2.c",
         "trunctfxf2.c",
+        "truncxfhf2.c",
       ]
     }
   }

From bd154e823eba4d62366dfa3d56ae0b99ab171b96 Mon Sep 17 00:00:00 2001
From: Peng Huang 
Date: Wed, 1 Jan 2025 16:19:39 -0500
Subject: [PATCH 247/567] Reapply "[Driver][OHOS] Fix lld link issue for OHOS
 (#118192)" (#120159)

The problem in original change is because OHOS::getCompilerRT()
pickes a wrong builtin runtime
`./lib/clang/20/lib/linux/libclang_rt.builtins-x86_64.a`,
if `./lib/clang/20/lib/linux/libclang_rt.builtins-x86_64.a` exist on the
test filesystem. It shouldn't happen with a clean build.
---
 clang/lib/Driver/ToolChains/OHOS.cpp | 60 ++++++++++++----------------
 1 file changed, 26 insertions(+), 34 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/OHOS.cpp b/clang/lib/Driver/ToolChains/OHOS.cpp
index 6e1a09ae908b2..c9a532771b99e 100644
--- a/clang/lib/Driver/ToolChains/OHOS.cpp
+++ b/clang/lib/Driver/ToolChains/OHOS.cpp
@@ -19,8 +19,8 @@
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Support/VirtualFileSystem.h"
 
 using namespace clang::driver;
 using namespace clang::driver::toolchains;
@@ -58,11 +58,9 @@ static bool findOHOSMuslMultilibs(const Driver &D,
   return false;
 }
 
-static bool findOHOSMultilibs(const Driver &D,
-                                      const ToolChain &TC,
-                                      const llvm::Triple &TargetTriple,
-                                      StringRef Path, const ArgList &Args,
-                                      DetectedMultilibs &Result) {
+static bool findOHOSMultilibs(const Driver &D, const ToolChain &TC,
+                              const llvm::Triple &TargetTriple, StringRef Path,
+                              const ArgList &Args, DetectedMultilibs &Result) {
   Multilib::flags_list Flags;
   bool IsA7 = false;
   if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ))
@@ -172,8 +170,7 @@ OHOS::OHOS(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
       Paths);
 }
 
-ToolChain::RuntimeLibType OHOS::GetRuntimeLibType(
-    const ArgList &Args) const {
+ToolChain::RuntimeLibType OHOS::GetRuntimeLibType(const ArgList &Args) const {
   if (Arg *A = Args.getLastArg(clang::driver::options::OPT_rtlib_EQ)) {
     StringRef Value = A->getValue();
     if (Value != "compiler-rt")
@@ -184,20 +181,19 @@ ToolChain::RuntimeLibType OHOS::GetRuntimeLibType(
   return ToolChain::RLT_CompilerRT;
 }
 
-ToolChain::CXXStdlibType
-OHOS::GetCXXStdlibType(const ArgList &Args) const {
+ToolChain::CXXStdlibType OHOS::GetCXXStdlibType(const ArgList &Args) const {
   if (Arg *A = Args.getLastArg(options::OPT_stdlib_EQ)) {
     StringRef Value = A->getValue();
     if (Value != "libc++")
       getDriver().Diag(diag::err_drv_invalid_stdlib_name)
-        << A->getAsString(Args);
+          << A->getAsString(Args);
   }
 
   return ToolChain::CST_Libcxx;
 }
 
 void OHOS::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
-                                        ArgStringList &CC1Args) const {
+                                     ArgStringList &CC1Args) const {
   const Driver &D = getDriver();
   const llvm::Triple &Triple = getTriple();
   std::string SysRoot = computeSysRoot();
@@ -258,7 +254,7 @@ void OHOS::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs,
 }
 
 void OHOS::AddCXXStdlibLibArgs(const ArgList &Args,
-                                  ArgStringList &CmdArgs) const {
+                               ArgStringList &CmdArgs) const {
   switch (GetCXXStdlibType(Args)) {
   case ToolChain::CST_Libcxx:
     CmdArgs.push_back("-lc++");
@@ -291,7 +287,8 @@ ToolChain::path_list OHOS::getRuntimePaths() const {
 
   // First try the triple passed to driver as --target=.
   P.assign(D.ResourceDir);
-  llvm::sys::path::append(P, "lib", D.getTargetTriple(), SelectedMultilib.gccSuffix());
+  llvm::sys::path::append(P, "lib", D.getTargetTriple(),
+                          SelectedMultilib.gccSuffix());
   Paths.push_back(P.c_str());
 
   // Second try the normalized triple.
@@ -340,26 +337,20 @@ std::string OHOS::getDynamicLinker(const ArgList &Args) const {
 
 std::string OHOS::getCompilerRT(const ArgList &Args, StringRef Component,
                                 FileType Type) const {
+  std::string CRTBasename =
+      buildCompilerRTBasename(Args, Component, Type, /*AddArch=*/false);
+
   SmallString<128> Path(getDriver().ResourceDir);
   llvm::sys::path::append(Path, "lib", getMultiarchTriple(getTriple()),
-                          SelectedMultilib.gccSuffix());
-  const char *Prefix =
-      Type == ToolChain::FT_Object ? "" : "lib";
-  const char *Suffix;
-  switch (Type) {
-  case ToolChain::FT_Object:
-    Suffix = ".o";
-    break;
-  case ToolChain::FT_Static:
-    Suffix = ".a";
-    break;
-  case ToolChain::FT_Shared:
-    Suffix = ".so";
-    break;
-  }
-  llvm::sys::path::append(
-      Path, Prefix + Twine("clang_rt.") + Component + Suffix);
-  return static_cast(Path.str());
+                          SelectedMultilib.gccSuffix(), CRTBasename);
+  if (getVFS().exists(Path))
+    return std::string(Path);
+
+  std::string NewPath = ToolChain::getCompilerRT(Args, Component, Type);
+  if (getVFS().exists(NewPath))
+    return NewPath;
+
+  return std::string(Path);
 }
 
 void OHOS::addExtraOpts(llvm::opt::ArgStringList &CmdArgs) const {
@@ -396,7 +387,7 @@ SanitizerMask OHOS::getSupportedSanitizers() const {
 
 // TODO: Make a base class for Linux and OHOS and move this there.
 void OHOS::addProfileRTLibs(const llvm::opt::ArgList &Args,
-                             llvm::opt::ArgStringList &CmdArgs) const {
+                            llvm::opt::ArgStringList &CmdArgs) const {
   // Add linker option -u__llvm_profile_runtime to cause runtime
   // initialization module to be linked in.
   if (needsProfileRT(Args))
@@ -413,7 +404,8 @@ ToolChain::path_list OHOS::getArchSpecificLibPaths() const {
   return Paths;
 }
 
-ToolChain::UnwindLibType OHOS::GetUnwindLibType(const llvm::opt::ArgList &Args) const {
+ToolChain::UnwindLibType
+OHOS::GetUnwindLibType(const llvm::opt::ArgList &Args) const {
   if (Args.getLastArg(options::OPT_unwindlib_EQ))
     return Generic_ELF::GetUnwindLibType(Args);
   return GetDefaultUnwindLibType();

From 418dedc2341e807fe7501ad95526b0d7c2f167c4 Mon Sep 17 00:00:00 2001
From: Florian Hahn 
Date: Wed, 1 Jan 2025 21:44:21 +0000
Subject: [PATCH 248/567] [VPlan] Remove redundant setting of insert point in
 ::executePlan (NFC).

The entry block is a VPIRBasicBkock wrapping the original loop's
preheader, so the insert point doesn't need to be set.
---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4282f815849a8..bf1cde52f0a6f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7709,11 +7709,9 @@ DenseMap LoopVectorizationPlanner::executePlan(
 
   // 0. Generate SCEV-dependent code in the entry, including TripCount, before
   // making any changes to the CFG.
-  if (!BestVPlan.getEntry()->empty()) {
-    State.CFG.PrevBB = OrigLoop->getLoopPreheader();
-    State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator());
+  if (!BestVPlan.getEntry()->empty())
     BestVPlan.getEntry()->execute(&State);
-  }
+
   if (!ILV.getTripCount())
     ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0)));
   else

From c7ebe4fd0afadcddd53ec89e6030f9a8f5370e1f Mon Sep 17 00:00:00 2001
From: Florian Hahn 
Date: Wed, 1 Jan 2025 22:05:21 +0000
Subject: [PATCH 249/567] [VPlan] Replace VPBBs with VPIRBBs during skeleton
 creation (NFC).

Move replacement of VPBBs for vector preheader, middle block and scalar
preheader from VPlan::execute to skeleton creation, which actually
creates the IR basic blocks.

For now, the vector preheader can only be replaced after
prepareToExecute as it may create new instructions in the vector
preheader.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 18 +++++++++++
 llvm/lib/Transforms/Vectorize/VPlan.cpp       | 31 ++-----------------
 2 files changed, 21 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index bf1cde52f0a6f..1daed0ebe08b9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2603,6 +2603,21 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
   return MemCheckBlock;
 }
 
+/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
+/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
+/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
+/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
+static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
+  VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
+  for (auto &R : make_early_inc_range(*VPBB)) {
+    assert(!R.isPhi() && "Tried to move phi recipe to end of block");
+    R.moveBefore(*IRVPBB, IRVPBB->end());
+  }
+
+  VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
+  // VPBB is now dead and will be cleaned up when the plan gets destroyed.
+}
+
 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
   assert(LoopVectorPreHeader && "Invalid loop structure");
@@ -2613,9 +2628,11 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
   LoopMiddleBlock =
       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
                  LI, nullptr, Twine(Prefix) + "middle.block");
+  replaceVPBBWithIRVPBB(Plan.getMiddleBlock(), LoopMiddleBlock);
   LoopScalarPreHeader =
       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
                  nullptr, Twine(Prefix) + "scalar.ph");
+  replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
 }
 
 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
@@ -7757,6 +7774,7 @@ DenseMap LoopVectorizationPlanner::executePlan(
   BestVPlan.prepareToExecute(
       ILV.getTripCount(),
       ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State);
+  replaceVPBBWithIRVPBB(BestVPlan.getVectorPreheader(), State.CFG.PrevBB);
 
   BestVPlan.execute(&State);
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 0619f47f77cbe..6d02efc05614a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -949,21 +949,6 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
   }
 }
 
-/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
-/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
-/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
-/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
-static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
-  VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
-  for (auto &R : make_early_inc_range(*VPBB)) {
-    assert(!R.isPhi() && "Tried to move phi recipe to end of block");
-    R.moveBefore(*IRVPBB, IRVPBB->end());
-  }
-
-  VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
-  // VPBB is now dead and will be cleaned up when the plan gets destroyed.
-}
-
 /// Generate the code inside the preheader and body of the vectorized loop.
 /// Assumes a single pre-header basic-block was created for this. Introduce
 /// additional basic-blocks as needed, and fill them all.
@@ -971,25 +956,13 @@ void VPlan::execute(VPTransformState *State) {
   // Initialize CFG state.
   State->CFG.PrevVPBB = nullptr;
   State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor();
-  BasicBlock *VectorPreHeader = State->CFG.PrevBB;
-  State->Builder.SetInsertPoint(VectorPreHeader->getTerminator());
 
   // Disconnect VectorPreHeader from ExitBB in both the CFG and DT.
+  BasicBlock *VectorPreHeader = State->CFG.PrevBB;
   cast(VectorPreHeader->getTerminator())->setSuccessor(0, nullptr);
   State->CFG.DTU.applyUpdates(
       {{DominatorTree::Delete, VectorPreHeader, State->CFG.ExitBB}});
 
-  // Replace regular VPBB's for the vector preheader, middle and scalar
-  // preheader blocks with VPIRBasicBlocks wrapping their IR blocks. The IR
-  // blocks are created during skeleton creation, so we can only create the
-  // VPIRBasicBlocks now during VPlan execution rather than earlier during VPlan
-  // construction.
-  BasicBlock *MiddleBB = State->CFG.ExitBB;
-  BasicBlock *ScalarPh = MiddleBB->getSingleSuccessor();
-  replaceVPBBWithIRVPBB(getVectorPreheader(), VectorPreHeader);
-  replaceVPBBWithIRVPBB(getMiddleBlock(), MiddleBB);
-  replaceVPBBWithIRVPBB(getScalarPreheader(), ScalarPh);
-
   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << State->VF
                     << ", UF=" << getUF() << '\n');
   setName("Final VPlan");
@@ -998,6 +971,8 @@ void VPlan::execute(VPTransformState *State) {
   // Disconnect the middle block from its single successor (the scalar loop
   // header) in both the CFG and DT. The branch will be recreated during VPlan
   // execution.
+  BasicBlock *MiddleBB = State->CFG.ExitBB;
+  BasicBlock *ScalarPh = MiddleBB->getSingleSuccessor();
   auto *BrInst = new UnreachableInst(MiddleBB->getContext());
   BrInst->insertBefore(MiddleBB->getTerminator());
   MiddleBB->getTerminator()->eraseFromParent();

From cd239493c1023cbccfe6b1e9be32e68592a7f304 Mon Sep 17 00:00:00 2001
From: Owen Pan 
Date: Wed, 1 Jan 2025 15:37:59 -0800
Subject: [PATCH 250/567] [clang-format] Support globstar in
 .clang-format-ignore (#121404)

Closes #110160.
Closes #114969.
---
 clang/docs/ClangFormat.rst                   |  1 +
 clang/docs/ReleaseNotes.rst                  |  1 +
 clang/lib/Format/MatchFilePath.cpp           | 38 +++++++++++++-------
 clang/unittests/Format/MatchFilePathTest.cpp | 35 ++++++++++++++++++
 4 files changed, 63 insertions(+), 12 deletions(-)

diff --git a/clang/docs/ClangFormat.rst b/clang/docs/ClangFormat.rst
index c8f1d7f5a7758..e1f677178c00a 100644
--- a/clang/docs/ClangFormat.rst
+++ b/clang/docs/ClangFormat.rst
@@ -150,6 +150,7 @@ names. It has the following format:
 * Patterns follow the rules specified in `POSIX 2.13.1, 2.13.2, and Rule 1 of
   2.13.3 `_.
+* Bash globstar (``**``) is supported.
 * A pattern is negated if it starts with a bang (``!``).
 
 To match all files in a directory, use e.g. ``foo/bar/*``. To match all files in
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 2a688a677294f..662c575bad3e8 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1125,6 +1125,7 @@ clang-format
 - Adds ``RemoveEmptyLinesInUnwrappedLines`` option.
 - Adds ``KeepFormFeed`` option and set it to ``true`` for ``GNU`` style.
 - Adds ``AllowShortNamespacesOnASingleLine`` option.
+- Adds support for bash globstar in ``.clang-format-ignore``.
 
 libclang
 --------
diff --git a/clang/lib/Format/MatchFilePath.cpp b/clang/lib/Format/MatchFilePath.cpp
index 062b334dcdd8f..3d8614838e535 100644
--- a/clang/lib/Format/MatchFilePath.cpp
+++ b/clang/lib/Format/MatchFilePath.cpp
@@ -25,9 +25,11 @@ bool matchFilePath(StringRef Pattern, StringRef FilePath) {
   assert(!Pattern.empty());
   assert(!FilePath.empty());
 
+  const auto FilePathBack = FilePath.back();
+
   // No match if `Pattern` ends with a non-meta character not equal to the last
   // character of `FilePath`.
-  if (const auto C = Pattern.back(); !strchr("?*]", C) && C != FilePath.back())
+  if (const auto C = Pattern.back(); !strchr("?*]", C) && C != FilePathBack)
     return false;
 
   constexpr auto Separator = '/';
@@ -49,25 +51,37 @@ bool matchFilePath(StringRef Pattern, StringRef FilePath) {
         return false;
       break;
     case '*': {
-      while (++I < EOP && Pattern[I] == '*') { // Skip consecutive stars.
+      bool Globstar = I == 0 || Pattern[I - 1] == Separator;
+      int StarCount = 1;
+      for (; ++I < EOP && Pattern[I] == '*'; ++StarCount) {
+        // Skip consecutive stars.
       }
+      if (StarCount != 2)
+        Globstar = false;
       const auto K = FilePath.find(Separator, J); // Index of next `Separator`.
       const bool NoMoreSeparatorsInFilePath = K == StringRef::npos;
       if (I == EOP) // `Pattern` ends with a star.
-        return NoMoreSeparatorsInFilePath;
-      // `Pattern` ends with a lone backslash.
-      if (Pattern[I] == '\\' && ++I == EOP)
-        return false;
+        return Globstar || NoMoreSeparatorsInFilePath;
+      if (Pattern[I] != Separator) {
+        Globstar = false;
+        // `Pattern` ends with a lone backslash.
+        if (Pattern[I] == '\\' && ++I == EOP)
+          return false;
+      }
       // The star is followed by a (possibly escaped) `Separator`.
       if (Pattern[I] == Separator) {
-        if (NoMoreSeparatorsInFilePath)
-          return false;
-        J = K; // Skip to next `Separator` in `FilePath`.
-        break;
+        if (!Globstar) {
+          if (NoMoreSeparatorsInFilePath)
+            return false;
+          J = K; // Skip to next `Separator` in `FilePath`.
+          break;
+        }
+        if (++I == EOP)
+          return FilePathBack == Separator;
       }
       // Recurse.
-      for (auto Pat = Pattern.substr(I); J < End && FilePath[J] != Separator;
-           ++J) {
+      for (auto Pat = Pattern.substr(I);
+           J < End && (Globstar || FilePath[J] != Separator); ++J) {
         if (matchFilePath(Pat, FilePath.substr(J)))
           return true;
       }
diff --git a/clang/unittests/Format/MatchFilePathTest.cpp b/clang/unittests/Format/MatchFilePathTest.cpp
index 28f665635718e..346ea7c31e615 100644
--- a/clang/unittests/Format/MatchFilePathTest.cpp
+++ b/clang/unittests/Format/MatchFilePathTest.cpp
@@ -164,6 +164,41 @@ TEST_F(MatchFilePathTest, Path) {
   EXPECT_FALSE(match("foo\\", R"(foo*\)"));
 }
 
+TEST_F(MatchFilePathTest, Globstar) {
+  EXPECT_TRUE(match("/", "**"));
+  EXPECT_TRUE(match("foo", "**"));
+  EXPECT_TRUE(match("/foo", "**"));
+  EXPECT_TRUE(match("foo/", "**"));
+  EXPECT_TRUE(match("foo/bar", "**"));
+
+  EXPECT_TRUE(match("/", "**/"));
+  EXPECT_TRUE(match("foo/", "**/"));
+  EXPECT_TRUE(match("/foo/", "**/"));
+  EXPECT_TRUE(match("foo/bar/", "**/"));
+
+  EXPECT_TRUE(match("/", "/**"));
+  EXPECT_TRUE(match("/foo", "/**"));
+  EXPECT_TRUE(match("/foo/", "/**"));
+  EXPECT_TRUE(match("/foo/bar", "/**"));
+
+  EXPECT_TRUE(match("foo", "**/foo"));
+  EXPECT_TRUE(match("/foo", "**/foo"));
+  EXPECT_TRUE(match("foo/bar", "**/bar"));
+  EXPECT_TRUE(match("/foo/bar", "**/foo/bar"));
+  EXPECT_TRUE(match("foo/bar/baz", "**/bar/baz"));
+
+  EXPECT_TRUE(match("abc/foo", "abc/**"));
+  EXPECT_TRUE(match("abc/foo/", "abc/**"));
+  EXPECT_TRUE(match("abc/foo/bar", "abc/**"));
+
+  EXPECT_TRUE(match("a/b", "a/**/b"));
+  EXPECT_TRUE(match("a/x/b", "a/**/b"));
+  EXPECT_TRUE(match("a/x/y/b", "a/**/b"));
+
+  EXPECT_FALSE(match("a/x/b", "a**/b"));
+  EXPECT_FALSE(match("a/x/b", "a/**b"));
+}
+
 } // namespace
 } // namespace format
 } // namespace clang

From 62cd050b635cbb201dd08188696448cf5ab23260 Mon Sep 17 00:00:00 2001
From: Roland McGrath 
Date: Wed, 1 Jan 2025 17:03:35 -0800
Subject: [PATCH 251/567] [libc] Move hdrgen yaml files into include/ (#121443)

The .yaml files should live next to the corresponding .h.def
files in libc/include/, rather than next to the implementation of
the tool in libc/utils/hdrgen/.  As with the .h.def files, there
is no need for a yaml/ subdirectory under include/.  This simpler
layout is more natural for maintenance and also simplifies build
integration outside the LLVM CMake build.
---
 libc/docs/dev/header_generation.rst           |   6 +-
 libc/include/CMakeLists.txt                   | 110 +++++++++---------
 .../hdrgen/yaml => include}/arpa/inet.yaml    |   0
 .../hdrgen/yaml => include}/assert.yaml       |   0
 .../hdrgen/yaml => include}/complex.yaml      |   0
 .../{utils/hdrgen/yaml => include}/ctype.yaml |   0
 .../hdrgen/yaml => include}/dirent.yaml       |   0
 .../{utils/hdrgen/yaml => include}/dlfcn.yaml |   0
 libc/{utils/hdrgen/yaml => include}/elf.yaml  |   0
 .../{utils/hdrgen/yaml => include}/errno.yaml |   0
 .../{utils/hdrgen/yaml => include}/fcntl.yaml |   0
 .../hdrgen/yaml => include}/features.yaml     |   0
 libc/{utils/hdrgen/yaml => include}/fenv.yaml |   0
 .../{utils/hdrgen/yaml => include}/float.yaml |   0
 .../hdrgen/yaml => include}/inttypes.yaml     |   0
 .../hdrgen/yaml => include}/limits.yaml       |   0
 libc/{utils/hdrgen/yaml => include}/link.yaml |   0
 .../hdrgen/yaml => include}/locale.yaml       |   0
 .../hdrgen/yaml => include}/malloc.yaml       |   0
 libc/{utils/hdrgen/yaml => include}/math.yaml |   0
 .../hdrgen/yaml => include}/pthread.yaml      |   0
 .../{utils/hdrgen/yaml => include}/sched.yaml |   0
 .../hdrgen/yaml => include}/search.yaml       |   0
 .../hdrgen/yaml => include}/setjmp.yaml       |   0
 .../hdrgen/yaml => include}/signal.yaml       |   0
 .../{utils/hdrgen/yaml => include}/spawn.yaml |   0
 .../hdrgen/yaml => include}/stdbit.yaml       |   0
 .../hdrgen/yaml => include}/stdckdint.yaml    |   0
 .../hdrgen/yaml => include}/stdfix.yaml       |   0
 .../hdrgen/yaml => include}/stdint.yaml       |   0
 .../{utils/hdrgen/yaml => include}/stdio.yaml |   0
 .../hdrgen/yaml => include}/stdlib.yaml       |   0
 .../hdrgen/yaml => include}/string.yaml       |   0
 .../hdrgen/yaml => include}/strings.yaml      |   0
 .../hdrgen/yaml => include}/sys/auxv.yaml     |   0
 .../hdrgen/yaml => include}/sys/epoll.yaml    |   0
 .../hdrgen/yaml => include}/sys/ioctl.yaml    |   0
 .../hdrgen/yaml => include}/sys/mman.yaml     |   0
 .../hdrgen/yaml => include}/sys/prctl.yaml    |   0
 .../hdrgen/yaml => include}/sys/random.yaml   |   0
 .../hdrgen/yaml => include}/sys/resource.yaml |   0
 .../hdrgen/yaml => include}/sys/select.yaml   |   0
 .../hdrgen/yaml => include}/sys/sendfile.yaml |   0
 .../hdrgen/yaml => include}/sys/socket.yaml   |   0
 .../hdrgen/yaml => include}/sys/stat.yaml     |   0
 .../hdrgen/yaml => include}/sys/statvfs.yaml  |   0
 .../hdrgen/yaml => include}/sys/syscall.yaml  |   0
 .../hdrgen/yaml => include}/sys/time.yaml     |   0
 .../hdrgen/yaml => include}/sys/types.yaml    |   0
 .../hdrgen/yaml => include}/sys/utsname.yaml  |   0
 .../hdrgen/yaml => include}/sys/wait.yaml     |   0
 .../hdrgen/yaml => include}/termios.yaml      |   0
 .../hdrgen/yaml => include}/threads.yaml      |   0
 libc/{utils/hdrgen/yaml => include}/time.yaml |   0
 .../{utils/hdrgen/yaml => include}/uchar.yaml |   0
 .../hdrgen/yaml => include}/unistd.yaml       |   0
 .../{utils/hdrgen/yaml => include}/wchar.yaml |   0
 libc/src/math/docs/add_math_function.md       |   2 +-
 58 files changed, 59 insertions(+), 59 deletions(-)
 rename libc/{utils/hdrgen/yaml => include}/arpa/inet.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/assert.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/complex.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/ctype.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/dirent.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/dlfcn.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/elf.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/errno.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/fcntl.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/features.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/fenv.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/float.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/inttypes.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/limits.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/link.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/locale.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/malloc.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/math.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/pthread.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/sched.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/search.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/setjmp.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/signal.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/spawn.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/stdbit.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/stdckdint.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/stdfix.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/stdint.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/stdio.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/stdlib.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/string.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/strings.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/sys/auxv.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/sys/epoll.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/sys/ioctl.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/sys/mman.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/sys/prctl.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/sys/random.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/sys/resource.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/sys/select.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/sys/sendfile.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/sys/socket.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/sys/stat.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/sys/statvfs.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/sys/syscall.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/sys/time.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/sys/types.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/sys/utsname.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/sys/wait.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/termios.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/threads.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/time.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/uchar.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/unistd.yaml (100%)
 rename libc/{utils/hdrgen/yaml => include}/wchar.yaml (100%)

diff --git a/libc/docs/dev/header_generation.rst b/libc/docs/dev/header_generation.rst
index 17a8d7af3a2c8..a946106fc7097 100644
--- a/libc/docs/dev/header_generation.rst
+++ b/libc/docs/dev/header_generation.rst
@@ -45,14 +45,14 @@ To add through the command line:
    .. code-block:: none
 
      python3 libc/utils/hdrgen/yaml_to_classes.py
-     libc/utils/hdrgen/yaml/[yaml_file.yaml] --add_function ""  ""   
+     libc/include/[yaml_file.yaml] --add_function ""  ""   
 
    Example:
 
    .. code-block:: none
 
       python3 libc/utils/hdrgen/yaml_to_classes.py
-      libc/utils/hdrgen/yaml/ctype.yaml --add_function "char" example_function
+      libc/include/ctype.yaml --add_function "char" example_function
       "int, void, const void" stdc example_float example_attribute
 
    Keep in mind only the return_type and arguments have quotes around them. If
@@ -118,7 +118,7 @@ Common Errors
    missing. Ensure the correct style and required files are present:
 
    | ``[header_name]``
-   | ``[../libc/utils/hdrgen/yaml/[yaml_file.yaml]``
+   | ``[../libc/include/[yaml_file.yaml]``
    | ``[header_name.h.def]``
    | ``[header_name.h]``
    | ``DEPENDS``
diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index e490840cafedb..eb407183c99f5 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -32,7 +32,7 @@ endmacro()
 
 add_header_macro(
   ctype
-  ../libc/utils/hdrgen/yaml/ctype.yaml
+  ../libc/include/ctype.yaml
   ctype.h.def
   ctype.h
   DEPENDS
@@ -42,7 +42,7 @@ add_header_macro(
 
 add_header_macro(
   dirent
-  ../libc/utils/hdrgen/yaml/dirent.yaml
+  ../libc/include/dirent.yaml
   dirent.h.def
   dirent.h
   DEPENDS
@@ -54,7 +54,7 @@ add_header_macro(
 
 add_header_macro(
   fcntl
-  ../libc/utils/hdrgen/yaml/fcntl.yaml
+  ../libc/include/fcntl.yaml
   fcntl.h.def
   fcntl.h
   DEPENDS
@@ -70,7 +70,7 @@ add_header_macro(
 
 add_header_macro(
   dlfcn
-  ../libc/utils/hdrgen/yaml/dlfcn.yaml
+  ../libc/include/dlfcn.yaml
   dlfcn.h.def
   dlfcn.h
   DEPENDS
@@ -80,7 +80,7 @@ add_header_macro(
 
 add_header_macro(
   features
-  ../libc/utils/hdrgen/yaml/features.yaml
+  ../libc/include/features.yaml
   features.h.def
   features.h
   DEPENDS
@@ -90,7 +90,7 @@ add_header_macro(
 
 add_header_macro(
   fenv
-  ../libc/utils/hdrgen/yaml/fenv.yaml
+  ../libc/include/fenv.yaml
   fenv.h.def
   fenv.h
   DEPENDS
@@ -102,7 +102,7 @@ add_header_macro(
 
 add_header_macro(
   inttypes
-  ../libc/utils/hdrgen/yaml/inttypes.yaml
+  ../libc/include/inttypes.yaml
   inttypes.h.def
   inttypes.h
   DEPENDS
@@ -113,7 +113,7 @@ add_header_macro(
 
 add_header_macro(
   float
-  ../libc/utils/hdrgen/yaml/float.yaml
+  ../libc/include/float.yaml
   float.h.def
   float.h
   DEPENDS
@@ -122,7 +122,7 @@ add_header_macro(
 
 add_header_macro(
   stdint
-  ../libc/utils/hdrgen/yaml/stdint.yaml
+  ../libc/include/stdint.yaml
   stdint.h.def
   stdint.h
   DEPENDS
@@ -131,7 +131,7 @@ add_header_macro(
 
 add_header_macro(
   limits
-  ../libc/utils/hdrgen/yaml/limits.yaml
+  ../libc/include/limits.yaml
   limits.h.def
   limits.h
   DEPENDS
@@ -140,7 +140,7 @@ add_header_macro(
 
 add_header_macro(
   malloc
-  ../libc/utils/hdrgen/yaml/malloc.yaml
+  ../libc/include/malloc.yaml
   malloc.h.def
   malloc.h
   DEPENDS
@@ -150,7 +150,7 @@ add_header_macro(
 
 add_header_macro(
   math
-  ../libc/utils/hdrgen/yaml/math.yaml
+  ../libc/include/math.yaml
   math.h.def
   math.h
   DEPENDS
@@ -165,7 +165,7 @@ add_header_macro(
 
 add_header_macro(
   stdfix
-  ../libc/utils/hdrgen/yaml/stdfix.yaml
+  ../libc/include/stdfix.yaml
   stdfix.h.def
   stdfix.h
   DEPENDS
@@ -178,7 +178,7 @@ file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/arpa)
 
 add_header_macro(
   arpa_inet
-  ../libc/utils/hdrgen/yaml/arpa/inet.yaml
+  ../libc/include/arpa/inet.yaml
   arpa/inet.h.def
   arpa/inet.h
   DEPENDS
@@ -187,7 +187,7 @@ add_header_macro(
 
 add_header_macro(
   assert
-  ../libc/utils/hdrgen/yaml/assert.yaml
+  ../libc/include/assert.yaml
   assert.h.def
   assert.h
   DEPENDS
@@ -197,7 +197,7 @@ add_header_macro(
 
 add_header_macro(
   complex
-  ../libc/utils/hdrgen/yaml/complex.yaml
+  ../libc/include/complex.yaml
   complex.h.def
   complex.h
   DEPENDS
@@ -207,7 +207,7 @@ add_header_macro(
 
 add_header_macro(
   setjmp
-  ../libc/utils/hdrgen/yaml/setjmp.yaml
+  ../libc/include/setjmp.yaml
   setjmp.h.def
   setjmp.h
   DEPENDS
@@ -217,7 +217,7 @@ add_header_macro(
 
 add_header_macro(
   string
-  ../libc/utils/hdrgen/yaml/string.yaml
+  ../libc/include/string.yaml
   string.h.def
   string.h
   DEPENDS
@@ -228,7 +228,7 @@ add_header_macro(
 
 add_header_macro(
   strings
-  ../libc/utils/hdrgen/yaml/strings.yaml
+  ../libc/include/strings.yaml
   strings.h.def
   strings.h
   DEPENDS
@@ -238,7 +238,7 @@ add_header_macro(
 
 add_header_macro(
   search
-  ../libc/utils/hdrgen/yaml/search.yaml
+  ../libc/include/search.yaml
   search.h.def
   search.h
   DEPENDS
@@ -252,7 +252,7 @@ add_header_macro(
 
 add_header_macro(
   time
-  ../libc/utils/hdrgen/yaml/time.yaml
+  ../libc/include/time.yaml
   time.h.def
   time.h
   DEPENDS
@@ -268,7 +268,7 @@ add_header_macro(
 
 add_header_macro(
   threads
-  ../libc/utils/hdrgen/yaml/threads.yaml
+  ../libc/include/threads.yaml
   threads.h.def
   threads.h
   DEPENDS
@@ -285,7 +285,7 @@ add_header_macro(
 
 add_header_macro(
   errno
-  ../libc/utils/hdrgen/yaml/errno.yaml
+  ../libc/include/errno.yaml
   errno.h.def
   errno.h
   DEPENDS
@@ -295,7 +295,7 @@ add_header_macro(
 
 add_header_macro(
   signal
-  ../libc/utils/hdrgen/yaml/signal.yaml
+  ../libc/include/signal.yaml
   signal.h.def
   signal.h
   DEPENDS
@@ -311,7 +311,7 @@ add_header_macro(
 
 add_header_macro(
   stdbit
-  ../libc/utils/hdrgen/yaml/stdbit.yaml
+  ../libc/include/stdbit.yaml
   stdbit.h.def
   stdbit.h
   DEPENDS
@@ -321,7 +321,7 @@ add_header_macro(
 
 add_header_macro(
   stdckdint
-  ../libc/utils/hdrgen/yaml/stdckdint.yaml
+  ../libc/include/stdckdint.yaml
   stdckdint.h.def
   stdckdint.h
   DEPENDS
@@ -331,7 +331,7 @@ add_header_macro(
 
 add_header_macro(
   stdio
-  ../libc/utils/hdrgen/yaml/stdio.yaml
+  ../libc/include/stdio.yaml
   stdio.h.def
   stdio.h
   DEPENDS
@@ -347,7 +347,7 @@ add_header_macro(
 
 add_header_macro(
   stdlib
-  ../libc/utils/hdrgen/yaml/stdlib.yaml
+  ../libc/include/stdlib.yaml
   stdlib.h.def
   stdlib.h
   DEPENDS
@@ -366,7 +366,7 @@ add_header_macro(
 
 add_header_macro(
   unistd
-  ../libc/utils/hdrgen/yaml/unistd.yaml
+  ../libc/include/unistd.yaml
   unistd.h.def
   unistd.h
   DEPENDS
@@ -385,7 +385,7 @@ add_header_macro(
 
 add_header_macro(
   pthread
-  ../libc/utils/hdrgen/yaml/pthread.yaml
+  ../libc/include/pthread.yaml
   pthread.h.def
   pthread.h
   DEPENDS
@@ -409,7 +409,7 @@ add_header_macro(
 
 add_header_macro(
   sched
-  ../libc/utils/hdrgen/yaml/sched.yaml
+  ../libc/include/sched.yaml
   sched.h.def
   sched.h
   DEPENDS
@@ -426,7 +426,7 @@ add_header_macro(
 
 add_header_macro(
   spawn
-  ../libc/utils/hdrgen/yaml/spawn.yaml
+  ../libc/include/spawn.yaml
   spawn.h.def
   spawn.h
   DEPENDS
@@ -439,7 +439,7 @@ add_header_macro(
 
 add_header_macro(
   link
-  ../libc/utils/hdrgen/yaml/link.yaml
+  ../libc/include/link.yaml
   link.h.def
   link.h
   DEPENDS
@@ -449,7 +449,7 @@ add_header_macro(
 
 add_header_macro(
   elf
-  ../libc/utils/hdrgen/yaml/elf.yaml
+  ../libc/include/elf.yaml
   elf.h.def
   elf.h
   DEPENDS
@@ -463,7 +463,7 @@ file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/sys)
 
 add_header_macro(
   sys_auxv
-  ../libc/utils/hdrgen/yaml/sys/auxv.yaml
+  ../libc/include/sys/auxv.yaml
   sys/auxv.h.def
   sys/auxv.h
   DEPENDS
@@ -473,7 +473,7 @@ add_header_macro(
 
 add_header_macro(
   sys_epoll
-  ../libc/utils/hdrgen/yaml/sys/epoll.yaml
+  ../libc/include/sys/epoll.yaml
   sys/epoll.h.def
   sys/epoll.h
   DEPENDS
@@ -486,7 +486,7 @@ add_header_macro(
 
 add_header_macro(
   sys_ioctl
-  ../libc/utils/hdrgen/yaml/sys/ioctl.yaml
+  ../libc/include/sys/ioctl.yaml
   sys/ioctl.h.def
   sys/ioctl.h
   DEPENDS
@@ -496,7 +496,7 @@ add_header_macro(
 
 add_header_macro(
   sys_mman
-  ../libc/utils/hdrgen/yaml/sys/mman.yaml
+  ../libc/include/sys/mman.yaml
   sys/mman.h.def
   sys/mman.h
   DEPENDS
@@ -509,7 +509,7 @@ add_header_macro(
 
 add_header_macro(
   sys_prctl
-  ../libc/utils/hdrgen/yaml/sys/prctl.yaml
+  ../libc/include/sys/prctl.yaml
   sys/prctl.h.def
   sys/prctl.h
   DEPENDS
@@ -526,7 +526,7 @@ add_header(
 
 add_header_macro(
   sys_random
-  ../libc/utils/hdrgen/yaml/sys/random.yaml
+  ../libc/include/sys/random.yaml
   sys/random.h.def
   sys/random.h
   DEPENDS
@@ -538,7 +538,7 @@ add_header_macro(
 
 add_header_macro(
   sys_resource
-  ../libc/utils/hdrgen/yaml/sys/resource.yaml
+  ../libc/include/sys/resource.yaml
   sys/resource.h.def
   sys/resource.h
   DEPENDS
@@ -550,7 +550,7 @@ add_header_macro(
 
 add_header_macro(
   sys_stat
-  ../libc/utils/hdrgen/yaml/sys/stat.yaml
+  ../libc/include/sys/stat.yaml
   sys/stat.h.def
   sys/stat.h
   DEPENDS
@@ -572,7 +572,7 @@ add_header_macro(
 
 add_header_macro(
   sys_select
-  ../libc/utils/hdrgen/yaml/sys/select.yaml
+  ../libc/include/sys/select.yaml
   sys/select.h.def
   sys/select.h
   DEPENDS
@@ -588,7 +588,7 @@ add_header_macro(
 
 add_header_macro(
   sys_sendfile
-  ../libc/utils/hdrgen/yaml/sys/sendfile.yaml
+  ../libc/include/sys/sendfile.yaml
   sys/sendfile.h.def
   sys/sendfile.h
   DEPENDS
@@ -600,7 +600,7 @@ add_header_macro(
 
 add_header_macro(
   sys_socket
-  ../libc/utils/hdrgen/yaml/sys/socket.yaml
+  ../libc/include/sys/socket.yaml
   sys/socket.h.def
   sys/socket.h
   DEPENDS
@@ -616,7 +616,7 @@ add_header_macro(
 
 add_header_macro(
   sys_statvfs
-  ../libc/utils/hdrgen/yaml/sys/statvfs.yaml
+  ../libc/include/sys/statvfs.yaml
   sys/statvfs.h.def
   sys/statvfs.h
   DEPENDS
@@ -626,7 +626,7 @@ add_header_macro(
 
 add_header_macro(
   sys_syscall
-  ../libc/utils/hdrgen/yaml/sys/syscall.yaml
+  ../libc/include/sys/syscall.yaml
   sys/syscall.h.def
   sys/syscall.h
   DEPENDS
@@ -634,7 +634,7 @@ add_header_macro(
 
 add_header_macro(
   sys_time
-  ../libc/utils/hdrgen/yaml/sys/time.yaml
+  ../libc/include/sys/time.yaml
   sys/time.h.def
   sys/time.h
   DEPENDS
@@ -645,7 +645,7 @@ add_header_macro(
 
 add_header_macro(
   sys_types
-  ../libc/utils/hdrgen/yaml/sys/types.yaml
+  ../libc/include/sys/types.yaml
   sys/types.h.def
   sys/types.h
   DEPENDS
@@ -675,7 +675,7 @@ add_header_macro(
 
 add_header_macro(
   sys_utsname
-  ../libc/utils/hdrgen/yaml/sys/utsname.yaml
+  ../libc/include/sys/utsname.yaml
   sys/utsname.h.def
   sys/utsname.h
   DEPENDS
@@ -685,7 +685,7 @@ add_header_macro(
 
 add_header_macro(
   sys_wait
-  ../libc/utils/hdrgen/yaml/sys/wait.yaml
+  ../libc/include/sys/wait.yaml
   sys/wait.h.def
   sys/wait.h
   DEPENDS
@@ -698,7 +698,7 @@ add_header_macro(
 
 add_header_macro(
   termios
-  ../libc/utils/hdrgen/yaml/termios.yaml
+  ../libc/include/termios.yaml
   termios.h.def
   termios.h
   DEPENDS
@@ -713,7 +713,7 @@ add_header_macro(
 
 add_header_macro(
   uchar
-  ../libc/utils/hdrgen/yaml/uchar.yaml
+  ../libc/include/uchar.yaml
   uchar.h.def
   uchar.h
   DEPENDS
@@ -726,7 +726,7 @@ add_header_macro(
 
 add_header_macro(
   wchar
-  ../libc/utils/hdrgen/yaml/wchar.yaml
+  ../libc/include/wchar.yaml
   wchar.h.def
   wchar.h
   DEPENDS
@@ -740,7 +740,7 @@ add_header_macro(
 
 add_header_macro(
   locale
-  ../libc/utils/hdrgen/yaml/locale.yaml
+  ../libc/include/locale.yaml
   locale.h.def
   locale.h
   DEPENDS
diff --git a/libc/utils/hdrgen/yaml/arpa/inet.yaml b/libc/include/arpa/inet.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/arpa/inet.yaml
rename to libc/include/arpa/inet.yaml
diff --git a/libc/utils/hdrgen/yaml/assert.yaml b/libc/include/assert.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/assert.yaml
rename to libc/include/assert.yaml
diff --git a/libc/utils/hdrgen/yaml/complex.yaml b/libc/include/complex.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/complex.yaml
rename to libc/include/complex.yaml
diff --git a/libc/utils/hdrgen/yaml/ctype.yaml b/libc/include/ctype.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/ctype.yaml
rename to libc/include/ctype.yaml
diff --git a/libc/utils/hdrgen/yaml/dirent.yaml b/libc/include/dirent.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/dirent.yaml
rename to libc/include/dirent.yaml
diff --git a/libc/utils/hdrgen/yaml/dlfcn.yaml b/libc/include/dlfcn.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/dlfcn.yaml
rename to libc/include/dlfcn.yaml
diff --git a/libc/utils/hdrgen/yaml/elf.yaml b/libc/include/elf.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/elf.yaml
rename to libc/include/elf.yaml
diff --git a/libc/utils/hdrgen/yaml/errno.yaml b/libc/include/errno.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/errno.yaml
rename to libc/include/errno.yaml
diff --git a/libc/utils/hdrgen/yaml/fcntl.yaml b/libc/include/fcntl.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/fcntl.yaml
rename to libc/include/fcntl.yaml
diff --git a/libc/utils/hdrgen/yaml/features.yaml b/libc/include/features.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/features.yaml
rename to libc/include/features.yaml
diff --git a/libc/utils/hdrgen/yaml/fenv.yaml b/libc/include/fenv.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/fenv.yaml
rename to libc/include/fenv.yaml
diff --git a/libc/utils/hdrgen/yaml/float.yaml b/libc/include/float.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/float.yaml
rename to libc/include/float.yaml
diff --git a/libc/utils/hdrgen/yaml/inttypes.yaml b/libc/include/inttypes.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/inttypes.yaml
rename to libc/include/inttypes.yaml
diff --git a/libc/utils/hdrgen/yaml/limits.yaml b/libc/include/limits.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/limits.yaml
rename to libc/include/limits.yaml
diff --git a/libc/utils/hdrgen/yaml/link.yaml b/libc/include/link.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/link.yaml
rename to libc/include/link.yaml
diff --git a/libc/utils/hdrgen/yaml/locale.yaml b/libc/include/locale.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/locale.yaml
rename to libc/include/locale.yaml
diff --git a/libc/utils/hdrgen/yaml/malloc.yaml b/libc/include/malloc.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/malloc.yaml
rename to libc/include/malloc.yaml
diff --git a/libc/utils/hdrgen/yaml/math.yaml b/libc/include/math.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/math.yaml
rename to libc/include/math.yaml
diff --git a/libc/utils/hdrgen/yaml/pthread.yaml b/libc/include/pthread.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/pthread.yaml
rename to libc/include/pthread.yaml
diff --git a/libc/utils/hdrgen/yaml/sched.yaml b/libc/include/sched.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/sched.yaml
rename to libc/include/sched.yaml
diff --git a/libc/utils/hdrgen/yaml/search.yaml b/libc/include/search.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/search.yaml
rename to libc/include/search.yaml
diff --git a/libc/utils/hdrgen/yaml/setjmp.yaml b/libc/include/setjmp.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/setjmp.yaml
rename to libc/include/setjmp.yaml
diff --git a/libc/utils/hdrgen/yaml/signal.yaml b/libc/include/signal.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/signal.yaml
rename to libc/include/signal.yaml
diff --git a/libc/utils/hdrgen/yaml/spawn.yaml b/libc/include/spawn.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/spawn.yaml
rename to libc/include/spawn.yaml
diff --git a/libc/utils/hdrgen/yaml/stdbit.yaml b/libc/include/stdbit.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/stdbit.yaml
rename to libc/include/stdbit.yaml
diff --git a/libc/utils/hdrgen/yaml/stdckdint.yaml b/libc/include/stdckdint.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/stdckdint.yaml
rename to libc/include/stdckdint.yaml
diff --git a/libc/utils/hdrgen/yaml/stdfix.yaml b/libc/include/stdfix.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/stdfix.yaml
rename to libc/include/stdfix.yaml
diff --git a/libc/utils/hdrgen/yaml/stdint.yaml b/libc/include/stdint.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/stdint.yaml
rename to libc/include/stdint.yaml
diff --git a/libc/utils/hdrgen/yaml/stdio.yaml b/libc/include/stdio.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/stdio.yaml
rename to libc/include/stdio.yaml
diff --git a/libc/utils/hdrgen/yaml/stdlib.yaml b/libc/include/stdlib.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/stdlib.yaml
rename to libc/include/stdlib.yaml
diff --git a/libc/utils/hdrgen/yaml/string.yaml b/libc/include/string.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/string.yaml
rename to libc/include/string.yaml
diff --git a/libc/utils/hdrgen/yaml/strings.yaml b/libc/include/strings.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/strings.yaml
rename to libc/include/strings.yaml
diff --git a/libc/utils/hdrgen/yaml/sys/auxv.yaml b/libc/include/sys/auxv.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/sys/auxv.yaml
rename to libc/include/sys/auxv.yaml
diff --git a/libc/utils/hdrgen/yaml/sys/epoll.yaml b/libc/include/sys/epoll.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/sys/epoll.yaml
rename to libc/include/sys/epoll.yaml
diff --git a/libc/utils/hdrgen/yaml/sys/ioctl.yaml b/libc/include/sys/ioctl.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/sys/ioctl.yaml
rename to libc/include/sys/ioctl.yaml
diff --git a/libc/utils/hdrgen/yaml/sys/mman.yaml b/libc/include/sys/mman.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/sys/mman.yaml
rename to libc/include/sys/mman.yaml
diff --git a/libc/utils/hdrgen/yaml/sys/prctl.yaml b/libc/include/sys/prctl.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/sys/prctl.yaml
rename to libc/include/sys/prctl.yaml
diff --git a/libc/utils/hdrgen/yaml/sys/random.yaml b/libc/include/sys/random.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/sys/random.yaml
rename to libc/include/sys/random.yaml
diff --git a/libc/utils/hdrgen/yaml/sys/resource.yaml b/libc/include/sys/resource.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/sys/resource.yaml
rename to libc/include/sys/resource.yaml
diff --git a/libc/utils/hdrgen/yaml/sys/select.yaml b/libc/include/sys/select.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/sys/select.yaml
rename to libc/include/sys/select.yaml
diff --git a/libc/utils/hdrgen/yaml/sys/sendfile.yaml b/libc/include/sys/sendfile.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/sys/sendfile.yaml
rename to libc/include/sys/sendfile.yaml
diff --git a/libc/utils/hdrgen/yaml/sys/socket.yaml b/libc/include/sys/socket.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/sys/socket.yaml
rename to libc/include/sys/socket.yaml
diff --git a/libc/utils/hdrgen/yaml/sys/stat.yaml b/libc/include/sys/stat.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/sys/stat.yaml
rename to libc/include/sys/stat.yaml
diff --git a/libc/utils/hdrgen/yaml/sys/statvfs.yaml b/libc/include/sys/statvfs.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/sys/statvfs.yaml
rename to libc/include/sys/statvfs.yaml
diff --git a/libc/utils/hdrgen/yaml/sys/syscall.yaml b/libc/include/sys/syscall.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/sys/syscall.yaml
rename to libc/include/sys/syscall.yaml
diff --git a/libc/utils/hdrgen/yaml/sys/time.yaml b/libc/include/sys/time.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/sys/time.yaml
rename to libc/include/sys/time.yaml
diff --git a/libc/utils/hdrgen/yaml/sys/types.yaml b/libc/include/sys/types.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/sys/types.yaml
rename to libc/include/sys/types.yaml
diff --git a/libc/utils/hdrgen/yaml/sys/utsname.yaml b/libc/include/sys/utsname.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/sys/utsname.yaml
rename to libc/include/sys/utsname.yaml
diff --git a/libc/utils/hdrgen/yaml/sys/wait.yaml b/libc/include/sys/wait.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/sys/wait.yaml
rename to libc/include/sys/wait.yaml
diff --git a/libc/utils/hdrgen/yaml/termios.yaml b/libc/include/termios.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/termios.yaml
rename to libc/include/termios.yaml
diff --git a/libc/utils/hdrgen/yaml/threads.yaml b/libc/include/threads.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/threads.yaml
rename to libc/include/threads.yaml
diff --git a/libc/utils/hdrgen/yaml/time.yaml b/libc/include/time.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/time.yaml
rename to libc/include/time.yaml
diff --git a/libc/utils/hdrgen/yaml/uchar.yaml b/libc/include/uchar.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/uchar.yaml
rename to libc/include/uchar.yaml
diff --git a/libc/utils/hdrgen/yaml/unistd.yaml b/libc/include/unistd.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/unistd.yaml
rename to libc/include/unistd.yaml
diff --git a/libc/utils/hdrgen/yaml/wchar.yaml b/libc/include/wchar.yaml
similarity index 100%
rename from libc/utils/hdrgen/yaml/wchar.yaml
rename to libc/include/wchar.yaml
diff --git a/libc/src/math/docs/add_math_function.md b/libc/src/math/docs/add_math_function.md
index 7d45bd02c4ff2..daaf1a3ec5639 100644
--- a/libc/src/math/docs/add_math_function.md
+++ b/libc/src/math/docs/add_math_function.md
@@ -18,7 +18,7 @@ together with its specifications:
 ```
 - Add function specs to the file:
 ```
-  libc/utils/hdrgen/yaml/math.yaml
+  libc/include/math.yaml
 ```
 
 ## Implementation

From 508929d42a8735b05815b5332a4733ed8d0bf1eb Mon Sep 17 00:00:00 2001
From: Roland McGrath 
Date: Wed, 1 Jan 2025 18:16:32 -0800
Subject: [PATCH 252/567] [libc] Make GpuHeaderFile a subclass (#121445)

Replace copy&paste from HeaderFile to GpuHeaderFile with
subclassing.  Only the __str__ method differs.
---
 libc/utils/hdrgen/gpu_headers.py | 26 ++------------------------
 1 file changed, 2 insertions(+), 24 deletions(-)

diff --git a/libc/utils/hdrgen/gpu_headers.py b/libc/utils/hdrgen/gpu_headers.py
index b26b3a88557b4..8c4ff6e08b112 100644
--- a/libc/utils/hdrgen/gpu_headers.py
+++ b/libc/utils/hdrgen/gpu_headers.py
@@ -6,31 +6,9 @@
 #
 # ==-------------------------------------------------------------------------==#
 
+from header import HeaderFile
 
-class GpuHeaderFile:
-    def __init__(self, name):
-        self.name = name
-        self.macros = []
-        self.types = []
-        self.enumerations = []
-        self.objects = []
-        self.functions = []
-
-    def add_macro(self, macro):
-        self.macros.append(macro)
-
-    def add_type(self, type_):
-        self.types.append(type_)
-
-    def add_enumeration(self, enumeration):
-        self.enumerations.append(enumeration)
-
-    def add_object(self, object):
-        self.objects.append(object)
-
-    def add_function(self, function):
-        self.functions.append(function)
-
+class GpuHeaderFile(HeaderFile):
     def __str__(self):
         content = []
 

From 1a0d0ae234544dc4978f1e12730408cb83f6b923 Mon Sep 17 00:00:00 2001
From: Owen Pan 
Date: Wed, 1 Jan 2025 18:24:56 -0800
Subject: [PATCH 253/567] [clang-format] Add `VariableTemplates` option
 (#121318)

Closes #120148.
---
 clang/docs/ClangFormatStyleOptions.rst        |  9 +++++++++
 clang/docs/ReleaseNotes.rst                   |  1 +
 clang/include/clang/Format/Format.h           | 11 ++++++++++-
 clang/lib/Format/Format.cpp                   |  1 +
 clang/lib/Format/FormatToken.h                |  1 +
 clang/lib/Format/FormatTokenLexer.cpp         |  4 ++++
 clang/lib/Format/FormatTokenLexer.h           |  3 ++-
 clang/lib/Format/TokenAnnotator.cpp           | 19 +++++++++++++++----
 clang/unittests/Format/TokenAnnotatorTest.cpp | 13 +++++++++++++
 9 files changed, 56 insertions(+), 6 deletions(-)

diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index c175436a2817a..d9b3f666df03c 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -6798,6 +6798,15 @@ the configuration (without a prefix: ``Auto``).
 
 
 
+.. _VariableTemplates:
+
+**VariableTemplates** (``List of Strings``) :versionbadge:`clang-format 20` :ref:`¶ `
+  A vector of non-keyword identifiers that should be interpreted as variable
+  template names.
+
+  A ``)`` after a variable template instantiation is **not** annotated as
+  the closing parenthesis of C-style cast operator.
+
 .. _VerilogBreakBetweenInstancePorts:
 
 **VerilogBreakBetweenInstancePorts** (``Boolean``) :versionbadge:`clang-format 17` :ref:`¶ `
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 662c575bad3e8..e0aef1af2135c 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1125,6 +1125,7 @@ clang-format
 - Adds ``RemoveEmptyLinesInUnwrappedLines`` option.
 - Adds ``KeepFormFeed`` option and set it to ``true`` for ``GNU`` style.
 - Adds ``AllowShortNamespacesOnASingleLine`` option.
+- Adds ``VariableTemplates`` option.
 - Adds support for bash globstar in ``.clang-format-ignore``.
 
 libclang
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index eefaabf9392fd..bb34f2d33ac15 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -5103,6 +5103,15 @@ struct FormatStyle {
   /// \version 3.7
   UseTabStyle UseTab;
 
+  /// A vector of non-keyword identifiers that should be interpreted as variable
+  /// template names.
+  ///
+  /// A ``)`` after a variable template instantiation is **not** annotated as
+  /// the closing parenthesis of C-style cast operator.
+  ///
+  /// \version 20
+  std::vector VariableTemplates;
+
   /// For Verilog, put each port on its own line in module instantiations.
   /// \code
   ///    true:
@@ -5314,7 +5323,7 @@ struct FormatStyle {
            TableGenBreakInsideDAGArg == R.TableGenBreakInsideDAGArg &&
            TabWidth == R.TabWidth && TemplateNames == R.TemplateNames &&
            TypeNames == R.TypeNames && TypenameMacros == R.TypenameMacros &&
-           UseTab == R.UseTab &&
+           UseTab == R.UseTab && VariableTemplates == R.VariableTemplates &&
            VerilogBreakBetweenInstancePorts ==
                R.VerilogBreakBetweenInstancePorts &&
            WhitespaceSensitiveMacros == R.WhitespaceSensitiveMacros;
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 8f44e9f00212c..a5657f2d910f6 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -1166,6 +1166,7 @@ template <> struct MappingTraits {
     IO.mapOptional("TypeNames", Style.TypeNames);
     IO.mapOptional("TypenameMacros", Style.TypenameMacros);
     IO.mapOptional("UseTab", Style.UseTab);
+    IO.mapOptional("VariableTemplates", Style.VariableTemplates);
     IO.mapOptional("VerilogBreakBetweenInstancePorts",
                    Style.VerilogBreakBetweenInstancePorts);
     IO.mapOptional("WhitespaceSensitiveMacros",
diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index f6bb860a1fea3..8917049cefb86 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -186,6 +186,7 @@ namespace format {
   TYPE(UnionLBrace)                                                            \
   TYPE(UnionRBrace)                                                            \
   TYPE(UntouchableMacroFunc)                                                   \
+  TYPE(VariableTemplate)                                                       \
   /* Like in 'assign x = 0, y = 1;' . */                                       \
   TYPE(VerilogAssignComma)                                                     \
   /* like in begin : block */                                                  \
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index 7a264bddcdfe1..0f8d4940d4369 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -76,6 +76,8 @@ FormatTokenLexer::FormatTokenLexer(
     TemplateNames.insert(&IdentTable.get(TemplateName));
   for (const auto &TypeName : Style.TypeNames)
     TypeNames.insert(&IdentTable.get(TypeName));
+  for (const auto &VariableTemplate : Style.VariableTemplates)
+    VariableTemplates.insert(&IdentTable.get(VariableTemplate));
 }
 
 ArrayRef FormatTokenLexer::lex() {
@@ -1382,6 +1384,8 @@ FormatToken *FormatTokenLexer::getNextToken() {
         FormatTok->setFinalizedType(TT_TemplateName);
       else if (TypeNames.contains(Identifier))
         FormatTok->setFinalizedType(TT_TypeName);
+      else if (VariableTemplates.contains(Identifier))
+        FormatTok->setFinalizedType(TT_VariableTemplate);
     }
   }
 
diff --git a/clang/lib/Format/FormatTokenLexer.h b/clang/lib/Format/FormatTokenLexer.h
index 71389d2ade2b7..61474a3f9ada8 100644
--- a/clang/lib/Format/FormatTokenLexer.h
+++ b/clang/lib/Format/FormatTokenLexer.h
@@ -129,7 +129,8 @@ class FormatTokenLexer {
 
   llvm::SmallMapVector Macros;
 
-  llvm::SmallPtrSet TemplateNames, TypeNames;
+  llvm::SmallPtrSet TemplateNames, TypeNames,
+      VariableTemplates;
 
   bool FormattingDisabled;
 
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index f2cfa7f49f62f..b0f570966a63f 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -2792,6 +2792,16 @@ class AnnotatingParser {
       return true;
     }
 
+    auto IsNonVariableTemplate = [](const FormatToken &Tok) {
+      if (Tok.isNot(TT_TemplateCloser))
+        return false;
+      const auto *Less = Tok.MatchingParen;
+      if (!Less)
+        return false;
+      const auto *BeforeLess = Less->getPreviousNonComment();
+      return BeforeLess && BeforeLess->isNot(TT_VariableTemplate);
+    };
+
     // Heuristically try to determine whether the parentheses contain a type.
     auto IsQualifiedPointerOrReference = [](const FormatToken *T,
                                             const LangOptions &LangOpts) {
@@ -2825,10 +2835,11 @@ class AnnotatingParser {
       }
       return T && T->is(TT_PointerOrReference);
     };
-    bool ParensAreType =
-        BeforeRParen->isOneOf(TT_TemplateCloser, TT_TypeDeclarationParen) ||
-        BeforeRParen->isTypeName(LangOpts) ||
-        IsQualifiedPointerOrReference(BeforeRParen, LangOpts);
+
+    bool ParensAreType = IsNonVariableTemplate(*BeforeRParen) ||
+                         BeforeRParen->is(TT_TypeDeclarationParen) ||
+                         BeforeRParen->isTypeName(LangOpts) ||
+                         IsQualifiedPointerOrReference(BeforeRParen, LangOpts);
     bool ParensCouldEndDecl =
         AfterRParen->isOneOf(tok::equal, tok::semi, tok::l_brace, tok::greater);
     if (ParensAreType && !ParensCouldEndDecl)
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index b2fb5227993c3..d61b9adf4f58c 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -3615,6 +3615,19 @@ TEST_F(TokenAnnotatorTest, TemplateInstantiation) {
   EXPECT_TOKEN(Tokens[18], tok::greater, TT_TemplateCloser);
 }
 
+TEST_F(TokenAnnotatorTest, VariableTemplate) {
+  auto Style = getLLVMStyle();
+  Style.VariableTemplates.push_back("a");
+
+  auto Tokens = annotate("auto t3 = (a) + b;", Style);
+  ASSERT_EQ(Tokens.size(), 13u) << Tokens;
+  EXPECT_TOKEN(Tokens[4], tok::identifier, TT_VariableTemplate);
+  EXPECT_TOKEN(Tokens[5], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[7], tok::greater, TT_TemplateCloser);
+  EXPECT_TOKEN(Tokens[8], tok::r_paren, TT_Unknown); // Not TT_CastRParen
+  EXPECT_TOKEN(Tokens[9], tok::plus, TT_BinaryOperator);
+}
+
 TEST_F(TokenAnnotatorTest, SwitchInMacroArgument) {
   auto Tokens = annotate("FOOBAR(switch);\n"
                          "void f() {}");

From 23ec9ee17eacb18eeb01d3c5a0d424852c7c1909 Mon Sep 17 00:00:00 2001
From: Phoebe Wang 
Date: Thu, 2 Jan 2025 11:30:26 +0800
Subject: [PATCH 254/567] [X86][AVX10.2] Lower fmininum/fmaximum to VMINMAX*
 (#121373)

---
 llvm/lib/Target/X86/X86ISelLowering.cpp    |  18 ++
 llvm/lib/Target/X86/X86InstrAVX10.td       |  52 ++++--
 llvm/test/CodeGen/X86/fminimum-fmaximum.ll | 192 +++++++++++++++++++++
 llvm/test/TableGen/x86-fold-tables.inc     |  21 ++-
 4 files changed, 255 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e7f6032ee7d74..a0514e93d6598 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2442,6 +2442,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FSQRT, VT, Legal);
       setOperationAction(ISD::FMA, VT, Legal);
       setOperationAction(ISD::SETCC, VT, Custom);
+      setOperationAction(ISD::FMINIMUM, VT, Custom);
+      setOperationAction(ISD::FMAXIMUM, VT, Custom);
     }
     if (Subtarget.hasAVX10_2_512()) {
       setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
@@ -2451,6 +2453,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
       setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
       setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
+      setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
+      setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
     }
     for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
       setCondCodeAction(ISD::SETOEQ, VT, Custom);
@@ -28842,6 +28846,20 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
   SDValue X = Op.getOperand(0);
   SDValue Y = Op.getOperand(1);
   SDLoc DL(Op);
+  if (Subtarget.hasAVX10_2() && TLI.isTypeLegal(VT)) {
+    unsigned Opc = 0;
+    if (VT.isVector())
+      Opc = X86ISD::VMINMAX;
+    else if (VT == MVT::f16 || VT == MVT::f32 || VT == MVT::f64)
+      Opc = X86ISD::VMINMAXS;
+
+    if (Opc) {
+      SDValue Imm =
+          DAG.getTargetConstant(Op.getOpcode() == ISD::FMAXIMUM, DL, MVT::i32);
+      return DAG.getNode(Opc, DL, VT, X, Y, Imm, Op->getFlags());
+    }
+  }
+
   uint64_t SizeInBits = VT.getScalarSizeInBits();
   APInt PreferredZero = APInt::getZero(SizeInBits);
   APInt OppositeZero = PreferredZero;
diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td
index 0301c07dfb540..3bc64eda01a9c 100644
--- a/llvm/lib/Target/X86/X86InstrAVX10.td
+++ b/llvm/lib/Target/X86/X86InstrAVX10.td
@@ -403,28 +403,42 @@ multiclass avx10_minmax_scalar {
   let ExeDomain = _.ExeDomain, Predicates = [HasAVX10_2] in {
     let mayRaiseFPException = 1 in {
-      defm rri : AVX512_maskable<0x53, MRMSrcReg, _, (outs VR128X:$dst),
-                               (ins VR128X:$src1, VR128X:$src2, i32u8imm:$src3),
-                                OpStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
-                                (_.VT (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                                              (i32 timm:$src3)))>,
-                                Sched<[WriteFMAX]>;
-
-      defm rmi : AVX512_maskable<0x53, MRMSrcMem, _, (outs VR128X:$dst),
-                       (ins VR128X:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
-                       OpStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
-                       (_.VT (OpNode (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2),
-                                     (i32 timm:$src3)))>,
+      let isCodeGenOnly = 1 in {
+        def rri : AVX512Ii8<0x53, MRMSrcReg, (outs _.FRC:$dst),
+                            (ins _.FRC:$src1, _.FRC:$src2, i32u8imm:$src3),
+                             !strconcat(OpStr, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+                             [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2, (i32 timm:$src3)))]>,
+                       Sched<[WriteFMAX]>;
+
+        def rmi : AVX512Ii8<0x53, MRMSrcMem, (outs _.FRC:$dst),
+                            (ins _.FRC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
+                             !strconcat(OpStr, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+                             [(set _.FRC:$dst, (OpNode _.FRC:$src1, (_.ScalarLdFrag addr:$src2),
+                                                       (i32 timm:$src3)))]>,
+                       Sched<[WriteFMAX.Folded, WriteFMAX.ReadAfterFold]>;
+      }
+      defm rri_Int : AVX512_maskable<0x53, MRMSrcReg, _, (outs VR128X:$dst),
+                                     (ins VR128X:$src1, VR128X:$src2, i32u8imm:$src3),
+                                      OpStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                                      (_.VT (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                                                    (i32 timm:$src3)))>,
+                       Sched<[WriteFMAX]>;
+
+      defm rmi_Int : AVX512_maskable<0x53, MRMSrcMem, _, (outs VR128X:$dst),
+                                     (ins VR128X:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
+                                      OpStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                                      (_.VT (OpNode (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2),
+                                                    (i32 timm:$src3)))>,
                        Sched<[WriteFMAX.Folded, WriteFMAX.ReadAfterFold]>;
     }
     let Uses = [], mayRaiseFPException = 0 in
-      defm rrib : AVX512_maskable<0x53, MRMSrcReg, _, (outs VR128X:$dst),
-                        (ins VR128X:$src1, VR128X:$src2, i32u8imm:$src3),
-                        OpStr, "$src3, {sae}, $src2, $src1",
-                        "$src1, $src2, {sae}, $src3",
-                        (_.VT (OpNodeSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                                         (i32 timm:$src3)))>,
-                        Sched<[WriteFMAX]>, EVEX_B;
+      defm rrib_Int : AVX512_maskable<0x53, MRMSrcReg, _, (outs VR128X:$dst),
+                                      (ins VR128X:$src1, VR128X:$src2, i32u8imm:$src3),
+                                       OpStr, "$src3, {sae}, $src2, $src1",
+                                       "$src1, $src2, {sae}, $src3",
+                                       (_.VT (OpNodeSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                                                        (i32 timm:$src3)))>,
+                       Sched<[WriteFMAX]>, EVEX_B;
   }
 }
 
diff --git a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
index c6da0c5ca4792..1dcce5336895f 100644
--- a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
+++ b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
@@ -3,6 +3,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx      | FileCheck %s --check-prefixes=AVX,AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f  | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=AVX10_2
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx        | FileCheck %s --check-prefixes=X86
 
 declare float @llvm.maximum.f32(float, float)
@@ -73,6 +74,11 @@ define float @test_fmaximum(float %x, float %y) nounwind {
 ; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxss $1, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
@@ -110,6 +116,11 @@ define <4 x float> @test_fmaximum_scalarize(<4 x float> %x, <4 x float> %y) "no-
 ; AVX-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_scalarize:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxps $1, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_scalarize:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
@@ -129,6 +140,11 @@ define float @test_fmaximum_nan0(float %x, float %y) {
 ; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_nan0:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_nan0:
 ; X86:       # %bb.0:
 ; X86-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
@@ -148,6 +164,11 @@ define float @test_fmaximum_nan1(float %x, float %y) {
 ; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_nan1:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_nan1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
@@ -215,6 +236,13 @@ define float @test_fmaximum_nnan(float %x, float %y) nounwind {
 ; AVX512DQ-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_nnan:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vaddss %xmm1, %xmm0, %xmm2
+; AVX10_2-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    vminmaxss $1, %xmm0, %xmm2
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_nnan:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
@@ -272,6 +300,12 @@ define double @test_fmaximum_zero0(double %x, double %y) nounwind {
 ; AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_zero0:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
+; AVX10_2-NEXT:    vminmaxsd $1, %xmm0, %xmm1
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_zero0:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
@@ -323,6 +357,12 @@ define double @test_fmaximum_zero1(double %x, double %y) nounwind {
 ; AVX512-NEXT:    vmovapd %xmm1, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_zero1:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxsd $1, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_zero1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
@@ -354,6 +394,11 @@ define double @test_fmaximum_zero2(double %x, double %y) {
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_zero2:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_zero2:
 ; X86:       # %bb.0:
 ; X86-NEXT:    fldz
@@ -390,6 +435,11 @@ define float @test_fmaximum_nsz(float %x, float %y) "no-signed-zeros-fp-math"="t
 ; AVX512-NEXT:    vmovaps %xmm1, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_nsz:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxss $1, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_nsz:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
@@ -474,6 +524,12 @@ define float @test_fmaximum_combine_cmps(float %x, float %y) nounwind {
 ; AVX512DQ-NEXT:    vmaxss %xmm2, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_combine_cmps:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vdivss %xmm0, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxss $1, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_combine_cmps:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
@@ -562,6 +618,11 @@ define float @test_fminimum(float %x, float %y) nounwind {
 ; AVX512-NEXT:    vmovaps %xmm1, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxss $0, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
@@ -599,6 +660,11 @@ define <2 x double> @test_fminimum_scalarize(<2 x double> %x, <2 x double> %y) "
 ; AVX-NEXT:    vminpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_scalarize:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxpd $0, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_scalarize:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vminpd %xmm1, %xmm0, %xmm0
@@ -618,6 +684,11 @@ define float @test_fminimum_nan0(float %x, float %y) {
 ; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_nan0:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_nan0:
 ; X86:       # %bb.0:
 ; X86-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
@@ -637,6 +708,11 @@ define float @test_fminimum_nan1(float %x, float %y) {
 ; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_nan1:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_nan1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
@@ -695,6 +771,11 @@ define double @test_fminimum_nnan(double %x, double %y) "no-nans-fp-math"="true"
 ; AVX512DQ-NEXT:    vminsd %xmm2, %xmm1, %xmm0
 ; AVX512DQ-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_nnan:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxsd $0, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_nnan:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
@@ -749,6 +830,11 @@ define double @test_fminimum_zero0(double %x, double %y) nounwind {
 ; AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_zero0:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxsd $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_zero0:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
@@ -796,6 +882,11 @@ define double @test_fminimum_zero1(double %x, double %y) nounwind {
 ; AVX512-NEXT:    vmovapd %xmm1, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_zero1:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxsd $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_zero1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
@@ -826,6 +917,11 @@ define double @test_fminimum_zero2(double %x, double %y) {
 ; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_zero2:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_zero2:
 ; X86:       # %bb.0:
 ; X86-NEXT:    fldz
@@ -863,6 +959,11 @@ define float @test_fminimum_nsz(float %x, float %y) nounwind {
 ; AVX512-NEXT:    vmovaps %xmm1, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_nsz:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxss $0, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_nsz:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
@@ -948,6 +1049,12 @@ define float @test_fminimum_combine_cmps(float %x, float %y) nounwind {
 ; AVX512DQ-NEXT:    vminss %xmm2, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_combine_cmps:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vdivss %xmm0, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxss $0, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_combine_cmps:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
@@ -1009,6 +1116,11 @@ define <2 x double> @test_fminimum_vector(<2 x double> %x, <2 x double> %y) {
 ; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_vector:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxpd $0, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_vector:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
@@ -1032,6 +1144,11 @@ define <4 x float> @test_fmaximum_vector(<4 x float> %x, <4 x float> %y) "no-nan
 ; AVX-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_vector:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxps $1, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_vector:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
@@ -1054,6 +1171,12 @@ define <2 x double> @test_fminimum_vector_zero(<2 x double> %x) {
 ; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_vector_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxpd $0, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_vector_zero:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
@@ -1077,6 +1200,11 @@ define <4 x float> @test_fmaximum_vector_signed_zero(<4 x float> %x) {
 ; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_vector_signed_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxps $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_vector_signed_zero:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
@@ -1102,6 +1230,13 @@ define <2 x double> @test_fminimum_vector_partially_zero(<2 x double> %x) {
 ; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_vector_partially_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX10_2-NEXT:    vminmaxpd $0, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_vector_partially_zero:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
@@ -1149,6 +1284,13 @@ define <2 x double> @test_fminimum_vector_different_zeros(<2 x double> %x) {
 ; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_vector_different_zeros:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX10_2-NEXT:    vminmaxpd $0, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_vector_different_zeros:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
@@ -1177,6 +1319,11 @@ define <4 x float> @test_fmaximum_vector_non_zero(<4 x float> %x) {
 ; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_vector_non_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxps $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_vector_non_zero:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovaps {{.*#+}} xmm1 = [5.0E+0,4.0E+0,3.0E+0,2.0E+0]
@@ -1206,6 +1353,13 @@ define <2 x double> @test_fminimum_vector_nan(<2 x double> %x) {
 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_vector_nan:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX10_2-NEXT:    vminmaxpd $0, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_vector_nan:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
@@ -1232,6 +1386,12 @@ define <2 x double> @test_fminimum_vector_zero_first(<2 x double> %x) {
 ; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_vector_zero_first:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxpd $0, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_vector_zero_first:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
@@ -1260,6 +1420,11 @@ define <2 x double> @test_fminimum_vector_signed_zero(<2 x double> %x) {
 ; AVX-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_vector_signed_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxpd $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_vector_signed_zero:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm1
@@ -1284,6 +1449,11 @@ define <4 x float> @test_fmaximum_vector_signed_zero_first(<4 x float> %x) {
 ; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_vector_signed_zero_first:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxps $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_vector_signed_zero_first:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
@@ -1314,6 +1484,12 @@ define <4 x float> @test_fmaximum_vector_zero(<4 x float> %x) {
 ; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_vector_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxps $1, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_vector_zero:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
@@ -1369,6 +1545,12 @@ define <4 x float> @test_fmaximum_v4f32_splat(<4 x float> %x, float %y) {
 ; AVX512-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_v4f32_splat:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vbroadcastss %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxps $1, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_v4f32_splat:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm1
@@ -1803,6 +1985,11 @@ define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) nounwind {
 ; AVX512-NEXT:    popq %rbp
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_v4f16:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxph $1, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_v4f16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    subl $164, %esp
@@ -2330,6 +2517,11 @@ define <4 x bfloat> @test_fmaximum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; AVX512-NEXT:    .cfi_def_cfa_offset 8
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_v4bf16:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxnepbf16 $1, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_v4bf16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc
index 36f6afacdf09d..8cfaa18a5cfac 100644
--- a/llvm/test/TableGen/x86-fold-tables.inc
+++ b/llvm/test/TableGen/x86-fold-tables.inc
@@ -3085,9 +3085,12 @@ static const X86FoldTableEntry Table2[] = {
   {X86::VMINMAXPSZ128rri, X86::VMINMAXPSZ128rmi, 0},
   {X86::VMINMAXPSZ256rri, X86::VMINMAXPSZ256rmi, 0},
   {X86::VMINMAXPSZrri, X86::VMINMAXPSZrmi, 0},
-  {X86::VMINMAXSDrri, X86::VMINMAXSDrmi, TB_NO_REVERSE},
-  {X86::VMINMAXSHrri, X86::VMINMAXSHrmi, TB_NO_REVERSE},
-  {X86::VMINMAXSSrri, X86::VMINMAXSSrmi, TB_NO_REVERSE},
+  {X86::VMINMAXSDrri, X86::VMINMAXSDrmi, 0},
+  {X86::VMINMAXSDrri_Int, X86::VMINMAXSDrmi_Int, TB_NO_REVERSE},
+  {X86::VMINMAXSHrri, X86::VMINMAXSHrmi, 0},
+  {X86::VMINMAXSHrri_Int, X86::VMINMAXSHrmi_Int, TB_NO_REVERSE},
+  {X86::VMINMAXSSrri, X86::VMINMAXSSrmi, 0},
+  {X86::VMINMAXSSrri_Int, X86::VMINMAXSSrmi_Int, TB_NO_REVERSE},
   {X86::VMINPBF16Z128rr, X86::VMINPBF16Z128rm, 0},
   {X86::VMINPBF16Z256rr, X86::VMINPBF16Z256rm, 0},
   {X86::VMINPBF16Zrr, X86::VMINPBF16Zrm, 0},
@@ -5131,9 +5134,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VMINMAXPSZ128rrikz, X86::VMINMAXPSZ128rmikz, 0},
   {X86::VMINMAXPSZ256rrikz, X86::VMINMAXPSZ256rmikz, 0},
   {X86::VMINMAXPSZrrikz, X86::VMINMAXPSZrmikz, 0},
-  {X86::VMINMAXSDrrikz, X86::VMINMAXSDrmikz, TB_NO_REVERSE},
-  {X86::VMINMAXSHrrikz, X86::VMINMAXSHrmikz, TB_NO_REVERSE},
-  {X86::VMINMAXSSrrikz, X86::VMINMAXSSrmikz, TB_NO_REVERSE},
+  {X86::VMINMAXSDrri_Intkz, X86::VMINMAXSDrmi_Intkz, TB_NO_REVERSE},
+  {X86::VMINMAXSHrri_Intkz, X86::VMINMAXSHrmi_Intkz, TB_NO_REVERSE},
+  {X86::VMINMAXSSrri_Intkz, X86::VMINMAXSSrmi_Intkz, TB_NO_REVERSE},
   {X86::VMINPBF16Z128rrkz, X86::VMINPBF16Z128rmkz, 0},
   {X86::VMINPBF16Z256rrkz, X86::VMINPBF16Z256rmkz, 0},
   {X86::VMINPBF16Zrrkz, X86::VMINPBF16Zrmkz, 0},
@@ -6753,9 +6756,9 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VMINMAXPSZ128rrik, X86::VMINMAXPSZ128rmik, 0},
   {X86::VMINMAXPSZ256rrik, X86::VMINMAXPSZ256rmik, 0},
   {X86::VMINMAXPSZrrik, X86::VMINMAXPSZrmik, 0},
-  {X86::VMINMAXSDrrik, X86::VMINMAXSDrmik, TB_NO_REVERSE},
-  {X86::VMINMAXSHrrik, X86::VMINMAXSHrmik, TB_NO_REVERSE},
-  {X86::VMINMAXSSrrik, X86::VMINMAXSSrmik, TB_NO_REVERSE},
+  {X86::VMINMAXSDrri_Intk, X86::VMINMAXSDrmi_Intk, TB_NO_REVERSE},
+  {X86::VMINMAXSHrri_Intk, X86::VMINMAXSHrmi_Intk, TB_NO_REVERSE},
+  {X86::VMINMAXSSrri_Intk, X86::VMINMAXSSrmi_Intk, TB_NO_REVERSE},
   {X86::VMINPBF16Z128rrk, X86::VMINPBF16Z128rmk, 0},
   {X86::VMINPBF16Z256rrk, X86::VMINPBF16Z256rmk, 0},
   {X86::VMINPBF16Zrrk, X86::VMINPBF16Zrmk, 0},

From 096551537b2a747a3387726ca618ceeb3950e9bc Mon Sep 17 00:00:00 2001
From: Lang Hames 
Date: Thu, 2 Jan 2025 14:35:27 +1100
Subject: [PATCH 255/567] "Reapply "[llvm-jitlink] Use concurrent linking by
 default." with more fixes.

This reapplies edca1d9bad2 which was reverted in 7ec139ad4bc due to bot
failures.

LocalDependencyPropagation.s is updated to use -num-threads=0 in order to
avoid interleaving debugging output.

ELFNixPlatform.h is updated to protect the deferred runtime function calls
map during bootstrap.
---
 .../llvm/ExecutionEngine/Orc/ELFNixPlatform.h |  1 +
 .../x86-64/LocalDependencyPropagation.s       |  6 +-
 llvm/tools/llvm-jitlink/llvm-jitlink-coff.cpp |  2 +
 llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp  |  2 +
 .../tools/llvm-jitlink/llvm-jitlink-macho.cpp |  2 +
 llvm/tools/llvm-jitlink/llvm-jitlink.cpp      | 56 ++++++++++++++++---
 6 files changed, 58 insertions(+), 11 deletions(-)

diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ELFNixPlatform.h b/llvm/include/llvm/ExecutionEngine/Orc/ELFNixPlatform.h
index 3da5e90a0ec5b..f19cfce16d4ea 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/ELFNixPlatform.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/ELFNixPlatform.h
@@ -156,6 +156,7 @@ class ELFNixPlatform : public Platform {
         RuntimeFunction *func1, RuntimeFunction *func2,
         const shared::WrapperFunctionCall::ArgDataBufferType &arg1,
         const shared::WrapperFunctionCall::ArgDataBufferType &arg2) {
+      std::lock_guard Lock(Mutex);
       auto &argList = DeferredRTFnMap[std::make_pair(func1, func2)];
       argList.emplace_back(arg1, arg2);
     }
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s b/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s
index 139ef149710c8..0898ad8b18230 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s
@@ -1,14 +1,14 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=x86_64-apple-macosx10.9 -filetype=obj -o %t %s
-# RUN: llvm-jitlink -debug-only=orc -noexec -abs _external_func=0x1 \
-# RUN:   -entry=_foo %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -debug-only=orc -num-threads=0 -noexec \
+# RUN:     -abs _external_func=0x1 -entry=_foo %t 2>&1 | FileCheck %s
 #
 # Check that simplification eliminates dependencies on symbols in this unit,
 # and correctly propagates dependencies on symbols outside the unit (including
 # via locally scoped symbols). In this test _baz depends on _foo indirectly via
 # the local symbol _bar. Initially we expect _baz to depend on _foo, and _foo
 # on _external_func, after simplification we expect both to depend on
-# _external_func only.	
+# _external_func only.
 
 # CHECK: In main emitting {{.*}}_foo{{.*}}
 # CHECK-NEXT: Initial dependencies:
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink-coff.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink-coff.cpp
index 5271fdb556590..6db78926101fd 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink-coff.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink-coff.cpp
@@ -66,6 +66,8 @@ static Expected getCOFFStubTarget(LinkGraph &G, Block &B) {
 
 namespace llvm {
 Error registerCOFFGraphInfo(Session &S, LinkGraph &G) {
+  std::lock_guard Lock(S.M);
+
   auto FileName = sys::path::filename(G.getName());
   if (S.FileInfos.count(FileName)) {
     return make_error("When -check is passed, file names must be "
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp
index a8c804a459e3c..6aa89413b7230 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp
@@ -101,6 +101,8 @@ static Error registerSymbol(LinkGraph &G, Symbol &Sym, Session::FileInfo &FI,
 namespace llvm {
 
 Error registerELFGraphInfo(Session &S, LinkGraph &G) {
+  std::lock_guard Lock(S.M);
+
   auto FileName = sys::path::filename(G.getName());
   if (S.FileInfos.count(FileName)) {
     return make_error("When -check is passed, file names must be "
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink-macho.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink-macho.cpp
index 2c60c802293a1..2fc56c9fcc72a 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink-macho.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink-macho.cpp
@@ -69,6 +69,8 @@ static Expected getMachOStubTarget(LinkGraph &G, Block &B) {
 namespace llvm {
 
 Error registerMachOGraphInfo(Session &S, LinkGraph &G) {
+  std::lock_guard Lock(S.M);
+
   auto FileName = sys::path::filename(G.getName());
   if (S.FileInfos.count(FileName)) {
     return make_error("When -check is passed, file names must be "
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index 96a3e5b2acdf4..5b23823317279 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -91,6 +91,10 @@ static cl::list InputFiles(cl::Positional, cl::OneOrMore,
                                         cl::desc("input files"),
                                         cl::cat(JITLinkCategory));
 
+static cl::opt MaterializationThreads(
+    "num-threads", cl::desc("Number of materialization threads to use"),
+    cl::init(std::numeric_limits::max()), cl::cat(JITLinkCategory));
+
 static cl::list
     LibrarySearchPaths("L",
                        cl::desc("Add dir to the list of library search paths"),
@@ -400,6 +404,7 @@ bool lazyLinkingRequested() {
 }
 
 static Error applyHarnessPromotions(Session &S, LinkGraph &G) {
+  std::lock_guard Lock(S.M);
 
   // If this graph is part of the test harness there's nothing to do.
   if (S.HarnessFiles.empty() || S.HarnessFiles.count(G.getName()))
@@ -450,7 +455,11 @@ static Error applyHarnessPromotions(Session &S, LinkGraph &G) {
   return Error::success();
 }
 
-static void dumpSectionContents(raw_ostream &OS, LinkGraph &G) {
+static void dumpSectionContents(raw_ostream &OS, Session &S, LinkGraph &G) {
+  std::lock_guard Lock(S.M);
+
+  outs() << "Relocated section contents for " << G.getName() << ":\n";
+
   constexpr orc::ExecutorAddrDiff DumpWidth = 16;
   static_assert(isPowerOf2_64(DumpWidth), "DumpWidth must be a power of two");
 
@@ -842,7 +851,7 @@ static Expected> launchExecutor() {
     S.CreateMemoryManager = createSharedMemoryManager;
 
   return SimpleRemoteEPC::Create(
-      std::make_unique(std::nullopt),
+      std::make_unique(MaterializationThreads),
       std::move(S), FromExecutor[ReadEnd], ToExecutor[WriteEnd]);
 #endif
 }
@@ -984,10 +993,16 @@ Expected> Session::Create(Triple TT,
     auto PageSize = sys::Process::getPageSize();
     if (!PageSize)
       return PageSize.takeError();
+    std::unique_ptr Dispatcher;
+    if (MaterializationThreads == 0)
+      Dispatcher = std::make_unique();
+    else
+      Dispatcher = std::make_unique(
+          MaterializationThreads);
+
     EPC = std::make_unique(
-        std::make_shared(),
-        std::make_unique(), std::move(TT), *PageSize,
-        createInProcessMemoryManager());
+        std::make_shared(), std::move(Dispatcher),
+        std::move(TT), *PageSize, createInProcessMemoryManager());
   }
 
   Error Err = Error::success();
@@ -1221,6 +1236,7 @@ void Session::modifyPassConfig(LinkGraph &G, PassConfiguration &PassConfig) {
 
   if (ShowGraphsRegex)
     PassConfig.PostFixupPasses.push_back([this](LinkGraph &G) -> Error {
+      std::lock_guard Lock(M);
       // Print graph if ShowLinkGraphs is specified-but-empty, or if
       // it contains the given graph.
       if (ShowGraphsRegex->match(G.getName())) {
@@ -1239,9 +1255,8 @@ void Session::modifyPassConfig(LinkGraph &G, PassConfiguration &PassConfig) {
       [this](LinkGraph &G) { return applyHarnessPromotions(*this, G); });
 
   if (ShowRelocatedSectionContents)
-    PassConfig.PostFixupPasses.push_back([](LinkGraph &G) -> Error {
-      outs() << "Relocated section contents for " << G.getName() << ":\n";
-      dumpSectionContents(outs(), G);
+    PassConfig.PostFixupPasses.push_back([this](LinkGraph &G) -> Error {
+      dumpSectionContents(outs(), *this, G);
       return Error::success();
     });
 
@@ -1613,6 +1628,31 @@ static Error sanitizeArguments(const Triple &TT, const char *ArgV0) {
     }
   }
 
+  if (MaterializationThreads == std::numeric_limits::max()) {
+    if (auto HC = std::thread::hardware_concurrency())
+      MaterializationThreads = HC;
+    else {
+      errs() << "Warning: std::thread::hardware_concurrency() returned 0, "
+                "defaulting to -threads=1.\n";
+      MaterializationThreads = 1;
+    }
+  }
+
+  if (!!OutOfProcessExecutor.getNumOccurrences() ||
+      !!OutOfProcessExecutorConnect.getNumOccurrences()) {
+    if (NoExec)
+      return make_error("-noexec cannot be used with " +
+                                         OutOfProcessExecutor.ArgStr + " or " +
+                                         OutOfProcessExecutorConnect.ArgStr,
+                                     inconvertibleErrorCode());
+
+    if (MaterializationThreads == 0)
+      return make_error("-threads=0 cannot be used with " +
+                                         OutOfProcessExecutor.ArgStr + " or " +
+                                         OutOfProcessExecutorConnect.ArgStr,
+                                     inconvertibleErrorCode());
+  }
+
   // Only one of -oop-executor and -oop-executor-connect can be used.
   if (!!OutOfProcessExecutor.getNumOccurrences() &&
       !!OutOfProcessExecutorConnect.getNumOccurrences())

From bc87a537d9b8117cfd63d5d9b798d6017a99097f Mon Sep 17 00:00:00 2001
From: Nathan Ridge 
Date: Thu, 2 Jan 2025 01:42:21 -0500
Subject: [PATCH 256/567] [clangd] Remove clangd's HasValue GMock matcher
 (#121309)

An equivalent matcher under the name Optional has since been added
upstream to GMock.

Fixes https://github.com/llvm/llvm-project/issues/121308
---
 clang-tools-extra/clangd/unittests/Matchers.h | 68 -------------------
 .../clangd/unittests/TypeHierarchyTests.cpp   | 11 +--
 2 files changed, 6 insertions(+), 73 deletions(-)

diff --git a/clang-tools-extra/clangd/unittests/Matchers.h b/clang-tools-extra/clangd/unittests/Matchers.h
index 0fbd93b2e6882..17d18dd9b85b6 100644
--- a/clang-tools-extra/clangd/unittests/Matchers.h
+++ b/clang-tools-extra/clangd/unittests/Matchers.h
@@ -127,74 +127,6 @@ PolySubsequenceMatcher HasSubsequence(Args &&... M) {
     llvm::consumeError(ComputedValue.takeError());                             \
   } while (false)
 
-// Implements the HasValue(m) matcher for matching an Optional whose
-// value matches matcher m.
-template  class OptionalMatcher {
-public:
-  explicit OptionalMatcher(const InnerMatcher &matcher) : matcher_(matcher) {}
-  OptionalMatcher(const OptionalMatcher&) = default;
-  OptionalMatcher &operator=(const OptionalMatcher&) = delete;
-
-  // This type conversion operator template allows Optional(m) to be
-  // used as a matcher for any Optional type whose value type is
-  // compatible with the inner matcher.
-  //
-  // The reason we do this instead of relying on
-  // MakePolymorphicMatcher() is that the latter is not flexible
-  // enough for implementing the DescribeTo() method of Optional().
-  template  operator Matcher() const {
-    return MakeMatcher(new Impl(matcher_));
-  }
-
-private:
-  // The monomorphic implementation that works for a particular optional type.
-  template 
-  class Impl : public ::testing::MatcherInterface {
-  public:
-    using Value = typename std::remove_const<
-        typename std::remove_reference::type>::type::value_type;
-
-    explicit Impl(const InnerMatcher &matcher)
-        : matcher_(::testing::MatcherCast(matcher)) {}
-
-    Impl(const Impl&) = default;
-    Impl &operator=(const Impl&) = delete;
-
-    virtual void DescribeTo(::std::ostream *os) const {
-      *os << "has a value that ";
-      matcher_.DescribeTo(os);
-    }
-
-    virtual void DescribeNegationTo(::std::ostream *os) const {
-      *os << "does not have a value that ";
-      matcher_.DescribeTo(os);
-    }
-
-    virtual bool
-    MatchAndExplain(Optional optional,
-                    ::testing::MatchResultListener *listener) const {
-      if (!optional)
-        return false;
-
-      *listener << "which has a value ";
-      return MatchPrintAndExplain(*optional, matcher_, listener);
-    }
-
-  private:
-    const Matcher matcher_;
-  };
-
-  const InnerMatcher matcher_;
-};
-
-// Creates a matcher that matches an Optional that has a value
-// that matches inner_matcher.
-template 
-inline OptionalMatcher
-HasValue(const InnerMatcher &inner_matcher) {
-  return OptionalMatcher(inner_matcher);
-}
-
 } // namespace clangd
 } // namespace clang
 #endif
diff --git a/clang-tools-extra/clangd/unittests/TypeHierarchyTests.cpp b/clang-tools-extra/clangd/unittests/TypeHierarchyTests.cpp
index 15158d8a45ca8..406a842f5a008 100644
--- a/clang-tools-extra/clangd/unittests/TypeHierarchyTests.cpp
+++ b/clang-tools-extra/clangd/unittests/TypeHierarchyTests.cpp
@@ -28,6 +28,7 @@ using ::testing::ElementsAre;
 using ::testing::Field;
 using ::testing::IsEmpty;
 using ::testing::Matcher;
+using ::testing::Optional;
 using ::testing::SizeIs;
 using ::testing::UnorderedElementsAre;
 
@@ -38,12 +39,12 @@ MATCHER_P(selectionRangeIs, R, "") { return arg.selectionRange == R; }
 template 
 ::testing::Matcher parents(ParentMatchers... ParentsM) {
   return Field(&TypeHierarchyItem::parents,
-               HasValue(UnorderedElementsAre(ParentsM...)));
+               Optional(UnorderedElementsAre(ParentsM...)));
 }
 template 
 ::testing::Matcher children(ChildMatchers... ChildrenM) {
   return Field(&TypeHierarchyItem::children,
-               HasValue(UnorderedElementsAre(ChildrenM...)));
+               Optional(UnorderedElementsAre(ChildrenM...)));
 }
 // Note: "not resolved" is different from "resolved but empty"!
 MATCHER(parentsNotResolved, "") { return !arg.parents; }
@@ -790,7 +791,7 @@ struct Child : Parent1, Parent2 {};
       Children,
       UnorderedElementsAre(
           AllOf(withName("Child"),
-                withResolveParents(HasValue(UnorderedElementsAre(withResolveID(
+                withResolveParents(Optional(UnorderedElementsAre(withResolveID(
                     getSymbolID(&findDecl(AST, "Parent1")).str())))))));
 }
 
@@ -810,9 +811,9 @@ struct Chil^d : Parent {};
   ASSERT_THAT(Result, SizeIs(1));
   auto Parents = superTypes(Result.front(), Index.get());
 
-  EXPECT_THAT(Parents, HasValue(UnorderedElementsAre(
+  EXPECT_THAT(Parents, Optional(UnorderedElementsAre(
                            AllOf(withName("Parent"),
-                                 withResolveParents(HasValue(IsEmpty()))))));
+                                 withResolveParents(Optional(IsEmpty()))))));
 }
 } // namespace
 } // namespace clangd

From 0c68155002edb30d6b0df3f17fe1f44a01afacd9 Mon Sep 17 00:00:00 2001
From: Lang Hames 
Date: Thu, 2 Jan 2025 18:20:28 +1100
Subject: [PATCH 257/567] [llvm-jitlink] Fix llvm-jitlink for
 LLVM_ENABLE_THREADS=Off.

Commit edca1d9bad2 enabled threaded linking by default in llvm-jitlink, but we
need to handle the case where LLVM is built with -DLLVM_ENABLE_THREADS=Off.

This patch updates the llvm-jitlink tool to switch back to materialization on
the main thread (equivalent to llvm-jitlink -num-threads=0 ...) when LLVM is
built without thread support.
---
 llvm/tools/llvm-jitlink/llvm-jitlink.cpp | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index 5b23823317279..646d4cef01a57 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -996,9 +996,14 @@ Expected> Session::Create(Triple TT,
     std::unique_ptr Dispatcher;
     if (MaterializationThreads == 0)
       Dispatcher = std::make_unique();
-    else
+    else {
+#if LLVM_ENABLE_THREADS
       Dispatcher = std::make_unique(
           MaterializationThreads);
+#else
+      llvm_unreachable("MaterializationThreads should be 0");
+#endif
+    }
 
     EPC = std::make_unique(
         std::make_shared(), std::move(Dispatcher),
@@ -1628,15 +1633,24 @@ static Error sanitizeArguments(const Triple &TT, const char *ArgV0) {
     }
   }
 
+#if LLVM_ENABLE_THREADS
   if (MaterializationThreads == std::numeric_limits::max()) {
     if (auto HC = std::thread::hardware_concurrency())
       MaterializationThreads = HC;
     else {
       errs() << "Warning: std::thread::hardware_concurrency() returned 0, "
-                "defaulting to -threads=1.\n";
+                "defaulting to -num-threads=1.\n";
       MaterializationThreads = 1;
     }
   }
+#else
+  if (MaterializationThreads.getNumOccurrences() &&
+      MaterializationThreads != 0) {
+    errs() << "Warning: -num-threads was set, but LLVM was built with threads "
+              "disabled. Resetting to -num-threads=0\n";
+  }
+  MaterializationThreads = 0;
+#endif
 
   if (!!OutOfProcessExecutor.getNumOccurrences() ||
       !!OutOfProcessExecutorConnect.getNumOccurrences()) {

From 641a786bedc054e71764f69f6f448ad4e090c9a4 Mon Sep 17 00:00:00 2001
From: David Green 
Date: Thu, 2 Jan 2025 08:05:44 +0000
Subject: [PATCH 258/567] [AArch64] Add codegen shuffle-select test. NFC

This splits the shuffle-select CostModel test into a seperate CodeGen test and
removes the codegen from the CostModel version. An extra fp16 test is added
too.
---
 .../CostModel/AArch64/shuffle-select.ll       | 123 +++++++++-------
 llvm/test/CodeGen/AArch64/shuffle-select.ll   | 137 ++++++++++++++++++
 2 files changed, 206 insertions(+), 54 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/shuffle-select.ll

diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll
index 075397afdce79..9c573c7eb49c7 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll
@@ -1,97 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -mtriple=aarch64--linux-gnu -passes="print" 2>&1 -disable-output | FileCheck %s --check-prefix=COST
-; RUN: llc < %s -mtriple=aarch64--linux-gnu | FileCheck %s --check-prefix=CODE
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
-; COST-LABEL: sel.v8i8
-; COST:       Found an estimated cost of 28 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> 
-; CODE-LABEL: sel.v8i8
-; CODE:       tbl v0.8b, { v0.16b }, v1.8b
-define <8 x i8> @sel.v8i8(<8 x i8> %v0, <8 x i8> %v1) {
+define <8 x i8> @sel_v8i8(<8 x i8> %v0, <8 x i8> %v1) {
+; COST-LABEL: 'sel_v8i8'
+; COST-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> 
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %tmp0
+;
   %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> 
   ret <8 x i8> %tmp0
 }
 
-; COST-LABEL: sel.v16i8
-; COST:       Found an estimated cost of 60 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> 
-; CODE-LABEL: sel.v16i8
-; CODE:       tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-define <16 x i8> @sel.v16i8(<16 x i8> %v0, <16 x i8> %v1) {
+define <16 x i8> @sel_v16i8(<16 x i8> %v0, <16 x i8> %v1) {
+; COST-LABEL: 'sel_v16i8'
+; COST-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> 
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %tmp0
+;
   %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> 
   ret <16 x i8> %tmp0
 }
 
-; COST-LABEL: sel.v4i16
-; COST:       Found an estimated cost of 2 for instruction: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> 
-; CODE-LABEL: sel.v4i16
-; CODE:       rev32 v0.4h, v0.4h
-; CODE:       trn2 v0.4h, v0.4h, v1.4h
-define <4 x i16> @sel.v4i16(<4 x i16> %v0, <4 x i16> %v1) {
+define <4 x i16> @sel_v4i16(<4 x i16> %v0, <4 x i16> %v1) {
+; COST-LABEL: 'sel_v4i16'
+; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> 
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %tmp0
+;
   %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> 
   ret <4 x i16> %tmp0
 }
 
-; COST-LABEL: sel.v8i16
-; COST:       Found an estimated cost of 28 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> 
-; CODE-LABEL: sel.v8i16
-; CODE:       tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-define <8 x i16> @sel.v8i16(<8 x i16> %v0, <8 x i16> %v1) {
+define <8 x i16> @sel_v8i16(<8 x i16> %v0, <8 x i16> %v1) {
+; COST-LABEL: 'sel_v8i16'
+; COST-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> 
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %tmp0
+;
   %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> 
   ret <8 x i16> %tmp0
 }
 
-; COST-LABEL: sel.v2i32
-; COST:        Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> 
-; CODE-LABEL: sel.v2i32
-; CODE:       mov v0.s[1], v1.s[1]
-define <2 x i32> @sel.v2i32(<2 x i32> %v0, <2 x i32> %v1) {
+define <2 x i32> @sel_v2i32(<2 x i32> %v0, <2 x i32> %v1) {
+; COST-LABEL: 'sel_v2i32'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> 
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %tmp0
+;
   %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> 
   ret <2 x i32> %tmp0
 }
 
-; COST-LABEL: sel.v4i32
-; COST:       Found an estimated cost of 2 for instruction: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> 
-; CODE-LABEL: sel.v4i32
-; CODE:       rev64 v0.4s, v0.4s
-; CODE:       trn2 v0.4s, v0.4s, v1.4s
-define <4 x i32> @sel.v4i32(<4 x i32> %v0, <4 x i32> %v1) {
+define <4 x i32> @sel_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
+; COST-LABEL: 'sel_v4i32'
+; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> 
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %tmp0
+;
   %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> 
   ret <4 x i32> %tmp0
 }
 
-; COST-LABEL: sel.v2i64
-; COST:       Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> 
-; CODE-LABEL: sel.v2i64
-; CODE:       mov v0.d[1], v1.d[1]
-define <2 x i64> @sel.v2i64(<2 x i64> %v0, <2 x i64> %v1) {
+define <2 x i64> @sel_v2i64(<2 x i64> %v0, <2 x i64> %v1) {
+; COST-LABEL: 'sel_v2i64'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> 
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %tmp0
+;
   %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> 
   ret <2 x i64> %tmp0
 }
 
-; COST-LABEL: sel.v2f32
-; COST:       Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> 
-; CODE-LABEL: sel.v2f32
-; CODE:       mov v0.s[1], v1.s[1]
-define <2 x float> @sel.v2f32(<2 x float> %v0, <2 x float> %v1) {
+define <4 x half> @sel_v4f16(<4 x half> %v0, <4 x half> %v1) {
+; COST-LABEL: 'sel_v4f16'
+; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> 
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x half> %tmp0
+;
+  %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> 
+  ret <4 x half> %tmp0
+}
+
+define <8 x half> @sel_v8f16(<8 x half> %v0, <8 x half> %v1) {
+; COST-LABEL: 'sel_v8f16'
+; COST-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> 
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x half> %tmp0
+;
+  %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> 
+  ret <8 x half> %tmp0
+}
+
+define <2 x float> @sel_v2f32(<2 x float> %v0, <2 x float> %v1) {
+; COST-LABEL: 'sel_v2f32'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> 
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %tmp0
+;
   %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> 
   ret <2 x float> %tmp0
 }
 
-; COST-LABEL: sel.v4f32
-; COST:       Found an estimated cost of 2 for instruction: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> 
-; CODE-LABEL: sel.v4f32
-; CODE:       rev64 v0.4s, v0.4s
-; CODE:       trn2 v0.4s, v0.4s, v1.4s
-define <4 x float> @sel.v4f32(<4 x float> %v0, <4 x float> %v1) {
+define <4 x float> @sel_v4f32(<4 x float> %v0, <4 x float> %v1) {
+; COST-LABEL: 'sel_v4f32'
+; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> 
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %tmp0
+;
   %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> 
   ret <4 x float> %tmp0
 }
 
-; COST-LABEL: sel.v2f64
-; COST:       Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> 
-; CODE-LABEL: sel.v2f64
-; CODE:       mov v0.d[1], v1.d[1]
-define <2 x double> @sel.v2f64(<2 x double> %v0, <2 x double> %v1) {
+define <2 x double> @sel_v2f64(<2 x double> %v0, <2 x double> %v1) {
+; COST-LABEL: 'sel_v2f64'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> 
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %tmp0
+;
   %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> 
   ret <2 x double> %tmp0
 }
diff --git a/llvm/test/CodeGen/AArch64/shuffle-select.ll b/llvm/test/CodeGen/AArch64/shuffle-select.ll
new file mode 100644
index 0000000000000..25a935f067bd6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/shuffle-select.ll
@@ -0,0 +1,137 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=aarch64--linux-gnu | FileCheck %s
+
+define <8 x i8> @sel_v8i8(<8 x i8> %v0, <8 x i8> %v1) {
+; CHECK-LABEL: sel_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    adrp x8, .LCPI0_0
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI0_0]
+; CHECK-NEXT:    tbl v0.8b, { v0.16b }, v1.8b
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> 
+  ret <8 x i8> %tmp0
+}
+
+define <16 x i8> @sel_v16i8(<16 x i8> %v0, <16 x i8> %v1) {
+; CHECK-LABEL: sel_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI1_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI1_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> 
+  ret <16 x i8> %tmp0
+}
+
+define <4 x i16> @sel_v4i16(<4 x i16> %v0, <4 x i16> %v1) {
+; CHECK-LABEL: sel_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-NEXT:    trn2 v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> 
+  ret <4 x i16> %tmp0
+}
+
+define <8 x i16> @sel_v8i16(<8 x i16> %v0, <8 x i16> %v1) {
+; CHECK-LABEL: sel_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI3_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI3_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> 
+  ret <8 x i16> %tmp0
+}
+
+define <2 x i32> @sel_v2i32(<2 x i32> %v0, <2 x i32> %v1) {
+; CHECK-LABEL: sel_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> 
+  ret <2 x i32> %tmp0
+}
+
+define <4 x i32> @sel_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: sel_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-NEXT:    trn2 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> 
+  ret <4 x i32> %tmp0
+}
+
+define <2 x i64> @sel_v2i64(<2 x i64> %v0, <2 x i64> %v1) {
+; CHECK-LABEL: sel_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.d[1], v1.d[1]
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> 
+  ret <2 x i64> %tmp0
+}
+
+define <4 x half> @sel_v4f16(<4 x half> %v0, <4 x half> %v1) {
+; CHECK-LABEL: sel_v4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-NEXT:    trn2 v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> 
+  ret <4 x half> %tmp0
+}
+
+define <8 x half> @sel_v8f16(<8 x half> %v0, <8 x half> %v1) {
+; CHECK-LABEL: sel_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI8_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI8_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> 
+  ret <8 x half> %tmp0
+}
+
+define <2 x float> @sel_v2f32(<2 x float> %v0, <2 x float> %v1) {
+; CHECK-LABEL: sel_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> 
+  ret <2 x float> %tmp0
+}
+
+define <4 x float> @sel_v4f32(<4 x float> %v0, <4 x float> %v1) {
+; CHECK-LABEL: sel_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-NEXT:    trn2 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> 
+  ret <4 x float> %tmp0
+}
+
+define <2 x double> @sel_v2f64(<2 x double> %v0, <2 x double> %v1) {
+; CHECK-LABEL: sel_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.d[1], v1.d[1]
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> 
+  ret <2 x double> %tmp0
+}

From 45e874e39030bc622ea43fbcfc4fcdd1dd404353 Mon Sep 17 00:00:00 2001
From: Timm Baeder 
Date: Thu, 2 Jan 2025 09:15:14 +0100
Subject: [PATCH 259/567] [clang][bytecode] Check for memcpy/memmove dummy
 pointers earlier (#121453)

---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 7 ++++---
 clang/test/CodeGen/builtin-memfns.c      | 1 +
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index d0d8b03deab26..e9f3303f958d3 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -1863,6 +1863,10 @@ static bool interp__builtin_memcpy(InterpState &S, CodePtr OpPC,
     return false;
   }
 
+  // Can't read from dummy pointers.
+  if (DestPtr.isDummy() || SrcPtr.isDummy())
+    return false;
+
   QualType DestElemType;
   size_t RemainingDestElems;
   if (DestPtr.getFieldDesc()->isArray()) {
@@ -1925,9 +1929,6 @@ static bool interp__builtin_memcpy(InterpState &S, CodePtr OpPC,
     }
   }
 
-  // As a last resort, reject dummy pointers.
-  if (DestPtr.isDummy() || SrcPtr.isDummy())
-    return false;
   assert(Size.getZExtValue() % DestElemSize == 0);
   if (!DoMemcpy(S, OpPC, SrcPtr, DestPtr, Bytes(Size.getZExtValue()).toBits()))
     return false;
diff --git a/clang/test/CodeGen/builtin-memfns.c b/clang/test/CodeGen/builtin-memfns.c
index 23c3c60b779b3..581eb85eb28e6 100644
--- a/clang/test/CodeGen/builtin-memfns.c
+++ b/clang/test/CodeGen/builtin-memfns.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -triple i386-pc-linux-gnu -emit-llvm < %s| FileCheck %s
+// RUN: %clang_cc1 -triple i386-pc-linux-gnu -emit-llvm -fexperimental-new-constant-interpreter < %s| FileCheck %s
 
 typedef __WCHAR_TYPE__ wchar_t;
 typedef __SIZE_TYPE__ size_t;

From a3744f065a3ce38deaf650a8f92941c19980b32a Mon Sep 17 00:00:00 2001
From: Spencer Abson 
Date: Tue, 31 Dec 2024 18:06:02 +0000
Subject: [PATCH 260/567] [clang][AArch64] Remove references to vector size in
 SVE immediate range checking. NFC

---
 .../include/clang/Basic/arm_immcheck_incl.td  | 10 ++++----
 clang/lib/Sema/SemaARM.cpp                    | 24 +++++++++----------
 2 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/clang/include/clang/Basic/arm_immcheck_incl.td b/clang/include/clang/Basic/arm_immcheck_incl.td
index 9d7f74a35aaa8..6892b8299771b 100644
--- a/clang/include/clang/Basic/arm_immcheck_incl.td
+++ b/clang/include/clang/Basic/arm_immcheck_incl.td
@@ -2,7 +2,9 @@ class ImmCheckType {
   int Value = val;
 }
 
-// These must be kept in sync with the flags in include/clang/Basic/TargetBuiltins.h
+
+// For SVE, container_size refers to the width of a vector segment (128b).
+// For NEON, container_size refers to the vector width (64b or 128b).
 def ImmCheck0_31                : ImmCheckType<0>;  // 0..31 (used for e.g. predicate patterns)
 def ImmCheck1_16                : ImmCheckType<1>;  // 1..16
 def ImmCheckExtract             : ImmCheckType<2>;  // 0..(2048/sizeinbits(elt) - 1)
@@ -10,10 +12,10 @@ def ImmCheckShiftRight          : ImmCheckType<3>;  // 1..sizeinbits(elt)
 def ImmCheckShiftRightNarrow    : ImmCheckType<4>;  // 1..sizeinbits(elt)/2
 def ImmCheckShiftLeft           : ImmCheckType<5>;  // 0..(sizeinbits(elt) - 1)
 def ImmCheck0_7                 : ImmCheckType<6>;  // 0..7
-def ImmCheckLaneIndex           : ImmCheckType<7>;  // 0..(sizeinbits(vec)/(sizeinbits(elt)) - 1)
+def ImmCheckLaneIndex           : ImmCheckType<7>;  // 0..(container_size/(sizeinbits(elt)) - 1)
 def ImmCheckCvt                 : ImmCheckType<8>;  // 1..sizeinbits(elt) (same as ShiftRight)
-def ImmCheckLaneIndexCompRotate : ImmCheckType<9>;  // 0..(sizeinbits(vec)/(2*sizeinbits(elt)) - 1)
-def ImmCheckLaneIndexDot        : ImmCheckType<10>; // 0..(sizeinbits(vec)/(4*sizeinbits(elt)) - 1)
+def ImmCheckLaneIndexCompRotate : ImmCheckType<9>;  // 0..(container_size/(2*sizeinbits(elt)) - 1)
+def ImmCheckLaneIndexDot        : ImmCheckType<10>; // 0..(container_size/(4*sizeinbits(elt)) - 1)
 def ImmCheckComplexRot90_270    : ImmCheckType<11>; // [90,270]
 def ImmCheckComplexRotAll90     : ImmCheckType<12>; // [0, 90, 180,270]
 def ImmCheck0_13                : ImmCheckType<13>; // 0..13
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index 3e93b38143f3b..411baa066f709 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -372,7 +372,7 @@ enum ArmSMEState : unsigned {
 
 bool SemaARM::CheckImmediateArg(CallExpr *TheCall, unsigned CheckTy,
                                 unsigned ArgIdx, unsigned EltBitWidth,
-                                unsigned VecBitWidth) {
+                                unsigned ContainerBitWidth) {
   // Function that checks whether the operand (ArgIdx) is an immediate
   // that is one of a given set of values.
   auto CheckImmediateInSet = [&](std::initializer_list Set,
@@ -445,17 +445,17 @@ bool SemaARM::CheckImmediateArg(CallExpr *TheCall, unsigned CheckTy,
     break;
   case ImmCheckType::ImmCheckLaneIndex:
     if (SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0,
-                                        (VecBitWidth / EltBitWidth) - 1))
+                                        (ContainerBitWidth / EltBitWidth) - 1))
       return true;
     break;
   case ImmCheckType::ImmCheckLaneIndexCompRotate:
-    if (SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0,
-                                        (VecBitWidth / (2 * EltBitWidth)) - 1))
+    if (SemaRef.BuiltinConstantArgRange(
+            TheCall, ArgIdx, 0, (ContainerBitWidth / (2 * EltBitWidth)) - 1))
       return true;
     break;
   case ImmCheckType::ImmCheckLaneIndexDot:
-    if (SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0,
-                                        (VecBitWidth / (4 * EltBitWidth)) - 1))
+    if (SemaRef.BuiltinConstantArgRange(
+            TheCall, ArgIdx, 0, (ContainerBitWidth / (4 * EltBitWidth)) - 1))
       return true;
     break;
   case ImmCheckType::ImmCheckComplexRot90_270:
@@ -515,13 +515,13 @@ bool SemaARM::PerformNeonImmChecks(
   bool HasError = false;
 
   for (const auto &I : ImmChecks) {
-    auto [ArgIdx, CheckTy, ElementSizeInBits, VecSizeInBits] = I;
+    auto [ArgIdx, CheckTy, ElementBitWidth, VecBitWidth] = I;
 
     if (OverloadType >= 0)
-      ElementSizeInBits = NeonTypeFlags(OverloadType).getEltSizeInBits();
+      ElementBitWidth = NeonTypeFlags(OverloadType).getEltSizeInBits();
 
-    HasError |= CheckImmediateArg(TheCall, CheckTy, ArgIdx, ElementSizeInBits,
-                                  VecSizeInBits);
+    HasError |= CheckImmediateArg(TheCall, CheckTy, ArgIdx, ElementBitWidth,
+                                  VecBitWidth);
   }
 
   return HasError;
@@ -532,9 +532,9 @@ bool SemaARM::PerformSVEImmChecks(
   bool HasError = false;
 
   for (const auto &I : ImmChecks) {
-    auto [ArgIdx, CheckTy, ElementSizeInBits] = I;
+    auto [ArgIdx, CheckTy, ElementBitWidth] = I;
     HasError |=
-        CheckImmediateArg(TheCall, CheckTy, ArgIdx, ElementSizeInBits, 128);
+        CheckImmediateArg(TheCall, CheckTy, ArgIdx, ElementBitWidth, 128);
   }
 
   return HasError;

From d5c8af492f2d8620b04330024d46a5f48db546fe Mon Sep 17 00:00:00 2001
From: Timm Baeder 
Date: Thu, 2 Jan 2025 10:31:49 +0100
Subject: [PATCH 261/567] [clang][bytecode] Consider start index when copying
 composite array (#121461)

... elements.
---
 clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp |  2 +-
 clang/test/AST/ByteCode/builtin-functions.cpp   | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
index 0fc94e1694822..57c1fab5d6ab4 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
@@ -110,7 +110,7 @@ static bool enumerateData(const Pointer &P, const Context &Ctx, Bits Offset,
   if (FieldDesc->isCompositeArray()) {
     QualType ElemType = FieldDesc->getElemQualType();
     Bits ElemSize = Bits(Ctx.getASTContext().getTypeSize(ElemType));
-    for (unsigned I = 0; I != FieldDesc->getNumElems(); ++I) {
+    for (unsigned I = P.getIndex(); I != FieldDesc->getNumElems(); ++I) {
       enumerateData(P.atIndex(I).narrow(), Ctx, Offset, BitsToRead, F);
       Offset += ElemSize;
       if (Offset >= BitsToRead)
diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp
index b0f8ea2e55ee0..0188e8297db52 100644
--- a/clang/test/AST/ByteCode/builtin-functions.cpp
+++ b/clang/test/AST/ByteCode/builtin-functions.cpp
@@ -1253,6 +1253,16 @@ namespace BuiltinMemcpy {
   static_assert(test_memmove(2, 0, 12) == 4234); // both-error {{constant}} \
                                                  // both-note {{in call}}
 #endif
+
+  struct Trivial { char k; short s; constexpr bool ok() { return k == 3 && s == 4; } };
+  constexpr bool test_trivial() {
+    Trivial arr[3] = {{1, 2}, {3, 4}, {5, 6}};
+    __builtin_memcpy(arr, arr+1, sizeof(Trivial));
+    __builtin_memmove(arr+1, arr, 2 * sizeof(Trivial));
+
+    return arr[0].ok() && arr[1].ok() && arr[2].ok();
+  }
+  static_assert(test_trivial());
 }
 
 namespace Memcmp {

From 34097c07e151fef0e5c645e1dac7f4872774317b Mon Sep 17 00:00:00 2001
From: Timm Baeder 
Date: Thu, 2 Jan 2025 10:59:08 +0100
Subject: [PATCH 262/567] [clang][bytecode] Consider unknown-size arrays in
 memcpy/memcmp (#121462)

When emitting diagnostics for the number of elements.
---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp      | 8 ++++++--
 clang/test/AST/ByteCode/builtin-functions.cpp | 9 +++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index e9f3303f958d3..b5849553d0bf5 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -1871,7 +1871,9 @@ static bool interp__builtin_memcpy(InterpState &S, CodePtr OpPC,
   size_t RemainingDestElems;
   if (DestPtr.getFieldDesc()->isArray()) {
     DestElemType = DestPtr.getFieldDesc()->getElemQualType();
-    RemainingDestElems = (DestPtr.getNumElems() - DestPtr.getIndex());
+    RemainingDestElems = DestPtr.isUnknownSizeArray()
+                             ? 0
+                             : (DestPtr.getNumElems() - DestPtr.getIndex());
   } else {
     DestElemType = DestPtr.getType();
     RemainingDestElems = 1;
@@ -1890,7 +1892,9 @@ static bool interp__builtin_memcpy(InterpState &S, CodePtr OpPC,
   size_t RemainingSrcElems;
   if (SrcPtr.getFieldDesc()->isArray()) {
     SrcElemType = SrcPtr.getFieldDesc()->getElemQualType();
-    RemainingSrcElems = (SrcPtr.getNumElems() - SrcPtr.getIndex());
+    RemainingSrcElems = SrcPtr.isUnknownSizeArray()
+                            ? 0
+                            : (SrcPtr.getNumElems() - SrcPtr.getIndex());
   } else {
     SrcElemType = SrcPtr.getType();
     RemainingSrcElems = 1;
diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp
index 0188e8297db52..723764010d9a3 100644
--- a/clang/test/AST/ByteCode/builtin-functions.cpp
+++ b/clang/test/AST/ByteCode/builtin-functions.cpp
@@ -1263,6 +1263,15 @@ namespace BuiltinMemcpy {
     return arr[0].ok() && arr[1].ok() && arr[2].ok();
   }
   static_assert(test_trivial());
+
+  // Check that an incomplete array is rejected.
+  constexpr int test_incomplete_array_type() { // both-error {{never produces a constant}}
+    extern int arr[];
+    __builtin_memmove(arr, arr, 4 * sizeof(arr[0]));
+    // both-note@-1 2{{'memmove' not supported: source is not a contiguous array of at least 4 elements of type 'int'}}
+    return arr[0] * 1000 + arr[1] * 100 + arr[2] * 10 + arr[3];
+  }
+  static_assert(test_incomplete_array_type() == 1234); // both-error {{constant}} both-note {{in call}}
 }
 
 namespace Memcmp {

From 4c5c5e2f41e62e4be9ca7bb6c42221cc11700321 Mon Sep 17 00:00:00 2001
From: ZhaoQi 
Date: Thu, 2 Jan 2025 18:09:06 +0800
Subject: [PATCH 263/567] [JITLink][LoongArch] Add R_LARCH_{B16,B21}
 relocations support (#121096)

---
 .../llvm/ExecutionEngine/JITLink/loongarch.h  | 75 +++++++++++++++++++
 .../ExecutionEngine/JITLink/ELF_loongarch.cpp |  4 +
 .../lib/ExecutionEngine/JITLink/loongarch.cpp |  2 +
 .../LoongArch/ELF_loongarch32_relocations.s   | 24 ++++++
 .../LoongArch/ELF_loongarch64_relocations.s   | 26 ++++++-
 5 files changed, 130 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/loongarch.h b/llvm/include/llvm/ExecutionEngine/JITLink/loongarch.h
index 39a7db32258ce..d31c749bad1b1 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/loongarch.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/loongarch.h
@@ -41,6 +41,50 @@ enum EdgeKind_loongarch : Edge::Kind {
   ///
   Pointer32,
 
+  /// A 16-bit PC-relative branch.
+  ///
+  /// Represents a PC-relative branch to a target within +/-128Kb. The target
+  /// must be 4-byte aligned.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- (Target - Fixup + Addend) >> 2 : int16
+  ///
+  /// Notes:
+  ///   The '16' in the name refers to the number operand bits and follows the
+  /// naming convention used by the corresponding ELF relocations. Since the low
+  /// two bits must be zero (because of the 4-byte alignment of the target) the
+  /// operand is effectively a signed 18-bit number.
+  ///
+  /// Errors:
+  ///   - The result of the unshifted part of the fixup expression must be
+  ///     4-byte aligned otherwise an alignment error will be returned.
+  ///   - The result of the fixup expression must fit into an int16 otherwise an
+  ///     out-of-range error will be returned.
+  ///
+  Branch16PCRel,
+
+  /// A 21-bit PC-relative branch.
+  ///
+  /// Represents a PC-relative branch to a target within +/-4Mb. The Target must
+  /// be 4-byte aligned.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- (Target - Fixup + Addend) >> 2 : int21
+  ///
+  /// Notes:
+  ///   The '21' in the name refers to the number operand bits and follows the
+  /// naming convention used by the corresponding ELF relocations. Since the low
+  /// two bits must be zero (because of the 4-byte alignment of the target) the
+  /// operand is effectively a signed 23-bit number.
+  ///
+  /// Errors:
+  ///   - The result of the unshifted part of the fixup expression must be
+  ///     4-byte aligned otherwise an alignment error will be returned.
+  ///   - The result of the fixup expression must fit into an int21 otherwise an
+  ///     out-of-range error will be returned.
+  ///
+  Branch21PCRel,
+
   /// A 26-bit PC-relative branch.
   ///
   /// Represents a PC-relative call or branch to a target within +/-128Mb. The
@@ -213,6 +257,37 @@ inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E) {
     *(ulittle32_t *)FixupPtr = Value;
     break;
   }
+  case Branch16PCRel: {
+    int64_t Value = TargetAddress - FixupAddress + Addend;
+
+    if (!isInt<18>(Value))
+      return makeTargetOutOfRangeError(G, B, E);
+
+    if (!isShiftedInt<16, 2>(Value))
+      return makeAlignmentError(orc::ExecutorAddr(FixupAddress), Value, 4, E);
+
+    uint32_t RawInstr = *(little32_t *)FixupPtr;
+    uint32_t Imm = static_cast(Value >> 2);
+    uint32_t Imm15_0 = extractBits(Imm, /*Hi=*/15, /*Lo=*/0) << 10;
+    *(little32_t *)FixupPtr = RawInstr | Imm15_0;
+    break;
+  }
+  case Branch21PCRel: {
+    int64_t Value = TargetAddress - FixupAddress + Addend;
+
+    if (!isInt<23>(Value))
+      return makeTargetOutOfRangeError(G, B, E);
+
+    if (!isShiftedInt<21, 2>(Value))
+      return makeAlignmentError(orc::ExecutorAddr(FixupAddress), Value, 4, E);
+
+    uint32_t RawInstr = *(little32_t *)FixupPtr;
+    uint32_t Imm = static_cast(Value >> 2);
+    uint32_t Imm15_0 = extractBits(Imm, /*Hi=*/15, /*Lo=*/0) << 10;
+    uint32_t Imm20_16 = extractBits(Imm, /*Hi=*/20, /*Lo=*/16);
+    *(little32_t *)FixupPtr = RawInstr | Imm15_0 | Imm20_16;
+    break;
+  }
   case Branch26PCRel: {
     int64_t Value = TargetAddress - FixupAddress + Addend;
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_loongarch.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_loongarch.cpp
index 56c32aeecf55a..a12e9f33e80a6 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_loongarch.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_loongarch.cpp
@@ -58,6 +58,10 @@ class ELFLinkGraphBuilder_loongarch : public ELFLinkGraphBuilder {
       return Pointer32;
     case ELF::R_LARCH_32_PCREL:
       return Delta32;
+    case ELF::R_LARCH_B16:
+      return Branch16PCRel;
+    case ELF::R_LARCH_B21:
+      return Branch21PCRel;
     case ELF::R_LARCH_B26:
       return Branch26PCRel;
     case ELF::R_LARCH_PCALA_HI20:
diff --git a/llvm/lib/ExecutionEngine/JITLink/loongarch.cpp b/llvm/lib/ExecutionEngine/JITLink/loongarch.cpp
index 010c0ed6713d4..cdb3da04354ee 100644
--- a/llvm/lib/ExecutionEngine/JITLink/loongarch.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/loongarch.cpp
@@ -44,6 +44,8 @@ const char *getEdgeKindName(Edge::Kind K) {
     KIND_NAME_CASE(Delta32)
     KIND_NAME_CASE(NegDelta32)
     KIND_NAME_CASE(Delta64)
+    KIND_NAME_CASE(Branch16PCRel)
+    KIND_NAME_CASE(Branch21PCRel)
     KIND_NAME_CASE(Branch26PCRel)
     KIND_NAME_CASE(Page20)
     KIND_NAME_CASE(PageOffset12)
diff --git a/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch32_relocations.s b/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch32_relocations.s
index 23f6acc307b98..da9f9982aade7 100644
--- a/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch32_relocations.s
+++ b/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch32_relocations.s
@@ -103,6 +103,30 @@ test_gotoffset12_external:
     ld.w $a0, $a0, %got_pc_lo12(external_data)
     .size test_gotoffset12_external, .-test_gotoffset12_external
 
+## Check R_LARCH_B16 relocation for compare and branch instructions.
+
+# jitlink-check: decode_operand(test_br16, 2)[17:0] = \
+# jitlink-check:   (test_br16_target - test_br16)[17:0]
+    .globl test_br16, test_br16_target
+    .p2align 2
+test_br16:
+    beq $t1, $t2, %b16(test_br16_target)
+    .skip (1 << 16)
+test_br16_target:
+    .size test_br16, .-test_br16
+
+## Check R_LARCH_B21 relocation for compare and branch instructions.
+
+# jitlink-check: decode_operand(test_br21, 1)[22:0] = \
+# jitlink-check:   (test_br21_target - test_br21)[22:0]
+    .globl test_br21, test_br21_target
+    .p2align 2
+test_br21:
+    beqz $t1, %b21(test_br21_target)
+    .skip (1 << 21)
+test_br21_target:
+    .size test_br21, .-test_br21
+
 
     .globl named_data
     .p2align 4
diff --git a/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch64_relocations.s b/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch64_relocations.s
index f07ac5422b8fc..a390d1b895f79 100644
--- a/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch64_relocations.s
+++ b/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch64_relocations.s
@@ -116,7 +116,6 @@ test_gotoffset12_external:
     ld.d $a0, $a0, %got_pc_lo12(external_data)
     .size test_gotoffset12_external, .-test_gotoffset12_external
 
-
 ## Check R_LARCH_CALL36 relocation of a local function call.
 
 # jitlink-check: decode_operand(local_func_call36, 1)[19:0] = \
@@ -130,6 +129,31 @@ local_func_call36:
     jirl $ra, $ra, 0
     .size local_func_call36, .-local_func_call36
 
+## Check R_LARCH_B16 relocation for compare and branch instructions.
+
+# jitlink-check: decode_operand(test_br16, 2)[17:0] = \
+# jitlink-check:   (test_br16_target - test_br16)[17:0]
+    .globl test_br16, test_br16_target
+    .p2align 2
+test_br16:
+    beq $t1, $t2, %b16(test_br16_target)
+    .skip (1 << 16)
+test_br16_target:
+    .size test_br16, .-test_br16
+
+## Check R_LARCH_B21 relocation for compare and branch instructions.
+
+# jitlink-check: decode_operand(test_br21, 1)[22:0] = \
+# jitlink-check:   (test_br21_target - test_br21)[22:0]
+    .globl test_br21, test_br21_target
+    .p2align 2
+test_br21:
+    beqz $t1, %b21(test_br21_target)
+    .skip (1 << 21)
+test_br21_target:
+    .size test_br21, .-test_br21
+
+
     .globl named_data
     .p2align 4
     .type named_data,@object

From 3ddc9f06ae61e916b333b096cef3560f0f5c6272 Mon Sep 17 00:00:00 2001
From: David Green 
Date: Thu, 2 Jan 2025 10:13:51 +0000
Subject: [PATCH 264/567] [AArch64] Additional shuffle subvector-extract cost
 tests. NFC

A Phase Ordering test for intrinsic shuffles is also added, showing a recent
regression from vector combining.
---
 .../CostModel/AArch64/shuffle-extract.ll      | 174 ++++
 llvm/test/CodeGen/AArch64/shuffle-select.ll   |  34 +-
 .../AArch64/block_scaling_decompr_8bit.ll     | 804 ++++++++++++++++++
 3 files changed, 1008 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/Analysis/CostModel/AArch64/shuffle-extract.ll
 create mode 100644 llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll

diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-extract.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-extract.ll
new file mode 100644
index 0000000000000..50356196b8381
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-extract.ll
@@ -0,0 +1,174 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -mtriple=aarch64--linux-gnu -passes="print" 2>&1 -disable-output | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+define void @extract_half() {
+; CHECK-LABEL: 'extract_half'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i8_lo = shufflevector <2 x i8> poison, <2 x i8> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_hi = shufflevector <2 x i8> poison, <2 x i8> poison, <1 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i8_lo = shufflevector <4 x i8> poison, <4 x i8> poison, <2 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_mi = shufflevector <4 x i8> poison, <4 x i8> poison, <2 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_hi = shufflevector <4 x i8> poison, <4 x i8> poison, <2 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i8_lo = shufflevector <8 x i8> poison, <8 x i8> poison, <4 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_mi = shufflevector <8 x i8> poison, <8 x i8> poison, <4 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_hi = shufflevector <8 x i8> poison, <8 x i8> poison, <4 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i8_lo = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_mi = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_hi = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i16_lo = shufflevector <2 x i16> poison, <2 x i16> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_hi = shufflevector <2 x i16> poison, <2 x i16> poison, <1 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i16_lo = shufflevector <4 x i16> poison, <4 x i16> poison, <2 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_mi = shufflevector <4 x i16> poison, <4 x i16> poison, <2 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_hi = shufflevector <4 x i16> poison, <4 x i16> poison, <2 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i16_lo = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_mi = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_hi = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i16_lo = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16_mi = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16_hi = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i32_lo = shufflevector <2 x i32> poison, <2 x i32> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_hi = shufflevector <2 x i32> poison, <2 x i32> poison, <1 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_lo = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_mi = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_hi = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i32_lo = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_mi = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_hi = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i32_lo = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_mi = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_hi = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i64_lo = shufflevector <2 x i64> poison, <2 x i64> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_hi = shufflevector <2 x i64> poison, <2 x i64> poison, <1 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i64_lo = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_mi = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_hi = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i64_lo = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_mi = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_hi = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2i8_lo = shufflevector <2 x i8> poison, <2 x i8> poison, <1 x i32> 
+  %v2i8_hi = shufflevector <2 x i8> poison, <2 x i8> poison, <1 x i32> 
+  %v4i8_lo = shufflevector <4 x i8> poison, <4 x i8> poison, <2 x i32> 
+  %v4i8_mi = shufflevector <4 x i8> poison, <4 x i8> poison, <2 x i32> 
+  %v4i8_hi = shufflevector <4 x i8> poison, <4 x i8> poison, <2 x i32> 
+  %v8i8_lo = shufflevector <8 x i8> poison, <8 x i8> poison, <4 x i32> 
+  %v8i8_mi = shufflevector <8 x i8> poison, <8 x i8> poison, <4 x i32> 
+  %v8i8_hi = shufflevector <8 x i8> poison, <8 x i8> poison, <4 x i32> 
+  %v16i8_lo = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> 
+  %v16i8_mi = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> 
+  %v16i8_hi = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> 
+
+  %v2i16_lo = shufflevector <2 x i16> poison, <2 x i16> poison, <1 x i32> 
+  %v2i16_hi = shufflevector <2 x i16> poison, <2 x i16> poison, <1 x i32> 
+  %v4i16_lo = shufflevector <4 x i16> poison, <4 x i16> poison, <2 x i32> 
+  %v4i16_mi = shufflevector <4 x i16> poison, <4 x i16> poison, <2 x i32> 
+  %v4i16_hi = shufflevector <4 x i16> poison, <4 x i16> poison, <2 x i32> 
+  %v8i16_lo = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> 
+  %v8i16_mi = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> 
+  %v8i16_hi = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> 
+  %v16i16_lo = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> 
+  %v16i16_mi = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> 
+  %v16i16_hi = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> 
+
+  %v2i32_lo = shufflevector <2 x i32> poison, <2 x i32> poison, <1 x i32> 
+  %v2i32_hi = shufflevector <2 x i32> poison, <2 x i32> poison, <1 x i32> 
+  %v4i32_lo = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> 
+  %v4i32_mi = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> 
+  %v4i32_hi = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> 
+  %v8i32_lo = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> 
+  %v8i32_mi = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> 
+  %v8i32_hi = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> 
+  %v16i32_lo = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> 
+  %v16i32_mi = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> 
+  %v16i32_hi = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> 
+
+  %v2i64_lo = shufflevector <2 x i64> poison, <2 x i64> poison, <1 x i32> 
+  %v2i64_hi = shufflevector <2 x i64> poison, <2 x i64> poison, <1 x i32> 
+  %v4i64_lo = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> 
+  %v4i64_mi = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> 
+  %v4i64_hi = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> 
+  %v8i64_lo = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> 
+  %v8i64_mi = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> 
+  %v8i64_hi = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> 
+
+  ret void
+}
+
+define void @extract_qtr() {
+; CHECK-LABEL: 'extract_qtr'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i8_lo = shufflevector <4 x i8> poison, <4 x i8> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_mi = shufflevector <4 x i8> poison, <4 x i8> poison, <1 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_hi = shufflevector <4 x i8> poison, <4 x i8> poison, <1 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i8_lo = shufflevector <8 x i8> poison, <8 x i8> poison, <2 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_mi = shufflevector <8 x i8> poison, <8 x i8> poison, <2 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_hi = shufflevector <8 x i8> poison, <8 x i8> poison, <2 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i8_lo = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_mi = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_hi = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i16_lo = shufflevector <4 x i16> poison, <4 x i16> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_mi = shufflevector <4 x i16> poison, <4 x i16> poison, <1 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_hi = shufflevector <4 x i16> poison, <4 x i16> poison, <1 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i16_lo = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_mi = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_hi = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i16_lo = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16_mi = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16_hi = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_lo = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_mi = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_hi = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i32_lo = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_mi = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_hi = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i32_lo = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_mi = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_hi = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i64_lo = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_mi = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_hi = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i64_lo = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_mi = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_hi = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> 
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v4i8_lo = shufflevector <4 x i8> poison, <4 x i8> poison, <1 x i32> 
+  %v4i8_mi = shufflevector <4 x i8> poison, <4 x i8> poison, <1 x i32> 
+  %v4i8_hi = shufflevector <4 x i8> poison, <4 x i8> poison, <1 x i32> 
+  %v8i8_lo = shufflevector <8 x i8> poison, <8 x i8> poison, <2 x i32> 
+  %v8i8_mi = shufflevector <8 x i8> poison, <8 x i8> poison, <2 x i32> 
+  %v8i8_hi = shufflevector <8 x i8> poison, <8 x i8> poison, <2 x i32> 
+  %v16i8_lo = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> 
+  %v16i8_mi = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> 
+  %v16i8_hi = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> 
+
+  %v4i16_lo = shufflevector <4 x i16> poison, <4 x i16> poison, <1 x i32> 
+  %v4i16_mi = shufflevector <4 x i16> poison, <4 x i16> poison, <1 x i32> 
+  %v4i16_hi = shufflevector <4 x i16> poison, <4 x i16> poison, <1 x i32> 
+  %v8i16_lo = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> 
+  %v8i16_mi = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> 
+  %v8i16_hi = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> 
+  %v16i16_lo = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> 
+  %v16i16_mi = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> 
+  %v16i16_hi = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> 
+
+  %v4i32_lo = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> 
+  %v4i32_mi = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> 
+  %v4i32_hi = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> 
+  %v8i32_lo = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> 
+  %v8i32_mi = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> 
+  %v8i32_hi = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> 
+  %v16i32_lo = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> 
+  %v16i32_mi = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> 
+  %v16i32_hi = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> 
+
+  %v4i64_lo = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> 
+  %v4i64_mi = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> 
+  %v4i64_hi = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> 
+  %v8i64_lo = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> 
+  %v8i64_mi = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> 
+  %v8i64_hi = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> 
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/shuffle-select.ll b/llvm/test/CodeGen/AArch64/shuffle-select.ll
index 25a935f067bd6..eeccaa170397d 100644
--- a/llvm/test/CodeGen/AArch64/shuffle-select.ll
+++ b/llvm/test/CodeGen/AArch64/shuffle-select.ll
@@ -28,6 +28,32 @@ define <16 x i8> @sel_v16i8(<16 x i8> %v0, <16 x i8> %v1) {
   ret <16 x i8> %tmp0
 }
 
+define <16 x i8> @sel_v16i8_poison(<16 x i8> %v0, <16 x i8> %v1) {
+; CHECK-LABEL: sel_v16i8_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI2_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI2_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> 
+  ret <16 x i8> %tmp0
+}
+
+define <16 x i8> @sel_v16i8_unregular(<16 x i8> %v0, <16 x i8> %v1) {
+; CHECK-LABEL: sel_v16i8_unregular:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI3_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI3_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> 
+  ret <16 x i8> %tmp0
+}
+
 define <4 x i16> @sel_v4i16(<4 x i16> %v0, <4 x i16> %v1) {
 ; CHECK-LABEL: sel_v4i16:
 ; CHECK:       // %bb.0:
@@ -41,9 +67,9 @@ define <4 x i16> @sel_v4i16(<4 x i16> %v0, <4 x i16> %v1) {
 define <8 x i16> @sel_v8i16(<8 x i16> %v0, <8 x i16> %v1) {
 ; CHECK-LABEL: sel_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI3_0
+; CHECK-NEXT:    adrp x8, .LCPI5_0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI3_0]
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI5_0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-NEXT:    ret
@@ -95,9 +121,9 @@ define <4 x half> @sel_v4f16(<4 x half> %v0, <4 x half> %v1) {
 define <8 x half> @sel_v8f16(<8 x half> %v0, <8 x half> %v1) {
 ; CHECK-LABEL: sel_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI8_0
+; CHECK-NEXT:    adrp x8, .LCPI10_0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI8_0]
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI10_0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll
new file mode 100644
index 0000000000000..7d9524420286d
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll
@@ -0,0 +1,804 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='default' -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64"
+
+%struct.cmplx_int16_t = type { i16, i16 }
+%struct.compressed_data_8bit = type { i8, [24 x i8] }
+
+define dso_local noundef i32 @_Z33block_scaling_decompr_8bitjPK27compressed_data_8bitP20cmplx_int16_tPKS2_(i32 noundef %n_prb, ptr noundef %src, ptr noundef %dst, ptr noundef %scale) #0 {
+; CHECK-LABEL: define dso_local noundef i32 @_Z33block_scaling_decompr_8bitjPK27compressed_data_8bitP20cmplx_int16_tPKS2_(
+; CHECK-SAME: i32 noundef [[N_PRB:%.*]], ptr nocapture noundef readonly [[SRC:%.*]], ptr nocapture noundef writeonly [[DST:%.*]], ptr noundef readonly [[SCALE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP47_NOT:%.*]] = icmp eq i32 [[N_PRB]], 0
+; CHECK-NEXT:    br i1 [[CMP47_NOT]], label %[[FOR_END:.*]], label %[[FOR_BODY_LR_PH:.*]]
+; CHECK:       [[FOR_BODY_LR_PH]]:
+; CHECK-NEXT:    [[CMP31_NOT:%.*]] = icmp eq ptr [[SCALE]], null
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT58:%.*]] = zext i32 [[N_PRB]] to i64
+; CHECK-NEXT:    br i1 [[CMP31_NOT]], label %[[FOR_BODY_US:.*]], label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY_US]]:
+; CHECK-NEXT:    [[INDVARS_IV55:%.*]] = phi i64 [ [[INDVARS_IV_NEXT56:%.*]], %[[FOR_BODY_US]] ], [ 0, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[DST_ADDR_052_US:%.*]] = phi ptr [ [[DST_ADDR_1_US:%.*]], %[[FOR_BODY_US]] ], [ [[DST]], %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds nuw [[STRUCT_COMPRESSED_DATA_8BIT:%.*]], ptr [[SRC]], i64 [[INDVARS_IV55]]
+; CHECK-NEXT:    [[MANTISSA_US:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[MANTISSA_US]], align 1
+; CHECK-NEXT:    [[VMOVL_I59_US:%.*]] = sext <8 x i8> [[TMP0]] to <8 x i16>
+; CHECK-NEXT:    [[ARRAYIDX7_US:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US]], i64 9
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX7_US]], align 1
+; CHECK-NEXT:    [[VMOVL_I56_US:%.*]] = sext <8 x i8> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    [[ARRAYIDX15_US:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US]], i64 17
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX15_US]], align 1
+; CHECK-NEXT:    [[VMOVL_I_US:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX_US]], align 1
+; CHECK-NEXT:    [[CONV_US:%.*]] = sext i8 [[TMP3]] to i16
+; CHECK-NEXT:    [[MUL_US:%.*]] = shl nsw i16 [[CONV_US]], 1
+; CHECK-NEXT:    [[VECINIT_I79_US:%.*]] = insertelement <8 x i16> poison, i16 [[MUL_US]], i64 0
+; CHECK-NEXT:    [[VECINIT7_I86_US:%.*]] = shufflevector <8 x i16> [[VECINIT_I79_US]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[MUL_I87_US:%.*]] = mul <8 x i16> [[VECINIT7_I86_US]], [[VMOVL_I59_US]]
+; CHECK-NEXT:    [[MUL_I74_US:%.*]] = mul <8 x i16> [[VECINIT7_I86_US]], [[VMOVL_I56_US]]
+; CHECK-NEXT:    [[MUL_I_US:%.*]] = mul <8 x i16> [[VECINIT7_I86_US]], [[VMOVL_I_US]]
+; CHECK-NEXT:    store <8 x i16> [[MUL_I87_US]], ptr [[DST_ADDR_052_US]], align 2
+; CHECK-NEXT:    [[ADD_PTR47_US:%.*]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052_US]], i64 16
+; CHECK-NEXT:    store <8 x i16> [[MUL_I74_US]], ptr [[ADD_PTR47_US]], align 2
+; CHECK-NEXT:    [[ADD_PTR50_US:%.*]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052_US]], i64 32
+; CHECK-NEXT:    store <8 x i16> [[MUL_I_US]], ptr [[ADD_PTR50_US]], align 2
+; CHECK-NEXT:    [[DST_ADDR_1_US]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052_US]], i64 48
+; CHECK-NEXT:    [[INDVARS_IV_NEXT56]] = add nuw nsw i64 [[INDVARS_IV55]], 1
+; CHECK-NEXT:    [[EXITCOND59_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT56]], [[WIDE_TRIP_COUNT58]]
+; CHECK-NEXT:    br i1 [[EXITCOND59_NOT]], label %[[FOR_END]], label %[[FOR_BODY_US]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[DST_ADDR_052:%.*]] = phi ptr [ [[DST_ADDR_1:%.*]], %[[FOR_BODY]] ], [ [[DST]], %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[AGG_TMP_COERCE_050:%.*]] = phi i64 [ [[AGG_TMP_COERCE_0_INSERT_INSERT:%.*]], %[[FOR_BODY]] ], [ undef, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[AGG_TMP42_COERCE_049:%.*]] = phi i64 [ [[AGG_TMP42_COERCE_0_INSERT_INSERT:%.*]], %[[FOR_BODY]] ], [ undef, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[AGG_TMP37_COERCE_048:%.*]] = phi i64 [ [[AGG_TMP37_COERCE_0_INSERT_INSERT:%.*]], %[[FOR_BODY]] ], [ undef, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [[STRUCT_COMPRESSED_DATA_8BIT]], ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[MANTISSA:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr [[MANTISSA]], align 1
+; CHECK-NEXT:    [[VMOVL_I59:%.*]] = sext <8 x i8> [[TMP4]] to <8 x i16>
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 9
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT:    [[VMOVL_I56:%.*]] = sext <8 x i8> [[TMP5]] to <8 x i16>
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 17
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX15]], align 1
+; CHECK-NEXT:    [[VMOVL_I:%.*]] = sext <8 x i8> [[TMP6]] to <8 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP7]] to i16
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i16 [[CONV]], 1
+; CHECK-NEXT:    [[VECINIT_I79:%.*]] = insertelement <8 x i16> poison, i16 [[MUL]], i64 0
+; CHECK-NEXT:    [[VECINIT7_I86:%.*]] = shufflevector <8 x i16> [[VECINIT_I79]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[MUL_I87:%.*]] = mul <8 x i16> [[VECINIT7_I86]], [[VMOVL_I59]]
+; CHECK-NEXT:    [[MUL_I74:%.*]] = mul <8 x i16> [[VECINIT7_I86]], [[VMOVL_I56]]
+; CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[VECINIT7_I86]], [[VMOVL_I]]
+; CHECK-NEXT:    [[AGG_TMP_SROA_0_0_COPYLOAD:%.*]] = load i32, ptr [[SCALE]], align 2
+; CHECK-NEXT:    [[AGG_TMP_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[AGG_TMP_SROA_0_0_COPYLOAD]] to i64
+; CHECK-NEXT:    [[AGG_TMP_COERCE_0_INSERT_MASK:%.*]] = and i64 [[AGG_TMP_COERCE_050]], -4294967296
+; CHECK-NEXT:    [[AGG_TMP_COERCE_0_INSERT_INSERT]] = or disjoint i64 [[AGG_TMP_COERCE_0_INSERT_MASK]], [[AGG_TMP_COERCE_0_INSERT_EXT]]
+; CHECK-NEXT:    [[CALL33:%.*]] = tail call fastcc noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef [[MUL_I87]], i64 [[AGG_TMP_COERCE_0_INSERT_INSERT]])
+; CHECK-NEXT:    store <8 x i16> [[CALL33]], ptr [[DST_ADDR_052]], align 2
+; CHECK-NEXT:    [[AGG_TMP37_SROA_0_0_COPYLOAD:%.*]] = load i32, ptr [[SCALE]], align 2
+; CHECK-NEXT:    [[AGG_TMP37_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[AGG_TMP37_SROA_0_0_COPYLOAD]] to i64
+; CHECK-NEXT:    [[AGG_TMP37_COERCE_0_INSERT_MASK:%.*]] = and i64 [[AGG_TMP37_COERCE_048]], -4294967296
+; CHECK-NEXT:    [[AGG_TMP37_COERCE_0_INSERT_INSERT]] = or disjoint i64 [[AGG_TMP37_COERCE_0_INSERT_MASK]], [[AGG_TMP37_COERCE_0_INSERT_EXT]]
+; CHECK-NEXT:    [[CALL38:%.*]] = tail call fastcc noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef [[MUL_I74]], i64 [[AGG_TMP37_COERCE_0_INSERT_INSERT]])
+; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052]], i64 16
+; CHECK-NEXT:    store <8 x i16> [[CALL38]], ptr [[ARRAYIDX39]], align 2
+; CHECK-NEXT:    [[AGG_TMP42_SROA_0_0_COPYLOAD:%.*]] = load i32, ptr [[SCALE]], align 2
+; CHECK-NEXT:    [[AGG_TMP42_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[AGG_TMP42_SROA_0_0_COPYLOAD]] to i64
+; CHECK-NEXT:    [[AGG_TMP42_COERCE_0_INSERT_MASK:%.*]] = and i64 [[AGG_TMP42_COERCE_049]], -4294967296
+; CHECK-NEXT:    [[AGG_TMP42_COERCE_0_INSERT_INSERT]] = or disjoint i64 [[AGG_TMP42_COERCE_0_INSERT_MASK]], [[AGG_TMP42_COERCE_0_INSERT_EXT]]
+; CHECK-NEXT:    [[CALL43:%.*]] = tail call fastcc noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef [[MUL_I]], i64 [[AGG_TMP42_COERCE_0_INSERT_INSERT]])
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052]], i64 32
+; CHECK-NEXT:    store <8 x i16> [[CALL43]], ptr [[ARRAYIDX44]], align 2
+; CHECK-NEXT:    [[DST_ADDR_1]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052]], i64 48
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT58]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %__p0.addr.i75 = alloca <8 x i16>, align 16
+  %__p1.addr.i76 = alloca i16, align 2
+  %__ret.i77 = alloca <8 x i16>, align 16
+  %.compoundliteral.i78 = alloca <8 x i16>, align 16
+  %__p0.addr.i62 = alloca <8 x i16>, align 16
+  %__p1.addr.i63 = alloca i16, align 2
+  %__ret.i64 = alloca <8 x i16>, align 16
+  %.compoundliteral.i65 = alloca <8 x i16>, align 16
+  %__p0.addr.i60 = alloca <8 x i16>, align 16
+  %__p1.addr.i = alloca i16, align 2
+  %__ret.i61 = alloca <8 x i16>, align 16
+  %.compoundliteral.i = alloca <8 x i16>, align 16
+  %__p0.addr.i57 = alloca <8 x i8>, align 8
+  %__ret.i58 = alloca <8 x i16>, align 16
+  %__p0.addr.i54 = alloca <8 x i8>, align 8
+  %__ret.i55 = alloca <8 x i16>, align 16
+  %__p0.addr.i = alloca <8 x i8>, align 8
+  %__ret.i = alloca <8 x i16>, align 16
+  %n_prb.addr = alloca i32, align 4
+  %src.addr = alloca ptr, align 8
+  %dst.addr = alloca ptr, align 8
+  %scale.addr = alloca ptr, align 8
+  %i = alloca i32, align 4
+  %prb_comp_in = alloca [3 x <8 x i16>], align 16
+  %__ret = alloca <8 x i8>, align 8
+  %tmp = alloca <8 x i8>, align 8
+  %__ret3 = alloca <8 x i8>, align 8
+  %tmp8 = alloca <8 x i8>, align 8
+  %__ret11 = alloca <8 x i8>, align 8
+  %tmp16 = alloca <8 x i8>, align 8
+  %prb_decomp = alloca [3 x <8 x i16>], align 16
+  %scaling_factor = alloca i16, align 2
+  %__s1 = alloca <8 x i16>, align 16
+  %agg.tmp = alloca %struct.cmplx_int16_t, align 2
+  %agg.tmp.coerce = alloca i64, align 8
+  %__s135 = alloca <8 x i16>, align 16
+  %agg.tmp37 = alloca %struct.cmplx_int16_t, align 2
+  %agg.tmp37.coerce = alloca i64, align 8
+  %__s140 = alloca <8 x i16>, align 16
+  %agg.tmp42 = alloca %struct.cmplx_int16_t, align 2
+  %agg.tmp42.coerce = alloca i64, align 8
+  %__s145 = alloca <8 x i16>, align 16
+  %__s148 = alloca <8 x i16>, align 16
+  %__s151 = alloca <8 x i16>, align 16
+  store i32 %n_prb, ptr %n_prb.addr, align 4
+  store ptr %src, ptr %src.addr, align 8
+  store ptr %dst, ptr %dst.addr, align 8
+  store ptr %scale, ptr %scale.addr, align 8
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, ptr %i, align 4
+  %1 = load i32, ptr %n_prb.addr, align 4
+  %cmp = icmp ult i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load ptr, ptr %src.addr, align 8
+  %3 = load i32, ptr %i, align 4
+  %idxprom = zext i32 %3 to i64
+  %arrayidx = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %2, i64 %idxprom
+  %mantissa = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %arrayidx, i32 0, i32 1
+  %arrayidx1 = getelementptr inbounds [24 x i8], ptr %mantissa, i64 0, i64 0
+  %4 = load <8 x i8>, ptr %arrayidx1, align 1
+  store <8 x i8> %4, ptr %__ret, align 8
+  %5 = load <8 x i8>, ptr %__ret, align 8
+  store <8 x i8> %5, ptr %tmp, align 8
+  %6 = load <8 x i8>, ptr %tmp, align 8
+  store <8 x i8> %6, ptr %__p0.addr.i57, align 8
+  %7 = load <8 x i8>, ptr %__p0.addr.i57, align 8
+  %vmovl.i59 = sext <8 x i8> %7 to <8 x i16>
+  store <8 x i16> %vmovl.i59, ptr %__ret.i58, align 16
+  %8 = load <8 x i16>, ptr %__ret.i58, align 16
+  %arrayidx2 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 0
+  store <8 x i16> %8, ptr %arrayidx2, align 16
+  %9 = load ptr, ptr %src.addr, align 8
+  %10 = load i32, ptr %i, align 4
+  %idxprom4 = zext i32 %10 to i64
+  %arrayidx5 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %9, i64 %idxprom4
+  %mantissa6 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %arrayidx5, i32 0, i32 1
+  %arrayidx7 = getelementptr inbounds [24 x i8], ptr %mantissa6, i64 0, i64 8
+  %11 = load <8 x i8>, ptr %arrayidx7, align 1
+  store <8 x i8> %11, ptr %__ret3, align 8
+  %12 = load <8 x i8>, ptr %__ret3, align 8
+  store <8 x i8> %12, ptr %tmp8, align 8
+  %13 = load <8 x i8>, ptr %tmp8, align 8
+  store <8 x i8> %13, ptr %__p0.addr.i54, align 8
+  %14 = load <8 x i8>, ptr %__p0.addr.i54, align 8
+  %vmovl.i56 = sext <8 x i8> %14 to <8 x i16>
+  store <8 x i16> %vmovl.i56, ptr %__ret.i55, align 16
+  %15 = load <8 x i16>, ptr %__ret.i55, align 16
+  %arrayidx10 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 1
+  store <8 x i16> %15, ptr %arrayidx10, align 16
+  %16 = load ptr, ptr %src.addr, align 8
+  %17 = load i32, ptr %i, align 4
+  %idxprom12 = zext i32 %17 to i64
+  %arrayidx13 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %16, i64 %idxprom12
+  %mantissa14 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %arrayidx13, i32 0, i32 1
+  %arrayidx15 = getelementptr inbounds [24 x i8], ptr %mantissa14, i64 0, i64 16
+  %18 = load <8 x i8>, ptr %arrayidx15, align 1
+  store <8 x i8> %18, ptr %__ret11, align 8
+  %19 = load <8 x i8>, ptr %__ret11, align 8
+  store <8 x i8> %19, ptr %tmp16, align 8
+  %20 = load <8 x i8>, ptr %tmp16, align 8
+  store <8 x i8> %20, ptr %__p0.addr.i, align 8
+  %21 = load <8 x i8>, ptr %__p0.addr.i, align 8
+  %vmovl.i = sext <8 x i8> %21 to <8 x i16>
+  store <8 x i16> %vmovl.i, ptr %__ret.i, align 16
+  %22 = load <8 x i16>, ptr %__ret.i, align 16
+  %arrayidx18 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 2
+  store <8 x i16> %22, ptr %arrayidx18, align 16
+  %23 = load ptr, ptr %src.addr, align 8
+  %24 = load i32, ptr %i, align 4
+  %idxprom19 = zext i32 %24 to i64
+  %arrayidx20 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %23, i64 %idxprom19
+  %exp = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %arrayidx20, i32 0, i32 0
+  %25 = load i8, ptr %exp, align 1
+  %conv = sext i8 %25 to i32
+  %mul = mul nsw i32 %conv, 2
+  %conv21 = trunc i32 %mul to i16
+  store i16 %conv21, ptr %scaling_factor, align 2
+  %arrayidx22 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 0
+  %26 = load <8 x i16>, ptr %arrayidx22, align 16
+  %27 = load i16, ptr %scaling_factor, align 2
+  store <8 x i16> %26, ptr %__p0.addr.i75, align 16
+  store i16 %27, ptr %__p1.addr.i76, align 2
+  %28 = load <8 x i16>, ptr %__p0.addr.i75, align 16
+  %29 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit.i79 = insertelement <8 x i16> poison, i16 %29, i32 0
+  %30 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit1.i80 = insertelement <8 x i16> %vecinit.i79, i16 %30, i32 1
+  %31 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit2.i81 = insertelement <8 x i16> %vecinit1.i80, i16 %31, i32 2
+  %32 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit3.i82 = insertelement <8 x i16> %vecinit2.i81, i16 %32, i32 3
+  %33 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit4.i83 = insertelement <8 x i16> %vecinit3.i82, i16 %33, i32 4
+  %34 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit5.i84 = insertelement <8 x i16> %vecinit4.i83, i16 %34, i32 5
+  %35 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit6.i85 = insertelement <8 x i16> %vecinit5.i84, i16 %35, i32 6
+  %36 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit7.i86 = insertelement <8 x i16> %vecinit6.i85, i16 %36, i32 7
+  store <8 x i16> %vecinit7.i86, ptr %.compoundliteral.i78, align 16
+  %37 = load <8 x i16>, ptr %.compoundliteral.i78, align 16
+  %mul.i87 = mul <8 x i16> %28, %37
+  store <8 x i16> %mul.i87, ptr %__ret.i77, align 16
+  %38 = load <8 x i16>, ptr %__ret.i77, align 16
+  %arrayidx24 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 0
+  store <8 x i16> %38, ptr %arrayidx24, align 16
+  %arrayidx25 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 1
+  %39 = load <8 x i16>, ptr %arrayidx25, align 16
+  %40 = load i16, ptr %scaling_factor, align 2
+  store <8 x i16> %39, ptr %__p0.addr.i62, align 16
+  store i16 %40, ptr %__p1.addr.i63, align 2
+  %41 = load <8 x i16>, ptr %__p0.addr.i62, align 16
+  %42 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit.i66 = insertelement <8 x i16> poison, i16 %42, i32 0
+  %43 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit1.i67 = insertelement <8 x i16> %vecinit.i66, i16 %43, i32 1
+  %44 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit2.i68 = insertelement <8 x i16> %vecinit1.i67, i16 %44, i32 2
+  %45 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit3.i69 = insertelement <8 x i16> %vecinit2.i68, i16 %45, i32 3
+  %46 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit4.i70 = insertelement <8 x i16> %vecinit3.i69, i16 %46, i32 4
+  %47 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit5.i71 = insertelement <8 x i16> %vecinit4.i70, i16 %47, i32 5
+  %48 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit6.i72 = insertelement <8 x i16> %vecinit5.i71, i16 %48, i32 6
+  %49 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit7.i73 = insertelement <8 x i16> %vecinit6.i72, i16 %49, i32 7
+  store <8 x i16> %vecinit7.i73, ptr %.compoundliteral.i65, align 16
+  %50 = load <8 x i16>, ptr %.compoundliteral.i65, align 16
+  %mul.i74 = mul <8 x i16> %41, %50
+  store <8 x i16> %mul.i74, ptr %__ret.i64, align 16
+  %51 = load <8 x i16>, ptr %__ret.i64, align 16
+  %arrayidx27 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 1
+  store <8 x i16> %51, ptr %arrayidx27, align 16
+  %arrayidx28 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 2
+  %52 = load <8 x i16>, ptr %arrayidx28, align 16
+  %53 = load i16, ptr %scaling_factor, align 2
+  store <8 x i16> %52, ptr %__p0.addr.i60, align 16
+  store i16 %53, ptr %__p1.addr.i, align 2
+  %54 = load <8 x i16>, ptr %__p0.addr.i60, align 16
+  %55 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit.i = insertelement <8 x i16> poison, i16 %55, i32 0
+  %56 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %56, i32 1
+  %57 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %57, i32 2
+  %58 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %58, i32 3
+  %59 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %59, i32 4
+  %60 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %60, i32 5
+  %61 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %61, i32 6
+  %62 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %62, i32 7
+  store <8 x i16> %vecinit7.i, ptr %.compoundliteral.i, align 16
+  %63 = load <8 x i16>, ptr %.compoundliteral.i, align 16
+  %mul.i = mul <8 x i16> %54, %63
+  store <8 x i16> %mul.i, ptr %__ret.i61, align 16
+  %64 = load <8 x i16>, ptr %__ret.i61, align 16
+  %arrayidx30 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 2
+  store <8 x i16> %64, ptr %arrayidx30, align 16
+  %65 = load ptr, ptr %scale.addr, align 8
+  %cmp31 = icmp ne ptr %65, null
+  br i1 %cmp31, label %if.then, label %if.else
+
+if.then:                                          ; preds = %for.body
+  %arrayidx32 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 0
+  %66 = load <8 x i16>, ptr %arrayidx32, align 16
+  %67 = load ptr, ptr %scale.addr, align 8
+  call void @llvm.memcpy.p0.p0.i64(ptr align 2 %agg.tmp, ptr align 2 %67, i64 4, i1 false)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %agg.tmp.coerce, ptr align 2 %agg.tmp, i64 4, i1 false)
+  %68 = load i64, ptr %agg.tmp.coerce, align 8
+  %call33 = call noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef %66, i64 %68)
+  store <8 x i16> %call33, ptr %__s1, align 16
+  %69 = load ptr, ptr %dst.addr, align 8
+  %arrayidx34 = getelementptr inbounds %struct.cmplx_int16_t, ptr %69, i64 0
+  %70 = load <8 x i16>, ptr %__s1, align 16
+  %71 = bitcast <8 x i16> %70 to <16 x i8>
+  %72 = bitcast <16 x i8> %71 to <8 x i16>
+  store <8 x i16> %72, ptr %arrayidx34, align 2
+  %arrayidx36 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 1
+  %73 = load <8 x i16>, ptr %arrayidx36, align 16
+  %74 = load ptr, ptr %scale.addr, align 8
+  call void @llvm.memcpy.p0.p0.i64(ptr align 2 %agg.tmp37, ptr align 2 %74, i64 4, i1 false)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %agg.tmp37.coerce, ptr align 2 %agg.tmp37, i64 4, i1 false)
+  %75 = load i64, ptr %agg.tmp37.coerce, align 8
+  %call38 = call noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef %73, i64 %75)
+  store <8 x i16> %call38, ptr %__s135, align 16
+  %76 = load ptr, ptr %dst.addr, align 8
+  %arrayidx39 = getelementptr inbounds %struct.cmplx_int16_t, ptr %76, i64 4
+  %77 = load <8 x i16>, ptr %__s135, align 16
+  %78 = bitcast <8 x i16> %77 to <16 x i8>
+  %79 = bitcast <16 x i8> %78 to <8 x i16>
+  store <8 x i16> %79, ptr %arrayidx39, align 2
+  %arrayidx41 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 2
+  %80 = load <8 x i16>, ptr %arrayidx41, align 16
+  %81 = load ptr, ptr %scale.addr, align 8
+  call void @llvm.memcpy.p0.p0.i64(ptr align 2 %agg.tmp42, ptr align 2 %81, i64 4, i1 false)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %agg.tmp42.coerce, ptr align 2 %agg.tmp42, i64 4, i1 false)
+  %82 = load i64, ptr %agg.tmp42.coerce, align 8
+  %call43 = call noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef %80, i64 %82)
+  store <8 x i16> %call43, ptr %__s140, align 16
+  %83 = load ptr, ptr %dst.addr, align 8
+  %arrayidx44 = getelementptr inbounds %struct.cmplx_int16_t, ptr %83, i64 8
+  %84 = load <8 x i16>, ptr %__s140, align 16
+  %85 = bitcast <8 x i16> %84 to <16 x i8>
+  %86 = bitcast <16 x i8> %85 to <8 x i16>
+  store <8 x i16> %86, ptr %arrayidx44, align 2
+  %87 = load ptr, ptr %dst.addr, align 8
+  %add.ptr = getelementptr inbounds %struct.cmplx_int16_t, ptr %87, i64 12
+  store ptr %add.ptr, ptr %dst.addr, align 8
+  br label %if.end
+
+if.else:                                          ; preds = %for.body
+  %arrayidx46 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 0
+  %88 = load <8 x i16>, ptr %arrayidx46, align 16
+  store <8 x i16> %88, ptr %__s145, align 16
+  %89 = load ptr, ptr %dst.addr, align 8
+  %90 = load <8 x i16>, ptr %__s145, align 16
+  %91 = bitcast <8 x i16> %90 to <16 x i8>
+  %92 = bitcast <16 x i8> %91 to <8 x i16>
+  store <8 x i16> %92, ptr %89, align 2
+  %93 = load ptr, ptr %dst.addr, align 8
+  %add.ptr47 = getelementptr inbounds %struct.cmplx_int16_t, ptr %93, i64 4
+  store ptr %add.ptr47, ptr %dst.addr, align 8
+  %arrayidx49 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 1
+  %94 = load <8 x i16>, ptr %arrayidx49, align 16
+  store <8 x i16> %94, ptr %__s148, align 16
+  %95 = load ptr, ptr %dst.addr, align 8
+  %96 = load <8 x i16>, ptr %__s148, align 16
+  %97 = bitcast <8 x i16> %96 to <16 x i8>
+  %98 = bitcast <16 x i8> %97 to <8 x i16>
+  store <8 x i16> %98, ptr %95, align 2
+  %99 = load ptr, ptr %dst.addr, align 8
+  %add.ptr50 = getelementptr inbounds %struct.cmplx_int16_t, ptr %99, i64 4
+  store ptr %add.ptr50, ptr %dst.addr, align 8
+  %arrayidx52 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 2
+  %100 = load <8 x i16>, ptr %arrayidx52, align 16
+  store <8 x i16> %100, ptr %__s151, align 16
+  %101 = load ptr, ptr %dst.addr, align 8
+  %102 = load <8 x i16>, ptr %__s151, align 16
+  %103 = bitcast <8 x i16> %102 to <16 x i8>
+  %104 = bitcast <16 x i8> %103 to <8 x i16>
+  store <8 x i16> %104, ptr %101, align 2
+  %105 = load ptr, ptr %dst.addr, align 8
+  %add.ptr53 = getelementptr inbounds %struct.cmplx_int16_t, ptr %105, i64 4
+  store ptr %add.ptr53, ptr %dst.addr, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %106 = load i32, ptr %i, align 4
+  %inc = add i32 %106, 1
+  store i32 %inc, ptr %i, align 4
+  br label %for.cond, !llvm.loop !4
+
+for.end:                                          ; preds = %for.cond
+  ret i32 0
+}
+
+define internal noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef %a, i64 %scale.coerce) #0 {
+; CHECK-LABEL: define internal fastcc noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(
+; CHECK-SAME: <8 x i16> noundef [[A:%.*]], i64 [[SCALE_COERCE:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SCALE_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[SCALE_COERCE]] to i16
+; CHECK-NEXT:    [[SCALE_SROA_2_0_EXTRACT_SHIFT36:%.*]] = lshr i64 [[SCALE_COERCE]], 16
+; CHECK-NEXT:    [[SCALE_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[SCALE_SROA_2_0_EXTRACT_SHIFT36]] to i16
+; CHECK-NEXT:    [[VECINIT_I19:%.*]] = insertelement <8 x i16> poison, i16 [[SCALE_SROA_0_0_EXTRACT_TRUNC]], i64 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[SCALE_SROA_2_0_EXTRACT_TRUNC]], i64 0
+; CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[VQNEGQ_V1_I:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> [[VECINIT7_I]])
+; CHECK-NEXT:    [[SHUFFLE_I85:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> 
+; CHECK-NEXT:    [[SHUFFLE_I82:%.*]] = shufflevector <8 x i16> [[VECINIT_I19]], <8 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[VQDMULL_V2_I72:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I85]], <4 x i16> [[SHUFFLE_I82]])
+; CHECK-NEXT:    [[SHUFFLE_I97:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> 
+; CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I97]], <4 x i16> [[SHUFFLE_I82]])
+; CHECK-NEXT:    [[SHUFFLE_I79:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> 
+; CHECK-NEXT:    [[SHUFFLE_I76:%.*]] = shufflevector <8 x i16> [[VQNEGQ_V1_I]], <8 x i16> [[VECINIT_I]], <4 x i32> 
+; CHECK-NEXT:    [[VQDMLAL2_I106:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I79]], <4 x i16> [[SHUFFLE_I76]])
+; CHECK-NEXT:    [[VQDMLAL_V3_I107:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMULL_V2_I72]], <4 x i32> [[VQDMLAL2_I106]])
+; CHECK-NEXT:    [[SHUFFLE_I91:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> 
+; CHECK-NEXT:    [[SHUFFLE_I88:%.*]] = shufflevector <8 x i16> [[VQNEGQ_V1_I]], <8 x i16> [[VECINIT_I]], <4 x i32> 
+; CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I91]], <4 x i16> [[SHUFFLE_I88]])
+; CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMULL_V2_I]], <4 x i32> [[VQDMLAL2_I]])
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[VQDMLAL_V3_I107]] to <8 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VQDMLAL_V3_I]] to <8 x i16>
+; CHECK-NEXT:    [[SHUFFLE_I61:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i32> 
+; CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I61]]
+;
+entry:
+  %__p0.addr.i102 = alloca <4 x i32>, align 16
+  %__p1.addr.i103 = alloca <4 x i16>, align 8
+  %__p2.addr.i104 = alloca <4 x i16>, align 8
+  %__ret.i105 = alloca <4 x i32>, align 16
+  %__p0.addr.i98 = alloca <4 x i32>, align 16
+  %__p1.addr.i99 = alloca <4 x i16>, align 8
+  %__p2.addr.i100 = alloca <4 x i16>, align 8
+  %__ret.i101 = alloca <4 x i32>, align 16
+  %__p0.addr.i95 = alloca <8 x i16>, align 16
+  %__ret.i96 = alloca <4 x i16>, align 8
+  %__p0.addr.i92 = alloca <8 x i16>, align 16
+  %__ret.i93 = alloca <4 x i16>, align 8
+  %__p0.addr.i89 = alloca <8 x i16>, align 16
+  %__ret.i90 = alloca <4 x i16>, align 8
+  %__p0.addr.i86 = alloca <8 x i16>, align 16
+  %__ret.i87 = alloca <4 x i16>, align 8
+  %__p0.addr.i83 = alloca <8 x i16>, align 16
+  %__ret.i84 = alloca <4 x i16>, align 8
+  %__p0.addr.i80 = alloca <8 x i16>, align 16
+  %__ret.i81 = alloca <4 x i16>, align 8
+  %__p0.addr.i77 = alloca <8 x i16>, align 16
+  %__ret.i78 = alloca <4 x i16>, align 8
+  %__p0.addr.i74 = alloca <8 x i16>, align 16
+  %__ret.i75 = alloca <4 x i16>, align 8
+  %__p0.addr.i69 = alloca <4 x i16>, align 8
+  %__p1.addr.i70 = alloca <4 x i16>, align 8
+  %__ret.i71 = alloca <4 x i32>, align 16
+  %__p0.addr.i66 = alloca <4 x i16>, align 8
+  %__p1.addr.i67 = alloca <4 x i16>, align 8
+  %__ret.i68 = alloca <4 x i32>, align 16
+  %__p0.addr.i64 = alloca <4 x i32>, align 16
+  %__ret.i65 = alloca <8 x i16>, align 16
+  %__p0.addr.i62 = alloca <4 x i32>, align 16
+  %__ret.i63 = alloca <8 x i16>, align 16
+  %__p0.addr.i58 = alloca <8 x i16>, align 16
+  %__p1.addr.i59 = alloca <8 x i16>, align 16
+  %__ret.i60 = alloca <8 x i16>, align 16
+  %__p0.addr.i51 = alloca <4 x i32>, align 16
+  %__p1.addr.i52 = alloca <8 x i16>, align 16
+  %__p2.addr.i53 = alloca <8 x i16>, align 16
+  %__ret.i54 = alloca <4 x i32>, align 16
+  %a.addr.i46 = alloca <4 x i32>, align 16
+  %b.addr.i47 = alloca <8 x i16>, align 16
+  %c.addr.i = alloca <8 x i16>, align 16
+  %__p0.addr.i40 = alloca <8 x i16>, align 16
+  %__p1.addr.i41 = alloca <8 x i16>, align 16
+  %__ret.i42 = alloca <4 x i32>, align 16
+  %a.addr.i = alloca <8 x i16>, align 16
+  %b.addr.i = alloca <8 x i16>, align 16
+  %__p0.addr.i38 = alloca <8 x i16>, align 16
+  %__ret.i39 = alloca <8 x i16>, align 16
+  %__p0.addr.i36 = alloca <8 x i16>, align 16
+  %__p1.addr.i = alloca <8 x i16>, align 16
+  %__p2.addr.i = alloca <8 x i16>, align 16
+  %__ret.i37 = alloca <8 x i16>, align 16
+  %__p0.addr.i29 = alloca i32, align 4
+  %__ret.i30 = alloca <4 x i32>, align 16
+  %.compoundliteral.i31 = alloca <4 x i32>, align 16
+  %__p0.addr.i27 = alloca <4 x i32>, align 16
+  %__ret.i28 = alloca <8 x i16>, align 16
+  %__p0.addr.i16 = alloca i16, align 2
+  %__ret.i17 = alloca <8 x i16>, align 16
+  %.compoundliteral.i18 = alloca <8 x i16>, align 16
+  %__p0.addr.i14 = alloca i16, align 2
+  %__ret.i15 = alloca <8 x i16>, align 16
+  %.compoundliteral.i = alloca <8 x i16>, align 16
+  %__p0.addr.i = alloca <8 x i16>, align 16
+  %__ret.i = alloca <8 x i16>, align 16
+  %scale = alloca %struct.cmplx_int16_t, align 2
+  %a.addr = alloca <8 x i16>, align 16
+  %a_rev = alloca <8 x i16>, align 16
+  %cc = alloca <8 x i16>, align 16
+  %dd = alloca <8 x i16>, align 16
+  %mult_mask = alloca <8 x i16>, align 16
+  %lo32 = alloca <4 x i32>, align 16
+  %hi32 = alloca <4 x i32>, align 16
+  %coerce.val.ii = trunc i64 %scale.coerce to i32
+  store i32 %coerce.val.ii, ptr %scale, align 2
+  store <8 x i16> %a, ptr %a.addr, align 16
+  %0 = load <8 x i16>, ptr %a.addr, align 16
+  store <8 x i16> %0, ptr %__p0.addr.i, align 16
+  %1 = load <8 x i16>, ptr %__p0.addr.i, align 16
+  %2 = load <8 x i16>, ptr %__p0.addr.i, align 16
+  %shuffle.i = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> 
+  store <8 x i16> %shuffle.i, ptr %__ret.i, align 16
+  %3 = load <8 x i16>, ptr %__ret.i, align 16
+  store <8 x i16> %3, ptr %a_rev, align 16
+  %re = getelementptr inbounds nuw %struct.cmplx_int16_t, ptr %scale, i32 0, i32 0
+  %4 = load i16, ptr %re, align 2
+  store i16 %4, ptr %__p0.addr.i16, align 2
+  %5 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit.i19 = insertelement <8 x i16> poison, i16 %5, i32 0
+  %6 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit1.i20 = insertelement <8 x i16> %vecinit.i19, i16 %6, i32 1
+  %7 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit2.i21 = insertelement <8 x i16> %vecinit1.i20, i16 %7, i32 2
+  %8 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit3.i22 = insertelement <8 x i16> %vecinit2.i21, i16 %8, i32 3
+  %9 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit4.i23 = insertelement <8 x i16> %vecinit3.i22, i16 %9, i32 4
+  %10 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit5.i24 = insertelement <8 x i16> %vecinit4.i23, i16 %10, i32 5
+  %11 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit6.i25 = insertelement <8 x i16> %vecinit5.i24, i16 %11, i32 6
+  %12 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit7.i26 = insertelement <8 x i16> %vecinit6.i25, i16 %12, i32 7
+  store <8 x i16> %vecinit7.i26, ptr %.compoundliteral.i18, align 16
+  %13 = load <8 x i16>, ptr %.compoundliteral.i18, align 16
+  store <8 x i16> %13, ptr %__ret.i17, align 16
+  %14 = load <8 x i16>, ptr %__ret.i17, align 16
+  store <8 x i16> %14, ptr %cc, align 16
+  %im = getelementptr inbounds nuw %struct.cmplx_int16_t, ptr %scale, i32 0, i32 1
+  %15 = load i16, ptr %im, align 2
+  store i16 %15, ptr %__p0.addr.i14, align 2
+  %16 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit.i = insertelement <8 x i16> poison, i16 %16, i32 0
+  %17 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %17, i32 1
+  %18 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %18, i32 2
+  %19 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %19, i32 3
+  %20 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %20, i32 4
+  %21 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %21, i32 5
+  %22 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %22, i32 6
+  %23 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %23, i32 7
+  store <8 x i16> %vecinit7.i, ptr %.compoundliteral.i, align 16
+  %24 = load <8 x i16>, ptr %.compoundliteral.i, align 16
+  store <8 x i16> %24, ptr %__ret.i15, align 16
+  %25 = load <8 x i16>, ptr %__ret.i15, align 16
+  store <8 x i16> %25, ptr %dd, align 16
+  store i32 65535, ptr %__p0.addr.i29, align 4
+  %26 = load i32, ptr %__p0.addr.i29, align 4
+  %vecinit.i32 = insertelement <4 x i32> poison, i32 %26, i32 0
+  %27 = load i32, ptr %__p0.addr.i29, align 4
+  %vecinit1.i33 = insertelement <4 x i32> %vecinit.i32, i32 %27, i32 1
+  %28 = load i32, ptr %__p0.addr.i29, align 4
+  %vecinit2.i34 = insertelement <4 x i32> %vecinit1.i33, i32 %28, i32 2
+  %29 = load i32, ptr %__p0.addr.i29, align 4
+  %vecinit3.i35 = insertelement <4 x i32> %vecinit2.i34, i32 %29, i32 3
+  store <4 x i32> %vecinit3.i35, ptr %.compoundliteral.i31, align 16
+  %30 = load <4 x i32>, ptr %.compoundliteral.i31, align 16
+  store <4 x i32> %30, ptr %__ret.i30, align 16
+  %31 = load <4 x i32>, ptr %__ret.i30, align 16
+  store <4 x i32> %31, ptr %__p0.addr.i27, align 16
+  %32 = load <4 x i32>, ptr %__p0.addr.i27, align 16
+  %33 = bitcast <4 x i32> %32 to <8 x i16>
+  store <8 x i16> %33, ptr %__ret.i28, align 16
+  %34 = load <8 x i16>, ptr %__ret.i28, align 16
+  store <8 x i16> %34, ptr %mult_mask, align 16
+  %35 = load <8 x i16>, ptr %mult_mask, align 16
+  %36 = load <8 x i16>, ptr %dd, align 16
+  store <8 x i16> %36, ptr %__p0.addr.i38, align 16
+  %37 = load <8 x i16>, ptr %__p0.addr.i38, align 16
+  %38 = bitcast <8 x i16> %37 to <16 x i8>
+  %vqnegq_v1.i = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %37)
+  %vqnegq_v2.i = bitcast <8 x i16> %vqnegq_v1.i to <16 x i8>
+  store <8 x i16> %vqnegq_v1.i, ptr %__ret.i39, align 16
+  %39 = load <8 x i16>, ptr %__ret.i39, align 16
+  %40 = load <8 x i16>, ptr %dd, align 16
+  store <8 x i16> %35, ptr %__p0.addr.i36, align 16
+  store <8 x i16> %39, ptr %__p1.addr.i, align 16
+  store <8 x i16> %40, ptr %__p2.addr.i, align 16
+  %41 = load <8 x i16>, ptr %__p0.addr.i36, align 16
+  %42 = bitcast <8 x i16> %41 to <16 x i8>
+  %43 = load <8 x i16>, ptr %__p1.addr.i, align 16
+  %44 = bitcast <8 x i16> %43 to <16 x i8>
+  %45 = load <8 x i16>, ptr %__p2.addr.i, align 16
+  %46 = bitcast <8 x i16> %45 to <16 x i8>
+  %vbsl3.i = and <8 x i16> %41, %43
+  %47 = xor <8 x i16> %41, splat (i16 -1)
+  %vbsl4.i = and <8 x i16> %47, %45
+  %vbsl5.i = or <8 x i16> %vbsl3.i, %vbsl4.i
+  store <8 x i16> %vbsl5.i, ptr %__ret.i37, align 16
+  %48 = load <8 x i16>, ptr %__ret.i37, align 16
+  store <8 x i16> %48, ptr %dd, align 16
+  %49 = load <8 x i16>, ptr %a.addr, align 16
+  %50 = load <8 x i16>, ptr %cc, align 16
+  store <8 x i16> %49, ptr %a.addr.i, align 16
+  store <8 x i16> %50, ptr %b.addr.i, align 16
+  %51 = load <8 x i16>, ptr %a.addr.i, align 16
+  store <8 x i16> %51, ptr %__p0.addr.i83, align 16
+  %52 = load <8 x i16>, ptr %__p0.addr.i83, align 16
+  %53 = load <8 x i16>, ptr %__p0.addr.i83, align 16
+  %shuffle.i85 = shufflevector <8 x i16> %52, <8 x i16> %53, <4 x i32> 
+  store <4 x i16> %shuffle.i85, ptr %__ret.i84, align 8
+  %54 = load <4 x i16>, ptr %__ret.i84, align 8
+  %55 = load <8 x i16>, ptr %b.addr.i, align 16
+  store <8 x i16> %55, ptr %__p0.addr.i80, align 16
+  %56 = load <8 x i16>, ptr %__p0.addr.i80, align 16
+  %57 = load <8 x i16>, ptr %__p0.addr.i80, align 16
+  %shuffle.i82 = shufflevector <8 x i16> %56, <8 x i16> %57, <4 x i32> 
+  store <4 x i16> %shuffle.i82, ptr %__ret.i81, align 8
+  %58 = load <4 x i16>, ptr %__ret.i81, align 8
+  store <4 x i16> %54, ptr %__p0.addr.i69, align 8
+  store <4 x i16> %58, ptr %__p1.addr.i70, align 8
+  %59 = load <4 x i16>, ptr %__p0.addr.i69, align 8
+  %60 = bitcast <4 x i16> %59 to <8 x i8>
+  %61 = load <4 x i16>, ptr %__p1.addr.i70, align 8
+  %62 = bitcast <4 x i16> %61 to <8 x i8>
+  %vqdmull_v2.i72 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %59, <4 x i16> %61)
+  %vqdmull_v3.i73 = bitcast <4 x i32> %vqdmull_v2.i72 to <16 x i8>
+  store <4 x i32> %vqdmull_v2.i72, ptr %__ret.i71, align 16
+  %63 = load <4 x i32>, ptr %__ret.i71, align 16
+  store <4 x i32> %63, ptr %lo32, align 16
+  %64 = load <8 x i16>, ptr %a.addr, align 16
+  %65 = load <8 x i16>, ptr %cc, align 16
+  store <8 x i16> %64, ptr %__p0.addr.i40, align 16
+  store <8 x i16> %65, ptr %__p1.addr.i41, align 16
+  %66 = load <8 x i16>, ptr %__p0.addr.i40, align 16
+  store <8 x i16> %66, ptr %__p0.addr.i95, align 16
+  %67 = load <8 x i16>, ptr %__p0.addr.i95, align 16
+  %68 = load <8 x i16>, ptr %__p0.addr.i95, align 16
+  %shuffle.i97 = shufflevector <8 x i16> %67, <8 x i16> %68, <4 x i32> 
+  store <4 x i16> %shuffle.i97, ptr %__ret.i96, align 8
+  %69 = load <4 x i16>, ptr %__ret.i96, align 8
+  %70 = load <8 x i16>, ptr %__p1.addr.i41, align 16
+  store <8 x i16> %70, ptr %__p0.addr.i92, align 16
+  %71 = load <8 x i16>, ptr %__p0.addr.i92, align 16
+  %72 = load <8 x i16>, ptr %__p0.addr.i92, align 16
+  %shuffle.i94 = shufflevector <8 x i16> %71, <8 x i16> %72, <4 x i32> 
+  store <4 x i16> %shuffle.i94, ptr %__ret.i93, align 8
+  %73 = load <4 x i16>, ptr %__ret.i93, align 8
+  store <4 x i16> %69, ptr %__p0.addr.i66, align 8
+  store <4 x i16> %73, ptr %__p1.addr.i67, align 8
+  %74 = load <4 x i16>, ptr %__p0.addr.i66, align 8
+  %75 = bitcast <4 x i16> %74 to <8 x i8>
+  %76 = load <4 x i16>, ptr %__p1.addr.i67, align 8
+  %77 = bitcast <4 x i16> %76 to <8 x i8>
+  %vqdmull_v2.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %74, <4 x i16> %76)
+  %vqdmull_v3.i = bitcast <4 x i32> %vqdmull_v2.i to <16 x i8>
+  store <4 x i32> %vqdmull_v2.i, ptr %__ret.i68, align 16
+  %78 = load <4 x i32>, ptr %__ret.i68, align 16
+  store <4 x i32> %78, ptr %__ret.i42, align 16
+  %79 = load <4 x i32>, ptr %__ret.i42, align 16
+  store <4 x i32> %79, ptr %hi32, align 16
+  %80 = load <4 x i32>, ptr %lo32, align 16
+  %81 = load <8 x i16>, ptr %a_rev, align 16
+  %82 = load <8 x i16>, ptr %dd, align 16
+  store <4 x i32> %80, ptr %a.addr.i46, align 16
+  store <8 x i16> %81, ptr %b.addr.i47, align 16
+  store <8 x i16> %82, ptr %c.addr.i, align 16
+  %83 = load <4 x i32>, ptr %a.addr.i46, align 16
+  %84 = load <8 x i16>, ptr %b.addr.i47, align 16
+  store <8 x i16> %84, ptr %__p0.addr.i77, align 16
+  %85 = load <8 x i16>, ptr %__p0.addr.i77, align 16
+  %86 = load <8 x i16>, ptr %__p0.addr.i77, align 16
+  %shuffle.i79 = shufflevector <8 x i16> %85, <8 x i16> %86, <4 x i32> 
+  store <4 x i16> %shuffle.i79, ptr %__ret.i78, align 8
+  %87 = load <4 x i16>, ptr %__ret.i78, align 8
+  %88 = load <8 x i16>, ptr %c.addr.i, align 16
+  store <8 x i16> %88, ptr %__p0.addr.i74, align 16
+  %89 = load <8 x i16>, ptr %__p0.addr.i74, align 16
+  %90 = load <8 x i16>, ptr %__p0.addr.i74, align 16
+  %shuffle.i76 = shufflevector <8 x i16> %89, <8 x i16> %90, <4 x i32> 
+  store <4 x i16> %shuffle.i76, ptr %__ret.i75, align 8
+  %91 = load <4 x i16>, ptr %__ret.i75, align 8
+  store <4 x i32> %83, ptr %__p0.addr.i102, align 16
+  store <4 x i16> %87, ptr %__p1.addr.i103, align 8
+  store <4 x i16> %91, ptr %__p2.addr.i104, align 8
+  %92 = load <4 x i32>, ptr %__p0.addr.i102, align 16
+  %93 = bitcast <4 x i32> %92 to <16 x i8>
+  %94 = load <4 x i16>, ptr %__p1.addr.i103, align 8
+  %95 = bitcast <4 x i16> %94 to <8 x i8>
+  %96 = load <4 x i16>, ptr %__p2.addr.i104, align 8
+  %97 = bitcast <4 x i16> %96 to <8 x i8>
+  %vqdmlal2.i106 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %94, <4 x i16> %96)
+  %vqdmlal_v3.i107 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %92, <4 x i32> %vqdmlal2.i106)
+  store <4 x i32> %vqdmlal_v3.i107, ptr %__ret.i105, align 16
+  %98 = load <4 x i32>, ptr %__ret.i105, align 16
+  store <4 x i32> %98, ptr %lo32, align 16
+  %99 = load <4 x i32>, ptr %hi32, align 16
+  %100 = load <8 x i16>, ptr %a_rev, align 16
+  %101 = load <8 x i16>, ptr %dd, align 16
+  store <4 x i32> %99, ptr %__p0.addr.i51, align 16
+  store <8 x i16> %100, ptr %__p1.addr.i52, align 16
+  store <8 x i16> %101, ptr %__p2.addr.i53, align 16
+  %102 = load <4 x i32>, ptr %__p0.addr.i51, align 16
+  %103 = load <8 x i16>, ptr %__p1.addr.i52, align 16
+  store <8 x i16> %103, ptr %__p0.addr.i89, align 16
+  %104 = load <8 x i16>, ptr %__p0.addr.i89, align 16
+  %105 = load <8 x i16>, ptr %__p0.addr.i89, align 16
+  %shuffle.i91 = shufflevector <8 x i16> %104, <8 x i16> %105, <4 x i32> 
+  store <4 x i16> %shuffle.i91, ptr %__ret.i90, align 8
+  %106 = load <4 x i16>, ptr %__ret.i90, align 8
+  %107 = load <8 x i16>, ptr %__p2.addr.i53, align 16
+  store <8 x i16> %107, ptr %__p0.addr.i86, align 16
+  %108 = load <8 x i16>, ptr %__p0.addr.i86, align 16
+  %109 = load <8 x i16>, ptr %__p0.addr.i86, align 16
+  %shuffle.i88 = shufflevector <8 x i16> %108, <8 x i16> %109, <4 x i32> 
+  store <4 x i16> %shuffle.i88, ptr %__ret.i87, align 8
+  %110 = load <4 x i16>, ptr %__ret.i87, align 8
+  store <4 x i32> %102, ptr %__p0.addr.i98, align 16
+  store <4 x i16> %106, ptr %__p1.addr.i99, align 8
+  store <4 x i16> %110, ptr %__p2.addr.i100, align 8
+  %111 = load <4 x i32>, ptr %__p0.addr.i98, align 16
+  %112 = bitcast <4 x i32> %111 to <16 x i8>
+  %113 = load <4 x i16>, ptr %__p1.addr.i99, align 8
+  %114 = bitcast <4 x i16> %113 to <8 x i8>
+  %115 = load <4 x i16>, ptr %__p2.addr.i100, align 8
+  %116 = bitcast <4 x i16> %115 to <8 x i8>
+  %vqdmlal2.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %113, <4 x i16> %115)
+  %vqdmlal_v3.i = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %111, <4 x i32> %vqdmlal2.i)
+  store <4 x i32> %vqdmlal_v3.i, ptr %__ret.i101, align 16
+  %117 = load <4 x i32>, ptr %__ret.i101, align 16
+  store <4 x i32> %117, ptr %__ret.i54, align 16
+  %118 = load <4 x i32>, ptr %__ret.i54, align 16
+  store <4 x i32> %118, ptr %hi32, align 16
+  %119 = load <4 x i32>, ptr %lo32, align 16
+  store <4 x i32> %119, ptr %__p0.addr.i64, align 16
+  %120 = load <4 x i32>, ptr %__p0.addr.i64, align 16
+  %121 = bitcast <4 x i32> %120 to <8 x i16>
+  store <8 x i16> %121, ptr %__ret.i65, align 16
+  %122 = load <8 x i16>, ptr %__ret.i65, align 16
+  %123 = load <4 x i32>, ptr %hi32, align 16
+  store <4 x i32> %123, ptr %__p0.addr.i62, align 16
+  %124 = load <4 x i32>, ptr %__p0.addr.i62, align 16
+  %125 = bitcast <4 x i32> %124 to <8 x i16>
+  store <8 x i16> %125, ptr %__ret.i63, align 16
+  %126 = load <8 x i16>, ptr %__ret.i63, align 16
+  store <8 x i16> %122, ptr %__p0.addr.i58, align 16
+  store <8 x i16> %126, ptr %__p1.addr.i59, align 16
+  %127 = load <8 x i16>, ptr %__p0.addr.i58, align 16
+  %128 = load <8 x i16>, ptr %__p1.addr.i59, align 16
+  %shuffle.i61 = shufflevector <8 x i16> %127, <8 x i16> %128, <8 x i32> 
+  store <8 x i16> %shuffle.i61, ptr %__ret.i60, align 16
+  %129 = load <8 x i16>, ptr %__ret.i60, align 16
+  ret <8 x i16> %129
+}
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) #2
+
+attributes #0 = { mustprogress noinline uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fullfp16,+jsconv,+lse,+neon,+pauth,+predres,+ras,+rcpc,+rdm,+sb,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a,-fmv" }
+attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+attributes #2 = { nocallback nofree nosync nounwind willreturn memory(none) }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 2}
+!2 = !{i32 7, !"frame-pointer", i32 1}
+!3 = !{!"clang version 20.0.0git"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.mustprogress"}
+;.
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]]}
+; CHECK: [[META5]] = !{!"llvm.loop.mustprogress"}
+;.

From 460e7d5f3054855af83e50f32a81ce71cac55a04 Mon Sep 17 00:00:00 2001
From: Abid Qadeer 
Date: Thu, 2 Jan 2025 10:26:27 +0000
Subject: [PATCH 265/567] [flang][debug] Correct pointer size. (#120781)

We were passing size in bytes for the sizeInBits field in
DIDerivedTypeAttr with DW_TAG_pointer_type. Although this field is
un-used in this case but better to be accurate.
---
 flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
index cc99698ead33f..5218ad3264954 100644
--- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
+++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
@@ -638,7 +638,7 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertPointerLikeType(
 
   return mlir::LLVM::DIDerivedTypeAttr::get(
       context, llvm::dwarf::DW_TAG_pointer_type,
-      mlir::StringAttr::get(context, ""), elTyAttr, ptrSize,
+      mlir::StringAttr::get(context, ""), elTyAttr, /*sizeInBits=*/ptrSize * 8,
       /*alignInBits=*/0, /*offset=*/0,
       /*optional
=*/std::nullopt, /*extra data=*/nullptr); } From 328ff042e33767d9abdc56bf62132e00b832ecc4 Mon Sep 17 00:00:00 2001 From: Abid Qadeer Date: Thu, 2 Jan 2025 10:30:08 +0000 Subject: [PATCH 266/567] [flang][NFC] Replace dyn_cast_or_null with dyn_cast_if_present. (#120785) --- .../Transforms/DebugTypeGenerator.cpp | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp index 5218ad3264954..8ae3d313d881c 100644 --- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp +++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp @@ -325,7 +325,7 @@ static bool canCacheThisType(mlir::LLVM::DICompositeTypeAttr comTy) { std::pair DebugTypeGenerator::getFieldSizeAndAlign(mlir::Type fieldTy) { mlir::Type llvmTy; - if (auto boxTy = mlir::dyn_cast_or_null(fieldTy)) + if (auto boxTy = mlir::dyn_cast_if_present(fieldTy)) llvmTy = llvmTypeConverter.convertBoxTypeAsStruct(boxTy, getBoxRank(boxTy)); else llvmTy = llvmTypeConverter.convertType(fieldTy); @@ -371,7 +371,7 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertRecordType( std::optional> lowerBounds = fir::getComponentLowerBoundsIfNonDefault(Ty, fieldName, module, symbolTable); - auto seqTy = mlir::dyn_cast_or_null(fieldTy); + auto seqTy = mlir::dyn_cast_if_present(fieldTy); // For members of the derived types, the information about the shift in // lower bounds is not part of the declOp but has to be extracted from the @@ -622,10 +622,10 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertPointerLikeType( // Arrays and character need different treatment because DWARF have special // constructs for them to get the location from the descriptor. Rest of // types are handled like pointer to underlying type. - if (auto seqTy = mlir::dyn_cast_or_null(elTy)) + if (auto seqTy = mlir::dyn_cast_if_present(elTy)) return convertBoxedSequenceType(seqTy, fileAttr, scope, declOp, genAllocated, genAssociated); - if (auto charTy = mlir::dyn_cast_or_null(elTy)) + if (auto charTy = mlir::dyn_cast_if_present(elTy)) return convertCharacterType(charTy, fileAttr, scope, declOp, /*hasDescriptor=*/true); @@ -654,22 +654,22 @@ DebugTypeGenerator::convertType(mlir::Type Ty, mlir::LLVM::DIFileAttr fileAttr, } else if (mlir::isa(Ty)) { return genBasicType(context, mlir::StringAttr::get(context, "real"), Ty.getIntOrFloatBitWidth(), llvm::dwarf::DW_ATE_float); - } else if (auto logTy = mlir::dyn_cast_or_null(Ty)) { + } else if (auto logTy = mlir::dyn_cast_if_present(Ty)) { return genBasicType(context, mlir::StringAttr::get(context, logTy.getMnemonic()), kindMapping.getLogicalBitsize(logTy.getFKind()), llvm::dwarf::DW_ATE_boolean); - } else if (auto cplxTy = mlir::dyn_cast_or_null(Ty)) { + } else if (auto cplxTy = mlir::dyn_cast_if_present(Ty)) { auto floatTy = mlir::cast(cplxTy.getElementType()); unsigned bitWidth = floatTy.getWidth(); return genBasicType(context, mlir::StringAttr::get(context, "complex"), bitWidth * 2, llvm::dwarf::DW_ATE_complex_float); - } else if (auto seqTy = mlir::dyn_cast_or_null(Ty)) { + } else if (auto seqTy = mlir::dyn_cast_if_present(Ty)) { return convertSequenceType(seqTy, fileAttr, scope, declOp); - } else if (auto charTy = mlir::dyn_cast_or_null(Ty)) { + } else if (auto charTy = mlir::dyn_cast_if_present(Ty)) { return convertCharacterType(charTy, fileAttr, scope, declOp, /*hasDescriptor=*/false); - } else if (auto recTy = mlir::dyn_cast_or_null(Ty)) { + } else if (auto recTy = mlir::dyn_cast_if_present(Ty)) { return convertRecordType(recTy, fileAttr, scope, declOp); } else if (auto tupleTy = mlir::dyn_cast_if_present(Ty)) { return convertTupleType(tupleTy, fileAttr, scope, declOp); @@ -678,22 +678,22 @@ DebugTypeGenerator::convertType(mlir::Type Ty, mlir::LLVM::DIFileAttr fileAttr, return convertPointerLikeType(elTy, fileAttr, scope, declOp, /*genAllocated=*/false, /*genAssociated=*/false); - } else if (auto vecTy = mlir::dyn_cast_or_null(Ty)) { + } else if (auto vecTy = mlir::dyn_cast_if_present(Ty)) { return convertVectorType(vecTy, fileAttr, scope, declOp); } else if (mlir::isa(Ty)) { return genBasicType(context, mlir::StringAttr::get(context, "integer"), llvmTypeConverter.getIndexTypeBitwidth(), llvm::dwarf::DW_ATE_signed); - } else if (auto boxTy = mlir::dyn_cast_or_null(Ty)) { + } else if (auto boxTy = mlir::dyn_cast_if_present(Ty)) { auto elTy = boxTy.getEleTy(); - if (auto seqTy = mlir::dyn_cast_or_null(elTy)) + if (auto seqTy = mlir::dyn_cast_if_present(elTy)) return convertBoxedSequenceType(seqTy, fileAttr, scope, declOp, false, false); - if (auto heapTy = mlir::dyn_cast_or_null(elTy)) + if (auto heapTy = mlir::dyn_cast_if_present(elTy)) return convertPointerLikeType(heapTy.getElementType(), fileAttr, scope, declOp, /*genAllocated=*/true, /*genAssociated=*/false); - if (auto ptrTy = mlir::dyn_cast_or_null(elTy)) + if (auto ptrTy = mlir::dyn_cast_if_present(elTy)) return convertPointerLikeType(ptrTy.getElementType(), fileAttr, scope, declOp, /*genAllocated=*/false, /*genAssociated=*/true); From 450c6b02d224245656c41033cc0c849bde2045f3 Mon Sep 17 00:00:00 2001 From: josel-amd Date: Thu, 2 Jan 2025 11:36:23 +0100 Subject: [PATCH 267/567] [MLIR][SCFToEmitC] Convert types while converting from SCF to EmitC (#118940) Switch from rewrite patterns to conversion patterns. This allows to perform type conversions together with other parts of the IR. For example, this allows to convert from index to emit.size_t types. --- .../mlir/Conversion/SCFToEmitC/SCFToEmitC.h | 4 +- mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp | 206 ++++++++++++------ mlir/test/Conversion/SCFToEmitC/for.mlir | 89 +++++++- mlir/test/Conversion/SCFToEmitC/switch.mlir | 9 +- 4 files changed, 228 insertions(+), 80 deletions(-) diff --git a/mlir/include/mlir/Conversion/SCFToEmitC/SCFToEmitC.h b/mlir/include/mlir/Conversion/SCFToEmitC/SCFToEmitC.h index 22df7f1c5dcf2..acc39e6acf726 100644 --- a/mlir/include/mlir/Conversion/SCFToEmitC/SCFToEmitC.h +++ b/mlir/include/mlir/Conversion/SCFToEmitC/SCFToEmitC.h @@ -9,6 +9,7 @@ #ifndef MLIR_CONVERSION_SCFTOEMITC_SCFTOEMITC_H #define MLIR_CONVERSION_SCFTOEMITC_SCFTOEMITC_H +#include "mlir/Transforms/DialectConversion.h" #include namespace mlir { @@ -19,7 +20,8 @@ class RewritePatternSet; #include "mlir/Conversion/Passes.h.inc" /// Collect a set of patterns to convert SCF operations to the EmitC dialect. -void populateSCFToEmitCConversionPatterns(RewritePatternSet &patterns); +void populateSCFToEmitCConversionPatterns(RewritePatternSet &patterns, + TypeConverter &typeConverter); } // namespace mlir #endif // MLIR_CONVERSION_SCFTOEMITC_SCFTOEMITC_H diff --git a/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp b/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp index 67a43c43d608b..92523ca4f12b2 100644 --- a/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp +++ b/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp @@ -14,6 +14,7 @@ #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/EmitC/IR/EmitC.h" +#include "mlir/Dialect/EmitC/Transforms/TypeConversions.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinOps.h" @@ -39,21 +40,22 @@ struct SCFToEmitCPass : public impl::SCFToEmitCBase { // Lower scf::for to emitc::for, implementing result values using // emitc::variable's updated within the loop body. -struct ForLowering : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; +struct ForLowering : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(ForOp forOp, - PatternRewriter &rewriter) const override; + LogicalResult + matchAndRewrite(ForOp forOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; }; // Create an uninitialized emitc::variable op for each result of the given op. template -static SmallVector createVariablesForResults(T op, - PatternRewriter &rewriter) { - SmallVector resultVariables; - +static LogicalResult +createVariablesForResults(T op, const TypeConverter *typeConverter, + ConversionPatternRewriter &rewriter, + SmallVector &resultVariables) { if (!op.getNumResults()) - return resultVariables; + return success(); Location loc = op->getLoc(); MLIRContext *context = op.getContext(); @@ -62,7 +64,9 @@ static SmallVector createVariablesForResults(T op, rewriter.setInsertionPoint(op); for (OpResult result : op.getResults()) { - Type resultType = result.getType(); + Type resultType = typeConverter->convertType(result.getType()); + if (!resultType) + return rewriter.notifyMatchFailure(op, "result type conversion failed"); Type varType = emitc::LValueType::get(resultType); emitc::OpaqueAttr noInit = emitc::OpaqueAttr::get(context, ""); emitc::VariableOp var = @@ -70,13 +74,13 @@ static SmallVector createVariablesForResults(T op, resultVariables.push_back(var); } - return resultVariables; + return success(); } // Create a series of assign ops assigning given values to given variables at // the current insertion point of given rewriter. -static void assignValues(ValueRange values, SmallVector &variables, - PatternRewriter &rewriter, Location loc) { +static void assignValues(ValueRange values, ValueRange variables, + ConversionPatternRewriter &rewriter, Location loc) { for (auto [value, var] : llvm::zip(values, variables)) rewriter.create(loc, var, value); } @@ -89,18 +93,25 @@ SmallVector loadValues(const SmallVector &variables, }); } -static void lowerYield(SmallVector &resultVariables, - PatternRewriter &rewriter, scf::YieldOp yield) { +static LogicalResult lowerYield(Operation *op, ValueRange resultVariables, + ConversionPatternRewriter &rewriter, + scf::YieldOp yield) { Location loc = yield.getLoc(); - ValueRange operands = yield.getOperands(); OpBuilder::InsertionGuard guard(rewriter); rewriter.setInsertionPoint(yield); - assignValues(operands, resultVariables, rewriter, loc); + SmallVector yieldOperands; + if (failed(rewriter.getRemappedValues(yield.getOperands(), yieldOperands))) { + return rewriter.notifyMatchFailure(op, "failed to lower yield operands"); + } + + assignValues(yieldOperands, resultVariables, rewriter, loc); rewriter.create(loc); rewriter.eraseOp(yield); + + return success(); } // Lower the contents of an scf::if/scf::index_switch regions to an @@ -108,27 +119,32 @@ static void lowerYield(SmallVector &resultVariables, // moved into the respective lowered region, but the scf::yield is replaced not // only with an emitc::yield, but also with a sequence of emitc::assign ops that // set the yielded values into the result variables. -static void lowerRegion(SmallVector &resultVariables, - PatternRewriter &rewriter, Region ®ion, - Region &loweredRegion) { +static LogicalResult lowerRegion(Operation *op, ValueRange resultVariables, + ConversionPatternRewriter &rewriter, + Region ®ion, Region &loweredRegion) { rewriter.inlineRegionBefore(region, loweredRegion, loweredRegion.end()); Operation *terminator = loweredRegion.back().getTerminator(); - lowerYield(resultVariables, rewriter, cast(terminator)); + return lowerYield(op, resultVariables, rewriter, + cast(terminator)); } -LogicalResult ForLowering::matchAndRewrite(ForOp forOp, - PatternRewriter &rewriter) const { +LogicalResult +ForLowering::matchAndRewrite(ForOp forOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { Location loc = forOp.getLoc(); // Create an emitc::variable op for each result. These variables will be // assigned to by emitc::assign ops within the loop body. - SmallVector resultVariables = - createVariablesForResults(forOp, rewriter); + SmallVector resultVariables; + if (failed(createVariablesForResults(forOp, getTypeConverter(), rewriter, + resultVariables))) + return rewriter.notifyMatchFailure(forOp, + "create variables for results failed"); - assignValues(forOp.getInits(), resultVariables, rewriter, loc); + assignValues(adaptor.getInitArgs(), resultVariables, rewriter, loc); emitc::ForOp loweredFor = rewriter.create( - loc, forOp.getLowerBound(), forOp.getUpperBound(), forOp.getStep()); + loc, adaptor.getLowerBound(), adaptor.getUpperBound(), adaptor.getStep()); Block *loweredBody = loweredFor.getBody(); @@ -143,13 +159,27 @@ LogicalResult ForLowering::matchAndRewrite(ForOp forOp, rewriter.restoreInsertionPoint(ip); + // Convert the original region types into the new types by adding unrealized + // casts in the beginning of the loop. This performs the conversion in place. + if (failed(rewriter.convertRegionTypes(&forOp.getRegion(), + *getTypeConverter(), nullptr))) { + return rewriter.notifyMatchFailure(forOp, "region types conversion failed"); + } + + // Register the replacements for the block arguments and inline the body of + // the scf.for loop into the body of the emitc::for loop. + Block *scfBody = &(forOp.getRegion().front()); SmallVector replacingValues; replacingValues.push_back(loweredFor.getInductionVar()); replacingValues.append(iterArgsValues.begin(), iterArgsValues.end()); + rewriter.mergeBlocks(scfBody, loweredBody, replacingValues); - rewriter.mergeBlocks(forOp.getBody(), loweredBody, replacingValues); - lowerYield(resultVariables, rewriter, - cast(loweredBody->getTerminator())); + auto result = lowerYield(forOp, resultVariables, rewriter, + cast(loweredBody->getTerminator())); + + if (failed(result)) { + return result; + } // Load variables into SSA values after the for loop. SmallVector resultValues = loadValues(resultVariables, rewriter, loc); @@ -160,38 +190,66 @@ LogicalResult ForLowering::matchAndRewrite(ForOp forOp, // Lower scf::if to emitc::if, implementing result values as emitc::variable's // updated within the then and else regions. -struct IfLowering : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; +struct IfLowering : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(IfOp ifOp, - PatternRewriter &rewriter) const override; + LogicalResult + matchAndRewrite(IfOp ifOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; }; } // namespace -LogicalResult IfLowering::matchAndRewrite(IfOp ifOp, - PatternRewriter &rewriter) const { +LogicalResult +IfLowering::matchAndRewrite(IfOp ifOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { Location loc = ifOp.getLoc(); // Create an emitc::variable op for each result. These variables will be // assigned to by emitc::assign ops within the then & else regions. - SmallVector resultVariables = - createVariablesForResults(ifOp, rewriter); - - Region &thenRegion = ifOp.getThenRegion(); - Region &elseRegion = ifOp.getElseRegion(); + SmallVector resultVariables; + if (failed(createVariablesForResults(ifOp, getTypeConverter(), rewriter, + resultVariables))) + return rewriter.notifyMatchFailure(ifOp, + "create variables for results failed"); + + // Utility function to lower the contents of an scf::if region to an emitc::if + // region. The contents of the scf::if regions is moved into the respective + // emitc::if regions, but the scf::yield is replaced not only with an + // emitc::yield, but also with a sequence of emitc::assign ops that set the + // yielded values into the result variables. + auto lowerRegion = [&resultVariables, &rewriter, + &ifOp](Region ®ion, Region &loweredRegion) { + rewriter.inlineRegionBefore(region, loweredRegion, loweredRegion.end()); + Operation *terminator = loweredRegion.back().getTerminator(); + auto result = lowerYield(ifOp, resultVariables, rewriter, + cast(terminator)); + if (failed(result)) { + return result; + } + return success(); + }; + + Region &thenRegion = adaptor.getThenRegion(); + Region &elseRegion = adaptor.getElseRegion(); bool hasElseBlock = !elseRegion.empty(); auto loweredIf = - rewriter.create(loc, ifOp.getCondition(), false, false); + rewriter.create(loc, adaptor.getCondition(), false, false); Region &loweredThenRegion = loweredIf.getThenRegion(); - lowerRegion(resultVariables, rewriter, thenRegion, loweredThenRegion); + auto result = lowerRegion(thenRegion, loweredThenRegion); + if (failed(result)) { + return result; + } if (hasElseBlock) { Region &loweredElseRegion = loweredIf.getElseRegion(); - lowerRegion(resultVariables, rewriter, elseRegion, loweredElseRegion); + auto result = lowerRegion(elseRegion, loweredElseRegion); + if (failed(result)) { + return result; + } } rewriter.setInsertionPointAfter(ifOp); @@ -203,37 +261,46 @@ LogicalResult IfLowering::matchAndRewrite(IfOp ifOp, // Lower scf::index_switch to emitc::switch, implementing result values as // emitc::variable's updated within the case and default regions. -struct IndexSwitchOpLowering : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; +struct IndexSwitchOpLowering : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(IndexSwitchOp indexSwitchOp, - PatternRewriter &rewriter) const override; + LogicalResult + matchAndRewrite(IndexSwitchOp indexSwitchOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; }; -LogicalResult -IndexSwitchOpLowering::matchAndRewrite(IndexSwitchOp indexSwitchOp, - PatternRewriter &rewriter) const { +LogicalResult IndexSwitchOpLowering::matchAndRewrite( + IndexSwitchOp indexSwitchOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { Location loc = indexSwitchOp.getLoc(); // Create an emitc::variable op for each result. These variables will be // assigned to by emitc::assign ops within the case and default regions. - SmallVector resultVariables = - createVariablesForResults(indexSwitchOp, rewriter); + SmallVector resultVariables; + if (failed(createVariablesForResults(indexSwitchOp, getTypeConverter(), + rewriter, resultVariables))) { + return rewriter.notifyMatchFailure(indexSwitchOp, + "create variables for results failed"); + } auto loweredSwitch = rewriter.create( - loc, indexSwitchOp.getArg(), indexSwitchOp.getCases(), - indexSwitchOp.getNumCases()); + loc, adaptor.getArg(), adaptor.getCases(), indexSwitchOp.getNumCases()); // Lowering all case regions. - for (auto pair : llvm::zip(indexSwitchOp.getCaseRegions(), - loweredSwitch.getCaseRegions())) { - lowerRegion(resultVariables, rewriter, std::get<0>(pair), - std::get<1>(pair)); + for (auto pair : + llvm::zip(adaptor.getCaseRegions(), loweredSwitch.getCaseRegions())) { + if (failed(lowerRegion(indexSwitchOp, resultVariables, rewriter, + *std::get<0>(pair), std::get<1>(pair)))) { + return failure(); + } } // Lowering default region. - lowerRegion(resultVariables, rewriter, indexSwitchOp.getDefaultRegion(), - loweredSwitch.getDefaultRegion()); + if (failed(lowerRegion(indexSwitchOp, resultVariables, rewriter, + adaptor.getDefaultRegion(), + loweredSwitch.getDefaultRegion()))) { + return failure(); + } rewriter.setInsertionPointAfter(indexSwitchOp); SmallVector results = loadValues(resultVariables, rewriter, loc); @@ -242,15 +309,22 @@ IndexSwitchOpLowering::matchAndRewrite(IndexSwitchOp indexSwitchOp, return success(); } -void mlir::populateSCFToEmitCConversionPatterns(RewritePatternSet &patterns) { - patterns.add(patterns.getContext()); - patterns.add(patterns.getContext()); - patterns.add(patterns.getContext()); +void mlir::populateSCFToEmitCConversionPatterns(RewritePatternSet &patterns, + TypeConverter &typeConverter) { + patterns.add(typeConverter, patterns.getContext()); + patterns.add(typeConverter, patterns.getContext()); + patterns.add(typeConverter, patterns.getContext()); } void SCFToEmitCPass::runOnOperation() { RewritePatternSet patterns(&getContext()); - populateSCFToEmitCConversionPatterns(patterns); + TypeConverter typeConverter; + // Fallback converter + // See note https://mlir.llvm.org/docs/DialectConversion/#type-converter + // Type converters are called most to least recently inserted + typeConverter.addConversion([](Type t) { return t; }); + populateEmitCSizeTTypeConversions(typeConverter); + populateSCFToEmitCConversionPatterns(patterns, typeConverter); // Configure conversion to lower out SCF operations. ConversionTarget target(getContext()); diff --git a/mlir/test/Conversion/SCFToEmitC/for.mlir b/mlir/test/Conversion/SCFToEmitC/for.mlir index 83592187a9b68..7f41e636936b8 100644 --- a/mlir/test/Conversion/SCFToEmitC/for.mlir +++ b/mlir/test/Conversion/SCFToEmitC/for.mlir @@ -7,8 +7,11 @@ func.func @simple_std_for_loop(%arg0 : index, %arg1 : index, %arg2 : index) { return } // CHECK-LABEL: func.func @simple_std_for_loop( -// CHECK-SAME: %[[VAL_0:.*]]: index, %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index) { -// CHECK-NEXT: emitc.for %[[VAL_3:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] { +// CHECK-SAME: %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index) { +// CHECK-NEXT: %[[VAL_2:.*]] = builtin.unrealized_conversion_cast %[[ARG_2]] : index to !emitc.size_t +// CHECK-NEXT: %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[ARG_1]] : index to !emitc.size_t +// CHECK-NEXT: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t +// CHECK-NEXT: emitc.for %[[VAL_3:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] : !emitc.size_t { // CHECK-NEXT: %[[VAL_4:.*]] = arith.constant 1 : index // CHECK-NEXT: } // CHECK-NEXT: return @@ -24,10 +27,13 @@ func.func @simple_std_2_for_loops(%arg0 : index, %arg1 : index, %arg2 : index) { return } // CHECK-LABEL: func.func @simple_std_2_for_loops( -// CHECK-SAME: %[[VAL_0:.*]]: index, %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index) { -// CHECK-NEXT: emitc.for %[[VAL_3:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] { +// CHECK-SAME: %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index) { +// CHECK-NEXT: %[[VAL_2:.*]] = builtin.unrealized_conversion_cast %[[ARG_2]] : index to !emitc.size_t +// CHECK-NEXT: %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[ARG_1]] : index to !emitc.size_t +// CHECK-NEXT: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t +// CHECK-NEXT: emitc.for %[[VAL_3:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] : !emitc.size_t { // CHECK-NEXT: %[[VAL_4:.*]] = arith.constant 1 : index -// CHECK-NEXT: emitc.for %[[VAL_5:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] { +// CHECK-NEXT: emitc.for %[[VAL_5:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] : !emitc.size_t { // CHECK-NEXT: %[[VAL_6:.*]] = arith.constant 1 : index // CHECK-NEXT: } // CHECK-NEXT: } @@ -44,14 +50,17 @@ func.func @for_yield(%arg0 : index, %arg1 : index, %arg2 : index) -> (f32, f32) return %result#0, %result#1 : f32, f32 } // CHECK-LABEL: func.func @for_yield( -// CHECK-SAME: %[[VAL_0:.*]]: index, %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index) -> (f32, f32) { +// CHECK-SAME: %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index) -> (f32, f32) { +// CHECK-NEXT: %[[VAL_2:.*]] = builtin.unrealized_conversion_cast %[[ARG_2]] : index to !emitc.size_t +// CHECK-NEXT: %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[ARG_1]] : index to !emitc.size_t +// CHECK-NEXT: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t // CHECK-NEXT: %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32 // CHECK-NEXT: %[[VAL_4:.*]] = arith.constant 1.000000e+00 : f32 // CHECK-NEXT: %[[VAL_5:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue // CHECK-NEXT: %[[VAL_6:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue // CHECK-NEXT: emitc.assign %[[VAL_3]] : f32 to %[[VAL_5]] : // CHECK-NEXT: emitc.assign %[[VAL_4]] : f32 to %[[VAL_6]] : -// CHECK-NEXT: emitc.for %[[VAL_7:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] { +// CHECK-NEXT: emitc.for %[[VAL_7:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] : !emitc.size_t { // CHECK-NEXT: %[[VAL_8:.*]] = emitc.load %[[VAL_5]] : // CHECK-NEXT: %[[VAL_9:.*]] = emitc.load %[[VAL_6]] : // CHECK-NEXT: %[[VAL_10:.*]] = arith.addf %[[VAL_8]], %[[VAL_9]] : f32 @@ -75,15 +84,18 @@ func.func @nested_for_yield(%arg0 : index, %arg1 : index, %arg2 : index) -> f32 return %r : f32 } // CHECK-LABEL: func.func @nested_for_yield( -// CHECK-SAME: %[[VAL_0:.*]]: index, %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index) -> f32 { +// CHECK-SAME: %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index) -> f32 { +// CHECK-NEXT: %[[VAL_2:.*]] = builtin.unrealized_conversion_cast %[[ARG_2]] : index to !emitc.size_t +// CHECK-NEXT: %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[ARG_1]] : index to !emitc.size_t +// CHECK-NEXT: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t // CHECK-NEXT: %[[VAL_3:.*]] = arith.constant 1.000000e+00 : f32 // CHECK-NEXT: %[[VAL_4:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue // CHECK-NEXT: emitc.assign %[[VAL_3]] : f32 to %[[VAL_4]] : -// CHECK-NEXT: emitc.for %[[VAL_5:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] { +// CHECK-NEXT: emitc.for %[[VAL_5:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] : !emitc.size_t { // CHECK-NEXT: %[[VAL_6:.*]] = emitc.load %[[VAL_4]] : // CHECK-NEXT: %[[VAL_7:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue // CHECK-NEXT: emitc.assign %[[VAL_6]] : f32 to %[[VAL_7]] : -// CHECK-NEXT: emitc.for %[[VAL_8:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] { +// CHECK-NEXT: emitc.for %[[VAL_8:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] : !emitc.size_t { // CHECK-NEXT: %[[VAL_9:.*]] = emitc.load %[[VAL_7]] : // CHECK-NEXT: %[[VAL_10:.*]] = arith.addf %[[VAL_9]], %[[VAL_9]] : f32 // CHECK-NEXT: emitc.assign %[[VAL_10]] : f32 to %[[VAL_7]] : @@ -94,3 +106,60 @@ func.func @nested_for_yield(%arg0 : index, %arg1 : index, %arg2 : index) -> f32 // CHECK-NEXT: %[[VAL_12:.*]] = emitc.load %[[VAL_4]] : // CHECK-NEXT: return %[[VAL_12]] : f32 // CHECK-NEXT: } + +func.func @for_yield_index(%arg0 : index, %arg1 : index, %arg2 : index) -> index { + %zero = arith.constant 0 : index + %r = scf.for %i0 = %arg0 to %arg1 step %arg2 iter_args(%acc = %zero) -> index { + scf.yield %acc : index + } + return %r : index +} + +// CHECK-LABEL: func.func @for_yield_index( +// CHECK-SAME: %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index) -> index { +// CHECK: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_2]] : index to !emitc.size_t +// CHECK: %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[ARG_1]] : index to !emitc.size_t +// CHECK: %[[VAL_2:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_3:.*]] = builtin.unrealized_conversion_cast %[[C0]] : index to !emitc.size_t +// CHECK: %[[VAL_4:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue +// CHECK: emitc.assign %[[VAL_3]] : !emitc.size_t to %[[VAL_4]] : +// CHECK: emitc.for %[[VAL_5:.*]] = %[[VAL_2]] to %[[VAL_1]] step %[[VAL_0]] : !emitc.size_t { +// CHECK: %[[V:.*]] = emitc.load %[[VAL_4]] : +// CHECK: emitc.assign %[[V]] : !emitc.size_t to %[[VAL_4]] : +// CHECK: } +// CHECK: %[[V2:.*]] = emitc.load %[[VAL_4]] : +// CHECK: %[[VAL_8:.*]] = builtin.unrealized_conversion_cast %[[V2]] : !emitc.size_t to index +// CHECK: return %[[VAL_8]] : index +// CHECK: } + + +func.func @for_yield_update_loop_carried_var(%arg0 : index, %arg1 : index, %arg2 : index) -> index { + %zero = arith.constant 0 : index + %r = scf.for %i0 = %arg0 to %arg1 step %arg2 iter_args(%acc = %zero) -> index { + %sn = arith.addi %acc, %acc : index + scf.yield %sn: index + } + return %r : index + } + +// CHECK-LABEL: func.func @for_yield_update_loop_carried_var( +// CHECK-SAME: %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index) -> index { +// CHECK: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_2]] : index to !emitc.size_t +// CHECK: %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[ARG_1]] : index to !emitc.size_t +// CHECK: %[[VAL_2:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_3:.*]] = builtin.unrealized_conversion_cast %[[C0]] : index to !emitc.size_t +// CHECK: %[[VAL_4:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue +// CHECK: emitc.assign %[[VAL_3]] : !emitc.size_t to %[[VAL_4]] : +// CHECK: emitc.for %[[ARG_3:.*]] = %[[VAL_2]] to %[[VAL_1]] step %[[VAL_0]] : !emitc.size_t { +// CHECK: %[[V:.*]] = emitc.load %[[VAL_4]] : +// CHECK: %[[VAL_5:.*]] = builtin.unrealized_conversion_cast %[[V]] : !emitc.size_t to index +// CHECK: %[[VAL_6:.*]] = arith.addi %[[VAL_5]], %[[VAL_5]] : index +// CHECK: %[[VAL_8:.*]] = builtin.unrealized_conversion_cast %[[VAL_6]] : index to !emitc.size_t +// CHECK: emitc.assign %[[VAL_8]] : !emitc.size_t to %[[VAL_4]] : +// CHECK: } +// CHECK: %[[V2:.*]] = emitc.load %[[VAL_4]] : +// CHECK: %[[VAL_9:.*]] = builtin.unrealized_conversion_cast %[[V2]] : !emitc.size_t to index +// CHECK: return %[[VAL_9]] : index +// CHECK: } diff --git a/mlir/test/Conversion/SCFToEmitC/switch.mlir b/mlir/test/Conversion/SCFToEmitC/switch.mlir index 86d96ed21f1b5..61015b0ae483b 100644 --- a/mlir/test/Conversion/SCFToEmitC/switch.mlir +++ b/mlir/test/Conversion/SCFToEmitC/switch.mlir @@ -1,7 +1,8 @@ // RUN: mlir-opt -allow-unregistered-dialect -convert-scf-to-emitc %s | FileCheck %s // CHECK-LABEL: func.func @switch_no_result( -// CHECK-SAME: %[[VAL_0:.*]]: index) { +// CHECK-SAME: %[[ARG_0:.*]]: index) { +// CHECK: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t // CHECK: emitc.switch %[[VAL_0]] // CHECK: case 2 { // CHECK: %[[VAL_1:.*]] = arith.constant 10 : i32 @@ -33,7 +34,8 @@ func.func @switch_no_result(%arg0 : index) { } // CHECK-LABEL: func.func @switch_one_result( -// CHECK-SAME: %[[VAL_0:.*]]: index) { +// CHECK-SAME: %[[ARG_0:.*]]: index) { +// CHECK: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t // CHECK: %[[VAL_1:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue // CHECK: emitc.switch %[[VAL_0]] // CHECK: case 2 { @@ -70,7 +72,8 @@ func.func @switch_one_result(%arg0 : index) { } // CHECK-LABEL: func.func @switch_two_results( -// CHECK-SAME: %[[VAL_0:.*]]: index) -> (i32, f32) { +// CHECK-SAME: %[[ARG_0:.*]]: index) -> (i32, f32) { +// CHECK: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t // CHECK: %[[VAL_1:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue // CHECK: %[[VAL_2:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue // CHECK: emitc.switch %[[VAL_0]] From df728cf1d7959e214af68dbf4d6e3750fc7b5b13 Mon Sep 17 00:00:00 2001 From: Matthias Gehre Date: Thu, 2 Jan 2025 11:55:35 +0100 Subject: [PATCH 268/567] Revert "[MLIR][SCFToEmitC] Convert types while converting from SCF to EmitC (#118940)" This reverts commit 450c6b02d224245656c41033cc0c849bde2045f3. --- .../mlir/Conversion/SCFToEmitC/SCFToEmitC.h | 4 +- mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp | 206 ++++++------------ mlir/test/Conversion/SCFToEmitC/for.mlir | 89 +------- mlir/test/Conversion/SCFToEmitC/switch.mlir | 9 +- 4 files changed, 80 insertions(+), 228 deletions(-) diff --git a/mlir/include/mlir/Conversion/SCFToEmitC/SCFToEmitC.h b/mlir/include/mlir/Conversion/SCFToEmitC/SCFToEmitC.h index acc39e6acf726..22df7f1c5dcf2 100644 --- a/mlir/include/mlir/Conversion/SCFToEmitC/SCFToEmitC.h +++ b/mlir/include/mlir/Conversion/SCFToEmitC/SCFToEmitC.h @@ -9,7 +9,6 @@ #ifndef MLIR_CONVERSION_SCFTOEMITC_SCFTOEMITC_H #define MLIR_CONVERSION_SCFTOEMITC_SCFTOEMITC_H -#include "mlir/Transforms/DialectConversion.h" #include namespace mlir { @@ -20,8 +19,7 @@ class RewritePatternSet; #include "mlir/Conversion/Passes.h.inc" /// Collect a set of patterns to convert SCF operations to the EmitC dialect. -void populateSCFToEmitCConversionPatterns(RewritePatternSet &patterns, - TypeConverter &typeConverter); +void populateSCFToEmitCConversionPatterns(RewritePatternSet &patterns); } // namespace mlir #endif // MLIR_CONVERSION_SCFTOEMITC_SCFTOEMITC_H diff --git a/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp b/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp index 92523ca4f12b2..67a43c43d608b 100644 --- a/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp +++ b/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp @@ -14,7 +14,6 @@ #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/EmitC/IR/EmitC.h" -#include "mlir/Dialect/EmitC/Transforms/TypeConversions.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinOps.h" @@ -40,22 +39,21 @@ struct SCFToEmitCPass : public impl::SCFToEmitCBase { // Lower scf::for to emitc::for, implementing result values using // emitc::variable's updated within the loop body. -struct ForLowering : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; +struct ForLowering : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; - LogicalResult - matchAndRewrite(ForOp forOp, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override; + LogicalResult matchAndRewrite(ForOp forOp, + PatternRewriter &rewriter) const override; }; // Create an uninitialized emitc::variable op for each result of the given op. template -static LogicalResult -createVariablesForResults(T op, const TypeConverter *typeConverter, - ConversionPatternRewriter &rewriter, - SmallVector &resultVariables) { +static SmallVector createVariablesForResults(T op, + PatternRewriter &rewriter) { + SmallVector resultVariables; + if (!op.getNumResults()) - return success(); + return resultVariables; Location loc = op->getLoc(); MLIRContext *context = op.getContext(); @@ -64,9 +62,7 @@ createVariablesForResults(T op, const TypeConverter *typeConverter, rewriter.setInsertionPoint(op); for (OpResult result : op.getResults()) { - Type resultType = typeConverter->convertType(result.getType()); - if (!resultType) - return rewriter.notifyMatchFailure(op, "result type conversion failed"); + Type resultType = result.getType(); Type varType = emitc::LValueType::get(resultType); emitc::OpaqueAttr noInit = emitc::OpaqueAttr::get(context, ""); emitc::VariableOp var = @@ -74,13 +70,13 @@ createVariablesForResults(T op, const TypeConverter *typeConverter, resultVariables.push_back(var); } - return success(); + return resultVariables; } // Create a series of assign ops assigning given values to given variables at // the current insertion point of given rewriter. -static void assignValues(ValueRange values, ValueRange variables, - ConversionPatternRewriter &rewriter, Location loc) { +static void assignValues(ValueRange values, SmallVector &variables, + PatternRewriter &rewriter, Location loc) { for (auto [value, var] : llvm::zip(values, variables)) rewriter.create(loc, var, value); } @@ -93,25 +89,18 @@ SmallVector loadValues(const SmallVector &variables, }); } -static LogicalResult lowerYield(Operation *op, ValueRange resultVariables, - ConversionPatternRewriter &rewriter, - scf::YieldOp yield) { +static void lowerYield(SmallVector &resultVariables, + PatternRewriter &rewriter, scf::YieldOp yield) { Location loc = yield.getLoc(); + ValueRange operands = yield.getOperands(); OpBuilder::InsertionGuard guard(rewriter); rewriter.setInsertionPoint(yield); - SmallVector yieldOperands; - if (failed(rewriter.getRemappedValues(yield.getOperands(), yieldOperands))) { - return rewriter.notifyMatchFailure(op, "failed to lower yield operands"); - } - - assignValues(yieldOperands, resultVariables, rewriter, loc); + assignValues(operands, resultVariables, rewriter, loc); rewriter.create(loc); rewriter.eraseOp(yield); - - return success(); } // Lower the contents of an scf::if/scf::index_switch regions to an @@ -119,32 +108,27 @@ static LogicalResult lowerYield(Operation *op, ValueRange resultVariables, // moved into the respective lowered region, but the scf::yield is replaced not // only with an emitc::yield, but also with a sequence of emitc::assign ops that // set the yielded values into the result variables. -static LogicalResult lowerRegion(Operation *op, ValueRange resultVariables, - ConversionPatternRewriter &rewriter, - Region ®ion, Region &loweredRegion) { +static void lowerRegion(SmallVector &resultVariables, + PatternRewriter &rewriter, Region ®ion, + Region &loweredRegion) { rewriter.inlineRegionBefore(region, loweredRegion, loweredRegion.end()); Operation *terminator = loweredRegion.back().getTerminator(); - return lowerYield(op, resultVariables, rewriter, - cast(terminator)); + lowerYield(resultVariables, rewriter, cast(terminator)); } -LogicalResult -ForLowering::matchAndRewrite(ForOp forOp, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const { +LogicalResult ForLowering::matchAndRewrite(ForOp forOp, + PatternRewriter &rewriter) const { Location loc = forOp.getLoc(); // Create an emitc::variable op for each result. These variables will be // assigned to by emitc::assign ops within the loop body. - SmallVector resultVariables; - if (failed(createVariablesForResults(forOp, getTypeConverter(), rewriter, - resultVariables))) - return rewriter.notifyMatchFailure(forOp, - "create variables for results failed"); + SmallVector resultVariables = + createVariablesForResults(forOp, rewriter); - assignValues(adaptor.getInitArgs(), resultVariables, rewriter, loc); + assignValues(forOp.getInits(), resultVariables, rewriter, loc); emitc::ForOp loweredFor = rewriter.create( - loc, adaptor.getLowerBound(), adaptor.getUpperBound(), adaptor.getStep()); + loc, forOp.getLowerBound(), forOp.getUpperBound(), forOp.getStep()); Block *loweredBody = loweredFor.getBody(); @@ -159,27 +143,13 @@ ForLowering::matchAndRewrite(ForOp forOp, OpAdaptor adaptor, rewriter.restoreInsertionPoint(ip); - // Convert the original region types into the new types by adding unrealized - // casts in the beginning of the loop. This performs the conversion in place. - if (failed(rewriter.convertRegionTypes(&forOp.getRegion(), - *getTypeConverter(), nullptr))) { - return rewriter.notifyMatchFailure(forOp, "region types conversion failed"); - } - - // Register the replacements for the block arguments and inline the body of - // the scf.for loop into the body of the emitc::for loop. - Block *scfBody = &(forOp.getRegion().front()); SmallVector replacingValues; replacingValues.push_back(loweredFor.getInductionVar()); replacingValues.append(iterArgsValues.begin(), iterArgsValues.end()); - rewriter.mergeBlocks(scfBody, loweredBody, replacingValues); - auto result = lowerYield(forOp, resultVariables, rewriter, - cast(loweredBody->getTerminator())); - - if (failed(result)) { - return result; - } + rewriter.mergeBlocks(forOp.getBody(), loweredBody, replacingValues); + lowerYield(resultVariables, rewriter, + cast(loweredBody->getTerminator())); // Load variables into SSA values after the for loop. SmallVector resultValues = loadValues(resultVariables, rewriter, loc); @@ -190,66 +160,38 @@ ForLowering::matchAndRewrite(ForOp forOp, OpAdaptor adaptor, // Lower scf::if to emitc::if, implementing result values as emitc::variable's // updated within the then and else regions. -struct IfLowering : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; +struct IfLowering : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; - LogicalResult - matchAndRewrite(IfOp ifOp, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override; + LogicalResult matchAndRewrite(IfOp ifOp, + PatternRewriter &rewriter) const override; }; } // namespace -LogicalResult -IfLowering::matchAndRewrite(IfOp ifOp, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const { +LogicalResult IfLowering::matchAndRewrite(IfOp ifOp, + PatternRewriter &rewriter) const { Location loc = ifOp.getLoc(); // Create an emitc::variable op for each result. These variables will be // assigned to by emitc::assign ops within the then & else regions. - SmallVector resultVariables; - if (failed(createVariablesForResults(ifOp, getTypeConverter(), rewriter, - resultVariables))) - return rewriter.notifyMatchFailure(ifOp, - "create variables for results failed"); - - // Utility function to lower the contents of an scf::if region to an emitc::if - // region. The contents of the scf::if regions is moved into the respective - // emitc::if regions, but the scf::yield is replaced not only with an - // emitc::yield, but also with a sequence of emitc::assign ops that set the - // yielded values into the result variables. - auto lowerRegion = [&resultVariables, &rewriter, - &ifOp](Region ®ion, Region &loweredRegion) { - rewriter.inlineRegionBefore(region, loweredRegion, loweredRegion.end()); - Operation *terminator = loweredRegion.back().getTerminator(); - auto result = lowerYield(ifOp, resultVariables, rewriter, - cast(terminator)); - if (failed(result)) { - return result; - } - return success(); - }; - - Region &thenRegion = adaptor.getThenRegion(); - Region &elseRegion = adaptor.getElseRegion(); + SmallVector resultVariables = + createVariablesForResults(ifOp, rewriter); + + Region &thenRegion = ifOp.getThenRegion(); + Region &elseRegion = ifOp.getElseRegion(); bool hasElseBlock = !elseRegion.empty(); auto loweredIf = - rewriter.create(loc, adaptor.getCondition(), false, false); + rewriter.create(loc, ifOp.getCondition(), false, false); Region &loweredThenRegion = loweredIf.getThenRegion(); - auto result = lowerRegion(thenRegion, loweredThenRegion); - if (failed(result)) { - return result; - } + lowerRegion(resultVariables, rewriter, thenRegion, loweredThenRegion); if (hasElseBlock) { Region &loweredElseRegion = loweredIf.getElseRegion(); - auto result = lowerRegion(elseRegion, loweredElseRegion); - if (failed(result)) { - return result; - } + lowerRegion(resultVariables, rewriter, elseRegion, loweredElseRegion); } rewriter.setInsertionPointAfter(ifOp); @@ -261,46 +203,37 @@ IfLowering::matchAndRewrite(IfOp ifOp, OpAdaptor adaptor, // Lower scf::index_switch to emitc::switch, implementing result values as // emitc::variable's updated within the case and default regions. -struct IndexSwitchOpLowering : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; +struct IndexSwitchOpLowering : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; - LogicalResult - matchAndRewrite(IndexSwitchOp indexSwitchOp, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override; + LogicalResult matchAndRewrite(IndexSwitchOp indexSwitchOp, + PatternRewriter &rewriter) const override; }; -LogicalResult IndexSwitchOpLowering::matchAndRewrite( - IndexSwitchOp indexSwitchOp, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const { +LogicalResult +IndexSwitchOpLowering::matchAndRewrite(IndexSwitchOp indexSwitchOp, + PatternRewriter &rewriter) const { Location loc = indexSwitchOp.getLoc(); // Create an emitc::variable op for each result. These variables will be // assigned to by emitc::assign ops within the case and default regions. - SmallVector resultVariables; - if (failed(createVariablesForResults(indexSwitchOp, getTypeConverter(), - rewriter, resultVariables))) { - return rewriter.notifyMatchFailure(indexSwitchOp, - "create variables for results failed"); - } + SmallVector resultVariables = + createVariablesForResults(indexSwitchOp, rewriter); auto loweredSwitch = rewriter.create( - loc, adaptor.getArg(), adaptor.getCases(), indexSwitchOp.getNumCases()); + loc, indexSwitchOp.getArg(), indexSwitchOp.getCases(), + indexSwitchOp.getNumCases()); // Lowering all case regions. - for (auto pair : - llvm::zip(adaptor.getCaseRegions(), loweredSwitch.getCaseRegions())) { - if (failed(lowerRegion(indexSwitchOp, resultVariables, rewriter, - *std::get<0>(pair), std::get<1>(pair)))) { - return failure(); - } + for (auto pair : llvm::zip(indexSwitchOp.getCaseRegions(), + loweredSwitch.getCaseRegions())) { + lowerRegion(resultVariables, rewriter, std::get<0>(pair), + std::get<1>(pair)); } // Lowering default region. - if (failed(lowerRegion(indexSwitchOp, resultVariables, rewriter, - adaptor.getDefaultRegion(), - loweredSwitch.getDefaultRegion()))) { - return failure(); - } + lowerRegion(resultVariables, rewriter, indexSwitchOp.getDefaultRegion(), + loweredSwitch.getDefaultRegion()); rewriter.setInsertionPointAfter(indexSwitchOp); SmallVector results = loadValues(resultVariables, rewriter, loc); @@ -309,22 +242,15 @@ LogicalResult IndexSwitchOpLowering::matchAndRewrite( return success(); } -void mlir::populateSCFToEmitCConversionPatterns(RewritePatternSet &patterns, - TypeConverter &typeConverter) { - patterns.add(typeConverter, patterns.getContext()); - patterns.add(typeConverter, patterns.getContext()); - patterns.add(typeConverter, patterns.getContext()); +void mlir::populateSCFToEmitCConversionPatterns(RewritePatternSet &patterns) { + patterns.add(patterns.getContext()); + patterns.add(patterns.getContext()); + patterns.add(patterns.getContext()); } void SCFToEmitCPass::runOnOperation() { RewritePatternSet patterns(&getContext()); - TypeConverter typeConverter; - // Fallback converter - // See note https://mlir.llvm.org/docs/DialectConversion/#type-converter - // Type converters are called most to least recently inserted - typeConverter.addConversion([](Type t) { return t; }); - populateEmitCSizeTTypeConversions(typeConverter); - populateSCFToEmitCConversionPatterns(patterns, typeConverter); + populateSCFToEmitCConversionPatterns(patterns); // Configure conversion to lower out SCF operations. ConversionTarget target(getContext()); diff --git a/mlir/test/Conversion/SCFToEmitC/for.mlir b/mlir/test/Conversion/SCFToEmitC/for.mlir index 7f41e636936b8..83592187a9b68 100644 --- a/mlir/test/Conversion/SCFToEmitC/for.mlir +++ b/mlir/test/Conversion/SCFToEmitC/for.mlir @@ -7,11 +7,8 @@ func.func @simple_std_for_loop(%arg0 : index, %arg1 : index, %arg2 : index) { return } // CHECK-LABEL: func.func @simple_std_for_loop( -// CHECK-SAME: %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index) { -// CHECK-NEXT: %[[VAL_2:.*]] = builtin.unrealized_conversion_cast %[[ARG_2]] : index to !emitc.size_t -// CHECK-NEXT: %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[ARG_1]] : index to !emitc.size_t -// CHECK-NEXT: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t -// CHECK-NEXT: emitc.for %[[VAL_3:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] : !emitc.size_t { +// CHECK-SAME: %[[VAL_0:.*]]: index, %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index) { +// CHECK-NEXT: emitc.for %[[VAL_3:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] { // CHECK-NEXT: %[[VAL_4:.*]] = arith.constant 1 : index // CHECK-NEXT: } // CHECK-NEXT: return @@ -27,13 +24,10 @@ func.func @simple_std_2_for_loops(%arg0 : index, %arg1 : index, %arg2 : index) { return } // CHECK-LABEL: func.func @simple_std_2_for_loops( -// CHECK-SAME: %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index) { -// CHECK-NEXT: %[[VAL_2:.*]] = builtin.unrealized_conversion_cast %[[ARG_2]] : index to !emitc.size_t -// CHECK-NEXT: %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[ARG_1]] : index to !emitc.size_t -// CHECK-NEXT: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t -// CHECK-NEXT: emitc.for %[[VAL_3:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] : !emitc.size_t { +// CHECK-SAME: %[[VAL_0:.*]]: index, %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index) { +// CHECK-NEXT: emitc.for %[[VAL_3:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] { // CHECK-NEXT: %[[VAL_4:.*]] = arith.constant 1 : index -// CHECK-NEXT: emitc.for %[[VAL_5:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] : !emitc.size_t { +// CHECK-NEXT: emitc.for %[[VAL_5:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] { // CHECK-NEXT: %[[VAL_6:.*]] = arith.constant 1 : index // CHECK-NEXT: } // CHECK-NEXT: } @@ -50,17 +44,14 @@ func.func @for_yield(%arg0 : index, %arg1 : index, %arg2 : index) -> (f32, f32) return %result#0, %result#1 : f32, f32 } // CHECK-LABEL: func.func @for_yield( -// CHECK-SAME: %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index) -> (f32, f32) { -// CHECK-NEXT: %[[VAL_2:.*]] = builtin.unrealized_conversion_cast %[[ARG_2]] : index to !emitc.size_t -// CHECK-NEXT: %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[ARG_1]] : index to !emitc.size_t -// CHECK-NEXT: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t +// CHECK-SAME: %[[VAL_0:.*]]: index, %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index) -> (f32, f32) { // CHECK-NEXT: %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32 // CHECK-NEXT: %[[VAL_4:.*]] = arith.constant 1.000000e+00 : f32 // CHECK-NEXT: %[[VAL_5:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue // CHECK-NEXT: %[[VAL_6:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue // CHECK-NEXT: emitc.assign %[[VAL_3]] : f32 to %[[VAL_5]] : // CHECK-NEXT: emitc.assign %[[VAL_4]] : f32 to %[[VAL_6]] : -// CHECK-NEXT: emitc.for %[[VAL_7:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] : !emitc.size_t { +// CHECK-NEXT: emitc.for %[[VAL_7:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] { // CHECK-NEXT: %[[VAL_8:.*]] = emitc.load %[[VAL_5]] : // CHECK-NEXT: %[[VAL_9:.*]] = emitc.load %[[VAL_6]] : // CHECK-NEXT: %[[VAL_10:.*]] = arith.addf %[[VAL_8]], %[[VAL_9]] : f32 @@ -84,18 +75,15 @@ func.func @nested_for_yield(%arg0 : index, %arg1 : index, %arg2 : index) -> f32 return %r : f32 } // CHECK-LABEL: func.func @nested_for_yield( -// CHECK-SAME: %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index) -> f32 { -// CHECK-NEXT: %[[VAL_2:.*]] = builtin.unrealized_conversion_cast %[[ARG_2]] : index to !emitc.size_t -// CHECK-NEXT: %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[ARG_1]] : index to !emitc.size_t -// CHECK-NEXT: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t +// CHECK-SAME: %[[VAL_0:.*]]: index, %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index) -> f32 { // CHECK-NEXT: %[[VAL_3:.*]] = arith.constant 1.000000e+00 : f32 // CHECK-NEXT: %[[VAL_4:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue // CHECK-NEXT: emitc.assign %[[VAL_3]] : f32 to %[[VAL_4]] : -// CHECK-NEXT: emitc.for %[[VAL_5:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] : !emitc.size_t { +// CHECK-NEXT: emitc.for %[[VAL_5:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] { // CHECK-NEXT: %[[VAL_6:.*]] = emitc.load %[[VAL_4]] : // CHECK-NEXT: %[[VAL_7:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue // CHECK-NEXT: emitc.assign %[[VAL_6]] : f32 to %[[VAL_7]] : -// CHECK-NEXT: emitc.for %[[VAL_8:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] : !emitc.size_t { +// CHECK-NEXT: emitc.for %[[VAL_8:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] { // CHECK-NEXT: %[[VAL_9:.*]] = emitc.load %[[VAL_7]] : // CHECK-NEXT: %[[VAL_10:.*]] = arith.addf %[[VAL_9]], %[[VAL_9]] : f32 // CHECK-NEXT: emitc.assign %[[VAL_10]] : f32 to %[[VAL_7]] : @@ -106,60 +94,3 @@ func.func @nested_for_yield(%arg0 : index, %arg1 : index, %arg2 : index) -> f32 // CHECK-NEXT: %[[VAL_12:.*]] = emitc.load %[[VAL_4]] : // CHECK-NEXT: return %[[VAL_12]] : f32 // CHECK-NEXT: } - -func.func @for_yield_index(%arg0 : index, %arg1 : index, %arg2 : index) -> index { - %zero = arith.constant 0 : index - %r = scf.for %i0 = %arg0 to %arg1 step %arg2 iter_args(%acc = %zero) -> index { - scf.yield %acc : index - } - return %r : index -} - -// CHECK-LABEL: func.func @for_yield_index( -// CHECK-SAME: %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index) -> index { -// CHECK: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_2]] : index to !emitc.size_t -// CHECK: %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[ARG_1]] : index to !emitc.size_t -// CHECK: %[[VAL_2:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[VAL_3:.*]] = builtin.unrealized_conversion_cast %[[C0]] : index to !emitc.size_t -// CHECK: %[[VAL_4:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue -// CHECK: emitc.assign %[[VAL_3]] : !emitc.size_t to %[[VAL_4]] : -// CHECK: emitc.for %[[VAL_5:.*]] = %[[VAL_2]] to %[[VAL_1]] step %[[VAL_0]] : !emitc.size_t { -// CHECK: %[[V:.*]] = emitc.load %[[VAL_4]] : -// CHECK: emitc.assign %[[V]] : !emitc.size_t to %[[VAL_4]] : -// CHECK: } -// CHECK: %[[V2:.*]] = emitc.load %[[VAL_4]] : -// CHECK: %[[VAL_8:.*]] = builtin.unrealized_conversion_cast %[[V2]] : !emitc.size_t to index -// CHECK: return %[[VAL_8]] : index -// CHECK: } - - -func.func @for_yield_update_loop_carried_var(%arg0 : index, %arg1 : index, %arg2 : index) -> index { - %zero = arith.constant 0 : index - %r = scf.for %i0 = %arg0 to %arg1 step %arg2 iter_args(%acc = %zero) -> index { - %sn = arith.addi %acc, %acc : index - scf.yield %sn: index - } - return %r : index - } - -// CHECK-LABEL: func.func @for_yield_update_loop_carried_var( -// CHECK-SAME: %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index) -> index { -// CHECK: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_2]] : index to !emitc.size_t -// CHECK: %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[ARG_1]] : index to !emitc.size_t -// CHECK: %[[VAL_2:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[VAL_3:.*]] = builtin.unrealized_conversion_cast %[[C0]] : index to !emitc.size_t -// CHECK: %[[VAL_4:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue -// CHECK: emitc.assign %[[VAL_3]] : !emitc.size_t to %[[VAL_4]] : -// CHECK: emitc.for %[[ARG_3:.*]] = %[[VAL_2]] to %[[VAL_1]] step %[[VAL_0]] : !emitc.size_t { -// CHECK: %[[V:.*]] = emitc.load %[[VAL_4]] : -// CHECK: %[[VAL_5:.*]] = builtin.unrealized_conversion_cast %[[V]] : !emitc.size_t to index -// CHECK: %[[VAL_6:.*]] = arith.addi %[[VAL_5]], %[[VAL_5]] : index -// CHECK: %[[VAL_8:.*]] = builtin.unrealized_conversion_cast %[[VAL_6]] : index to !emitc.size_t -// CHECK: emitc.assign %[[VAL_8]] : !emitc.size_t to %[[VAL_4]] : -// CHECK: } -// CHECK: %[[V2:.*]] = emitc.load %[[VAL_4]] : -// CHECK: %[[VAL_9:.*]] = builtin.unrealized_conversion_cast %[[V2]] : !emitc.size_t to index -// CHECK: return %[[VAL_9]] : index -// CHECK: } diff --git a/mlir/test/Conversion/SCFToEmitC/switch.mlir b/mlir/test/Conversion/SCFToEmitC/switch.mlir index 61015b0ae483b..86d96ed21f1b5 100644 --- a/mlir/test/Conversion/SCFToEmitC/switch.mlir +++ b/mlir/test/Conversion/SCFToEmitC/switch.mlir @@ -1,8 +1,7 @@ // RUN: mlir-opt -allow-unregistered-dialect -convert-scf-to-emitc %s | FileCheck %s // CHECK-LABEL: func.func @switch_no_result( -// CHECK-SAME: %[[ARG_0:.*]]: index) { -// CHECK: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t +// CHECK-SAME: %[[VAL_0:.*]]: index) { // CHECK: emitc.switch %[[VAL_0]] // CHECK: case 2 { // CHECK: %[[VAL_1:.*]] = arith.constant 10 : i32 @@ -34,8 +33,7 @@ func.func @switch_no_result(%arg0 : index) { } // CHECK-LABEL: func.func @switch_one_result( -// CHECK-SAME: %[[ARG_0:.*]]: index) { -// CHECK: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t +// CHECK-SAME: %[[VAL_0:.*]]: index) { // CHECK: %[[VAL_1:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue // CHECK: emitc.switch %[[VAL_0]] // CHECK: case 2 { @@ -72,8 +70,7 @@ func.func @switch_one_result(%arg0 : index) { } // CHECK-LABEL: func.func @switch_two_results( -// CHECK-SAME: %[[ARG_0:.*]]: index) -> (i32, f32) { -// CHECK: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t +// CHECK-SAME: %[[VAL_0:.*]]: index) -> (i32, f32) { // CHECK: %[[VAL_1:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue // CHECK: %[[VAL_2:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue // CHECK: emitc.switch %[[VAL_0]] From 76a4c4593ba9f8cad894d80ea75687cd86be3cdd Mon Sep 17 00:00:00 2001 From: Rajat Bajpai Date: Thu, 2 Jan 2025 16:33:10 +0530 Subject: [PATCH 269/567] [InstCombine] Fix constant swap case of fcmp + fadd + sel xfrm (#119419) The fcmp + fadd + sel => fcmp + sel + fadd xfrm performs incorrect transformation when select branch values are swapped. This change fixes this. --- .../InstCombine/InstCombineSelect.cpp | 44 +++++++++++-------- .../InstCombine/fcmp-fadd-select.ll | 16 +++---- 2 files changed, 34 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 3d251d662bd53..e7a8e947705f8 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -3769,22 +3769,9 @@ static Value *foldSelectIntoAddConstant(SelectInst &SI, if (!SIFOp || !SIFOp->hasNoSignedZeros() || !SIFOp->hasNoNaNs()) return nullptr; - // select((fcmp Pred, X, 0), (fadd X, C), C) - // => fadd((select (fcmp Pred, X, 0), X, 0), C) - // - // Pred := OGT, OGE, OLT, OLE, UGT, UGE, ULT, and ULE - Instruction *FAdd; - Constant *C; - Value *X, *Z; - CmpPredicate Pred; - - // Note: OneUse check for `Cmp` is necessary because it makes sure that other - // InstCombine folds don't undo this transformation and cause an infinite - // loop. Furthermore, it could also increase the operation count. - if (match(&SI, m_Select(m_OneUse(m_FCmp(Pred, m_Value(X), m_Value(Z))), - m_OneUse(m_Instruction(FAdd)), m_Constant(C))) || - match(&SI, m_Select(m_OneUse(m_FCmp(Pred, m_Value(X), m_Value(Z))), - m_Constant(C), m_OneUse(m_Instruction(FAdd))))) { + auto TryFoldIntoAddConstant = + [&Builder, &SI](CmpInst::Predicate Pred, Value *X, Value *Z, + Instruction *FAdd, Constant *C, bool Swapped) -> Value * { // Only these relational predicates can be transformed into maxnum/minnum // intrinsic. if (!CmpInst::isRelational(Pred) || !match(Z, m_AnyZeroFP())) @@ -3793,7 +3780,8 @@ static Value *foldSelectIntoAddConstant(SelectInst &SI, if (!match(FAdd, m_FAdd(m_Specific(X), m_Specific(C)))) return nullptr; - Value *NewSelect = Builder.CreateSelect(SI.getCondition(), X, Z, "", &SI); + Value *NewSelect = Builder.CreateSelect(SI.getCondition(), Swapped ? Z : X, + Swapped ? X : Z, "", &SI); NewSelect->takeName(&SI); Value *NewFAdd = Builder.CreateFAdd(NewSelect, C); @@ -3808,7 +3796,27 @@ static Value *foldSelectIntoAddConstant(SelectInst &SI, cast(NewSelect)->setFastMathFlags(NewFMF); return NewFAdd; - } + }; + + // select((fcmp Pred, X, 0), (fadd X, C), C) + // => fadd((select (fcmp Pred, X, 0), X, 0), C) + // + // Pred := OGT, OGE, OLT, OLE, UGT, UGE, ULT, and ULE + Instruction *FAdd; + Constant *C; + Value *X, *Z; + CmpPredicate Pred; + + // Note: OneUse check for `Cmp` is necessary because it makes sure that other + // InstCombine folds don't undo this transformation and cause an infinite + // loop. Furthermore, it could also increase the operation count. + if (match(&SI, m_Select(m_OneUse(m_FCmp(Pred, m_Value(X), m_Value(Z))), + m_OneUse(m_Instruction(FAdd)), m_Constant(C)))) + return TryFoldIntoAddConstant(Pred, X, Z, FAdd, C, /*Swapped=*/false); + + if (match(&SI, m_Select(m_OneUse(m_FCmp(Pred, m_Value(X), m_Value(Z))), + m_Constant(C), m_OneUse(m_Instruction(FAdd))))) + return TryFoldIntoAddConstant(Pred, X, Z, FAdd, C, /*Swapped=*/true); return nullptr; } diff --git a/llvm/test/Transforms/InstCombine/fcmp-fadd-select.ll b/llvm/test/Transforms/InstCombine/fcmp-fadd-select.ll index 0d0af91608e7a..15fad55db8df1 100644 --- a/llvm/test/Transforms/InstCombine/fcmp-fadd-select.ll +++ b/llvm/test/Transforms/InstCombine/fcmp-fadd-select.ll @@ -19,7 +19,7 @@ define float @test_fcmp_ogt_fadd_select_constant(float %in) { define float @test_fcmp_ogt_fadd_select_constant_swapped(float %in) { ; CHECK-LABEL: define float @test_fcmp_ogt_fadd_select_constant_swapped( ; CHECK-SAME: float [[IN:%.*]]) { -; CHECK-NEXT: [[SEL_NEW:%.*]] = call nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00) +; CHECK-NEXT: [[SEL_NEW:%.*]] = call nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00) ; CHECK-NEXT: [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00 ; CHECK-NEXT: ret float [[ADD_NEW]] ; @@ -87,7 +87,7 @@ define float @test_fcmp_olt_fadd_select_constant(float %in) { define float @test_fcmp_olt_fadd_select_constant_swapped(float %in) { ; CHECK-LABEL: define float @test_fcmp_olt_fadd_select_constant_swapped( ; CHECK-SAME: float [[IN:%.*]]) { -; CHECK-NEXT: [[SEL_NEW:%.*]] = call nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00) +; CHECK-NEXT: [[SEL_NEW:%.*]] = call nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00) ; CHECK-NEXT: [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00 ; CHECK-NEXT: ret float [[ADD_NEW]] ; @@ -155,7 +155,7 @@ define float @test_fcmp_oge_fadd_select_constant(float %in) { define float @test_fcmp_oge_fadd_select_constant_swapped(float %in) { ; CHECK-LABEL: define float @test_fcmp_oge_fadd_select_constant_swapped( ; CHECK-SAME: float [[IN:%.*]]) { -; CHECK-NEXT: [[SEL_NEW:%.*]] = call nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00) +; CHECK-NEXT: [[SEL_NEW:%.*]] = call nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00) ; CHECK-NEXT: [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00 ; CHECK-NEXT: ret float [[ADD_NEW]] ; @@ -223,7 +223,7 @@ define float @test_fcmp_ole_fadd_select_constant(float %in) { define float @test_fcmp_ole_fadd_select_constant_swapped(float %in) { ; CHECK-LABEL: define float @test_fcmp_ole_fadd_select_constant_swapped( ; CHECK-SAME: float [[IN:%.*]]) { -; CHECK-NEXT: [[SEL_NEW:%.*]] = call nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00) +; CHECK-NEXT: [[SEL_NEW:%.*]] = call nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00) ; CHECK-NEXT: [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00 ; CHECK-NEXT: ret float [[ADD_NEW]] ; @@ -293,7 +293,7 @@ define float @test_fcmp_ugt_fadd_select_constant_swapped(float %in) { ; CHECK-LABEL: define float @test_fcmp_ugt_fadd_select_constant_swapped( ; CHECK-SAME: float [[IN:%.*]]) { ; CHECK-NEXT: [[CMP1_INV:%.*]] = fcmp ole float [[IN]], 0.000000e+00 -; CHECK-NEXT: [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float 0.000000e+00, float [[IN]] +; CHECK-NEXT: [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float [[IN]], float 0.000000e+00 ; CHECK-NEXT: [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00 ; CHECK-NEXT: ret float [[ADD_NEW]] ; @@ -366,7 +366,7 @@ define float @test_fcmp_uge_fadd_select_constant_swapped(float %in) { ; CHECK-LABEL: define float @test_fcmp_uge_fadd_select_constant_swapped( ; CHECK-SAME: float [[IN:%.*]]) { ; CHECK-NEXT: [[CMP1_INV:%.*]] = fcmp olt float [[IN]], 0.000000e+00 -; CHECK-NEXT: [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float 0.000000e+00, float [[IN]] +; CHECK-NEXT: [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float [[IN]], float 0.000000e+00 ; CHECK-NEXT: [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00 ; CHECK-NEXT: ret float [[ADD_NEW]] ; @@ -439,7 +439,7 @@ define float @test_fcmp_ult_fadd_select_constant_swapped(float %in) { ; CHECK-LABEL: define float @test_fcmp_ult_fadd_select_constant_swapped( ; CHECK-SAME: float [[IN:%.*]]) { ; CHECK-NEXT: [[CMP1_INV:%.*]] = fcmp oge float [[IN]], 0.000000e+00 -; CHECK-NEXT: [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float 0.000000e+00, float [[IN]] +; CHECK-NEXT: [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float [[IN]], float 0.000000e+00 ; CHECK-NEXT: [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00 ; CHECK-NEXT: ret float [[ADD_NEW]] ; @@ -512,7 +512,7 @@ define float @test_fcmp_ule_fadd_select_constant_swapped(float %in) { ; CHECK-LABEL: define float @test_fcmp_ule_fadd_select_constant_swapped( ; CHECK-SAME: float [[IN:%.*]]) { ; CHECK-NEXT: [[CMP1_INV:%.*]] = fcmp ogt float [[IN]], 0.000000e+00 -; CHECK-NEXT: [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float 0.000000e+00, float [[IN]] +; CHECK-NEXT: [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float [[IN]], float 0.000000e+00 ; CHECK-NEXT: [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00 ; CHECK-NEXT: ret float [[ADD_NEW]] ; From 1fa0302ba2efc5374b5d1cdc8715dfc0f1048c6c Mon Sep 17 00:00:00 2001 From: Jacek Caban Date: Thu, 2 Jan 2025 12:06:58 +0100 Subject: [PATCH 270/567] [LLD][COFF] Emit warnings for missing load config on EC targets (#121339) ARM64EC and ARM64X images require a load configuration to be valid. --- lld/COFF/SymbolTable.cpp | 9 +++++++++ lld/test/COFF/arm64x-loadconfig.s | 14 +++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp index 6f25ad0620927..ae88675ab93a1 100644 --- a/lld/COFF/SymbolTable.cpp +++ b/lld/COFF/SymbolTable.cpp @@ -536,6 +536,15 @@ void SymbolTable::initializeLoadConfig() { auto sym = dyn_cast_or_null(findUnderscore("_load_config_used")); if (!sym) { + if (isEC()) { + Warn(ctx) << "EC version of '_load_config_used' is missing"; + return; + } + if (ctx.hybridSymtab) { + Warn(ctx) << "native version of '_load_config_used' is missing for " + "ARM64X target"; + return; + } if (ctx.config.guardCF != GuardCFLevel::Off) Warn(ctx) << "Control Flow Guard is enabled but '_load_config_used' is missing"; diff --git a/lld/test/COFF/arm64x-loadconfig.s b/lld/test/COFF/arm64x-loadconfig.s index 8d2ab55554634..d21f4bfe95b84 100644 --- a/lld/test/COFF/arm64x-loadconfig.s +++ b/lld/test/COFF/arm64x-loadconfig.s @@ -8,7 +8,19 @@ // RUN: llvm-mc -filetype=obj -triple=aarch64-windows loadconfig-short.s -o loadconfig-short.obj // RUN: llvm-mc -filetype=obj -triple=arm64ec-windows loadconfig-short.s -o loadconfig-short-arm64ec.obj -// RUN: lld-link -machine:arm64x -out:out.dll -dll -noentry loadconfig.obj test.obj +// RUN: lld-link -machine:arm64x -out:out-warn.dll -dll -noentry test.obj \ +// RUN: 2>&1 | FileCheck --check-prefixes=WARN-LOADCFG,WARN-EC-LOADCFG %s +// WARN-LOADCFG: lld-link: warning: native version of '_load_config_used' is missing for ARM64X target +// WARN-EC-LOADCFG: lld-link: warning: EC version of '_load_config_used' is missing + +// RUN: lld-link -machine:arm64x -out:out-nonative.dll -dll -noentry loadconfig-ec.obj chpe.obj \ +// RUN: 2>&1 | FileCheck --check-prefixes=WARN-LOADCFG --implicit-check-not EC %s + +// RUN: lld-link -machine:arm64ec -out:out-ec.dll -dll -noentry chpe.obj \ +// RUN: 2>&1 | FileCheck --check-prefixes=WARN-EC-LOADCFG --implicit-check-not native %s + +// RUN: lld-link -machine:arm64x -out:out.dll -dll -noentry loadconfig.obj test.obj \ +// RUN: 2>&1 | FileCheck --check-prefixes=WARN-EC-LOADCFG --implicit-check-not native %s // RUN: llvm-readobj --coff-load-config out.dll | FileCheck --check-prefix=DYNRELOCS %s // DYNRELOCS: DynamicValueRelocTableOffset: 0xC From 207e485f4b7e8113b8b329ddcde423aafc0a8832 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 2 Jan 2025 11:09:02 +0000 Subject: [PATCH 271/567] [VPlan] Track VectorPH during skeleton creation. (NFC) Split off from https://github.com/llvm/llvm-project/pull/108378. This ensures that the logic works even if now vector region exits. --- .../Transforms/Vectorize/LoopVectorize.cpp | 32 +++++++++++-------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 1daed0ebe08b9..52186882b4f20 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -479,7 +479,8 @@ class InnerLoopVectorizer { AC(AC), ORE(ORE), VF(VecWidth), MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), - PSI(PSI), RTChecks(RTChecks), Plan(Plan) { + PSI(PSI), RTChecks(RTChecks), Plan(Plan), + VectorPHVPB(Plan.getEntry()->getSingleSuccessor()) { // Query this against the original loop and save it here because the profile // of the original loop header may change as the transformation happens. OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( @@ -582,6 +583,11 @@ class InnerLoopVectorizer { virtual void printDebugTracesAtStart() {} virtual void printDebugTracesAtEnd() {} + /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the + /// vector preheader and its predecessor, also connecting the new block to the + /// scalar preheader. + void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB); + /// The original loop. Loop *OrigLoop; @@ -676,6 +682,10 @@ class InnerLoopVectorizer { BasicBlock *AdditionalBypassBlock = nullptr; VPlan &Plan; + + /// The vector preheader block of \p Plan, used as target for check blocks + /// introduced during skeleton creation. + VPBlockBase *VectorPHVPB; }; /// Encapsulate information regarding vectorization of a loop and its epilogue. @@ -2443,19 +2453,15 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { return VectorTripCount; } -/// Introduces a new VPIRBasicBlock for \p CheckIRBB to \p Plan between the -/// vector preheader and its predecessor, also connecting the new block to the -/// scalar preheader. -static void introduceCheckBlockInVPlan(VPlan &Plan, BasicBlock *CheckIRBB) { +void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) { VPBlockBase *ScalarPH = Plan.getScalarPreheader(); - VPBlockBase *VectorPH = Plan.getVectorPreheader(); - VPBlockBase *PreVectorPH = VectorPH->getSinglePredecessor(); + VPBlockBase *PreVectorPH = VectorPHVPB->getSinglePredecessor(); if (PreVectorPH->getNumSuccessors() != 1) { assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors"); assert(PreVectorPH->getSuccessors()[0] == ScalarPH && "Unexpected successor"); VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB); - VPBlockUtils::insertOnEdge(PreVectorPH, VectorPH, CheckVPIRBB); + VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB); PreVectorPH = CheckVPIRBB; } VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH); @@ -2544,7 +2550,7 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { LoopBypassBlocks.push_back(TCCheckBlock); // TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here. - introduceCheckBlockInVPlan(Plan, TCCheckBlock); + introduceCheckBlockInVPlan(TCCheckBlock); } BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { @@ -2562,7 +2568,7 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { LoopBypassBlocks.push_back(SCEVCheckBlock); AddedSafetyChecks = true; - introduceCheckBlockInVPlan(Plan, SCEVCheckBlock); + introduceCheckBlockInVPlan(SCEVCheckBlock); return SCEVCheckBlock; } @@ -2599,7 +2605,7 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { AddedSafetyChecks = true; - introduceCheckBlockInVPlan(Plan, MemCheckBlock); + introduceCheckBlockInVPlan(MemCheckBlock); return MemCheckBlock; } @@ -7952,7 +7958,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false); ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); - introduceCheckBlockInVPlan(Plan, TCCheckBlock); + introduceCheckBlockInVPlan(TCCheckBlock); return TCCheckBlock; } @@ -8092,7 +8098,7 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( Plan.setEntry(NewEntry); // OldEntry is now dead and will be cleaned up when the plan gets destroyed. - introduceCheckBlockInVPlan(Plan, Insert); + introduceCheckBlockInVPlan(Insert); return Insert; } From 67c974bfd6b3dbba83865e8319a3e9d3274a44e9 Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Thu, 2 Jan 2025 12:26:34 +0000 Subject: [PATCH 272/567] [lldb][DWARFASTParserClang][NFC] Remove redundant parameter to ParseChildParameters (#121033) This was never set to anything other than `true`. --- .../SymbolFile/DWARF/DWARFASTParserClang.cpp | 16 +++++++--------- .../SymbolFile/DWARF/DWARFASTParserClang.h | 2 +- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp index 28e7cceb39715..e2f76e88dd6f0 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp @@ -1222,11 +1222,9 @@ DWARFASTParserClang::ParseSubroutine(const DWARFDIE &die, } if (die.HasChildren()) { - bool skip_artificial = true; - ParseChildParameters(containing_decl_ctx, die, skip_artificial, is_static, - is_variadic, has_template_params, - function_param_types, function_param_decls, - type_quals); + ParseChildParameters(containing_decl_ctx, die, is_static, is_variadic, + has_template_params, function_param_types, + function_param_decls, type_quals); } bool ignore_containing_context = false; @@ -2325,7 +2323,7 @@ DWARFASTParserClang::ConstructDemangledNameFromDWARF(const DWARFDIE &die) { clang::DeclContext *containing_decl_ctx = GetClangDeclContextContainingDIE(die, nullptr); - ParseChildParameters(containing_decl_ctx, die, true, is_static, is_variadic, + ParseChildParameters(containing_decl_ctx, die, is_static, is_variadic, has_template_params, param_types, param_decls, type_quals); sstr << "("; @@ -3069,8 +3067,8 @@ bool DWARFASTParserClang::ParseChildMembers( size_t DWARFASTParserClang::ParseChildParameters( clang::DeclContext *containing_decl_ctx, const DWARFDIE &parent_die, - bool skip_artificial, bool &is_static, bool &is_variadic, - bool &has_template_params, std::vector &function_param_types, + bool &is_static, bool &is_variadic, bool &has_template_params, + std::vector &function_param_types, std::vector &function_param_decls, unsigned &type_quals) { if (!parent_die) @@ -3125,7 +3123,7 @@ size_t DWARFASTParserClang::ParseChildParameters( } bool skip = false; - if (skip_artificial && is_artificial) { + if (is_artificial) { // In order to determine if a C++ member function is "const" we // have to look at the const-ness of "this"... if (arg_idx == 0 && diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h index 55f8e38d7486d..5b1c204bbe815 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h @@ -189,7 +189,7 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser { size_t ParseChildParameters(clang::DeclContext *containing_decl_ctx, const lldb_private::plugin::dwarf::DWARFDIE &parent_die, - bool skip_artificial, bool &is_static, bool &is_variadic, + bool &is_static, bool &is_variadic, bool &has_template_params, std::vector &function_args, std::vector &function_param_decls, From 02b30128e8e87795b9262035a48990648cbec586 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Thu, 2 Jan 2025 13:44:45 +0100 Subject: [PATCH 273/567] [clang][bytecode] Always return false for invalid bcp results (#121467) --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 3 ++- clang/test/AST/ByteCode/builtin-constant-p.cpp | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index b5849553d0bf5..731c9290993f1 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -1544,9 +1544,10 @@ static bool interp__builtin_constant_p(InterpState &S, CodePtr OpPC, if (Res.isInvalid()) { C.cleanup(); Stk.clear(); + return returnInt(false); } - if (!Res.isInvalid() && !Res.empty()) { + if (!Res.empty()) { const APValue &LV = Res.toAPValue(); if (LV.isLValue()) { APValue::LValueBase Base = LV.getLValueBase(); diff --git a/clang/test/AST/ByteCode/builtin-constant-p.cpp b/clang/test/AST/ByteCode/builtin-constant-p.cpp index 0d222d1c96277..62899b60064c2 100644 --- a/clang/test/AST/ByteCode/builtin-constant-p.cpp +++ b/clang/test/AST/ByteCode/builtin-constant-p.cpp @@ -12,3 +12,9 @@ static_assert(__builtin_constant_p(I + 10.0), ""); static_assert(__builtin_constant_p(nullptr), ""); static_assert(__builtin_constant_p(&I), ""); // both-error {{failed due to requirement}} static_assert(__builtin_constant_p((void)I), ""); // both-error {{failed due to requirement}} + +extern int z; +constexpr int foo(int &a) { + return __builtin_constant_p(a); +} +static_assert(!foo(z)); From 8178e7218833bcbcf4263016252681c81fb167db Mon Sep 17 00:00:00 2001 From: Marius Brehler Date: Thu, 2 Jan 2025 14:02:07 +0100 Subject: [PATCH 274/567] [mlir][func] Fix return op example (#121470) Similiar to #121112. --- mlir/include/mlir/Dialect/Func/IR/FuncOps.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/include/mlir/Dialect/Func/IR/FuncOps.td b/mlir/include/mlir/Dialect/Func/IR/FuncOps.td index 237a825c19104..211201802b08c 100644 --- a/mlir/include/mlir/Dialect/Func/IR/FuncOps.td +++ b/mlir/include/mlir/Dialect/Func/IR/FuncOps.td @@ -352,7 +352,7 @@ def ReturnOp : Func_Op<"return", [Pure, HasParent<"FuncOp">, Example: ```mlir - func.func @foo() : (i32, f8) { + func.func @foo() -> (i32, f8) { ... return %0, %1 : i32, f8 } From 9cd774d1e49f792b7546e5309c7b27d653b37132 Mon Sep 17 00:00:00 2001 From: Phoebe Wang Date: Thu, 2 Jan 2025 21:02:19 +0800 Subject: [PATCH 275/567] [X86][NFC] Move "_Int" after "k"/"kz" (#121450) Address comment at https://github.com/llvm/llvm-project/pull/121373#discussion_r1900402932 --- .../X86/MCTargetDesc/X86ATTInstPrinter.cpp | 12 +- .../X86/MCTargetDesc/X86InstComments.cpp | 19 +- .../X86/MCTargetDesc/X86InstPrinterCommon.cpp | 12 +- .../X86/MCTargetDesc/X86IntelInstPrinter.cpp | 12 +- llvm/lib/Target/X86/X86InstrAVX10.td | 35 ++- llvm/lib/Target/X86/X86InstrAVX512.td | 246 ++++++++------- llvm/lib/Target/X86/X86InstrFMA3Info.cpp | 15 +- llvm/lib/Target/X86/X86InstrInfo.cpp | 294 +++++++++--------- llvm/lib/Target/X86/X86SchedSapphireRapids.td | 52 ++-- llvm/lib/Target/X86/X86ScheduleZnver4.td | 4 +- llvm/test/TableGen/x86-fold-tables.inc | 282 ++++++++--------- 11 files changed, 505 insertions(+), 478 deletions(-) diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp index b67c573e217ba..abe0cc6365dd4 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp @@ -140,8 +140,8 @@ bool X86ATTInstPrinter::printVecCompareInstr(const MCInst *MI, case X86::VCMPPSZ128rmik: case X86::VCMPPSZ128rrik: case X86::VCMPPSZ256rmik: case X86::VCMPPSZ256rrik: case X86::VCMPPSZrmik: case X86::VCMPPSZrrik: - case X86::VCMPSDZrmi_Intk: case X86::VCMPSDZrri_Intk: - case X86::VCMPSSZrmi_Intk: case X86::VCMPSSZrri_Intk: + case X86::VCMPSDZrmik_Int: case X86::VCMPSDZrrik_Int: + case X86::VCMPSSZrmik_Int: case X86::VCMPSSZrrik_Int: case X86::VCMPPDZ128rmbi: case X86::VCMPPDZ128rmbik: case X86::VCMPPDZ256rmbi: case X86::VCMPPDZ256rmbik: case X86::VCMPPDZrmbi: case X86::VCMPPDZrmbik: @@ -150,8 +150,8 @@ bool X86ATTInstPrinter::printVecCompareInstr(const MCInst *MI, case X86::VCMPPSZrmbi: case X86::VCMPPSZrmbik: case X86::VCMPPDZrrib: case X86::VCMPPDZrribk: case X86::VCMPPSZrrib: case X86::VCMPPSZrribk: - case X86::VCMPSDZrrib_Int: case X86::VCMPSDZrrib_Intk: - case X86::VCMPSSZrrib_Int: case X86::VCMPSSZrrib_Intk: + case X86::VCMPSDZrrib_Int: case X86::VCMPSDZrribk_Int: + case X86::VCMPSSZrrib_Int: case X86::VCMPSSZrribk_Int: case X86::VCMPPHZ128rmi: case X86::VCMPPHZ128rri: case X86::VCMPPHZ256rmi: case X86::VCMPPHZ256rri: case X86::VCMPPHZrmi: case X86::VCMPPHZrri: @@ -160,12 +160,12 @@ bool X86ATTInstPrinter::printVecCompareInstr(const MCInst *MI, case X86::VCMPPHZ128rmik: case X86::VCMPPHZ128rrik: case X86::VCMPPHZ256rmik: case X86::VCMPPHZ256rrik: case X86::VCMPPHZrmik: case X86::VCMPPHZrrik: - case X86::VCMPSHZrmi_Intk: case X86::VCMPSHZrri_Intk: + case X86::VCMPSHZrmik_Int: case X86::VCMPSHZrrik_Int: case X86::VCMPPHZ128rmbi: case X86::VCMPPHZ128rmbik: case X86::VCMPPHZ256rmbi: case X86::VCMPPHZ256rmbik: case X86::VCMPPHZrmbi: case X86::VCMPPHZrmbik: case X86::VCMPPHZrrib: case X86::VCMPPHZrribk: - case X86::VCMPSHZrrib_Int: case X86::VCMPSHZrrib_Intk: + case X86::VCMPSHZrrib_Int: case X86::VCMPSHZrribk_Int: case X86::VCMPPBF16Z128rmi: case X86::VCMPPBF16Z128rri: case X86::VCMPPBF16Z256rmi: case X86::VCMPPBF16Z256rri: case X86::VCMPPBF16Zrmi: case X86::VCMPPBF16Zrri: diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp index 9f8bc57fbc76d..681d0dab37d09 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp @@ -40,6 +40,17 @@ using namespace llvm; CASE_MASK_INS_COMMON(Inst, Suffix, src) \ CASE_MASKZ_INS_COMMON(Inst, Suffix, src) +#define CASE_MASK_INS_COMMON_INT(Inst, Suffix, src) \ + case X86::V##Inst##Suffix##src##k_Int: + +#define CASE_MASKZ_INS_COMMON_INT(Inst, Suffix, src) \ + case X86::V##Inst##Suffix##src##kz_Int: + +#define CASE_AVX512_INS_COMMON_INT(Inst, Suffix, src) \ + CASE_AVX_INS_COMMON(Inst, Suffix, src##_Int) \ + CASE_MASK_INS_COMMON_INT(Inst, Suffix, src) \ + CASE_MASKZ_INS_COMMON_INT(Inst, Suffix, src) + #define CASE_FPCLASS_PACKED(Inst, src) \ CASE_AVX_INS_COMMON(Inst, Z, src##i) \ CASE_AVX_INS_COMMON(Inst, Z256, src##i) \ @@ -196,8 +207,8 @@ using namespace llvm; CASE_AVX_INS_COMMON(Inst##SS, , r_Int) \ CASE_AVX_INS_COMMON(Inst##SD, Z, r) \ CASE_AVX_INS_COMMON(Inst##SS, Z, r) \ - CASE_AVX512_INS_COMMON(Inst##SD, Z, r_Int) \ - CASE_AVX512_INS_COMMON(Inst##SS, Z, r_Int) + CASE_AVX512_INS_COMMON_INT(Inst##SD, Z, r) \ + CASE_AVX512_INS_COMMON_INT(Inst##SS, Z, r) #define CASE_FMA_SCALAR_MEM(Inst) \ CASE_AVX_INS_COMMON(Inst##SD, , m) \ @@ -206,8 +217,8 @@ using namespace llvm; CASE_AVX_INS_COMMON(Inst##SS, , m_Int) \ CASE_AVX_INS_COMMON(Inst##SD, Z, m) \ CASE_AVX_INS_COMMON(Inst##SS, Z, m) \ - CASE_AVX512_INS_COMMON(Inst##SD, Z, m_Int) \ - CASE_AVX512_INS_COMMON(Inst##SS, Z, m_Int) + CASE_AVX512_INS_COMMON_INT(Inst##SD, Z, m) \ + CASE_AVX512_INS_COMMON_INT(Inst##SS, Z, m) #define CASE_FMA4(Inst, suf) \ CASE_AVX_INS_COMMON(Inst, 4, suf) \ diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp index fafcc737ff983..01e2d4ace9773 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp @@ -277,8 +277,8 @@ void X86InstPrinterCommon::printCMPMnemonic(const MCInst *MI, bool IsVCmp, case X86::VCMPSDrmi_Int: case X86::VCMPSDrri_Int: case X86::VCMPSDZrmi: case X86::VCMPSDZrri: case X86::VCMPSDZrmi_Int: case X86::VCMPSDZrri_Int: - case X86::VCMPSDZrmi_Intk: case X86::VCMPSDZrri_Intk: - case X86::VCMPSDZrrib_Int: case X86::VCMPSDZrrib_Intk: + case X86::VCMPSDZrmik_Int: case X86::VCMPSDZrrik_Int: + case X86::VCMPSDZrrib_Int: case X86::VCMPSDZrribk_Int: OS << "sd\t"; break; case X86::CMPSSrmi: case X86::CMPSSrri: @@ -287,8 +287,8 @@ void X86InstPrinterCommon::printCMPMnemonic(const MCInst *MI, bool IsVCmp, case X86::VCMPSSrmi_Int: case X86::VCMPSSrri_Int: case X86::VCMPSSZrmi: case X86::VCMPSSZrri: case X86::VCMPSSZrmi_Int: case X86::VCMPSSZrri_Int: - case X86::VCMPSSZrmi_Intk: case X86::VCMPSSZrri_Intk: - case X86::VCMPSSZrrib_Int: case X86::VCMPSSZrrib_Intk: + case X86::VCMPSSZrmik_Int: case X86::VCMPSSZrrik_Int: + case X86::VCMPSSZrrib_Int: case X86::VCMPSSZrribk_Int: OS << "ss\t"; break; case X86::VCMPPHZ128rmi: case X86::VCMPPHZ128rri: @@ -305,8 +305,8 @@ void X86InstPrinterCommon::printCMPMnemonic(const MCInst *MI, bool IsVCmp, break; case X86::VCMPSHZrmi: case X86::VCMPSHZrri: case X86::VCMPSHZrmi_Int: case X86::VCMPSHZrri_Int: - case X86::VCMPSHZrrib_Int: case X86::VCMPSHZrrib_Intk: - case X86::VCMPSHZrmi_Intk: case X86::VCMPSHZrri_Intk: + case X86::VCMPSHZrrib_Int: case X86::VCMPSHZrribk_Int: + case X86::VCMPSHZrmik_Int: case X86::VCMPSHZrrik_Int: OS << "sh\t"; break; case X86::VCMPPBF16Z128rmi: case X86::VCMPPBF16Z128rri: diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp index 680092679c903..c26dc2ca5a7a4 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp @@ -119,8 +119,8 @@ bool X86IntelInstPrinter::printVecCompareInstr(const MCInst *MI, raw_ostream &OS case X86::VCMPPSZ128rmik: case X86::VCMPPSZ128rrik: case X86::VCMPPSZ256rmik: case X86::VCMPPSZ256rrik: case X86::VCMPPSZrmik: case X86::VCMPPSZrrik: - case X86::VCMPSDZrmi_Intk: case X86::VCMPSDZrri_Intk: - case X86::VCMPSSZrmi_Intk: case X86::VCMPSSZrri_Intk: + case X86::VCMPSDZrmik_Int: case X86::VCMPSDZrrik_Int: + case X86::VCMPSSZrmik_Int: case X86::VCMPSSZrrik_Int: case X86::VCMPPDZ128rmbi: case X86::VCMPPDZ128rmbik: case X86::VCMPPDZ256rmbi: case X86::VCMPPDZ256rmbik: case X86::VCMPPDZrmbi: case X86::VCMPPDZrmbik: @@ -129,8 +129,8 @@ bool X86IntelInstPrinter::printVecCompareInstr(const MCInst *MI, raw_ostream &OS case X86::VCMPPSZrmbi: case X86::VCMPPSZrmbik: case X86::VCMPPDZrrib: case X86::VCMPPDZrribk: case X86::VCMPPSZrrib: case X86::VCMPPSZrribk: - case X86::VCMPSDZrrib_Int: case X86::VCMPSDZrrib_Intk: - case X86::VCMPSSZrrib_Int: case X86::VCMPSSZrrib_Intk: + case X86::VCMPSDZrrib_Int: case X86::VCMPSDZrribk_Int: + case X86::VCMPSSZrrib_Int: case X86::VCMPSSZrribk_Int: case X86::VCMPPHZ128rmi: case X86::VCMPPHZ128rri: case X86::VCMPPHZ256rmi: case X86::VCMPPHZ256rri: case X86::VCMPPHZrmi: case X86::VCMPPHZrri: @@ -139,12 +139,12 @@ bool X86IntelInstPrinter::printVecCompareInstr(const MCInst *MI, raw_ostream &OS case X86::VCMPPHZ128rmik: case X86::VCMPPHZ128rrik: case X86::VCMPPHZ256rmik: case X86::VCMPPHZ256rrik: case X86::VCMPPHZrmik: case X86::VCMPPHZrrik: - case X86::VCMPSHZrmi_Intk: case X86::VCMPSHZrri_Intk: + case X86::VCMPSHZrmik_Int: case X86::VCMPSHZrrik_Int: case X86::VCMPPHZ128rmbi: case X86::VCMPPHZ128rmbik: case X86::VCMPPHZ256rmbi: case X86::VCMPPHZ256rmbik: case X86::VCMPPHZrmbi: case X86::VCMPPHZrmbik: case X86::VCMPPHZrrib: case X86::VCMPPHZrribk: - case X86::VCMPSHZrrib_Int: case X86::VCMPSHZrrib_Intk: + case X86::VCMPSHZrrib_Int: case X86::VCMPSHZrribk_Int: case X86::VCMPPBF16Z128rmi: case X86::VCMPPBF16Z128rri: case X86::VCMPPBF16Z256rmi: case X86::VCMPPBF16Z256rri: case X86::VCMPPBF16Zrmi: case X86::VCMPPBF16Zrri: diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td index 3bc64eda01a9c..cda6998778bc4 100644 --- a/llvm/lib/Target/X86/X86InstrAVX10.td +++ b/llvm/lib/Target/X86/X86InstrAVX10.td @@ -417,27 +417,30 @@ multiclass avx10_minmax_scalar, Sched<[WriteFMAX.Folded, WriteFMAX.ReadAfterFold]>; } - defm rri_Int : AVX512_maskable<0x53, MRMSrcReg, _, (outs VR128X:$dst), - (ins VR128X:$src1, VR128X:$src2, i32u8imm:$src3), - OpStr, "$src3, $src2, $src1", "$src1, $src2, $src3", - (_.VT (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 timm:$src3)))>, + defm rri : AVX512_maskable<0x53, MRMSrcReg, _, (outs VR128X:$dst), + (ins VR128X:$src1, VR128X:$src2, i32u8imm:$src3), + OpStr, "$src3, $src2, $src1", "$src1, $src2, $src3", + (_.VT (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 timm:$src3))), + 0, 0, 0, vselect_mask, "", "_Int">, Sched<[WriteFMAX]>; - defm rmi_Int : AVX512_maskable<0x53, MRMSrcMem, _, (outs VR128X:$dst), - (ins VR128X:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3), - OpStr, "$src3, $src2, $src1", "$src1, $src2, $src3", - (_.VT (OpNode (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2), - (i32 timm:$src3)))>, + defm rmi : AVX512_maskable<0x53, MRMSrcMem, _, (outs VR128X:$dst), + (ins VR128X:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3), + OpStr, "$src3, $src2, $src1", "$src1, $src2, $src3", + (_.VT (OpNode (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2), + (i32 timm:$src3))), + 0, 0, 0, vselect_mask, "", "_Int">, Sched<[WriteFMAX.Folded, WriteFMAX.ReadAfterFold]>; } let Uses = [], mayRaiseFPException = 0 in - defm rrib_Int : AVX512_maskable<0x53, MRMSrcReg, _, (outs VR128X:$dst), - (ins VR128X:$src1, VR128X:$src2, i32u8imm:$src3), - OpStr, "$src3, {sae}, $src2, $src1", - "$src1, $src2, {sae}, $src3", - (_.VT (OpNodeSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 timm:$src3)))>, + defm rrib : AVX512_maskable<0x53, MRMSrcReg, _, (outs VR128X:$dst), + (ins VR128X:$src1, VR128X:$src2, i32u8imm:$src3), + OpStr, "$src3, {sae}, $src2, $src1", + "$src1, $src2, {sae}, $src3", + (_.VT (OpNodeSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 timm:$src3))), + 0, 0, 0, vselect_mask, "", "_Int">, Sched<[WriteFMAX]>, EVEX_B; } } diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index e899807cd1b7c..d6ca4b142afe0 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -28,19 +28,20 @@ multiclass AVX512_maskable_custom O, Format F, bit IsCommutable = 0, bit IsKCommutable = 0, bit IsKZCommutable = IsCommutable, - string ClobberConstraint = ""> { + string ClobberConstraint = "", + string Suffix = ""> { let isCommutable = IsCommutable, Constraints = ClobberConstraint in - def NAME: AVX512; + def Suffix: AVX512; // Prefer over VMOV*rrk Pat<> let isCommutable = IsKCommutable in - def NAME#k: AVX512, + def k#Suffix: AVX512, EVEX_K { // In case of the 3src subclass this is overridden with a let. string Constraints = !if(!eq(ClobberConstraint, ""), MaskingConstraint, @@ -52,10 +53,10 @@ multiclass AVX512_maskable_custom O, Format F, // So, it is Ok to use IsCommutable instead of IsKCommutable. let isCommutable = IsKZCommutable, // Prefer over VMOV*rrkz Pat<> Constraints = ClobberConstraint in - def NAME#kz: AVX512, + def kz#Suffix: AVX512, EVEX_KZ; } @@ -72,7 +73,8 @@ multiclass AVX512_maskable_common O, Format F, X86VectorVTInfo _, bit IsCommutable = 0, bit IsKCommutable = 0, bit IsKZCommutable = IsCommutable, - string ClobberConstraint = ""> : + string ClobberConstraint = "", + string Suffix = ""> : AVX512_maskable_custom O, Format F, X86VectorVTInfo _, [(set _.RC:$dst, (Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))], MaskingConstraint, IsCommutable, - IsKCommutable, IsKZCommutable, ClobberConstraint>; + IsKCommutable, IsKZCommutable, ClobberConstraint, + Suffix>; // This multiclass generates the unconditional/non-masking, the masking and // the zero-masking variant of the vector instruction. In the masking case, the @@ -115,23 +118,24 @@ multiclass AVX512_maskable O, Format F, X86VectorVTInfo _, bit IsCommutable = 0, bit IsKCommutable = 0, bit IsKZCommutable = IsCommutable, SDPatternOperator Select = vselect_mask, - string ClobberConstraint = ""> : + string ClobberConstraint = "", + string Suffix = ""> : AVX512_maskable_common; + IsKZCommutable, ClobberConstraint, Suffix>; // This multiclass generates the unconditional/non-masking, the masking and // the zero-masking variant of the scalar instruction. multiclass AVX512_maskable_scalar O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, - dag RHS> : + dag RHS, string Suffix = ""> : AVX512_maskable; + RHS, 0, 0, 0, X86selects_mask, "", Suffix>; // Similar to AVX512_maskable but in this case one of the source operands // ($src1) is already tied to $dst so we just use that for the preserved @@ -144,7 +148,7 @@ multiclass AVX512_maskable_3src O, Format F, X86VectorVTInfo _, bit IsCommutable = 0, bit IsKCommutable = 0, SDPatternOperator Select = vselect_mask, - bit MaskOnly = 0> : + bit MaskOnly = 0, string Suffix = ""> : AVX512_maskable_common O, Format F, X86VectorVTInfo _, OpcodeStr, AttSrcAsm, IntelSrcAsm, !if(MaskOnly, (null_frag), RHS), (Select _.KRCWM:$mask, RHS, _.RC:$src1), - Select, "", IsCommutable, IsKCommutable>; + Select, "", IsCommutable, IsKCommutable, + IsCommutable, "", Suffix>; // Similar to AVX512_maskable_3src but in this case the input VT for the tied // operand differs from the output VT. This requires a bitconvert on @@ -178,10 +183,10 @@ multiclass AVX512_maskable_3src_scalar O, Format F, X86VectorVTInfo _, dag RHS, bit IsCommutable = 0, bit IsKCommutable = 0, - bit MaskOnly = 0> : + bit MaskOnly = 0, string Suffix = ""> : AVX512_maskable_3src; + X86selects_mask, MaskOnly, Suffix>; multiclass AVX512_maskable_in_asm O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, @@ -215,17 +220,18 @@ multiclass AVX512_maskable_custom_cmp O, Format F, string AttSrcAsm, string IntelSrcAsm, list Pattern, list MaskingPattern, - bit IsCommutable = 0> { + bit IsCommutable = 0, + string Suffix = ""> { let isCommutable = IsCommutable in { - def NAME: AVX512; - def NAME#k: AVX512, EVEX_K; + def k#Suffix: AVX512, EVEX_K; } } @@ -235,20 +241,22 @@ multiclass AVX512_maskable_common_cmp O, Format F, X86VectorVTInfo _, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, dag RHS, dag MaskingRHS, - bit IsCommutable = 0> : + bit IsCommutable = 0, + string Suffix = ""> : AVX512_maskable_custom_cmp; + [(set _.KRC:$dst, MaskingRHS)], IsCommutable, Suffix>; multiclass AVX512_maskable_cmp O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, - dag RHS, dag RHS_su, bit IsCommutable = 0> : + dag RHS, dag RHS_su, bit IsCommutable = 0, + string Suffix = ""> : AVX512_maskable_common_cmp; + (and _.KRCWM:$mask, RHS_su), IsCommutable, Suffix>; // Used by conversion instructions. multiclass AVX512_maskable_cvt O, Format F, X86VectorVTInfo _, @@ -1937,37 +1945,37 @@ defm VPBLENDMW : blendmask_bw<0x66, "vpblendmw", SchedWriteVarBlend, multiclass avx512_cmp_scalar { - defm rri_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, - (outs _.KRC:$dst), - (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), - "vcmp"#_.Suffix, - "$cc, $src2, $src1", "$src1, $src2, $cc", - (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc), - (OpNode_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc)>, - EVEX, VVVV, VEX_LIG, Sched<[sched]>, SIMD_EXC; + defm rri : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, $src2, $src1", "$src1, $src2, $cc", + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc), + (OpNode_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc), 0, "_Int">, + EVEX, VVVV, VEX_LIG, Sched<[sched]>, SIMD_EXC; let mayLoad = 1 in - defm rmi_Int : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, - (outs _.KRC:$dst), - (ins _.RC:$src1, _.IntScalarMemOp:$src2, u8imm:$cc), - "vcmp"#_.Suffix, - "$cc, $src2, $src1", "$src1, $src2, $cc", - (OpNode (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2), - timm:$cc), - (OpNode_su (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2), - timm:$cc)>, EVEX, VVVV, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>, - Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; + defm rmi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.IntScalarMemOp:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, $src2, $src1", "$src1, $src2, $cc", + (OpNode (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2), + timm:$cc), + (OpNode_su (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2), + timm:$cc), 0, "_Int">, EVEX, VVVV, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>, + Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; let Uses = [MXCSR] in - defm rrib_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, - (outs _.KRC:$dst), - (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), - "vcmp"#_.Suffix, - "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc", - (OpNodeSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), - timm:$cc), - (OpNodeSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), - timm:$cc)>, - EVEX, VVVV, VEX_LIG, EVEX_B, Sched<[sched]>; + defm rrib : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc", + (OpNodeSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), + timm:$cc), + (OpNodeSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), + timm:$cc), 0, "_Int">, + EVEX, VVVV, VEX_LIG, EVEX_B, Sched<[sched]>; let isCodeGenOnly = 1 in { let isCommutable = 1 in @@ -5354,17 +5362,17 @@ multiclass avx512_fp_scalar opc, string OpcodeStr,X86VectorVTInfo _, SDPatternOperator OpNode, SDNode VecNode, X86FoldableSchedWrite sched, bit IsCommutable> { let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in { - defm rr_Int : AVX512_maskable_scalar, + (_.VT (VecNode _.RC:$src1, _.RC:$src2)), "_Int">, Sched<[sched]>; - defm rm_Int : AVX512_maskable_scalar, + (_.ScalarIntMemFrags addr:$src2))), "_Int">, Sched<[sched.Folded, sched.ReadAfterFold]>; let isCodeGenOnly = 1, Predicates = [HasAVX512] in { def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst), @@ -5387,28 +5395,28 @@ multiclass avx512_fp_scalar opc, string OpcodeStr,X86VectorVTInfo _, multiclass avx512_fp_scalar_round opc, string OpcodeStr,X86VectorVTInfo _, SDNode VecNode, X86FoldableSchedWrite sched> { let ExeDomain = _.ExeDomain, Uses = [MXCSR] in - defm rrb_Int : AVX512_maskable_scalar, + (i32 timm:$rc)), "_Int">, EVEX_B, EVEX_RC, Sched<[sched]>; } multiclass avx512_fp_scalar_sae opc, string OpcodeStr,X86VectorVTInfo _, SDPatternOperator OpNode, SDNode VecNode, SDNode SaeNode, X86FoldableSchedWrite sched, bit IsCommutable> { let ExeDomain = _.ExeDomain in { - defm rr_Int : AVX512_maskable_scalar, + (_.VT (VecNode _.RC:$src1, _.RC:$src2)), "_Int">, Sched<[sched]>, SIMD_EXC; - defm rm_Int : AVX512_maskable_scalar, + (_.ScalarIntMemFrags addr:$src2))), "_Int">, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; let isCodeGenOnly = 1, Predicates = [HasAVX512], @@ -5429,10 +5437,10 @@ multiclass avx512_fp_scalar_sae opc, string OpcodeStr,X86VectorVTInfo _, } let Uses = [MXCSR] in - defm rrb_Int : AVX512_maskable_scalar, + (SaeNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)), "_Int">, EVEX_B, Sched<[sched]>; } } @@ -6835,22 +6843,22 @@ defm VFNMSUB132 : avx512_fma3p_132_f<0x9E, "vfnmsub132", X86any_Fnmsub, multiclass avx512_fma3s_common opc, string OpcodeStr, X86VectorVTInfo _, dag RHS_r, dag RHS_m, dag RHS_b, bit MaskOnlyReg> { let Constraints = "$src1 = $dst", hasSideEffects = 0 in { - defm r_Int: AVX512_maskable_3src_scalar, + "$src3, $src2", "$src2, $src3", (null_frag), 1, 1, 0, "_Int">, EVEX, VVVV, Sched<[SchedWriteFMA.Scl]>, SIMD_EXC; let mayLoad = 1 in - defm m_Int: AVX512_maskable_3src_scalar, + "$src3, $src2", "$src2, $src3", (null_frag), 1, 1, 0, "_Int">, EVEX, VVVV, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold, SchedWriteFMA.Scl.ReadAfterFold]>, SIMD_EXC; let Uses = [MXCSR] in - defm rb_Int: AVX512_maskable_3src_scalar, + OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", (null_frag), 1, 1, 0, "_Int">, EVEX, VVVV, EVEX_B, EVEX_RC, Sched<[SchedWriteFMA.Scl]>; let isCodeGenOnly = 1, isCommutable = 1 in { @@ -6982,7 +6990,7 @@ multiclass avx512_scalar_fma_patterns(Prefix#"213"#Suffix#"Zr_Intk") + (!cast(Prefix#"213"#Suffix#"Zrk_Int") VR128X:$src1, VK1WM:$mask, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>; @@ -6993,7 +7001,7 @@ multiclass avx512_scalar_fma_patterns(Prefix#"213"#Suffix#"Zm_Intk") + (!cast(Prefix#"213"#Suffix#"Zmk_Int") VR128X:$src1, VK1WM:$mask, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>; @@ -7002,7 +7010,7 @@ multiclass avx512_scalar_fma_patterns(Prefix#"132"#Suffix#"Zm_Intk") + (!cast(Prefix#"132"#Suffix#"Zmk_Int") VR128X:$src1, VK1WM:$mask, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>; @@ -7011,7 +7019,7 @@ multiclass avx512_scalar_fma_patterns(Prefix#"231"#Suffix#"Zr_Intk") + (!cast(Prefix#"231"#Suffix#"Zrk_Int") VR128X:$src1, VK1WM:$mask, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>; @@ -7021,7 +7029,7 @@ multiclass avx512_scalar_fma_patterns(Prefix#"231"#Suffix#"Zm_Intk") + (!cast(Prefix#"231"#Suffix#"Zmk_Int") VR128X:$src1, VK1WM:$mask, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>; @@ -7031,7 +7039,7 @@ multiclass avx512_scalar_fma_patterns(Prefix#"213"#Suffix#"Zr_Intkz") + (!cast(Prefix#"213"#Suffix#"Zrkz_Int") VR128X:$src1, VK1WM:$mask, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>; @@ -7041,7 +7049,7 @@ multiclass avx512_scalar_fma_patterns(Prefix#"231"#Suffix#"Zr_Intkz") + (!cast(Prefix#"231"#Suffix#"Zrkz_Int") VR128X:$src1, VK1WM:$mask, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>; @@ -7052,7 +7060,7 @@ multiclass avx512_scalar_fma_patterns(Prefix#"213"#Suffix#"Zm_Intkz") + (!cast(Prefix#"213"#Suffix#"Zmkz_Int") VR128X:$src1, VK1WM:$mask, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>; @@ -7061,7 +7069,7 @@ multiclass avx512_scalar_fma_patterns(Prefix#"132"#Suffix#"Zm_Intkz") + (!cast(Prefix#"132"#Suffix#"Zmkz_Int") VR128X:$src1, VK1WM:$mask, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>; @@ -7070,7 +7078,7 @@ multiclass avx512_scalar_fma_patterns(Prefix#"231"#Suffix#"Zm_Intkz") + (!cast(Prefix#"231"#Suffix#"Zmkz_Int") VR128X:$src1, VK1WM:$mask, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>; @@ -7097,7 +7105,7 @@ multiclass avx512_scalar_fma_patterns(Prefix#"213"#Suffix#"Zrb_Intk") + (!cast(Prefix#"213"#Suffix#"Zrbk_Int") VR128X:$src1, VK1WM:$mask, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>; @@ -7108,7 +7116,7 @@ multiclass avx512_scalar_fma_patterns(Prefix#"231"#Suffix#"Zrb_Intk") + (!cast(Prefix#"231"#Suffix#"Zrbk_Int") VR128X:$src1, VK1WM:$mask, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>; @@ -7119,7 +7127,7 @@ multiclass avx512_scalar_fma_patterns(Prefix#"213"#Suffix#"Zrb_Intkz") + (!cast(Prefix#"213"#Suffix#"Zrbkz_Int") VR128X:$src1, VK1WM:$mask, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>; @@ -7130,7 +7138,7 @@ multiclass avx512_scalar_fma_patterns(Prefix#"231"#Suffix#"Zrb_Intkz") + (!cast(Prefix#"231"#Suffix#"Zrbkz_Int") VR128X:$src1, VK1WM:$mask, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>; @@ -7628,17 +7636,17 @@ let Uses = [MXCSR], mayRaiseFPException = 1 in multiclass avx512_cvt_fp_scalar opc, string OpcodeStr, X86VectorVTInfo _, X86VectorVTInfo _Src, SDNode OpNode, X86FoldableSchedWrite sched> { - defm rr_Int : AVX512_maskable_scalar, + (_Src.VT _Src.RC:$src2))), "_Int">, EVEX, VVVV, VEX_LIG, Sched<[sched]>; - defm rm_Int : AVX512_maskable_scalar, + (_Src.ScalarIntMemFrags addr:$src2))), "_Int">, EVEX, VVVV, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>; @@ -7660,11 +7668,11 @@ multiclass avx512_cvt_fp_sae_scalar opc, string OpcodeStr, X86VectorVTIn X86VectorVTInfo _Src, SDNode OpNodeSAE, X86FoldableSchedWrite sched> { let Uses = [MXCSR] in - defm rrb_Int : AVX512_maskable_scalar, + (_Src.VT _Src.RC:$src2))), "_Int">, EVEX, VVVV, VEX_LIG, EVEX_B, Sched<[sched]>; } @@ -7673,11 +7681,11 @@ multiclass avx512_cvt_fp_rc_scalar opc, string OpcodeStr, X86VectorVTInf X86VectorVTInfo _Src, SDNode OpNodeRnd, X86FoldableSchedWrite sched> { let Uses = [MXCSR] in - defm rrb_Int : AVX512_maskable_scalar, + (_Src.VT _Src.RC:$src2), (i32 timm:$rc))), "_Int">, EVEX, VVVV, VEX_LIG, Sched<[sched]>, EVEX_B, EVEX_RC; } @@ -9531,25 +9539,25 @@ multiclass avx512_sqrt_packed_all_round opc, string OpcodeStr, multiclass avx512_sqrt_scalar opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _, string Name, Predicate prd = HasAVX512> { let ExeDomain = _.ExeDomain, Predicates = [prd] in { - defm r_Int : AVX512_maskable_scalar, + (_.VT _.RC:$src2)), "_Int">, Sched<[sched]>, SIMD_EXC; - defm m_Int : AVX512_maskable_scalar, + (_.ScalarIntMemFrags addr:$src2)), "_Int">, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; let Uses = [MXCSR] in - defm rb_Int : AVX512_maskable_scalar, + (i32 timm:$rc)), "_Int">, EVEX_B, EVEX_RC, Sched<[sched]>; let isCodeGenOnly = 1, hasSideEffects = 0 in { @@ -9596,27 +9604,27 @@ defm VSQRT : avx512_sqrt_scalar_all<0x51, "vsqrt", SchedWriteFSqrtSizes>, VEX_LI multiclass avx512_rndscale_scalar opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in { - defm rri_Int : AVX512_maskable_scalar, + (i32 timm:$src3))), "_Int">, Sched<[sched]>, SIMD_EXC; let Uses = [MXCSR] in - defm rrib_Int : AVX512_maskable_scalar, EVEX_B, + (i32 timm:$src3))), "_Int">, EVEX_B, Sched<[sched]>; - defm rmi_Int : AVX512_maskable_scalar, + (_.ScalarIntMemFrags addr:$src2), (i32 timm:$src3))), "_Int">, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [HasAVX512] in { @@ -9669,13 +9677,13 @@ multiclass avx512_masked_scalar("V"#OpcPrefix#r_Intk) + (!cast("V"#OpcPrefix#rk_Int) _.VT:$dst, OutMask, _.VT:$src2, _.VT:$src1)>; def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects_mask Mask, (OpNode (extractelt _.VT:$src2, (iPTR 0))), ZeroFP))), - (!cast("V"#OpcPrefix#r_Intkz) + (!cast("V"#OpcPrefix#rkz_Int) OutMask, _.VT:$src2, _.VT:$src1)>; } } @@ -12174,7 +12182,7 @@ multiclass AVX512_scalar_math_fp_patterns("V"#OpcPrefix#"Zrr_Intk") + (!cast("V"#OpcPrefix#"Zrrk_Int") (_.VT (COPY_TO_REGCLASS _.FRC:$src0, VR128X)), VK1WM:$mask, _.VT:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>; @@ -12185,7 +12193,7 @@ multiclass AVX512_scalar_math_fp_patterns("V"#OpcPrefix#"Zrm_Intk") + (!cast("V"#OpcPrefix#"Zrmk_Int") (_.VT (COPY_TO_REGCLASS _.FRC:$src0, VR128X)), VK1WM:$mask, _.VT:$src1, addr:$src2)>; @@ -12196,7 +12204,7 @@ multiclass AVX512_scalar_math_fp_patterns("V"#OpcPrefix#"Zrr_Intkz") + (!cast("V"#OpcPrefix#"Zrrkz_Int") VK1WM:$mask, _.VT:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>; def : Pat<(MoveNode (_.VT VR128X:$src1), @@ -12205,7 +12213,7 @@ multiclass AVX512_scalar_math_fp_patterns("V"#OpcPrefix#"Zrm_Intkz") VK1WM:$mask, _.VT:$src1, addr:$src2)>; + (!cast("V"#OpcPrefix#"Zrmkz_Int") VK1WM:$mask, _.VT:$src1, addr:$src2)>; } } diff --git a/llvm/lib/Target/X86/X86InstrFMA3Info.cpp b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp index 090ec687d28c4..0da4857d66748 100644 --- a/llvm/lib/Target/X86/X86InstrFMA3Info.cpp +++ b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp @@ -27,6 +27,11 @@ using namespace llvm; FMA3GROUP(Name, Suf##k, Attrs | X86InstrFMA3Group::KMergeMasked) \ FMA3GROUP(Name, Suf##kz, Attrs | X86InstrFMA3Group::KZeroMasked) +#define FMA3GROUP_MASKED_INT(Name, Suf, Attrs) \ + FMA3GROUP(Name, Suf##_Int, Attrs) \ + FMA3GROUP(Name, Suf##k_Int, Attrs | X86InstrFMA3Group::KMergeMasked) \ + FMA3GROUP(Name, Suf##kz_Int, Attrs | X86InstrFMA3Group::KZeroMasked) + #define FMA3GROUP_PACKED_WIDTHS_Z(Name, Suf, Attrs) \ FMA3GROUP_MASKED(Name, Suf##Z128m, Attrs) \ FMA3GROUP_MASKED(Name, Suf##Z128r, Attrs) \ @@ -52,9 +57,9 @@ using namespace llvm; #define FMA3GROUP_SCALAR_WIDTHS_Z(Name, Suf, Attrs) \ FMA3GROUP(Name, Suf##Zm, Attrs) \ - FMA3GROUP_MASKED(Name, Suf##Zm_Int, Attrs | X86InstrFMA3Group::Intrinsic) \ + FMA3GROUP_MASKED_INT(Name, Suf##Zm, Attrs | X86InstrFMA3Group::Intrinsic) \ FMA3GROUP(Name, Suf##Zr, Attrs) \ - FMA3GROUP_MASKED(Name, Suf##Zr_Int, Attrs | X86InstrFMA3Group::Intrinsic) \ + FMA3GROUP_MASKED_INT(Name, Suf##Zr, Attrs | X86InstrFMA3Group::Intrinsic) \ #define FMA3GROUP_SCALAR_WIDTHS_ALL(Name, Suf, Attrs) \ FMA3GROUP_SCALAR_WIDTHS_Z(Name, Suf, Attrs) \ @@ -108,11 +113,11 @@ static const X86InstrFMA3Group Groups[] = { #define FMA3GROUP_SCALAR_AVX512_ROUND(Name, Suf, Attrs) \ FMA3GROUP(Name, SDZ##Suf, Attrs) \ - FMA3GROUP_MASKED(Name, SDZ##Suf##_Int, Attrs) \ + FMA3GROUP_MASKED_INT(Name, SDZ##Suf, Attrs) \ FMA3GROUP(Name, SHZ##Suf, Attrs) \ - FMA3GROUP_MASKED(Name, SHZ##Suf##_Int, Attrs) \ + FMA3GROUP_MASKED_INT(Name, SHZ##Suf, Attrs) \ FMA3GROUP(Name, SSZ##Suf, Attrs) \ - FMA3GROUP_MASKED(Name, SSZ##Suf##_Int, Attrs) + FMA3GROUP_MASKED_INT(Name, SSZ##Suf, Attrs) static const X86InstrFMA3Group BroadcastGroups[] = { FMA3GROUP_PACKED_AVX512_ALL(VFMADD, mb, 0) diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 5a6ea1182ccb8..30a5161bbcc50 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -7646,8 +7646,8 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::CVTSS2SDrr_Int: case X86::VCVTSS2SDrr_Int: case X86::VCVTSS2SDZrr_Int: - case X86::VCVTSS2SDZrr_Intk: - case X86::VCVTSS2SDZrr_Intkz: + case X86::VCVTSS2SDZrrk_Int: + case X86::VCVTSS2SDZrrkz_Int: case X86::CVTSS2SIrr_Int: case X86::CVTSS2SI64rr_Int: case X86::VCVTSS2SIrr_Int: @@ -7700,21 +7700,21 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: case X86::VSUBSSZrr_Int: - case X86::VADDSSZrr_Intk: - case X86::VADDSSZrr_Intkz: - case X86::VCMPSSZrri_Intk: - case X86::VDIVSSZrr_Intk: - case X86::VDIVSSZrr_Intkz: - case X86::VMAXSSZrr_Intk: - case X86::VMAXSSZrr_Intkz: - case X86::VMINSSZrr_Intk: - case X86::VMINSSZrr_Intkz: - case X86::VMULSSZrr_Intk: - case X86::VMULSSZrr_Intkz: - case X86::VSQRTSSZr_Intk: - case X86::VSQRTSSZr_Intkz: - case X86::VSUBSSZrr_Intk: - case X86::VSUBSSZrr_Intkz: + case X86::VADDSSZrrk_Int: + case X86::VADDSSZrrkz_Int: + case X86::VCMPSSZrrik_Int: + case X86::VDIVSSZrrk_Int: + case X86::VDIVSSZrrkz_Int: + case X86::VMAXSSZrrk_Int: + case X86::VMAXSSZrrkz_Int: + case X86::VMINSSZrrk_Int: + case X86::VMINSSZrrkz_Int: + case X86::VMULSSZrrk_Int: + case X86::VMULSSZrrkz_Int: + case X86::VSQRTSSZrk_Int: + case X86::VSQRTSSZrkz_Int: + case X86::VSUBSSZrrk_Int: + case X86::VSUBSSZrrkz_Int: case X86::VFMADDSS4rr_Int: case X86::VFNMADDSS4rr_Int: case X86::VFMSUBSS4rr_Int: @@ -7743,30 +7743,30 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::VFNMSUB213SSZr_Int: case X86::VFMSUB231SSZr_Int: case X86::VFNMSUB231SSZr_Int: - case X86::VFMADD132SSZr_Intk: - case X86::VFNMADD132SSZr_Intk: - case X86::VFMADD213SSZr_Intk: - case X86::VFNMADD213SSZr_Intk: - case X86::VFMADD231SSZr_Intk: - case X86::VFNMADD231SSZr_Intk: - case X86::VFMSUB132SSZr_Intk: - case X86::VFNMSUB132SSZr_Intk: - case X86::VFMSUB213SSZr_Intk: - case X86::VFNMSUB213SSZr_Intk: - case X86::VFMSUB231SSZr_Intk: - case X86::VFNMSUB231SSZr_Intk: - case X86::VFMADD132SSZr_Intkz: - case X86::VFNMADD132SSZr_Intkz: - case X86::VFMADD213SSZr_Intkz: - case X86::VFNMADD213SSZr_Intkz: - case X86::VFMADD231SSZr_Intkz: - case X86::VFNMADD231SSZr_Intkz: - case X86::VFMSUB132SSZr_Intkz: - case X86::VFNMSUB132SSZr_Intkz: - case X86::VFMSUB213SSZr_Intkz: - case X86::VFNMSUB213SSZr_Intkz: - case X86::VFMSUB231SSZr_Intkz: - case X86::VFNMSUB231SSZr_Intkz: + case X86::VFMADD132SSZrk_Int: + case X86::VFNMADD132SSZrk_Int: + case X86::VFMADD213SSZrk_Int: + case X86::VFNMADD213SSZrk_Int: + case X86::VFMADD231SSZrk_Int: + case X86::VFNMADD231SSZrk_Int: + case X86::VFMSUB132SSZrk_Int: + case X86::VFNMSUB132SSZrk_Int: + case X86::VFMSUB213SSZrk_Int: + case X86::VFNMSUB213SSZrk_Int: + case X86::VFMSUB231SSZrk_Int: + case X86::VFNMSUB231SSZrk_Int: + case X86::VFMADD132SSZrkz_Int: + case X86::VFNMADD132SSZrkz_Int: + case X86::VFMADD213SSZrkz_Int: + case X86::VFNMADD213SSZrkz_Int: + case X86::VFMADD231SSZrkz_Int: + case X86::VFNMADD231SSZrkz_Int: + case X86::VFMSUB132SSZrkz_Int: + case X86::VFNMSUB132SSZrkz_Int: + case X86::VFMSUB213SSZrkz_Int: + case X86::VFNMSUB213SSZrkz_Int: + case X86::VFMSUB231SSZrkz_Int: + case X86::VFNMSUB231SSZrkz_Int: case X86::VFIXUPIMMSSZrri: case X86::VFIXUPIMMSSZrrik: case X86::VFIXUPIMMSSZrrikz: @@ -7791,8 +7791,8 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::VREDUCESSZrrik: case X86::VREDUCESSZrrikz: case X86::VRNDSCALESSZrri_Int: - case X86::VRNDSCALESSZrri_Intk: - case X86::VRNDSCALESSZrri_Intkz: + case X86::VRNDSCALESSZrrik_Int: + case X86::VRNDSCALESSZrrikz_Int: case X86::VRSQRT14SSZrr: case X86::VRSQRT14SSZrrk: case X86::VRSQRT14SSZrrkz: @@ -7819,8 +7819,8 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::CVTSD2SSrr_Int: case X86::VCVTSD2SSrr_Int: case X86::VCVTSD2SSZrr_Int: - case X86::VCVTSD2SSZrr_Intk: - case X86::VCVTSD2SSZrr_Intkz: + case X86::VCVTSD2SSZrrk_Int: + case X86::VCVTSD2SSZrrkz_Int: case X86::CVTSD2SIrr_Int: case X86::CVTSD2SI64rr_Int: case X86::VCVTSD2SIrr_Int: @@ -7869,21 +7869,21 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: case X86::VSUBSDZrr_Int: - case X86::VADDSDZrr_Intk: - case X86::VADDSDZrr_Intkz: - case X86::VCMPSDZrri_Intk: - case X86::VDIVSDZrr_Intk: - case X86::VDIVSDZrr_Intkz: - case X86::VMAXSDZrr_Intk: - case X86::VMAXSDZrr_Intkz: - case X86::VMINSDZrr_Intk: - case X86::VMINSDZrr_Intkz: - case X86::VMULSDZrr_Intk: - case X86::VMULSDZrr_Intkz: - case X86::VSQRTSDZr_Intk: - case X86::VSQRTSDZr_Intkz: - case X86::VSUBSDZrr_Intk: - case X86::VSUBSDZrr_Intkz: + case X86::VADDSDZrrk_Int: + case X86::VADDSDZrrkz_Int: + case X86::VCMPSDZrrik_Int: + case X86::VDIVSDZrrk_Int: + case X86::VDIVSDZrrkz_Int: + case X86::VMAXSDZrrk_Int: + case X86::VMAXSDZrrkz_Int: + case X86::VMINSDZrrk_Int: + case X86::VMINSDZrrkz_Int: + case X86::VMULSDZrrk_Int: + case X86::VMULSDZrrkz_Int: + case X86::VSQRTSDZrk_Int: + case X86::VSQRTSDZrkz_Int: + case X86::VSUBSDZrrk_Int: + case X86::VSUBSDZrrkz_Int: case X86::VFMADDSD4rr_Int: case X86::VFNMADDSD4rr_Int: case X86::VFMSUBSD4rr_Int: @@ -7912,30 +7912,30 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::VFNMSUB213SDZr_Int: case X86::VFMSUB231SDZr_Int: case X86::VFNMSUB231SDZr_Int: - case X86::VFMADD132SDZr_Intk: - case X86::VFNMADD132SDZr_Intk: - case X86::VFMADD213SDZr_Intk: - case X86::VFNMADD213SDZr_Intk: - case X86::VFMADD231SDZr_Intk: - case X86::VFNMADD231SDZr_Intk: - case X86::VFMSUB132SDZr_Intk: - case X86::VFNMSUB132SDZr_Intk: - case X86::VFMSUB213SDZr_Intk: - case X86::VFNMSUB213SDZr_Intk: - case X86::VFMSUB231SDZr_Intk: - case X86::VFNMSUB231SDZr_Intk: - case X86::VFMADD132SDZr_Intkz: - case X86::VFNMADD132SDZr_Intkz: - case X86::VFMADD213SDZr_Intkz: - case X86::VFNMADD213SDZr_Intkz: - case X86::VFMADD231SDZr_Intkz: - case X86::VFNMADD231SDZr_Intkz: - case X86::VFMSUB132SDZr_Intkz: - case X86::VFNMSUB132SDZr_Intkz: - case X86::VFMSUB213SDZr_Intkz: - case X86::VFNMSUB213SDZr_Intkz: - case X86::VFMSUB231SDZr_Intkz: - case X86::VFNMSUB231SDZr_Intkz: + case X86::VFMADD132SDZrk_Int: + case X86::VFNMADD132SDZrk_Int: + case X86::VFMADD213SDZrk_Int: + case X86::VFNMADD213SDZrk_Int: + case X86::VFMADD231SDZrk_Int: + case X86::VFNMADD231SDZrk_Int: + case X86::VFMSUB132SDZrk_Int: + case X86::VFNMSUB132SDZrk_Int: + case X86::VFMSUB213SDZrk_Int: + case X86::VFNMSUB213SDZrk_Int: + case X86::VFMSUB231SDZrk_Int: + case X86::VFNMSUB231SDZrk_Int: + case X86::VFMADD132SDZrkz_Int: + case X86::VFNMADD132SDZrkz_Int: + case X86::VFMADD213SDZrkz_Int: + case X86::VFNMADD213SDZrkz_Int: + case X86::VFMADD231SDZrkz_Int: + case X86::VFNMADD231SDZrkz_Int: + case X86::VFMSUB132SDZrkz_Int: + case X86::VFNMSUB132SDZrkz_Int: + case X86::VFMSUB213SDZrkz_Int: + case X86::VFNMSUB213SDZrkz_Int: + case X86::VFMSUB231SDZrkz_Int: + case X86::VFNMSUB231SDZrkz_Int: case X86::VFIXUPIMMSDZrri: case X86::VFIXUPIMMSDZrrik: case X86::VFIXUPIMMSDZrrikz: @@ -7960,8 +7960,8 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::VREDUCESDZrrik: case X86::VREDUCESDZrrikz: case X86::VRNDSCALESDZrri_Int: - case X86::VRNDSCALESDZrri_Intk: - case X86::VRNDSCALESDZrri_Intkz: + case X86::VRNDSCALESDZrrik_Int: + case X86::VRNDSCALESDZrrikz_Int: case X86::VRSQRT14SDZrr: case X86::VRSQRT14SDZrrk: case X86::VRSQRT14SDZrrkz: @@ -7989,19 +7989,19 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::VMINSHZrr_Int: case X86::VMULSHZrr_Int: case X86::VSUBSHZrr_Int: - case X86::VADDSHZrr_Intk: - case X86::VADDSHZrr_Intkz: - case X86::VCMPSHZrri_Intk: - case X86::VDIVSHZrr_Intk: - case X86::VDIVSHZrr_Intkz: - case X86::VMAXSHZrr_Intk: - case X86::VMAXSHZrr_Intkz: - case X86::VMINSHZrr_Intk: - case X86::VMINSHZrr_Intkz: - case X86::VMULSHZrr_Intk: - case X86::VMULSHZrr_Intkz: - case X86::VSUBSHZrr_Intk: - case X86::VSUBSHZrr_Intkz: + case X86::VADDSHZrrk_Int: + case X86::VADDSHZrrkz_Int: + case X86::VCMPSHZrrik_Int: + case X86::VDIVSHZrrk_Int: + case X86::VDIVSHZrrkz_Int: + case X86::VMAXSHZrrk_Int: + case X86::VMAXSHZrrkz_Int: + case X86::VMINSHZrrk_Int: + case X86::VMINSHZrrkz_Int: + case X86::VMULSHZrrk_Int: + case X86::VMULSHZrrkz_Int: + case X86::VSUBSHZrrk_Int: + case X86::VSUBSHZrrkz_Int: case X86::VFMADD132SHZr_Int: case X86::VFNMADD132SHZr_Int: case X86::VFMADD213SHZr_Int: @@ -8014,30 +8014,30 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::VFNMSUB213SHZr_Int: case X86::VFMSUB231SHZr_Int: case X86::VFNMSUB231SHZr_Int: - case X86::VFMADD132SHZr_Intk: - case X86::VFNMADD132SHZr_Intk: - case X86::VFMADD213SHZr_Intk: - case X86::VFNMADD213SHZr_Intk: - case X86::VFMADD231SHZr_Intk: - case X86::VFNMADD231SHZr_Intk: - case X86::VFMSUB132SHZr_Intk: - case X86::VFNMSUB132SHZr_Intk: - case X86::VFMSUB213SHZr_Intk: - case X86::VFNMSUB213SHZr_Intk: - case X86::VFMSUB231SHZr_Intk: - case X86::VFNMSUB231SHZr_Intk: - case X86::VFMADD132SHZr_Intkz: - case X86::VFNMADD132SHZr_Intkz: - case X86::VFMADD213SHZr_Intkz: - case X86::VFNMADD213SHZr_Intkz: - case X86::VFMADD231SHZr_Intkz: - case X86::VFNMADD231SHZr_Intkz: - case X86::VFMSUB132SHZr_Intkz: - case X86::VFNMSUB132SHZr_Intkz: - case X86::VFMSUB213SHZr_Intkz: - case X86::VFNMSUB213SHZr_Intkz: - case X86::VFMSUB231SHZr_Intkz: - case X86::VFNMSUB231SHZr_Intkz: + case X86::VFMADD132SHZrk_Int: + case X86::VFNMADD132SHZrk_Int: + case X86::VFMADD213SHZrk_Int: + case X86::VFNMADD213SHZrk_Int: + case X86::VFMADD231SHZrk_Int: + case X86::VFNMADD231SHZrk_Int: + case X86::VFMSUB132SHZrk_Int: + case X86::VFNMSUB132SHZrk_Int: + case X86::VFMSUB213SHZrk_Int: + case X86::VFNMSUB213SHZrk_Int: + case X86::VFMSUB231SHZrk_Int: + case X86::VFNMSUB231SHZrk_Int: + case X86::VFMADD132SHZrkz_Int: + case X86::VFNMADD132SHZrkz_Int: + case X86::VFMADD213SHZrkz_Int: + case X86::VFNMADD213SHZrkz_Int: + case X86::VFMADD231SHZrkz_Int: + case X86::VFNMADD231SHZrkz_Int: + case X86::VFMSUB132SHZrkz_Int: + case X86::VFNMSUB132SHZrkz_Int: + case X86::VFMSUB213SHZrkz_Int: + case X86::VFNMSUB213SHZrkz_Int: + case X86::VFMSUB231SHZrkz_Int: + case X86::VFNMSUB231SHZrkz_Int: return false; default: return true; @@ -9489,25 +9489,25 @@ bool X86InstrInfo::isHighLatencyDef(int opc) const { case X86::VDIVSDZrm: case X86::VDIVSDZrr: case X86::VDIVSDZrm_Int: - case X86::VDIVSDZrm_Intk: - case X86::VDIVSDZrm_Intkz: + case X86::VDIVSDZrmk_Int: + case X86::VDIVSDZrmkz_Int: case X86::VDIVSDZrr_Int: - case X86::VDIVSDZrr_Intk: - case X86::VDIVSDZrr_Intkz: + case X86::VDIVSDZrrk_Int: + case X86::VDIVSDZrrkz_Int: case X86::VDIVSDZrrb_Int: - case X86::VDIVSDZrrb_Intk: - case X86::VDIVSDZrrb_Intkz: + case X86::VDIVSDZrrbk_Int: + case X86::VDIVSDZrrbkz_Int: case X86::VDIVSSZrm: case X86::VDIVSSZrr: case X86::VDIVSSZrm_Int: - case X86::VDIVSSZrm_Intk: - case X86::VDIVSSZrm_Intkz: + case X86::VDIVSSZrmk_Int: + case X86::VDIVSSZrmkz_Int: case X86::VDIVSSZrr_Int: - case X86::VDIVSSZrr_Intk: - case X86::VDIVSSZrr_Intkz: + case X86::VDIVSSZrrk_Int: + case X86::VDIVSSZrrkz_Int: case X86::VDIVSSZrrb_Int: - case X86::VDIVSSZrrb_Intk: - case X86::VDIVSSZrrb_Intkz: + case X86::VDIVSSZrrbk_Int: + case X86::VDIVSSZrrbkz_Int: case X86::VSQRTPDZ128m: case X86::VSQRTPDZ128mb: case X86::VSQRTPDZ128mbk: @@ -9570,26 +9570,26 @@ bool X86InstrInfo::isHighLatencyDef(int opc) const { case X86::VSQRTPSZrkz: case X86::VSQRTSDZm: case X86::VSQRTSDZm_Int: - case X86::VSQRTSDZm_Intk: - case X86::VSQRTSDZm_Intkz: + case X86::VSQRTSDZmk_Int: + case X86::VSQRTSDZmkz_Int: case X86::VSQRTSDZr: case X86::VSQRTSDZr_Int: - case X86::VSQRTSDZr_Intk: - case X86::VSQRTSDZr_Intkz: + case X86::VSQRTSDZrk_Int: + case X86::VSQRTSDZrkz_Int: case X86::VSQRTSDZrb_Int: - case X86::VSQRTSDZrb_Intk: - case X86::VSQRTSDZrb_Intkz: + case X86::VSQRTSDZrbk_Int: + case X86::VSQRTSDZrbkz_Int: case X86::VSQRTSSZm: case X86::VSQRTSSZm_Int: - case X86::VSQRTSSZm_Intk: - case X86::VSQRTSSZm_Intkz: + case X86::VSQRTSSZmk_Int: + case X86::VSQRTSSZmkz_Int: case X86::VSQRTSSZr: case X86::VSQRTSSZr_Int: - case X86::VSQRTSSZr_Intk: - case X86::VSQRTSSZr_Intkz: + case X86::VSQRTSSZrk_Int: + case X86::VSQRTSSZrkz_Int: case X86::VSQRTSSZrb_Int: - case X86::VSQRTSSZrb_Intk: - case X86::VSQRTSSZrb_Intkz: + case X86::VSQRTSSZrbk_Int: + case X86::VSQRTSSZrbkz_Int: case X86::VGATHERDPDYrm: case X86::VGATHERDPDZ128rm: diff --git a/llvm/lib/Target/X86/X86SchedSapphireRapids.td b/llvm/lib/Target/X86/X86SchedSapphireRapids.td index e04ff68d278b2..4f0d3669a311d 100644 --- a/llvm/lib/Target/X86/X86SchedSapphireRapids.td +++ b/llvm/lib/Target/X86/X86SchedSapphireRapids.td @@ -669,7 +669,7 @@ def : InstRW<[SPRWriteResGroup12], (instregex "^ADD_F(P?)rST0$", "^VALIGN(D|Q)Z256rri((k|kz)?)$", "^VCMPP(D|H|S)Z(128|256)rri(k?)$", "^VCMPS(D|H|S)Zrri$", - "^VCMPS(D|H|S)Zrr(b?)i_Int(k?)$", + "^VCMPS(D|H|S)Zrr(b?)i(k?)_Int$", "^VFPCLASSP(D|H|S)Z(128|256)ri(k?)$", "^VFPCLASSS(D|H|S)Zri(k?)$", "^VPACK(S|U)S(DW|WB)Yrr$", @@ -977,7 +977,7 @@ def SPRWriteResGroup49 : SchedWriteRes<[SPRPort00, SPRPort02_03_10]> { let NumMicroOps = 2; } def : InstRW<[SPRWriteResGroup49], (instregex "^DIV_F(32|64)m$")>; -def : InstRW<[SPRWriteResGroup49, ReadAfterVecLd], (instregex "^VSQRTSHZm_Int((k|kz)?)$")>; +def : InstRW<[SPRWriteResGroup49, ReadAfterVecLd], (instregex "^VSQRTSHZm((k|kz)?)_Int$")>; def : InstRW<[SPRWriteResGroup49, ReadAfterVecLd], (instrs VSQRTSHZm)>; def SPRWriteResGroup50 : SchedWriteRes<[SPRPort00, SPRPort02_03_10, SPRPort05]> { @@ -1166,11 +1166,11 @@ def : InstRW<[SPRWriteResGroup73, ReadAfterVecXLd], (instregex "^(V?)GF2P8AFFINE def : InstRW<[SPRWriteResGroup73, ReadAfterVecXLd], (instrs VGETEXPPHZ128mbkz, VGF2P8MULBZ128rm)>; def : InstRW<[SPRWriteResGroup73, ReadAfterVecLd], (instregex "^V(ADD|SUB)SHZrm$", - "^V(ADD|SUB)SHZrm_Int((k|kz)?)$", + "^V(ADD|SUB)SHZrm((k|kz)?)_Int$", "^VCVTSH2SSZrm((_Int)?)$", "^VM(AX|IN)CSHZrm$", "^VM(AX|IN|UL)SHZrm$", - "^VM(AX|IN|UL)SHZrm_Int((k|kz)?)$")>; + "^VM(AX|IN|UL)SHZrm((k|kz)?)_Int$")>; def : InstRW<[SPRWriteResGroup73, ReadAfterVecYLd], (instregex "^VGF2P8AFFINE((INV)?)QBYrmi$", "^VGF2P8AFFINE((INV)?)QBZ256rm(b?)i$", "^VGF2P8MULB(Y|Z256)rm$")>; @@ -1181,7 +1181,7 @@ def : InstRW<[SPRWriteResGroup73, ReadAfterVecXLd, ReadAfterVecXLd], (instregex "^VFMSUBADD(132|213|231)PHZ128m((b|k|bk|kz)?)$", "^VFMSUBADD(132|213|231)PHZ128mbkz$")>; def : InstRW<[SPRWriteResGroup73, ReadAfterVecLd, ReadAfterVecLd], (instregex "^VF(N?)M(ADD|SUB)(132|213|231)SHZm$", - "^VF(N?)M(ADD|SUB)(132|213|231)SHZm_Int((k|kz)?)$")>; + "^VF(N?)M(ADD|SUB)(132|213|231)SHZm((k|kz)?)_Int$")>; def : InstRW<[SPRWriteResGroup73, ReadAfterVecYLd, ReadAfterVecYLd], (instregex "^VPMADD52(H|L)UQZ256m((b|k|bk|kz)?)$", "^VPMADD52(H|L)UQZ256mbkz$")>; @@ -2301,7 +2301,7 @@ def : InstRW<[SPRWriteResGroup218, ReadAfterVecXLd], (instregex "^(V?)ROUNDS(D|S "^VRNDSCALEP(D|S)Z128rmbik(z?)$", "^VRNDSCALEP(D|S)Z128rmi((kz)?)$", "^VRNDSCALES(D|S)Zrmi$", - "^VRNDSCALES(D|S)Zrmi_Int((k|kz)?)$")>; + "^VRNDSCALES(D|S)Zrmi((k|kz)?)_Int$")>; def SPRWriteResGroup219 : SchedWriteRes<[SPRPort00_01]> { let ReleaseAtCycles = [2]; @@ -2313,7 +2313,7 @@ def : InstRW<[SPRWriteResGroup219], (instregex "^(V?)ROUND(PD|SS)ri$", "^(V?)ROUNDS(D|S)ri_Int$", "^VRNDSCALEP(D|S)Z(128|256)rri((k|kz)?)$", "^VRNDSCALES(D|S)Zrri$", - "^VRNDSCALES(D|S)Zrri(b?)_Int((k|kz)?)$", + "^VRNDSCALES(D|S)Zrri(b?)((k|kz)?)_Int$", "^VROUNDP(D|S)Yri$")>; def SPRWriteResGroup220 : SchedWriteRes<[SPRPort00_06]> { @@ -2530,7 +2530,7 @@ def SPRWriteResGroup249 : SchedWriteRes<[SPRPort01_05]> { let Latency = 4; } def : InstRW<[SPRWriteResGroup249], (instregex "^V(ADD|SUB)P(D|S)Z(128|256)rrkz$", - "^V(ADD|SUB)S(D|S)Zrr(b?)_Intkz$")>; + "^V(ADD|SUB)S(D|S)Zrr(b?)kz_Int$")>; def SPRWriteResGroup250 : SchedWriteRes<[SPRPort00_05]> { let Latency = 3; @@ -2545,11 +2545,11 @@ def SPRWriteResGroup251 : SchedWriteRes<[SPRPort00_01]> { let Latency = 6; } def : InstRW<[SPRWriteResGroup251], (instregex "^V(ADD|SUB)PHZ(128|256)rrk(z?)$", - "^V(ADD|SUB)SHZrr(b?)_Intk(z?)$", + "^V(ADD|SUB)SHZrr(b?)k(z?)_Int$", "^VCVT(T?)PH2(U?)WZ(128|256)rrk(z?)$", "^VCVT(U?)W2PHZ(128|256)rrk(z?)$", "^VF(N?)M(ADD|SUB)(132|213|231)PHZ(128|256)rk(z?)$", - "^VF(N?)M(ADD|SUB)(132|213|231)SHZr(b?)_Intk(z?)$", + "^VF(N?)M(ADD|SUB)(132|213|231)SHZr(b?)k(z?)_Int$", "^VFMADDSUB(132|213|231)PHZ(128|256)rk(z?)$", "^VFMSUBADD(132|213|231)PHZ(128|256)rk(z?)$", "^VGETEXPPHZ(128|256)rk(z?)$", @@ -2560,7 +2560,7 @@ def : InstRW<[SPRWriteResGroup251], (instregex "^V(ADD|SUB)PHZ(128|256)rrk(z?)$" "^VGETMANTSHZrri(k|bkz)$", "^VM(AX|IN)CPHZ(128|256)rrk(z?)$", "^VM(AX|IN|UL)PHZ(128|256)rrk(z?)$", - "^VM(AX|IN|UL)SHZrr(b?)_Intk(z?)$")>; + "^VM(AX|IN|UL)SHZrr(b?)k(z?)_Int$")>; def SPRWriteResGroup252 : SchedWriteRes<[SPRPort00]> { let Latency = 5; @@ -2745,7 +2745,7 @@ def : InstRW<[SPRWriteResGroup263, ReadAfterVecYLd], (instregex "^VCMPP(D|H|S)Z( "^VPTEST(N?)M(B|D|Q|W)Z((256)?)rm(k?)$", "^VPTEST(N?)M(D|Q)Z((256)?)rmb(k?)$")>; def : InstRW<[SPRWriteResGroup263, ReadAfterVecLd], (instregex "^VCMPS(D|H|S)Zrmi$", - "^VCMPS(D|H|S)Zrmi_Int(k?)$", + "^VCMPS(D|H|S)Zrmi(k?)_Int$", "^VFPCLASSS(D|H|S)Zmik$")>; def SPRWriteResGroup264 : SchedWriteRes<[SPRPort00, SPRPort02_03_10]> { @@ -3171,7 +3171,7 @@ def : InstRW<[SPRWriteResGroup314], (instregex "^VCVT(T?)PD2(U?)QQZ(128|256)rr(( "^VPLZCNT(D|Q)Z(128|256)rr((k|kz)?)$", "^VPMADD52(H|L)UQZ(128|256)r((k|kz)?)$", "^VSCALEFS(D|S)Zrr((k|kz)?)$", - "^VSCALEFS(D|S)Zrrb_Int((k|kz)?)$")>; + "^VSCALEFS(D|S)Zrrb((k|kz)?)_Int$")>; def : InstRW<[SPRWriteResGroup314, ReadAfterVecLd], (instregex "^VFIXUPIMMS(D|S)Zrrib((k|kz)?)$")>; def SPRWriteResGroup315 : SchedWriteRes<[SPRPort00_01, SPRPort02_03_10, SPRPort05]> { @@ -3300,7 +3300,7 @@ def SPRWriteResGroup331 : SchedWriteRes<[SPRPort00_01, SPRPort02_03_10]> { let NumMicroOps = 2; } def : InstRW<[SPRWriteResGroup331], (instregex "^VCVTPH2PSZ(128|256)rmk(z?)$")>; -def : InstRW<[SPRWriteResGroup331, ReadAfterVecLd], (instregex "^VCVTSH2SSZrm_Intk(z?)$")>; +def : InstRW<[SPRWriteResGroup331, ReadAfterVecLd], (instregex "^VCVTSH2SSZrmk(z?)_Int$")>; def : InstRW<[SPRWriteResGroup331, ReadAfterVecXLd], (instregex "^VPMADDUBSWZ128rmk(z?)$", "^VPMULH((U|RS)?)WZ128rmk(z?)$", "^VPMULLWZ128rmk(z?)$")>; @@ -3460,7 +3460,7 @@ def SPRWriteResGroup353 : SchedWriteRes<[SPRPort00_01, SPRPort00_01_05, SPRPort0 let Latency = 21; let NumMicroOps = 7; } -def : InstRW<[SPRWriteResGroup353, ReadAfterVecLd], (instregex "^VCVTSD2SHZrm_Intk(z?)$")>; +def : InstRW<[SPRWriteResGroup353, ReadAfterVecLd], (instregex "^VCVTSD2SHZrmk(z?)_Int$")>; def SPRWriteResGroup354 : SchedWriteRes<[SPRPort00_01, SPRPort00_01_05, SPRPort05]> { let ReleaseAtCycles = [2, 1, 1]; @@ -3475,7 +3475,7 @@ def SPRWriteResGroup355 : SchedWriteRes<[SPRPort00_01, SPRPort00_01_05, SPRPort0 let Latency = 14; let NumMicroOps = 4; } -def : InstRW<[SPRWriteResGroup355], (instregex "^VCVTSD2SHZrr(b?)_Intk(z?)$")>; +def : InstRW<[SPRWriteResGroup355], (instregex "^VCVTSD2SHZrr(b?)k(z?)_Int$")>; def SPRWriteResGroup356 : SchedWriteRes<[SPRPort00_01, SPRPort02_03_10, SPRPort05]> { let ReleaseAtCycles = [2, 1, 1]; @@ -3489,7 +3489,7 @@ def SPRWriteResGroup357 : SchedWriteRes<[SPRPort00_01, SPRPort02_03_10, SPRPort0 let Latency = 20; let NumMicroOps = 4; } -def : InstRW<[SPRWriteResGroup357, ReadAfterVecLd], (instregex "^VCVTSH2SDZrm_Intk(z?)$")>; +def : InstRW<[SPRWriteResGroup357, ReadAfterVecLd], (instregex "^VCVTSH2SDZrmk(z?)_Int$")>; def SPRWriteResGroup358 : SchedWriteRes<[SPRPort00_01, SPRPort05]> { let ReleaseAtCycles = [2, 1]; @@ -3504,7 +3504,7 @@ def SPRWriteResGroup359 : SchedWriteRes<[SPRPort00_01, SPRPort05]> { let Latency = 13; let NumMicroOps = 3; } -def : InstRW<[SPRWriteResGroup359], (instregex "^VCVTSH2SDZrr(b?)_Intk(z?)$")>; +def : InstRW<[SPRWriteResGroup359], (instregex "^VCVTSH2SDZrr(b?)k(z?)_Int$")>; def SPRWriteResGroup360 : SchedWriteRes<[SPRPort00, SPRPort00_01, SPRPort02_03_10]> { let Latency = 13; @@ -3523,7 +3523,7 @@ def : InstRW<[SPRWriteResGroup361], (instregex "^VCVT(T?)SH2(U?)SI((64)?)Zrr(b?) def SPRWriteResGroup362 : SchedWriteRes<[SPRPort00_01]> { let Latency = 8; } -def : InstRW<[SPRWriteResGroup362], (instregex "^VCVTSH2SSZrr(b?)_Intk(z?)$")>; +def : InstRW<[SPRWriteResGroup362], (instregex "^VCVTSH2SSZrr(b?)k(z?)_Int$")>; def SPRWriteResGroup363 : SchedWriteRes<[SPRPort00_01, SPRPort00_01_05, SPRPort02_03_10]> { let Latency = 14; @@ -3536,7 +3536,7 @@ def SPRWriteResGroup364 : SchedWriteRes<[SPRPort00_01, SPRPort00_01_05, SPRPort0 let Latency = 16; let NumMicroOps = 3; } -def : InstRW<[SPRWriteResGroup364, ReadAfterVecLd], (instregex "^VCVTSS2SHZrm_Intk(z?)$")>; +def : InstRW<[SPRWriteResGroup364, ReadAfterVecLd], (instregex "^VCVTSS2SHZrmk(z?)_Int$")>; def SPRWriteResGroup365 : SchedWriteRes<[SPRPort00_01, SPRPort00_01_05]> { let Latency = 6; @@ -3549,7 +3549,7 @@ def SPRWriteResGroup366 : SchedWriteRes<[SPRPort00_01, SPRPort00_01_05]> { let Latency = 9; let NumMicroOps = 2; } -def : InstRW<[SPRWriteResGroup366], (instregex "^VCVTSS2SHZrr(b?)_Intk(z?)$")>; +def : InstRW<[SPRWriteResGroup366], (instregex "^VCVTSS2SHZrr(b?)k(z?)_Int$")>; def SPRWriteResGroup367 : SchedWriteRes<[SPRPort05]> { let Latency = 5; @@ -3667,7 +3667,7 @@ def SPRWriteResGroup380 : SchedWriteRes<[SPRPort00, SPRPort02_03_10]> { let Latency = 21; let NumMicroOps = 2; } -def : InstRW<[SPRWriteResGroup380, ReadAfterVecLd], (instregex "^VDIVSHZrm_Int((k|kz)?)$")>; +def : InstRW<[SPRWriteResGroup380, ReadAfterVecLd], (instregex "^VDIVSHZrm((k|kz)?)_Int$")>; def : InstRW<[SPRWriteResGroup380, ReadAfterVecLd], (instrs VDIVSHZrm)>; def SPRWriteResGroup381 : SchedWriteRes<[SPRPort00]> { @@ -4884,7 +4884,7 @@ def SPRWriteResGroup534 : SchedWriteRes<[SPRPort00_01, SPRPort02_03_10]> { let NumMicroOps = 3; } def : InstRW<[SPRWriteResGroup534, ReadAfterVecXLd], (instregex "^VRNDSCALEPHZ128rm(b?)ik(z?)$", - "^VRNDSCALESHZrmi_Intk(z?)$", + "^VRNDSCALESHZrmik(z?)_Int$", "^VSCALEFPHZ128rm(bk|kz)$", "^VSCALEFPHZ128rm(k|bkz)$")>; def : InstRW<[SPRWriteResGroup534, ReadAfterVecYLd], (instregex "^VRNDSCALEPHZ256rm(b?)ik(z?)$", @@ -4898,9 +4898,9 @@ def SPRWriteResGroup535 : SchedWriteRes<[SPRPort00_01]> { let NumMicroOps = 2; } def : InstRW<[SPRWriteResGroup535], (instregex "^VRNDSCALEPHZ(128|256)rrik(z?)$", - "^VRNDSCALESHZrri(b?)_Intk(z?)$", + "^VRNDSCALESHZrri(b?)k(z?)_Int$", "^VSCALEFPHZ(128|256)rrk(z?)$", - "^VSCALEFSHZrrb_Intk(z?)$", + "^VSCALEFSHZrrbk(z?)_Int$", "^VSCALEFSHZrrk(z?)$")>; def SPRWriteResGroup536 : SchedWriteRes<[SPRPort00, SPRPort02_03_10]> { @@ -4944,7 +4944,7 @@ def SPRWriteResGroup540 : SchedWriteRes<[SPRPort00, SPRPort02_03_10]> { } def : InstRW<[SPRWriteResGroup540, ReadAfterVecXLd], (instregex "^VSQRTPDZ128m(bk|kz)$", "^VSQRTPDZ128m(k|bkz)$")>; -def : InstRW<[SPRWriteResGroup540, ReadAfterVecLd], (instregex "^VSQRTSDZm_Intk(z?)$")>; +def : InstRW<[SPRWriteResGroup540, ReadAfterVecLd], (instregex "^VSQRTSDZmk(z?)_Int$")>; def SPRWriteResGroup541 : SchedWriteRes<[SPRPort00, SPRPort00_05, SPRPort02_03_10]> { let ReleaseAtCycles = [2, 1, 1]; diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td index 38f9b5ef1d80b..c5478dd9fc13d 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver4.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td @@ -1545,7 +1545,7 @@ def Zn4WriteSCALErr: SchedWriteRes<[Zn4FPFMisc23]> { let NumMicroOps = 2; } def : InstRW<[Zn4WriteSCALErr], (instregex - "V(SCALEF|REDUCE)(S|P)(S|D)(Z?|Z128?|Z256?)(rr|rrb|rrkz|rrik|rrikz|rri)(_Int?|_Intkz?)", + "V(SCALEF|REDUCE)(S|P)(S|D)(Z?|Z128?|Z256?)(rr|rrb|rrkz|rrik|rrikz|rri)(_Int?)", "(V?)REDUCE(PD|PS|SD|SS)(Z?|Z128?)(rri|rrikz|rrib)" )>; @@ -1585,7 +1585,7 @@ def : InstRW<[Zn4WriteSHIFTrr], (instregex "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z?|Z128?|Z256?)(rr|rrk|rrkz)", "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z256?)(ri|rik|rikz)", "(V?)P(ROL|ROR)(D|Q)(Z?|Z128?)(ri|rik|rikz)", - "VPSHUFBITQMBZ128rr", "VFMSUB231SSZr_Intkz" + "VPSHUFBITQMBZ128rr", "VFMSUB231SSZrkz_Int" )>; def Zn4WriteSHIFTri: SchedWriteRes<[Zn4FPFMisc01]> { diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc index 8cfaa18a5cfac..954c05bdb2076 100644 --- a/llvm/test/TableGen/x86-fold-tables.inc +++ b/llvm/test/TableGen/x86-fold-tables.inc @@ -4239,9 +4239,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VADDPSZ128rrkz, X86::VADDPSZ128rmkz, 0}, {X86::VADDPSZ256rrkz, X86::VADDPSZ256rmkz, 0}, {X86::VADDPSZrrkz, X86::VADDPSZrmkz, 0}, - {X86::VADDSDZrr_Intkz, X86::VADDSDZrm_Intkz, TB_NO_REVERSE}, - {X86::VADDSHZrr_Intkz, X86::VADDSHZrm_Intkz, TB_NO_REVERSE}, - {X86::VADDSSZrr_Intkz, X86::VADDSSZrm_Intkz, TB_NO_REVERSE}, + {X86::VADDSDZrrkz_Int, X86::VADDSDZrmkz_Int, TB_NO_REVERSE}, + {X86::VADDSHZrrkz_Int, X86::VADDSHZrmkz_Int, TB_NO_REVERSE}, + {X86::VADDSSZrrkz_Int, X86::VADDSSZrmkz_Int, TB_NO_REVERSE}, {X86::VALIGNDZ128rrikz, X86::VALIGNDZ128rmikz, 0}, {X86::VALIGNDZ256rrikz, X86::VALIGNDZ256rmikz, 0}, {X86::VALIGNDZrrikz, X86::VALIGNDZrmikz, 0}, @@ -4288,9 +4288,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VCMPPSZ128rrik, X86::VCMPPSZ128rmik, 0}, {X86::VCMPPSZ256rrik, X86::VCMPPSZ256rmik, 0}, {X86::VCMPPSZrrik, X86::VCMPPSZrmik, 0}, - {X86::VCMPSDZrri_Intk, X86::VCMPSDZrmi_Intk, TB_NO_REVERSE}, - {X86::VCMPSHZrri_Intk, X86::VCMPSHZrmi_Intk, TB_NO_REVERSE}, - {X86::VCMPSSZrri_Intk, X86::VCMPSSZrmi_Intk, TB_NO_REVERSE}, + {X86::VCMPSDZrrik_Int, X86::VCMPSDZrmik_Int, TB_NO_REVERSE}, + {X86::VCMPSHZrrik_Int, X86::VCMPSHZrmik_Int, TB_NO_REVERSE}, + {X86::VCMPSSZrrik_Int, X86::VCMPSSZrmik_Int, TB_NO_REVERSE}, {X86::VCVT2PS2PHXZ128rrkz, X86::VCVT2PS2PHXZ128rmkz, 0}, {X86::VCVT2PS2PHXZ256rrkz, X86::VCVT2PS2PHXZ256rmkz, 0}, {X86::VCVT2PS2PHXZrrkz, X86::VCVT2PS2PHXZrmkz, 0}, @@ -4438,12 +4438,12 @@ static const X86FoldTableEntry Table3[] = { {X86::VCVTQQ2PSZ128rrk, X86::VCVTQQ2PSZ128rmk, 0}, {X86::VCVTQQ2PSZ256rrk, X86::VCVTQQ2PSZ256rmk, 0}, {X86::VCVTQQ2PSZrrk, X86::VCVTQQ2PSZrmk, 0}, - {X86::VCVTSD2SHZrr_Intkz, X86::VCVTSD2SHZrm_Intkz, TB_NO_REVERSE}, - {X86::VCVTSD2SSZrr_Intkz, X86::VCVTSD2SSZrm_Intkz, TB_NO_REVERSE}, - {X86::VCVTSH2SDZrr_Intkz, X86::VCVTSH2SDZrm_Intkz, TB_NO_REVERSE}, - {X86::VCVTSH2SSZrr_Intkz, X86::VCVTSH2SSZrm_Intkz, TB_NO_REVERSE}, - {X86::VCVTSS2SDZrr_Intkz, X86::VCVTSS2SDZrm_Intkz, TB_NO_REVERSE}, - {X86::VCVTSS2SHZrr_Intkz, X86::VCVTSS2SHZrm_Intkz, TB_NO_REVERSE}, + {X86::VCVTSD2SHZrrkz_Int, X86::VCVTSD2SHZrmkz_Int, TB_NO_REVERSE}, + {X86::VCVTSD2SSZrrkz_Int, X86::VCVTSD2SSZrmkz_Int, TB_NO_REVERSE}, + {X86::VCVTSH2SDZrrkz_Int, X86::VCVTSH2SDZrmkz_Int, TB_NO_REVERSE}, + {X86::VCVTSH2SSZrrkz_Int, X86::VCVTSH2SSZrmkz_Int, TB_NO_REVERSE}, + {X86::VCVTSS2SDZrrkz_Int, X86::VCVTSS2SDZrmkz_Int, TB_NO_REVERSE}, + {X86::VCVTSS2SHZrrkz_Int, X86::VCVTSS2SHZrmkz_Int, TB_NO_REVERSE}, {X86::VCVTTNEBF162IBSZ128rrk, X86::VCVTTNEBF162IBSZ128rmk, 0}, {X86::VCVTTNEBF162IBSZ256rrk, X86::VCVTTNEBF162IBSZ256rmk, 0}, {X86::VCVTTNEBF162IBSZrrk, X86::VCVTTNEBF162IBSZrmk, 0}, @@ -4567,9 +4567,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VDIVPSZ128rrkz, X86::VDIVPSZ128rmkz, 0}, {X86::VDIVPSZ256rrkz, X86::VDIVPSZ256rmkz, 0}, {X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0}, - {X86::VDIVSDZrr_Intkz, X86::VDIVSDZrm_Intkz, TB_NO_REVERSE}, - {X86::VDIVSHZrr_Intkz, X86::VDIVSHZrm_Intkz, TB_NO_REVERSE}, - {X86::VDIVSSZrr_Intkz, X86::VDIVSSZrm_Intkz, TB_NO_REVERSE}, + {X86::VDIVSDZrrkz_Int, X86::VDIVSDZrmkz_Int, TB_NO_REVERSE}, + {X86::VDIVSHZrrkz_Int, X86::VDIVSHZrmkz_Int, TB_NO_REVERSE}, + {X86::VDIVSSZrrkz_Int, X86::VDIVSSZrmkz_Int, TB_NO_REVERSE}, {X86::VDPBF16PSZ128r, X86::VDPBF16PSZ128m, 0}, {X86::VDPBF16PSZ256r, X86::VDPBF16PSZ256m, 0}, {X86::VDPBF16PSZr, X86::VDPBF16PSZm, 0}, @@ -5110,9 +5110,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VMAXPSZ128rrkz, X86::VMAXPSZ128rmkz, 0}, {X86::VMAXPSZ256rrkz, X86::VMAXPSZ256rmkz, 0}, {X86::VMAXPSZrrkz, X86::VMAXPSZrmkz, 0}, - {X86::VMAXSDZrr_Intkz, X86::VMAXSDZrm_Intkz, TB_NO_REVERSE}, - {X86::VMAXSHZrr_Intkz, X86::VMAXSHZrm_Intkz, TB_NO_REVERSE}, - {X86::VMAXSSZrr_Intkz, X86::VMAXSSZrm_Intkz, TB_NO_REVERSE}, + {X86::VMAXSDZrrkz_Int, X86::VMAXSDZrmkz_Int, TB_NO_REVERSE}, + {X86::VMAXSHZrrkz_Int, X86::VMAXSHZrmkz_Int, TB_NO_REVERSE}, + {X86::VMAXSSZrrkz_Int, X86::VMAXSSZrmkz_Int, TB_NO_REVERSE}, {X86::VMINCPDZ128rrkz, X86::VMINCPDZ128rmkz, 0}, {X86::VMINCPDZ256rrkz, X86::VMINCPDZ256rmkz, 0}, {X86::VMINCPDZrrkz, X86::VMINCPDZrmkz, 0}, @@ -5134,9 +5134,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VMINMAXPSZ128rrikz, X86::VMINMAXPSZ128rmikz, 0}, {X86::VMINMAXPSZ256rrikz, X86::VMINMAXPSZ256rmikz, 0}, {X86::VMINMAXPSZrrikz, X86::VMINMAXPSZrmikz, 0}, - {X86::VMINMAXSDrri_Intkz, X86::VMINMAXSDrmi_Intkz, TB_NO_REVERSE}, - {X86::VMINMAXSHrri_Intkz, X86::VMINMAXSHrmi_Intkz, TB_NO_REVERSE}, - {X86::VMINMAXSSrri_Intkz, X86::VMINMAXSSrmi_Intkz, TB_NO_REVERSE}, + {X86::VMINMAXSDrrikz_Int, X86::VMINMAXSDrmikz_Int, TB_NO_REVERSE}, + {X86::VMINMAXSHrrikz_Int, X86::VMINMAXSHrmikz_Int, TB_NO_REVERSE}, + {X86::VMINMAXSSrrikz_Int, X86::VMINMAXSSrmikz_Int, TB_NO_REVERSE}, {X86::VMINPBF16Z128rrkz, X86::VMINPBF16Z128rmkz, 0}, {X86::VMINPBF16Z256rrkz, X86::VMINPBF16Z256rmkz, 0}, {X86::VMINPBF16Zrrkz, X86::VMINPBF16Zrmkz, 0}, @@ -5149,9 +5149,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VMINPSZ128rrkz, X86::VMINPSZ128rmkz, 0}, {X86::VMINPSZ256rrkz, X86::VMINPSZ256rmkz, 0}, {X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0}, - {X86::VMINSDZrr_Intkz, X86::VMINSDZrm_Intkz, TB_NO_REVERSE}, - {X86::VMINSHZrr_Intkz, X86::VMINSHZrm_Intkz, TB_NO_REVERSE}, - {X86::VMINSSZrr_Intkz, X86::VMINSSZrm_Intkz, TB_NO_REVERSE}, + {X86::VMINSDZrrkz_Int, X86::VMINSDZrmkz_Int, TB_NO_REVERSE}, + {X86::VMINSHZrrkz_Int, X86::VMINSHZrmkz_Int, TB_NO_REVERSE}, + {X86::VMINSSZrrkz_Int, X86::VMINSSZrmkz_Int, TB_NO_REVERSE}, {X86::VMOVAPDZ128rrk, X86::VMOVAPDZ128rmk, TB_NO_REVERSE|TB_ALIGN_16}, {X86::VMOVAPDZ256rrk, X86::VMOVAPDZ256rmk, TB_NO_REVERSE|TB_ALIGN_32}, {X86::VMOVAPDZrrk, X86::VMOVAPDZrmk, TB_NO_REVERSE|TB_ALIGN_64}, @@ -5206,9 +5206,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VMULPSZ128rrkz, X86::VMULPSZ128rmkz, 0}, {X86::VMULPSZ256rrkz, X86::VMULPSZ256rmkz, 0}, {X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0}, - {X86::VMULSDZrr_Intkz, X86::VMULSDZrm_Intkz, TB_NO_REVERSE}, - {X86::VMULSHZrr_Intkz, X86::VMULSHZrm_Intkz, TB_NO_REVERSE}, - {X86::VMULSSZrr_Intkz, X86::VMULSSZrm_Intkz, TB_NO_REVERSE}, + {X86::VMULSDZrrkz_Int, X86::VMULSDZrmkz_Int, TB_NO_REVERSE}, + {X86::VMULSHZrrkz_Int, X86::VMULSHZrmkz_Int, TB_NO_REVERSE}, + {X86::VMULSSZrrkz_Int, X86::VMULSSZrmkz_Int, TB_NO_REVERSE}, {X86::VORPDZ128rrkz, X86::VORPDZ128rmkz, 0}, {X86::VORPDZ256rrkz, X86::VORPDZ256rmkz, 0}, {X86::VORPDZrrkz, X86::VORPDZrmkz, 0}, @@ -5972,9 +5972,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VRNDSCALEPSZ128rrik, X86::VRNDSCALEPSZ128rmik, 0}, {X86::VRNDSCALEPSZ256rrik, X86::VRNDSCALEPSZ256rmik, 0}, {X86::VRNDSCALEPSZrrik, X86::VRNDSCALEPSZrmik, 0}, - {X86::VRNDSCALESDZrri_Intkz, X86::VRNDSCALESDZrmi_Intkz, TB_NO_REVERSE}, - {X86::VRNDSCALESHZrri_Intkz, X86::VRNDSCALESHZrmi_Intkz, TB_NO_REVERSE}, - {X86::VRNDSCALESSZrri_Intkz, X86::VRNDSCALESSZrmi_Intkz, TB_NO_REVERSE}, + {X86::VRNDSCALESDZrrikz_Int, X86::VRNDSCALESDZrmikz_Int, TB_NO_REVERSE}, + {X86::VRNDSCALESHZrrikz_Int, X86::VRNDSCALESHZrmikz_Int, TB_NO_REVERSE}, + {X86::VRNDSCALESSZrrikz_Int, X86::VRNDSCALESSZrmikz_Int, TB_NO_REVERSE}, {X86::VRSQRT14PDZ128rk, X86::VRSQRT14PDZ128mk, 0}, {X86::VRSQRT14PDZ256rk, X86::VRSQRT14PDZ256mk, 0}, {X86::VRSQRT14PDZrk, X86::VRSQRT14PDZmk, 0}, @@ -6038,9 +6038,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VSQRTPSZ128rk, X86::VSQRTPSZ128mk, 0}, {X86::VSQRTPSZ256rk, X86::VSQRTPSZ256mk, 0}, {X86::VSQRTPSZrk, X86::VSQRTPSZmk, 0}, - {X86::VSQRTSDZr_Intkz, X86::VSQRTSDZm_Intkz, TB_NO_REVERSE}, - {X86::VSQRTSHZr_Intkz, X86::VSQRTSHZm_Intkz, TB_NO_REVERSE}, - {X86::VSQRTSSZr_Intkz, X86::VSQRTSSZm_Intkz, TB_NO_REVERSE}, + {X86::VSQRTSDZrkz_Int, X86::VSQRTSDZmkz_Int, TB_NO_REVERSE}, + {X86::VSQRTSHZrkz_Int, X86::VSQRTSHZmkz_Int, TB_NO_REVERSE}, + {X86::VSQRTSSZrkz_Int, X86::VSQRTSSZmkz_Int, TB_NO_REVERSE}, {X86::VSUBNEPBF16Z128rrkz, X86::VSUBNEPBF16Z128rmkz, 0}, {X86::VSUBNEPBF16Z256rrkz, X86::VSUBNEPBF16Z256rmkz, 0}, {X86::VSUBNEPBF16Zrrkz, X86::VSUBNEPBF16Zrmkz, 0}, @@ -6053,9 +6053,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VSUBPSZ128rrkz, X86::VSUBPSZ128rmkz, 0}, {X86::VSUBPSZ256rrkz, X86::VSUBPSZ256rmkz, 0}, {X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0}, - {X86::VSUBSDZrr_Intkz, X86::VSUBSDZrm_Intkz, TB_NO_REVERSE}, - {X86::VSUBSHZrr_Intkz, X86::VSUBSHZrm_Intkz, TB_NO_REVERSE}, - {X86::VSUBSSZrr_Intkz, X86::VSUBSSZrm_Intkz, TB_NO_REVERSE}, + {X86::VSUBSDZrrkz_Int, X86::VSUBSDZrmkz_Int, TB_NO_REVERSE}, + {X86::VSUBSHZrrkz_Int, X86::VSUBSHZrmkz_Int, TB_NO_REVERSE}, + {X86::VSUBSSZrrkz_Int, X86::VSUBSSZrmkz_Int, TB_NO_REVERSE}, {X86::VUNPCKHPDZ128rrkz, X86::VUNPCKHPDZ128rmkz, 0}, {X86::VUNPCKHPDZ256rrkz, X86::VUNPCKHPDZ256rmkz, 0}, {X86::VUNPCKHPDZrrkz, X86::VUNPCKHPDZrmkz, 0}, @@ -6089,9 +6089,9 @@ static const X86FoldTableEntry Table4[] = { {X86::VADDPSZ128rrk, X86::VADDPSZ128rmk, 0}, {X86::VADDPSZ256rrk, X86::VADDPSZ256rmk, 0}, {X86::VADDPSZrrk, X86::VADDPSZrmk, 0}, - {X86::VADDSDZrr_Intk, X86::VADDSDZrm_Intk, TB_NO_REVERSE}, - {X86::VADDSHZrr_Intk, X86::VADDSHZrm_Intk, TB_NO_REVERSE}, - {X86::VADDSSZrr_Intk, X86::VADDSSZrm_Intk, TB_NO_REVERSE}, + {X86::VADDSDZrrk_Int, X86::VADDSDZrmk_Int, TB_NO_REVERSE}, + {X86::VADDSHZrrk_Int, X86::VADDSHZrmk_Int, TB_NO_REVERSE}, + {X86::VADDSSZrrk_Int, X86::VADDSSZrmk_Int, TB_NO_REVERSE}, {X86::VALIGNDZ128rrik, X86::VALIGNDZ128rmik, 0}, {X86::VALIGNDZ256rrik, X86::VALIGNDZ256rmik, 0}, {X86::VALIGNDZrrik, X86::VALIGNDZrmik, 0}, @@ -6140,12 +6140,12 @@ static const X86FoldTableEntry Table4[] = { {X86::VCVTNE2PS2BF16Z128rrk, X86::VCVTNE2PS2BF16Z128rmk, 0}, {X86::VCVTNE2PS2BF16Z256rrk, X86::VCVTNE2PS2BF16Z256rmk, 0}, {X86::VCVTNE2PS2BF16Zrrk, X86::VCVTNE2PS2BF16Zrmk, 0}, - {X86::VCVTSD2SHZrr_Intk, X86::VCVTSD2SHZrm_Intk, TB_NO_REVERSE}, - {X86::VCVTSD2SSZrr_Intk, X86::VCVTSD2SSZrm_Intk, TB_NO_REVERSE}, - {X86::VCVTSH2SDZrr_Intk, X86::VCVTSH2SDZrm_Intk, TB_NO_REVERSE}, - {X86::VCVTSH2SSZrr_Intk, X86::VCVTSH2SSZrm_Intk, TB_NO_REVERSE}, - {X86::VCVTSS2SDZrr_Intk, X86::VCVTSS2SDZrm_Intk, TB_NO_REVERSE}, - {X86::VCVTSS2SHZrr_Intk, X86::VCVTSS2SHZrm_Intk, TB_NO_REVERSE}, + {X86::VCVTSD2SHZrrk_Int, X86::VCVTSD2SHZrmk_Int, TB_NO_REVERSE}, + {X86::VCVTSD2SSZrrk_Int, X86::VCVTSD2SSZrmk_Int, TB_NO_REVERSE}, + {X86::VCVTSH2SDZrrk_Int, X86::VCVTSH2SDZrmk_Int, TB_NO_REVERSE}, + {X86::VCVTSH2SSZrrk_Int, X86::VCVTSH2SSZrmk_Int, TB_NO_REVERSE}, + {X86::VCVTSS2SDZrrk_Int, X86::VCVTSS2SDZrmk_Int, TB_NO_REVERSE}, + {X86::VCVTSS2SHZrrk_Int, X86::VCVTSS2SHZrmk_Int, TB_NO_REVERSE}, {X86::VDBPSADBWZ128rrik, X86::VDBPSADBWZ128rmik, 0}, {X86::VDBPSADBWZ256rrik, X86::VDBPSADBWZ256rmik, 0}, {X86::VDBPSADBWZrrik, X86::VDBPSADBWZrmik, 0}, @@ -6161,9 +6161,9 @@ static const X86FoldTableEntry Table4[] = { {X86::VDIVPSZ128rrk, X86::VDIVPSZ128rmk, 0}, {X86::VDIVPSZ256rrk, X86::VDIVPSZ256rmk, 0}, {X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0}, - {X86::VDIVSDZrr_Intk, X86::VDIVSDZrm_Intk, TB_NO_REVERSE}, - {X86::VDIVSHZrr_Intk, X86::VDIVSHZrm_Intk, TB_NO_REVERSE}, - {X86::VDIVSSZrr_Intk, X86::VDIVSSZrm_Intk, TB_NO_REVERSE}, + {X86::VDIVSDZrrk_Int, X86::VDIVSDZrmk_Int, TB_NO_REVERSE}, + {X86::VDIVSHZrrk_Int, X86::VDIVSHZrmk_Int, TB_NO_REVERSE}, + {X86::VDIVSSZrrk_Int, X86::VDIVSSZrmk_Int, TB_NO_REVERSE}, {X86::VDPBF16PSZ128rk, X86::VDPBF16PSZ128mk, 0}, {X86::VDPBF16PSZ128rkz, X86::VDPBF16PSZ128mkz, 0}, {X86::VDPBF16PSZ256rk, X86::VDPBF16PSZ256mk, 0}, @@ -6228,12 +6228,12 @@ static const X86FoldTableEntry Table4[] = { {X86::VFMADD132PSZ256rkz, X86::VFMADD132PSZ256mkz, 0}, {X86::VFMADD132PSZrk, X86::VFMADD132PSZmk, 0}, {X86::VFMADD132PSZrkz, X86::VFMADD132PSZmkz, 0}, - {X86::VFMADD132SDZr_Intk, X86::VFMADD132SDZm_Intk, TB_NO_REVERSE}, - {X86::VFMADD132SDZr_Intkz, X86::VFMADD132SDZm_Intkz, TB_NO_REVERSE}, - {X86::VFMADD132SHZr_Intk, X86::VFMADD132SHZm_Intk, TB_NO_REVERSE}, - {X86::VFMADD132SHZr_Intkz, X86::VFMADD132SHZm_Intkz, TB_NO_REVERSE}, - {X86::VFMADD132SSZr_Intk, X86::VFMADD132SSZm_Intk, TB_NO_REVERSE}, - {X86::VFMADD132SSZr_Intkz, X86::VFMADD132SSZm_Intkz, TB_NO_REVERSE}, + {X86::VFMADD132SDZrk_Int, X86::VFMADD132SDZmk_Int, TB_NO_REVERSE}, + {X86::VFMADD132SDZrkz_Int, X86::VFMADD132SDZmkz_Int, TB_NO_REVERSE}, + {X86::VFMADD132SHZrk_Int, X86::VFMADD132SHZmk_Int, TB_NO_REVERSE}, + {X86::VFMADD132SHZrkz_Int, X86::VFMADD132SHZmkz_Int, TB_NO_REVERSE}, + {X86::VFMADD132SSZrk_Int, X86::VFMADD132SSZmk_Int, TB_NO_REVERSE}, + {X86::VFMADD132SSZrkz_Int, X86::VFMADD132SSZmkz_Int, TB_NO_REVERSE}, {X86::VFMADD213NEPBF16Z128rk, X86::VFMADD213NEPBF16Z128mk, 0}, {X86::VFMADD213NEPBF16Z128rkz, X86::VFMADD213NEPBF16Z128mkz, 0}, {X86::VFMADD213NEPBF16Z256rk, X86::VFMADD213NEPBF16Z256mk, 0}, @@ -6258,12 +6258,12 @@ static const X86FoldTableEntry Table4[] = { {X86::VFMADD213PSZ256rkz, X86::VFMADD213PSZ256mkz, 0}, {X86::VFMADD213PSZrk, X86::VFMADD213PSZmk, 0}, {X86::VFMADD213PSZrkz, X86::VFMADD213PSZmkz, 0}, - {X86::VFMADD213SDZr_Intk, X86::VFMADD213SDZm_Intk, TB_NO_REVERSE}, - {X86::VFMADD213SDZr_Intkz, X86::VFMADD213SDZm_Intkz, TB_NO_REVERSE}, - {X86::VFMADD213SHZr_Intk, X86::VFMADD213SHZm_Intk, TB_NO_REVERSE}, - {X86::VFMADD213SHZr_Intkz, X86::VFMADD213SHZm_Intkz, TB_NO_REVERSE}, - {X86::VFMADD213SSZr_Intk, X86::VFMADD213SSZm_Intk, TB_NO_REVERSE}, - {X86::VFMADD213SSZr_Intkz, X86::VFMADD213SSZm_Intkz, TB_NO_REVERSE}, + {X86::VFMADD213SDZrk_Int, X86::VFMADD213SDZmk_Int, TB_NO_REVERSE}, + {X86::VFMADD213SDZrkz_Int, X86::VFMADD213SDZmkz_Int, TB_NO_REVERSE}, + {X86::VFMADD213SHZrk_Int, X86::VFMADD213SHZmk_Int, TB_NO_REVERSE}, + {X86::VFMADD213SHZrkz_Int, X86::VFMADD213SHZmkz_Int, TB_NO_REVERSE}, + {X86::VFMADD213SSZrk_Int, X86::VFMADD213SSZmk_Int, TB_NO_REVERSE}, + {X86::VFMADD213SSZrkz_Int, X86::VFMADD213SSZmkz_Int, TB_NO_REVERSE}, {X86::VFMADD231NEPBF16Z128rk, X86::VFMADD231NEPBF16Z128mk, 0}, {X86::VFMADD231NEPBF16Z128rkz, X86::VFMADD231NEPBF16Z128mkz, 0}, {X86::VFMADD231NEPBF16Z256rk, X86::VFMADD231NEPBF16Z256mk, 0}, @@ -6288,12 +6288,12 @@ static const X86FoldTableEntry Table4[] = { {X86::VFMADD231PSZ256rkz, X86::VFMADD231PSZ256mkz, 0}, {X86::VFMADD231PSZrk, X86::VFMADD231PSZmk, 0}, {X86::VFMADD231PSZrkz, X86::VFMADD231PSZmkz, 0}, - {X86::VFMADD231SDZr_Intk, X86::VFMADD231SDZm_Intk, TB_NO_REVERSE}, - {X86::VFMADD231SDZr_Intkz, X86::VFMADD231SDZm_Intkz, TB_NO_REVERSE}, - {X86::VFMADD231SHZr_Intk, X86::VFMADD231SHZm_Intk, TB_NO_REVERSE}, - {X86::VFMADD231SHZr_Intkz, X86::VFMADD231SHZm_Intkz, TB_NO_REVERSE}, - {X86::VFMADD231SSZr_Intk, X86::VFMADD231SSZm_Intk, TB_NO_REVERSE}, - {X86::VFMADD231SSZr_Intkz, X86::VFMADD231SSZm_Intkz, TB_NO_REVERSE}, + {X86::VFMADD231SDZrk_Int, X86::VFMADD231SDZmk_Int, TB_NO_REVERSE}, + {X86::VFMADD231SDZrkz_Int, X86::VFMADD231SDZmkz_Int, TB_NO_REVERSE}, + {X86::VFMADD231SHZrk_Int, X86::VFMADD231SHZmk_Int, TB_NO_REVERSE}, + {X86::VFMADD231SHZrkz_Int, X86::VFMADD231SHZmkz_Int, TB_NO_REVERSE}, + {X86::VFMADD231SSZrk_Int, X86::VFMADD231SSZmk_Int, TB_NO_REVERSE}, + {X86::VFMADD231SSZrkz_Int, X86::VFMADD231SSZmkz_Int, TB_NO_REVERSE}, {X86::VFMADDCPHZ128rk, X86::VFMADDCPHZ128mk, 0}, {X86::VFMADDCPHZ128rkz, X86::VFMADDCPHZ128mkz, 0}, {X86::VFMADDCPHZ256rk, X86::VFMADDCPHZ256mk, 0}, @@ -6380,12 +6380,12 @@ static const X86FoldTableEntry Table4[] = { {X86::VFMSUB132PSZ256rkz, X86::VFMSUB132PSZ256mkz, 0}, {X86::VFMSUB132PSZrk, X86::VFMSUB132PSZmk, 0}, {X86::VFMSUB132PSZrkz, X86::VFMSUB132PSZmkz, 0}, - {X86::VFMSUB132SDZr_Intk, X86::VFMSUB132SDZm_Intk, TB_NO_REVERSE}, - {X86::VFMSUB132SDZr_Intkz, X86::VFMSUB132SDZm_Intkz, TB_NO_REVERSE}, - {X86::VFMSUB132SHZr_Intk, X86::VFMSUB132SHZm_Intk, TB_NO_REVERSE}, - {X86::VFMSUB132SHZr_Intkz, X86::VFMSUB132SHZm_Intkz, TB_NO_REVERSE}, - {X86::VFMSUB132SSZr_Intk, X86::VFMSUB132SSZm_Intk, TB_NO_REVERSE}, - {X86::VFMSUB132SSZr_Intkz, X86::VFMSUB132SSZm_Intkz, TB_NO_REVERSE}, + {X86::VFMSUB132SDZrk_Int, X86::VFMSUB132SDZmk_Int, TB_NO_REVERSE}, + {X86::VFMSUB132SDZrkz_Int, X86::VFMSUB132SDZmkz_Int, TB_NO_REVERSE}, + {X86::VFMSUB132SHZrk_Int, X86::VFMSUB132SHZmk_Int, TB_NO_REVERSE}, + {X86::VFMSUB132SHZrkz_Int, X86::VFMSUB132SHZmkz_Int, TB_NO_REVERSE}, + {X86::VFMSUB132SSZrk_Int, X86::VFMSUB132SSZmk_Int, TB_NO_REVERSE}, + {X86::VFMSUB132SSZrkz_Int, X86::VFMSUB132SSZmkz_Int, TB_NO_REVERSE}, {X86::VFMSUB213NEPBF16Z128rk, X86::VFMSUB213NEPBF16Z128mk, 0}, {X86::VFMSUB213NEPBF16Z128rkz, X86::VFMSUB213NEPBF16Z128mkz, 0}, {X86::VFMSUB213NEPBF16Z256rk, X86::VFMSUB213NEPBF16Z256mk, 0}, @@ -6410,12 +6410,12 @@ static const X86FoldTableEntry Table4[] = { {X86::VFMSUB213PSZ256rkz, X86::VFMSUB213PSZ256mkz, 0}, {X86::VFMSUB213PSZrk, X86::VFMSUB213PSZmk, 0}, {X86::VFMSUB213PSZrkz, X86::VFMSUB213PSZmkz, 0}, - {X86::VFMSUB213SDZr_Intk, X86::VFMSUB213SDZm_Intk, TB_NO_REVERSE}, - {X86::VFMSUB213SDZr_Intkz, X86::VFMSUB213SDZm_Intkz, TB_NO_REVERSE}, - {X86::VFMSUB213SHZr_Intk, X86::VFMSUB213SHZm_Intk, TB_NO_REVERSE}, - {X86::VFMSUB213SHZr_Intkz, X86::VFMSUB213SHZm_Intkz, TB_NO_REVERSE}, - {X86::VFMSUB213SSZr_Intk, X86::VFMSUB213SSZm_Intk, TB_NO_REVERSE}, - {X86::VFMSUB213SSZr_Intkz, X86::VFMSUB213SSZm_Intkz, TB_NO_REVERSE}, + {X86::VFMSUB213SDZrk_Int, X86::VFMSUB213SDZmk_Int, TB_NO_REVERSE}, + {X86::VFMSUB213SDZrkz_Int, X86::VFMSUB213SDZmkz_Int, TB_NO_REVERSE}, + {X86::VFMSUB213SHZrk_Int, X86::VFMSUB213SHZmk_Int, TB_NO_REVERSE}, + {X86::VFMSUB213SHZrkz_Int, X86::VFMSUB213SHZmkz_Int, TB_NO_REVERSE}, + {X86::VFMSUB213SSZrk_Int, X86::VFMSUB213SSZmk_Int, TB_NO_REVERSE}, + {X86::VFMSUB213SSZrkz_Int, X86::VFMSUB213SSZmkz_Int, TB_NO_REVERSE}, {X86::VFMSUB231NEPBF16Z128rk, X86::VFMSUB231NEPBF16Z128mk, 0}, {X86::VFMSUB231NEPBF16Z128rkz, X86::VFMSUB231NEPBF16Z128mkz, 0}, {X86::VFMSUB231NEPBF16Z256rk, X86::VFMSUB231NEPBF16Z256mk, 0}, @@ -6440,12 +6440,12 @@ static const X86FoldTableEntry Table4[] = { {X86::VFMSUB231PSZ256rkz, X86::VFMSUB231PSZ256mkz, 0}, {X86::VFMSUB231PSZrk, X86::VFMSUB231PSZmk, 0}, {X86::VFMSUB231PSZrkz, X86::VFMSUB231PSZmkz, 0}, - {X86::VFMSUB231SDZr_Intk, X86::VFMSUB231SDZm_Intk, TB_NO_REVERSE}, - {X86::VFMSUB231SDZr_Intkz, X86::VFMSUB231SDZm_Intkz, TB_NO_REVERSE}, - {X86::VFMSUB231SHZr_Intk, X86::VFMSUB231SHZm_Intk, TB_NO_REVERSE}, - {X86::VFMSUB231SHZr_Intkz, X86::VFMSUB231SHZm_Intkz, TB_NO_REVERSE}, - {X86::VFMSUB231SSZr_Intk, X86::VFMSUB231SSZm_Intk, TB_NO_REVERSE}, - {X86::VFMSUB231SSZr_Intkz, X86::VFMSUB231SSZm_Intkz, TB_NO_REVERSE}, + {X86::VFMSUB231SDZrk_Int, X86::VFMSUB231SDZmk_Int, TB_NO_REVERSE}, + {X86::VFMSUB231SDZrkz_Int, X86::VFMSUB231SDZmkz_Int, TB_NO_REVERSE}, + {X86::VFMSUB231SHZrk_Int, X86::VFMSUB231SHZmk_Int, TB_NO_REVERSE}, + {X86::VFMSUB231SHZrkz_Int, X86::VFMSUB231SHZmkz_Int, TB_NO_REVERSE}, + {X86::VFMSUB231SSZrk_Int, X86::VFMSUB231SSZmk_Int, TB_NO_REVERSE}, + {X86::VFMSUB231SSZrkz_Int, X86::VFMSUB231SSZmkz_Int, TB_NO_REVERSE}, {X86::VFMSUBADD132PDZ128rk, X86::VFMSUBADD132PDZ128mk, 0}, {X86::VFMSUBADD132PDZ128rkz, X86::VFMSUBADD132PDZ128mkz, 0}, {X86::VFMSUBADD132PDZ256rk, X86::VFMSUBADD132PDZ256mk, 0}, @@ -6528,12 +6528,12 @@ static const X86FoldTableEntry Table4[] = { {X86::VFNMADD132PSZ256rkz, X86::VFNMADD132PSZ256mkz, 0}, {X86::VFNMADD132PSZrk, X86::VFNMADD132PSZmk, 0}, {X86::VFNMADD132PSZrkz, X86::VFNMADD132PSZmkz, 0}, - {X86::VFNMADD132SDZr_Intk, X86::VFNMADD132SDZm_Intk, TB_NO_REVERSE}, - {X86::VFNMADD132SDZr_Intkz, X86::VFNMADD132SDZm_Intkz, TB_NO_REVERSE}, - {X86::VFNMADD132SHZr_Intk, X86::VFNMADD132SHZm_Intk, TB_NO_REVERSE}, - {X86::VFNMADD132SHZr_Intkz, X86::VFNMADD132SHZm_Intkz, TB_NO_REVERSE}, - {X86::VFNMADD132SSZr_Intk, X86::VFNMADD132SSZm_Intk, TB_NO_REVERSE}, - {X86::VFNMADD132SSZr_Intkz, X86::VFNMADD132SSZm_Intkz, TB_NO_REVERSE}, + {X86::VFNMADD132SDZrk_Int, X86::VFNMADD132SDZmk_Int, TB_NO_REVERSE}, + {X86::VFNMADD132SDZrkz_Int, X86::VFNMADD132SDZmkz_Int, TB_NO_REVERSE}, + {X86::VFNMADD132SHZrk_Int, X86::VFNMADD132SHZmk_Int, TB_NO_REVERSE}, + {X86::VFNMADD132SHZrkz_Int, X86::VFNMADD132SHZmkz_Int, TB_NO_REVERSE}, + {X86::VFNMADD132SSZrk_Int, X86::VFNMADD132SSZmk_Int, TB_NO_REVERSE}, + {X86::VFNMADD132SSZrkz_Int, X86::VFNMADD132SSZmkz_Int, TB_NO_REVERSE}, {X86::VFNMADD213NEPBF16Z128rk, X86::VFNMADD213NEPBF16Z128mk, 0}, {X86::VFNMADD213NEPBF16Z128rkz, X86::VFNMADD213NEPBF16Z128mkz, 0}, {X86::VFNMADD213NEPBF16Z256rk, X86::VFNMADD213NEPBF16Z256mk, 0}, @@ -6558,12 +6558,12 @@ static const X86FoldTableEntry Table4[] = { {X86::VFNMADD213PSZ256rkz, X86::VFNMADD213PSZ256mkz, 0}, {X86::VFNMADD213PSZrk, X86::VFNMADD213PSZmk, 0}, {X86::VFNMADD213PSZrkz, X86::VFNMADD213PSZmkz, 0}, - {X86::VFNMADD213SDZr_Intk, X86::VFNMADD213SDZm_Intk, TB_NO_REVERSE}, - {X86::VFNMADD213SDZr_Intkz, X86::VFNMADD213SDZm_Intkz, TB_NO_REVERSE}, - {X86::VFNMADD213SHZr_Intk, X86::VFNMADD213SHZm_Intk, TB_NO_REVERSE}, - {X86::VFNMADD213SHZr_Intkz, X86::VFNMADD213SHZm_Intkz, TB_NO_REVERSE}, - {X86::VFNMADD213SSZr_Intk, X86::VFNMADD213SSZm_Intk, TB_NO_REVERSE}, - {X86::VFNMADD213SSZr_Intkz, X86::VFNMADD213SSZm_Intkz, TB_NO_REVERSE}, + {X86::VFNMADD213SDZrk_Int, X86::VFNMADD213SDZmk_Int, TB_NO_REVERSE}, + {X86::VFNMADD213SDZrkz_Int, X86::VFNMADD213SDZmkz_Int, TB_NO_REVERSE}, + {X86::VFNMADD213SHZrk_Int, X86::VFNMADD213SHZmk_Int, TB_NO_REVERSE}, + {X86::VFNMADD213SHZrkz_Int, X86::VFNMADD213SHZmkz_Int, TB_NO_REVERSE}, + {X86::VFNMADD213SSZrk_Int, X86::VFNMADD213SSZmk_Int, TB_NO_REVERSE}, + {X86::VFNMADD213SSZrkz_Int, X86::VFNMADD213SSZmkz_Int, TB_NO_REVERSE}, {X86::VFNMADD231NEPBF16Z128rk, X86::VFNMADD231NEPBF16Z128mk, 0}, {X86::VFNMADD231NEPBF16Z128rkz, X86::VFNMADD231NEPBF16Z128mkz, 0}, {X86::VFNMADD231NEPBF16Z256rk, X86::VFNMADD231NEPBF16Z256mk, 0}, @@ -6588,12 +6588,12 @@ static const X86FoldTableEntry Table4[] = { {X86::VFNMADD231PSZ256rkz, X86::VFNMADD231PSZ256mkz, 0}, {X86::VFNMADD231PSZrk, X86::VFNMADD231PSZmk, 0}, {X86::VFNMADD231PSZrkz, X86::VFNMADD231PSZmkz, 0}, - {X86::VFNMADD231SDZr_Intk, X86::VFNMADD231SDZm_Intk, TB_NO_REVERSE}, - {X86::VFNMADD231SDZr_Intkz, X86::VFNMADD231SDZm_Intkz, TB_NO_REVERSE}, - {X86::VFNMADD231SHZr_Intk, X86::VFNMADD231SHZm_Intk, TB_NO_REVERSE}, - {X86::VFNMADD231SHZr_Intkz, X86::VFNMADD231SHZm_Intkz, TB_NO_REVERSE}, - {X86::VFNMADD231SSZr_Intk, X86::VFNMADD231SSZm_Intk, TB_NO_REVERSE}, - {X86::VFNMADD231SSZr_Intkz, X86::VFNMADD231SSZm_Intkz, TB_NO_REVERSE}, + {X86::VFNMADD231SDZrk_Int, X86::VFNMADD231SDZmk_Int, TB_NO_REVERSE}, + {X86::VFNMADD231SDZrkz_Int, X86::VFNMADD231SDZmkz_Int, TB_NO_REVERSE}, + {X86::VFNMADD231SHZrk_Int, X86::VFNMADD231SHZmk_Int, TB_NO_REVERSE}, + {X86::VFNMADD231SHZrkz_Int, X86::VFNMADD231SHZmkz_Int, TB_NO_REVERSE}, + {X86::VFNMADD231SSZrk_Int, X86::VFNMADD231SSZmk_Int, TB_NO_REVERSE}, + {X86::VFNMADD231SSZrkz_Int, X86::VFNMADD231SSZmkz_Int, TB_NO_REVERSE}, {X86::VFNMSUB132NEPBF16Z128rk, X86::VFNMSUB132NEPBF16Z128mk, 0}, {X86::VFNMSUB132NEPBF16Z128rkz, X86::VFNMSUB132NEPBF16Z128mkz, 0}, {X86::VFNMSUB132NEPBF16Z256rk, X86::VFNMSUB132NEPBF16Z256mk, 0}, @@ -6618,12 +6618,12 @@ static const X86FoldTableEntry Table4[] = { {X86::VFNMSUB132PSZ256rkz, X86::VFNMSUB132PSZ256mkz, 0}, {X86::VFNMSUB132PSZrk, X86::VFNMSUB132PSZmk, 0}, {X86::VFNMSUB132PSZrkz, X86::VFNMSUB132PSZmkz, 0}, - {X86::VFNMSUB132SDZr_Intk, X86::VFNMSUB132SDZm_Intk, TB_NO_REVERSE}, - {X86::VFNMSUB132SDZr_Intkz, X86::VFNMSUB132SDZm_Intkz, TB_NO_REVERSE}, - {X86::VFNMSUB132SHZr_Intk, X86::VFNMSUB132SHZm_Intk, TB_NO_REVERSE}, - {X86::VFNMSUB132SHZr_Intkz, X86::VFNMSUB132SHZm_Intkz, TB_NO_REVERSE}, - {X86::VFNMSUB132SSZr_Intk, X86::VFNMSUB132SSZm_Intk, TB_NO_REVERSE}, - {X86::VFNMSUB132SSZr_Intkz, X86::VFNMSUB132SSZm_Intkz, TB_NO_REVERSE}, + {X86::VFNMSUB132SDZrk_Int, X86::VFNMSUB132SDZmk_Int, TB_NO_REVERSE}, + {X86::VFNMSUB132SDZrkz_Int, X86::VFNMSUB132SDZmkz_Int, TB_NO_REVERSE}, + {X86::VFNMSUB132SHZrk_Int, X86::VFNMSUB132SHZmk_Int, TB_NO_REVERSE}, + {X86::VFNMSUB132SHZrkz_Int, X86::VFNMSUB132SHZmkz_Int, TB_NO_REVERSE}, + {X86::VFNMSUB132SSZrk_Int, X86::VFNMSUB132SSZmk_Int, TB_NO_REVERSE}, + {X86::VFNMSUB132SSZrkz_Int, X86::VFNMSUB132SSZmkz_Int, TB_NO_REVERSE}, {X86::VFNMSUB213NEPBF16Z128rk, X86::VFNMSUB213NEPBF16Z128mk, 0}, {X86::VFNMSUB213NEPBF16Z128rkz, X86::VFNMSUB213NEPBF16Z128mkz, 0}, {X86::VFNMSUB213NEPBF16Z256rk, X86::VFNMSUB213NEPBF16Z256mk, 0}, @@ -6648,12 +6648,12 @@ static const X86FoldTableEntry Table4[] = { {X86::VFNMSUB213PSZ256rkz, X86::VFNMSUB213PSZ256mkz, 0}, {X86::VFNMSUB213PSZrk, X86::VFNMSUB213PSZmk, 0}, {X86::VFNMSUB213PSZrkz, X86::VFNMSUB213PSZmkz, 0}, - {X86::VFNMSUB213SDZr_Intk, X86::VFNMSUB213SDZm_Intk, TB_NO_REVERSE}, - {X86::VFNMSUB213SDZr_Intkz, X86::VFNMSUB213SDZm_Intkz, TB_NO_REVERSE}, - {X86::VFNMSUB213SHZr_Intk, X86::VFNMSUB213SHZm_Intk, TB_NO_REVERSE}, - {X86::VFNMSUB213SHZr_Intkz, X86::VFNMSUB213SHZm_Intkz, TB_NO_REVERSE}, - {X86::VFNMSUB213SSZr_Intk, X86::VFNMSUB213SSZm_Intk, TB_NO_REVERSE}, - {X86::VFNMSUB213SSZr_Intkz, X86::VFNMSUB213SSZm_Intkz, TB_NO_REVERSE}, + {X86::VFNMSUB213SDZrk_Int, X86::VFNMSUB213SDZmk_Int, TB_NO_REVERSE}, + {X86::VFNMSUB213SDZrkz_Int, X86::VFNMSUB213SDZmkz_Int, TB_NO_REVERSE}, + {X86::VFNMSUB213SHZrk_Int, X86::VFNMSUB213SHZmk_Int, TB_NO_REVERSE}, + {X86::VFNMSUB213SHZrkz_Int, X86::VFNMSUB213SHZmkz_Int, TB_NO_REVERSE}, + {X86::VFNMSUB213SSZrk_Int, X86::VFNMSUB213SSZmk_Int, TB_NO_REVERSE}, + {X86::VFNMSUB213SSZrkz_Int, X86::VFNMSUB213SSZmkz_Int, TB_NO_REVERSE}, {X86::VFNMSUB231NEPBF16Z128rk, X86::VFNMSUB231NEPBF16Z128mk, 0}, {X86::VFNMSUB231NEPBF16Z128rkz, X86::VFNMSUB231NEPBF16Z128mkz, 0}, {X86::VFNMSUB231NEPBF16Z256rk, X86::VFNMSUB231NEPBF16Z256mk, 0}, @@ -6678,12 +6678,12 @@ static const X86FoldTableEntry Table4[] = { {X86::VFNMSUB231PSZ256rkz, X86::VFNMSUB231PSZ256mkz, 0}, {X86::VFNMSUB231PSZrk, X86::VFNMSUB231PSZmk, 0}, {X86::VFNMSUB231PSZrkz, X86::VFNMSUB231PSZmkz, 0}, - {X86::VFNMSUB231SDZr_Intk, X86::VFNMSUB231SDZm_Intk, TB_NO_REVERSE}, - {X86::VFNMSUB231SDZr_Intkz, X86::VFNMSUB231SDZm_Intkz, TB_NO_REVERSE}, - {X86::VFNMSUB231SHZr_Intk, X86::VFNMSUB231SHZm_Intk, TB_NO_REVERSE}, - {X86::VFNMSUB231SHZr_Intkz, X86::VFNMSUB231SHZm_Intkz, TB_NO_REVERSE}, - {X86::VFNMSUB231SSZr_Intk, X86::VFNMSUB231SSZm_Intk, TB_NO_REVERSE}, - {X86::VFNMSUB231SSZr_Intkz, X86::VFNMSUB231SSZm_Intkz, TB_NO_REVERSE}, + {X86::VFNMSUB231SDZrk_Int, X86::VFNMSUB231SDZmk_Int, TB_NO_REVERSE}, + {X86::VFNMSUB231SDZrkz_Int, X86::VFNMSUB231SDZmkz_Int, TB_NO_REVERSE}, + {X86::VFNMSUB231SHZrk_Int, X86::VFNMSUB231SHZmk_Int, TB_NO_REVERSE}, + {X86::VFNMSUB231SHZrkz_Int, X86::VFNMSUB231SHZmkz_Int, TB_NO_REVERSE}, + {X86::VFNMSUB231SSZrk_Int, X86::VFNMSUB231SSZmk_Int, TB_NO_REVERSE}, + {X86::VFNMSUB231SSZrkz_Int, X86::VFNMSUB231SSZmkz_Int, TB_NO_REVERSE}, {X86::VGETEXPSDZrk, X86::VGETEXPSDZmk, TB_NO_REVERSE}, {X86::VGETEXPSHZrk, X86::VGETEXPSHZmk, TB_NO_REVERSE}, {X86::VGETEXPSSZrk, X86::VGETEXPSSZmk, TB_NO_REVERSE}, @@ -6732,9 +6732,9 @@ static const X86FoldTableEntry Table4[] = { {X86::VMAXPSZ128rrk, X86::VMAXPSZ128rmk, 0}, {X86::VMAXPSZ256rrk, X86::VMAXPSZ256rmk, 0}, {X86::VMAXPSZrrk, X86::VMAXPSZrmk, 0}, - {X86::VMAXSDZrr_Intk, X86::VMAXSDZrm_Intk, TB_NO_REVERSE}, - {X86::VMAXSHZrr_Intk, X86::VMAXSHZrm_Intk, TB_NO_REVERSE}, - {X86::VMAXSSZrr_Intk, X86::VMAXSSZrm_Intk, TB_NO_REVERSE}, + {X86::VMAXSDZrrk_Int, X86::VMAXSDZrmk_Int, TB_NO_REVERSE}, + {X86::VMAXSHZrrk_Int, X86::VMAXSHZrmk_Int, TB_NO_REVERSE}, + {X86::VMAXSSZrrk_Int, X86::VMAXSSZrmk_Int, TB_NO_REVERSE}, {X86::VMINCPDZ128rrk, X86::VMINCPDZ128rmk, 0}, {X86::VMINCPDZ256rrk, X86::VMINCPDZ256rmk, 0}, {X86::VMINCPDZrrk, X86::VMINCPDZrmk, 0}, @@ -6756,9 +6756,9 @@ static const X86FoldTableEntry Table4[] = { {X86::VMINMAXPSZ128rrik, X86::VMINMAXPSZ128rmik, 0}, {X86::VMINMAXPSZ256rrik, X86::VMINMAXPSZ256rmik, 0}, {X86::VMINMAXPSZrrik, X86::VMINMAXPSZrmik, 0}, - {X86::VMINMAXSDrri_Intk, X86::VMINMAXSDrmi_Intk, TB_NO_REVERSE}, - {X86::VMINMAXSHrri_Intk, X86::VMINMAXSHrmi_Intk, TB_NO_REVERSE}, - {X86::VMINMAXSSrri_Intk, X86::VMINMAXSSrmi_Intk, TB_NO_REVERSE}, + {X86::VMINMAXSDrrik_Int, X86::VMINMAXSDrmik_Int, TB_NO_REVERSE}, + {X86::VMINMAXSHrrik_Int, X86::VMINMAXSHrmik_Int, TB_NO_REVERSE}, + {X86::VMINMAXSSrrik_Int, X86::VMINMAXSSrmik_Int, TB_NO_REVERSE}, {X86::VMINPBF16Z128rrk, X86::VMINPBF16Z128rmk, 0}, {X86::VMINPBF16Z256rrk, X86::VMINPBF16Z256rmk, 0}, {X86::VMINPBF16Zrrk, X86::VMINPBF16Zrmk, 0}, @@ -6771,9 +6771,9 @@ static const X86FoldTableEntry Table4[] = { {X86::VMINPSZ128rrk, X86::VMINPSZ128rmk, 0}, {X86::VMINPSZ256rrk, X86::VMINPSZ256rmk, 0}, {X86::VMINPSZrrk, X86::VMINPSZrmk, 0}, - {X86::VMINSDZrr_Intk, X86::VMINSDZrm_Intk, TB_NO_REVERSE}, - {X86::VMINSHZrr_Intk, X86::VMINSHZrm_Intk, TB_NO_REVERSE}, - {X86::VMINSSZrr_Intk, X86::VMINSSZrm_Intk, TB_NO_REVERSE}, + {X86::VMINSDZrrk_Int, X86::VMINSDZrmk_Int, TB_NO_REVERSE}, + {X86::VMINSHZrrk_Int, X86::VMINSHZrmk_Int, TB_NO_REVERSE}, + {X86::VMINSSZrrk_Int, X86::VMINSSZrmk_Int, TB_NO_REVERSE}, {X86::VMPSADBWZ128rrik, X86::VMPSADBWZ128rmik, 0}, {X86::VMPSADBWZ256rrik, X86::VMPSADBWZ256rmik, 0}, {X86::VMPSADBWZrrik, X86::VMPSADBWZrmik, 0}, @@ -6789,9 +6789,9 @@ static const X86FoldTableEntry Table4[] = { {X86::VMULPSZ128rrk, X86::VMULPSZ128rmk, 0}, {X86::VMULPSZ256rrk, X86::VMULPSZ256rmk, 0}, {X86::VMULPSZrrk, X86::VMULPSZrmk, 0}, - {X86::VMULSDZrr_Intk, X86::VMULSDZrm_Intk, TB_NO_REVERSE}, - {X86::VMULSHZrr_Intk, X86::VMULSHZrm_Intk, TB_NO_REVERSE}, - {X86::VMULSSZrr_Intk, X86::VMULSSZrm_Intk, TB_NO_REVERSE}, + {X86::VMULSDZrrk_Int, X86::VMULSDZrmk_Int, TB_NO_REVERSE}, + {X86::VMULSHZrrk_Int, X86::VMULSHZrmk_Int, TB_NO_REVERSE}, + {X86::VMULSSZrrk_Int, X86::VMULSSZrmk_Int, TB_NO_REVERSE}, {X86::VORPDZ128rrk, X86::VORPDZ128rmk, 0}, {X86::VORPDZ256rrk, X86::VORPDZ256rmk, 0}, {X86::VORPDZrrk, X86::VORPDZrmk, 0}, @@ -7347,9 +7347,9 @@ static const X86FoldTableEntry Table4[] = { {X86::VREDUCESDZrrik, X86::VREDUCESDZrmik, TB_NO_REVERSE}, {X86::VREDUCESHZrrik, X86::VREDUCESHZrmik, TB_NO_REVERSE}, {X86::VREDUCESSZrrik, X86::VREDUCESSZrmik, TB_NO_REVERSE}, - {X86::VRNDSCALESDZrri_Intk, X86::VRNDSCALESDZrmi_Intk, TB_NO_REVERSE}, - {X86::VRNDSCALESHZrri_Intk, X86::VRNDSCALESHZrmi_Intk, TB_NO_REVERSE}, - {X86::VRNDSCALESSZrri_Intk, X86::VRNDSCALESSZrmi_Intk, TB_NO_REVERSE}, + {X86::VRNDSCALESDZrrik_Int, X86::VRNDSCALESDZrmik_Int, TB_NO_REVERSE}, + {X86::VRNDSCALESHZrrik_Int, X86::VRNDSCALESHZrmik_Int, TB_NO_REVERSE}, + {X86::VRNDSCALESSZrrik_Int, X86::VRNDSCALESSZrmik_Int, TB_NO_REVERSE}, {X86::VRSQRT14SDZrrk, X86::VRSQRT14SDZrmk, TB_NO_REVERSE}, {X86::VRSQRT14SSZrrk, X86::VRSQRT14SSZrmk, TB_NO_REVERSE}, {X86::VRSQRT28SDZrk, X86::VRSQRT28SDZmk, TB_NO_REVERSE}, @@ -7384,9 +7384,9 @@ static const X86FoldTableEntry Table4[] = { {X86::VSHUFPSZ128rrik, X86::VSHUFPSZ128rmik, 0}, {X86::VSHUFPSZ256rrik, X86::VSHUFPSZ256rmik, 0}, {X86::VSHUFPSZrrik, X86::VSHUFPSZrmik, 0}, - {X86::VSQRTSDZr_Intk, X86::VSQRTSDZm_Intk, TB_NO_REVERSE}, - {X86::VSQRTSHZr_Intk, X86::VSQRTSHZm_Intk, TB_NO_REVERSE}, - {X86::VSQRTSSZr_Intk, X86::VSQRTSSZm_Intk, TB_NO_REVERSE}, + {X86::VSQRTSDZrk_Int, X86::VSQRTSDZmk_Int, TB_NO_REVERSE}, + {X86::VSQRTSHZrk_Int, X86::VSQRTSHZmk_Int, TB_NO_REVERSE}, + {X86::VSQRTSSZrk_Int, X86::VSQRTSSZmk_Int, TB_NO_REVERSE}, {X86::VSUBNEPBF16Z128rrk, X86::VSUBNEPBF16Z128rmk, 0}, {X86::VSUBNEPBF16Z256rrk, X86::VSUBNEPBF16Z256rmk, 0}, {X86::VSUBNEPBF16Zrrk, X86::VSUBNEPBF16Zrmk, 0}, @@ -7399,9 +7399,9 @@ static const X86FoldTableEntry Table4[] = { {X86::VSUBPSZ128rrk, X86::VSUBPSZ128rmk, 0}, {X86::VSUBPSZ256rrk, X86::VSUBPSZ256rmk, 0}, {X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0}, - {X86::VSUBSDZrr_Intk, X86::VSUBSDZrm_Intk, TB_NO_REVERSE}, - {X86::VSUBSHZrr_Intk, X86::VSUBSHZrm_Intk, TB_NO_REVERSE}, - {X86::VSUBSSZrr_Intk, X86::VSUBSSZrm_Intk, TB_NO_REVERSE}, + {X86::VSUBSDZrrk_Int, X86::VSUBSDZrmk_Int, TB_NO_REVERSE}, + {X86::VSUBSHZrrk_Int, X86::VSUBSHZrmk_Int, TB_NO_REVERSE}, + {X86::VSUBSSZrrk_Int, X86::VSUBSSZrmk_Int, TB_NO_REVERSE}, {X86::VUNPCKHPDZ128rrk, X86::VUNPCKHPDZ128rmk, 0}, {X86::VUNPCKHPDZ256rrk, X86::VUNPCKHPDZ256rmk, 0}, {X86::VUNPCKHPDZrrk, X86::VUNPCKHPDZrmk, 0}, From 4b22ef7d6b9f739de63b6d07777424cf5dfb9f92 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Thu, 2 Jan 2025 08:24:50 -0500 Subject: [PATCH 276/567] [lld/COFF] Fix comment typo to cycle bots --- lld/COFF/Writer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp index b3dd5f6cf4926..a5b2b6238db1a 100644 --- a/lld/COFF/Writer.cpp +++ b/lld/COFF/Writer.cpp @@ -946,7 +946,7 @@ void Writer::appendECImportTables() { const uint32_t rdata = IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_MEM_READ; - // IAT is always placed at the begining of .rdata section and its size + // IAT is always placed at the beginning of .rdata section and its size // is aligned to 4KB. Insert it here, after all merges all done. if (PartialSection *importAddresses = findPartialSection(".idata$5", rdata)) { if (!rdataSec->chunks.empty()) From d622b66a820a0e5e61c131e9ae5b4db35292aa14 Mon Sep 17 00:00:00 2001 From: josel-amd Date: Thu, 2 Jan 2025 14:58:15 +0100 Subject: [PATCH 277/567] Re-introduce Type Conversion on EmitC (#121476) This PR reintroduces https://github.com/llvm/llvm-project/pull/118940 with a fix for the build issues on cd9caf3aeed55280537052227f08bb1b41154efd --- .../mlir/Conversion/SCFToEmitC/SCFToEmitC.h | 4 +- mlir/lib/Conversion/SCFToEmitC/CMakeLists.txt | 1 + mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp | 206 ++++++++++++------ mlir/test/Conversion/SCFToEmitC/for.mlir | 89 +++++++- mlir/test/Conversion/SCFToEmitC/switch.mlir | 9 +- 5 files changed, 229 insertions(+), 80 deletions(-) diff --git a/mlir/include/mlir/Conversion/SCFToEmitC/SCFToEmitC.h b/mlir/include/mlir/Conversion/SCFToEmitC/SCFToEmitC.h index 22df7f1c5dcf2..acc39e6acf726 100644 --- a/mlir/include/mlir/Conversion/SCFToEmitC/SCFToEmitC.h +++ b/mlir/include/mlir/Conversion/SCFToEmitC/SCFToEmitC.h @@ -9,6 +9,7 @@ #ifndef MLIR_CONVERSION_SCFTOEMITC_SCFTOEMITC_H #define MLIR_CONVERSION_SCFTOEMITC_SCFTOEMITC_H +#include "mlir/Transforms/DialectConversion.h" #include namespace mlir { @@ -19,7 +20,8 @@ class RewritePatternSet; #include "mlir/Conversion/Passes.h.inc" /// Collect a set of patterns to convert SCF operations to the EmitC dialect. -void populateSCFToEmitCConversionPatterns(RewritePatternSet &patterns); +void populateSCFToEmitCConversionPatterns(RewritePatternSet &patterns, + TypeConverter &typeConverter); } // namespace mlir #endif // MLIR_CONVERSION_SCFTOEMITC_SCFTOEMITC_H diff --git a/mlir/lib/Conversion/SCFToEmitC/CMakeLists.txt b/mlir/lib/Conversion/SCFToEmitC/CMakeLists.txt index 79119d374f7a5..af5493be8a4b3 100644 --- a/mlir/lib/Conversion/SCFToEmitC/CMakeLists.txt +++ b/mlir/lib/Conversion/SCFToEmitC/CMakeLists.txt @@ -13,6 +13,7 @@ add_mlir_conversion_library(MLIRSCFToEmitC LINK_LIBS PUBLIC MLIRArithDialect MLIREmitCDialect + MLIREmitCTransforms MLIRSCFDialect MLIRTransforms ) diff --git a/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp b/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp index 67a43c43d608b..92523ca4f12b2 100644 --- a/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp +++ b/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp @@ -14,6 +14,7 @@ #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/EmitC/IR/EmitC.h" +#include "mlir/Dialect/EmitC/Transforms/TypeConversions.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinOps.h" @@ -39,21 +40,22 @@ struct SCFToEmitCPass : public impl::SCFToEmitCBase { // Lower scf::for to emitc::for, implementing result values using // emitc::variable's updated within the loop body. -struct ForLowering : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; +struct ForLowering : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(ForOp forOp, - PatternRewriter &rewriter) const override; + LogicalResult + matchAndRewrite(ForOp forOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; }; // Create an uninitialized emitc::variable op for each result of the given op. template -static SmallVector createVariablesForResults(T op, - PatternRewriter &rewriter) { - SmallVector resultVariables; - +static LogicalResult +createVariablesForResults(T op, const TypeConverter *typeConverter, + ConversionPatternRewriter &rewriter, + SmallVector &resultVariables) { if (!op.getNumResults()) - return resultVariables; + return success(); Location loc = op->getLoc(); MLIRContext *context = op.getContext(); @@ -62,7 +64,9 @@ static SmallVector createVariablesForResults(T op, rewriter.setInsertionPoint(op); for (OpResult result : op.getResults()) { - Type resultType = result.getType(); + Type resultType = typeConverter->convertType(result.getType()); + if (!resultType) + return rewriter.notifyMatchFailure(op, "result type conversion failed"); Type varType = emitc::LValueType::get(resultType); emitc::OpaqueAttr noInit = emitc::OpaqueAttr::get(context, ""); emitc::VariableOp var = @@ -70,13 +74,13 @@ static SmallVector createVariablesForResults(T op, resultVariables.push_back(var); } - return resultVariables; + return success(); } // Create a series of assign ops assigning given values to given variables at // the current insertion point of given rewriter. -static void assignValues(ValueRange values, SmallVector &variables, - PatternRewriter &rewriter, Location loc) { +static void assignValues(ValueRange values, ValueRange variables, + ConversionPatternRewriter &rewriter, Location loc) { for (auto [value, var] : llvm::zip(values, variables)) rewriter.create(loc, var, value); } @@ -89,18 +93,25 @@ SmallVector loadValues(const SmallVector &variables, }); } -static void lowerYield(SmallVector &resultVariables, - PatternRewriter &rewriter, scf::YieldOp yield) { +static LogicalResult lowerYield(Operation *op, ValueRange resultVariables, + ConversionPatternRewriter &rewriter, + scf::YieldOp yield) { Location loc = yield.getLoc(); - ValueRange operands = yield.getOperands(); OpBuilder::InsertionGuard guard(rewriter); rewriter.setInsertionPoint(yield); - assignValues(operands, resultVariables, rewriter, loc); + SmallVector yieldOperands; + if (failed(rewriter.getRemappedValues(yield.getOperands(), yieldOperands))) { + return rewriter.notifyMatchFailure(op, "failed to lower yield operands"); + } + + assignValues(yieldOperands, resultVariables, rewriter, loc); rewriter.create(loc); rewriter.eraseOp(yield); + + return success(); } // Lower the contents of an scf::if/scf::index_switch regions to an @@ -108,27 +119,32 @@ static void lowerYield(SmallVector &resultVariables, // moved into the respective lowered region, but the scf::yield is replaced not // only with an emitc::yield, but also with a sequence of emitc::assign ops that // set the yielded values into the result variables. -static void lowerRegion(SmallVector &resultVariables, - PatternRewriter &rewriter, Region ®ion, - Region &loweredRegion) { +static LogicalResult lowerRegion(Operation *op, ValueRange resultVariables, + ConversionPatternRewriter &rewriter, + Region ®ion, Region &loweredRegion) { rewriter.inlineRegionBefore(region, loweredRegion, loweredRegion.end()); Operation *terminator = loweredRegion.back().getTerminator(); - lowerYield(resultVariables, rewriter, cast(terminator)); + return lowerYield(op, resultVariables, rewriter, + cast(terminator)); } -LogicalResult ForLowering::matchAndRewrite(ForOp forOp, - PatternRewriter &rewriter) const { +LogicalResult +ForLowering::matchAndRewrite(ForOp forOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { Location loc = forOp.getLoc(); // Create an emitc::variable op for each result. These variables will be // assigned to by emitc::assign ops within the loop body. - SmallVector resultVariables = - createVariablesForResults(forOp, rewriter); + SmallVector resultVariables; + if (failed(createVariablesForResults(forOp, getTypeConverter(), rewriter, + resultVariables))) + return rewriter.notifyMatchFailure(forOp, + "create variables for results failed"); - assignValues(forOp.getInits(), resultVariables, rewriter, loc); + assignValues(adaptor.getInitArgs(), resultVariables, rewriter, loc); emitc::ForOp loweredFor = rewriter.create( - loc, forOp.getLowerBound(), forOp.getUpperBound(), forOp.getStep()); + loc, adaptor.getLowerBound(), adaptor.getUpperBound(), adaptor.getStep()); Block *loweredBody = loweredFor.getBody(); @@ -143,13 +159,27 @@ LogicalResult ForLowering::matchAndRewrite(ForOp forOp, rewriter.restoreInsertionPoint(ip); + // Convert the original region types into the new types by adding unrealized + // casts in the beginning of the loop. This performs the conversion in place. + if (failed(rewriter.convertRegionTypes(&forOp.getRegion(), + *getTypeConverter(), nullptr))) { + return rewriter.notifyMatchFailure(forOp, "region types conversion failed"); + } + + // Register the replacements for the block arguments and inline the body of + // the scf.for loop into the body of the emitc::for loop. + Block *scfBody = &(forOp.getRegion().front()); SmallVector replacingValues; replacingValues.push_back(loweredFor.getInductionVar()); replacingValues.append(iterArgsValues.begin(), iterArgsValues.end()); + rewriter.mergeBlocks(scfBody, loweredBody, replacingValues); - rewriter.mergeBlocks(forOp.getBody(), loweredBody, replacingValues); - lowerYield(resultVariables, rewriter, - cast(loweredBody->getTerminator())); + auto result = lowerYield(forOp, resultVariables, rewriter, + cast(loweredBody->getTerminator())); + + if (failed(result)) { + return result; + } // Load variables into SSA values after the for loop. SmallVector resultValues = loadValues(resultVariables, rewriter, loc); @@ -160,38 +190,66 @@ LogicalResult ForLowering::matchAndRewrite(ForOp forOp, // Lower scf::if to emitc::if, implementing result values as emitc::variable's // updated within the then and else regions. -struct IfLowering : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; +struct IfLowering : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(IfOp ifOp, - PatternRewriter &rewriter) const override; + LogicalResult + matchAndRewrite(IfOp ifOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; }; } // namespace -LogicalResult IfLowering::matchAndRewrite(IfOp ifOp, - PatternRewriter &rewriter) const { +LogicalResult +IfLowering::matchAndRewrite(IfOp ifOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { Location loc = ifOp.getLoc(); // Create an emitc::variable op for each result. These variables will be // assigned to by emitc::assign ops within the then & else regions. - SmallVector resultVariables = - createVariablesForResults(ifOp, rewriter); - - Region &thenRegion = ifOp.getThenRegion(); - Region &elseRegion = ifOp.getElseRegion(); + SmallVector resultVariables; + if (failed(createVariablesForResults(ifOp, getTypeConverter(), rewriter, + resultVariables))) + return rewriter.notifyMatchFailure(ifOp, + "create variables for results failed"); + + // Utility function to lower the contents of an scf::if region to an emitc::if + // region. The contents of the scf::if regions is moved into the respective + // emitc::if regions, but the scf::yield is replaced not only with an + // emitc::yield, but also with a sequence of emitc::assign ops that set the + // yielded values into the result variables. + auto lowerRegion = [&resultVariables, &rewriter, + &ifOp](Region ®ion, Region &loweredRegion) { + rewriter.inlineRegionBefore(region, loweredRegion, loweredRegion.end()); + Operation *terminator = loweredRegion.back().getTerminator(); + auto result = lowerYield(ifOp, resultVariables, rewriter, + cast(terminator)); + if (failed(result)) { + return result; + } + return success(); + }; + + Region &thenRegion = adaptor.getThenRegion(); + Region &elseRegion = adaptor.getElseRegion(); bool hasElseBlock = !elseRegion.empty(); auto loweredIf = - rewriter.create(loc, ifOp.getCondition(), false, false); + rewriter.create(loc, adaptor.getCondition(), false, false); Region &loweredThenRegion = loweredIf.getThenRegion(); - lowerRegion(resultVariables, rewriter, thenRegion, loweredThenRegion); + auto result = lowerRegion(thenRegion, loweredThenRegion); + if (failed(result)) { + return result; + } if (hasElseBlock) { Region &loweredElseRegion = loweredIf.getElseRegion(); - lowerRegion(resultVariables, rewriter, elseRegion, loweredElseRegion); + auto result = lowerRegion(elseRegion, loweredElseRegion); + if (failed(result)) { + return result; + } } rewriter.setInsertionPointAfter(ifOp); @@ -203,37 +261,46 @@ LogicalResult IfLowering::matchAndRewrite(IfOp ifOp, // Lower scf::index_switch to emitc::switch, implementing result values as // emitc::variable's updated within the case and default regions. -struct IndexSwitchOpLowering : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; +struct IndexSwitchOpLowering : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(IndexSwitchOp indexSwitchOp, - PatternRewriter &rewriter) const override; + LogicalResult + matchAndRewrite(IndexSwitchOp indexSwitchOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; }; -LogicalResult -IndexSwitchOpLowering::matchAndRewrite(IndexSwitchOp indexSwitchOp, - PatternRewriter &rewriter) const { +LogicalResult IndexSwitchOpLowering::matchAndRewrite( + IndexSwitchOp indexSwitchOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { Location loc = indexSwitchOp.getLoc(); // Create an emitc::variable op for each result. These variables will be // assigned to by emitc::assign ops within the case and default regions. - SmallVector resultVariables = - createVariablesForResults(indexSwitchOp, rewriter); + SmallVector resultVariables; + if (failed(createVariablesForResults(indexSwitchOp, getTypeConverter(), + rewriter, resultVariables))) { + return rewriter.notifyMatchFailure(indexSwitchOp, + "create variables for results failed"); + } auto loweredSwitch = rewriter.create( - loc, indexSwitchOp.getArg(), indexSwitchOp.getCases(), - indexSwitchOp.getNumCases()); + loc, adaptor.getArg(), adaptor.getCases(), indexSwitchOp.getNumCases()); // Lowering all case regions. - for (auto pair : llvm::zip(indexSwitchOp.getCaseRegions(), - loweredSwitch.getCaseRegions())) { - lowerRegion(resultVariables, rewriter, std::get<0>(pair), - std::get<1>(pair)); + for (auto pair : + llvm::zip(adaptor.getCaseRegions(), loweredSwitch.getCaseRegions())) { + if (failed(lowerRegion(indexSwitchOp, resultVariables, rewriter, + *std::get<0>(pair), std::get<1>(pair)))) { + return failure(); + } } // Lowering default region. - lowerRegion(resultVariables, rewriter, indexSwitchOp.getDefaultRegion(), - loweredSwitch.getDefaultRegion()); + if (failed(lowerRegion(indexSwitchOp, resultVariables, rewriter, + adaptor.getDefaultRegion(), + loweredSwitch.getDefaultRegion()))) { + return failure(); + } rewriter.setInsertionPointAfter(indexSwitchOp); SmallVector results = loadValues(resultVariables, rewriter, loc); @@ -242,15 +309,22 @@ IndexSwitchOpLowering::matchAndRewrite(IndexSwitchOp indexSwitchOp, return success(); } -void mlir::populateSCFToEmitCConversionPatterns(RewritePatternSet &patterns) { - patterns.add(patterns.getContext()); - patterns.add(patterns.getContext()); - patterns.add(patterns.getContext()); +void mlir::populateSCFToEmitCConversionPatterns(RewritePatternSet &patterns, + TypeConverter &typeConverter) { + patterns.add(typeConverter, patterns.getContext()); + patterns.add(typeConverter, patterns.getContext()); + patterns.add(typeConverter, patterns.getContext()); } void SCFToEmitCPass::runOnOperation() { RewritePatternSet patterns(&getContext()); - populateSCFToEmitCConversionPatterns(patterns); + TypeConverter typeConverter; + // Fallback converter + // See note https://mlir.llvm.org/docs/DialectConversion/#type-converter + // Type converters are called most to least recently inserted + typeConverter.addConversion([](Type t) { return t; }); + populateEmitCSizeTTypeConversions(typeConverter); + populateSCFToEmitCConversionPatterns(patterns, typeConverter); // Configure conversion to lower out SCF operations. ConversionTarget target(getContext()); diff --git a/mlir/test/Conversion/SCFToEmitC/for.mlir b/mlir/test/Conversion/SCFToEmitC/for.mlir index 83592187a9b68..7f41e636936b8 100644 --- a/mlir/test/Conversion/SCFToEmitC/for.mlir +++ b/mlir/test/Conversion/SCFToEmitC/for.mlir @@ -7,8 +7,11 @@ func.func @simple_std_for_loop(%arg0 : index, %arg1 : index, %arg2 : index) { return } // CHECK-LABEL: func.func @simple_std_for_loop( -// CHECK-SAME: %[[VAL_0:.*]]: index, %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index) { -// CHECK-NEXT: emitc.for %[[VAL_3:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] { +// CHECK-SAME: %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index) { +// CHECK-NEXT: %[[VAL_2:.*]] = builtin.unrealized_conversion_cast %[[ARG_2]] : index to !emitc.size_t +// CHECK-NEXT: %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[ARG_1]] : index to !emitc.size_t +// CHECK-NEXT: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t +// CHECK-NEXT: emitc.for %[[VAL_3:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] : !emitc.size_t { // CHECK-NEXT: %[[VAL_4:.*]] = arith.constant 1 : index // CHECK-NEXT: } // CHECK-NEXT: return @@ -24,10 +27,13 @@ func.func @simple_std_2_for_loops(%arg0 : index, %arg1 : index, %arg2 : index) { return } // CHECK-LABEL: func.func @simple_std_2_for_loops( -// CHECK-SAME: %[[VAL_0:.*]]: index, %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index) { -// CHECK-NEXT: emitc.for %[[VAL_3:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] { +// CHECK-SAME: %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index) { +// CHECK-NEXT: %[[VAL_2:.*]] = builtin.unrealized_conversion_cast %[[ARG_2]] : index to !emitc.size_t +// CHECK-NEXT: %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[ARG_1]] : index to !emitc.size_t +// CHECK-NEXT: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t +// CHECK-NEXT: emitc.for %[[VAL_3:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] : !emitc.size_t { // CHECK-NEXT: %[[VAL_4:.*]] = arith.constant 1 : index -// CHECK-NEXT: emitc.for %[[VAL_5:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] { +// CHECK-NEXT: emitc.for %[[VAL_5:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] : !emitc.size_t { // CHECK-NEXT: %[[VAL_6:.*]] = arith.constant 1 : index // CHECK-NEXT: } // CHECK-NEXT: } @@ -44,14 +50,17 @@ func.func @for_yield(%arg0 : index, %arg1 : index, %arg2 : index) -> (f32, f32) return %result#0, %result#1 : f32, f32 } // CHECK-LABEL: func.func @for_yield( -// CHECK-SAME: %[[VAL_0:.*]]: index, %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index) -> (f32, f32) { +// CHECK-SAME: %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index) -> (f32, f32) { +// CHECK-NEXT: %[[VAL_2:.*]] = builtin.unrealized_conversion_cast %[[ARG_2]] : index to !emitc.size_t +// CHECK-NEXT: %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[ARG_1]] : index to !emitc.size_t +// CHECK-NEXT: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t // CHECK-NEXT: %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32 // CHECK-NEXT: %[[VAL_4:.*]] = arith.constant 1.000000e+00 : f32 // CHECK-NEXT: %[[VAL_5:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue // CHECK-NEXT: %[[VAL_6:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue // CHECK-NEXT: emitc.assign %[[VAL_3]] : f32 to %[[VAL_5]] : // CHECK-NEXT: emitc.assign %[[VAL_4]] : f32 to %[[VAL_6]] : -// CHECK-NEXT: emitc.for %[[VAL_7:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] { +// CHECK-NEXT: emitc.for %[[VAL_7:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] : !emitc.size_t { // CHECK-NEXT: %[[VAL_8:.*]] = emitc.load %[[VAL_5]] : // CHECK-NEXT: %[[VAL_9:.*]] = emitc.load %[[VAL_6]] : // CHECK-NEXT: %[[VAL_10:.*]] = arith.addf %[[VAL_8]], %[[VAL_9]] : f32 @@ -75,15 +84,18 @@ func.func @nested_for_yield(%arg0 : index, %arg1 : index, %arg2 : index) -> f32 return %r : f32 } // CHECK-LABEL: func.func @nested_for_yield( -// CHECK-SAME: %[[VAL_0:.*]]: index, %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index) -> f32 { +// CHECK-SAME: %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index) -> f32 { +// CHECK-NEXT: %[[VAL_2:.*]] = builtin.unrealized_conversion_cast %[[ARG_2]] : index to !emitc.size_t +// CHECK-NEXT: %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[ARG_1]] : index to !emitc.size_t +// CHECK-NEXT: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t // CHECK-NEXT: %[[VAL_3:.*]] = arith.constant 1.000000e+00 : f32 // CHECK-NEXT: %[[VAL_4:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue // CHECK-NEXT: emitc.assign %[[VAL_3]] : f32 to %[[VAL_4]] : -// CHECK-NEXT: emitc.for %[[VAL_5:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] { +// CHECK-NEXT: emitc.for %[[VAL_5:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] : !emitc.size_t { // CHECK-NEXT: %[[VAL_6:.*]] = emitc.load %[[VAL_4]] : // CHECK-NEXT: %[[VAL_7:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue // CHECK-NEXT: emitc.assign %[[VAL_6]] : f32 to %[[VAL_7]] : -// CHECK-NEXT: emitc.for %[[VAL_8:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] { +// CHECK-NEXT: emitc.for %[[VAL_8:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] : !emitc.size_t { // CHECK-NEXT: %[[VAL_9:.*]] = emitc.load %[[VAL_7]] : // CHECK-NEXT: %[[VAL_10:.*]] = arith.addf %[[VAL_9]], %[[VAL_9]] : f32 // CHECK-NEXT: emitc.assign %[[VAL_10]] : f32 to %[[VAL_7]] : @@ -94,3 +106,60 @@ func.func @nested_for_yield(%arg0 : index, %arg1 : index, %arg2 : index) -> f32 // CHECK-NEXT: %[[VAL_12:.*]] = emitc.load %[[VAL_4]] : // CHECK-NEXT: return %[[VAL_12]] : f32 // CHECK-NEXT: } + +func.func @for_yield_index(%arg0 : index, %arg1 : index, %arg2 : index) -> index { + %zero = arith.constant 0 : index + %r = scf.for %i0 = %arg0 to %arg1 step %arg2 iter_args(%acc = %zero) -> index { + scf.yield %acc : index + } + return %r : index +} + +// CHECK-LABEL: func.func @for_yield_index( +// CHECK-SAME: %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index) -> index { +// CHECK: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_2]] : index to !emitc.size_t +// CHECK: %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[ARG_1]] : index to !emitc.size_t +// CHECK: %[[VAL_2:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_3:.*]] = builtin.unrealized_conversion_cast %[[C0]] : index to !emitc.size_t +// CHECK: %[[VAL_4:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue +// CHECK: emitc.assign %[[VAL_3]] : !emitc.size_t to %[[VAL_4]] : +// CHECK: emitc.for %[[VAL_5:.*]] = %[[VAL_2]] to %[[VAL_1]] step %[[VAL_0]] : !emitc.size_t { +// CHECK: %[[V:.*]] = emitc.load %[[VAL_4]] : +// CHECK: emitc.assign %[[V]] : !emitc.size_t to %[[VAL_4]] : +// CHECK: } +// CHECK: %[[V2:.*]] = emitc.load %[[VAL_4]] : +// CHECK: %[[VAL_8:.*]] = builtin.unrealized_conversion_cast %[[V2]] : !emitc.size_t to index +// CHECK: return %[[VAL_8]] : index +// CHECK: } + + +func.func @for_yield_update_loop_carried_var(%arg0 : index, %arg1 : index, %arg2 : index) -> index { + %zero = arith.constant 0 : index + %r = scf.for %i0 = %arg0 to %arg1 step %arg2 iter_args(%acc = %zero) -> index { + %sn = arith.addi %acc, %acc : index + scf.yield %sn: index + } + return %r : index + } + +// CHECK-LABEL: func.func @for_yield_update_loop_carried_var( +// CHECK-SAME: %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index) -> index { +// CHECK: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_2]] : index to !emitc.size_t +// CHECK: %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[ARG_1]] : index to !emitc.size_t +// CHECK: %[[VAL_2:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_3:.*]] = builtin.unrealized_conversion_cast %[[C0]] : index to !emitc.size_t +// CHECK: %[[VAL_4:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue +// CHECK: emitc.assign %[[VAL_3]] : !emitc.size_t to %[[VAL_4]] : +// CHECK: emitc.for %[[ARG_3:.*]] = %[[VAL_2]] to %[[VAL_1]] step %[[VAL_0]] : !emitc.size_t { +// CHECK: %[[V:.*]] = emitc.load %[[VAL_4]] : +// CHECK: %[[VAL_5:.*]] = builtin.unrealized_conversion_cast %[[V]] : !emitc.size_t to index +// CHECK: %[[VAL_6:.*]] = arith.addi %[[VAL_5]], %[[VAL_5]] : index +// CHECK: %[[VAL_8:.*]] = builtin.unrealized_conversion_cast %[[VAL_6]] : index to !emitc.size_t +// CHECK: emitc.assign %[[VAL_8]] : !emitc.size_t to %[[VAL_4]] : +// CHECK: } +// CHECK: %[[V2:.*]] = emitc.load %[[VAL_4]] : +// CHECK: %[[VAL_9:.*]] = builtin.unrealized_conversion_cast %[[V2]] : !emitc.size_t to index +// CHECK: return %[[VAL_9]] : index +// CHECK: } diff --git a/mlir/test/Conversion/SCFToEmitC/switch.mlir b/mlir/test/Conversion/SCFToEmitC/switch.mlir index 86d96ed21f1b5..61015b0ae483b 100644 --- a/mlir/test/Conversion/SCFToEmitC/switch.mlir +++ b/mlir/test/Conversion/SCFToEmitC/switch.mlir @@ -1,7 +1,8 @@ // RUN: mlir-opt -allow-unregistered-dialect -convert-scf-to-emitc %s | FileCheck %s // CHECK-LABEL: func.func @switch_no_result( -// CHECK-SAME: %[[VAL_0:.*]]: index) { +// CHECK-SAME: %[[ARG_0:.*]]: index) { +// CHECK: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t // CHECK: emitc.switch %[[VAL_0]] // CHECK: case 2 { // CHECK: %[[VAL_1:.*]] = arith.constant 10 : i32 @@ -33,7 +34,8 @@ func.func @switch_no_result(%arg0 : index) { } // CHECK-LABEL: func.func @switch_one_result( -// CHECK-SAME: %[[VAL_0:.*]]: index) { +// CHECK-SAME: %[[ARG_0:.*]]: index) { +// CHECK: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t // CHECK: %[[VAL_1:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue // CHECK: emitc.switch %[[VAL_0]] // CHECK: case 2 { @@ -70,7 +72,8 @@ func.func @switch_one_result(%arg0 : index) { } // CHECK-LABEL: func.func @switch_two_results( -// CHECK-SAME: %[[VAL_0:.*]]: index) -> (i32, f32) { +// CHECK-SAME: %[[ARG_0:.*]]: index) -> (i32, f32) { +// CHECK: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t // CHECK: %[[VAL_1:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue // CHECK: %[[VAL_2:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue // CHECK: emitc.switch %[[VAL_0]] From a4deb809be8f5ec3adec3626e9d700f6168d0e9f Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Thu, 2 Jan 2025 09:13:40 -0500 Subject: [PATCH 278/567] Revert "Reapply "[Driver][OHOS] Fix lld link issue for OHOS (#118192)" (#120159)" This reverts commit bd154e823eba4d62366dfa3d56ae0b99ab171b96. Test fails with -DLLVM_ENABLE_PER_TARGET_RUNTIME_DIR=OFF, see https://github.com/llvm/llvm-project/pull/120159#issuecomment-2567836727 --- clang/lib/Driver/ToolChains/OHOS.cpp | 60 ++++++++++++++++------------ 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/clang/lib/Driver/ToolChains/OHOS.cpp b/clang/lib/Driver/ToolChains/OHOS.cpp index c9a532771b99e..6e1a09ae908b2 100644 --- a/clang/lib/Driver/ToolChains/OHOS.cpp +++ b/clang/lib/Driver/ToolChains/OHOS.cpp @@ -19,8 +19,8 @@ #include "llvm/ProfileData/InstrProf.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" -#include "llvm/Support/ScopedPrinter.h" #include "llvm/Support/VirtualFileSystem.h" +#include "llvm/Support/ScopedPrinter.h" using namespace clang::driver; using namespace clang::driver::toolchains; @@ -58,9 +58,11 @@ static bool findOHOSMuslMultilibs(const Driver &D, return false; } -static bool findOHOSMultilibs(const Driver &D, const ToolChain &TC, - const llvm::Triple &TargetTriple, StringRef Path, - const ArgList &Args, DetectedMultilibs &Result) { +static bool findOHOSMultilibs(const Driver &D, + const ToolChain &TC, + const llvm::Triple &TargetTriple, + StringRef Path, const ArgList &Args, + DetectedMultilibs &Result) { Multilib::flags_list Flags; bool IsA7 = false; if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) @@ -170,7 +172,8 @@ OHOS::OHOS(const Driver &D, const llvm::Triple &Triple, const ArgList &Args) Paths); } -ToolChain::RuntimeLibType OHOS::GetRuntimeLibType(const ArgList &Args) const { +ToolChain::RuntimeLibType OHOS::GetRuntimeLibType( + const ArgList &Args) const { if (Arg *A = Args.getLastArg(clang::driver::options::OPT_rtlib_EQ)) { StringRef Value = A->getValue(); if (Value != "compiler-rt") @@ -181,19 +184,20 @@ ToolChain::RuntimeLibType OHOS::GetRuntimeLibType(const ArgList &Args) const { return ToolChain::RLT_CompilerRT; } -ToolChain::CXXStdlibType OHOS::GetCXXStdlibType(const ArgList &Args) const { +ToolChain::CXXStdlibType +OHOS::GetCXXStdlibType(const ArgList &Args) const { if (Arg *A = Args.getLastArg(options::OPT_stdlib_EQ)) { StringRef Value = A->getValue(); if (Value != "libc++") getDriver().Diag(diag::err_drv_invalid_stdlib_name) - << A->getAsString(Args); + << A->getAsString(Args); } return ToolChain::CST_Libcxx; } void OHOS::AddClangSystemIncludeArgs(const ArgList &DriverArgs, - ArgStringList &CC1Args) const { + ArgStringList &CC1Args) const { const Driver &D = getDriver(); const llvm::Triple &Triple = getTriple(); std::string SysRoot = computeSysRoot(); @@ -254,7 +258,7 @@ void OHOS::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs, } void OHOS::AddCXXStdlibLibArgs(const ArgList &Args, - ArgStringList &CmdArgs) const { + ArgStringList &CmdArgs) const { switch (GetCXXStdlibType(Args)) { case ToolChain::CST_Libcxx: CmdArgs.push_back("-lc++"); @@ -287,8 +291,7 @@ ToolChain::path_list OHOS::getRuntimePaths() const { // First try the triple passed to driver as --target=. P.assign(D.ResourceDir); - llvm::sys::path::append(P, "lib", D.getTargetTriple(), - SelectedMultilib.gccSuffix()); + llvm::sys::path::append(P, "lib", D.getTargetTriple(), SelectedMultilib.gccSuffix()); Paths.push_back(P.c_str()); // Second try the normalized triple. @@ -337,20 +340,26 @@ std::string OHOS::getDynamicLinker(const ArgList &Args) const { std::string OHOS::getCompilerRT(const ArgList &Args, StringRef Component, FileType Type) const { - std::string CRTBasename = - buildCompilerRTBasename(Args, Component, Type, /*AddArch=*/false); - SmallString<128> Path(getDriver().ResourceDir); llvm::sys::path::append(Path, "lib", getMultiarchTriple(getTriple()), - SelectedMultilib.gccSuffix(), CRTBasename); - if (getVFS().exists(Path)) - return std::string(Path); - - std::string NewPath = ToolChain::getCompilerRT(Args, Component, Type); - if (getVFS().exists(NewPath)) - return NewPath; - - return std::string(Path); + SelectedMultilib.gccSuffix()); + const char *Prefix = + Type == ToolChain::FT_Object ? "" : "lib"; + const char *Suffix; + switch (Type) { + case ToolChain::FT_Object: + Suffix = ".o"; + break; + case ToolChain::FT_Static: + Suffix = ".a"; + break; + case ToolChain::FT_Shared: + Suffix = ".so"; + break; + } + llvm::sys::path::append( + Path, Prefix + Twine("clang_rt.") + Component + Suffix); + return static_cast(Path.str()); } void OHOS::addExtraOpts(llvm::opt::ArgStringList &CmdArgs) const { @@ -387,7 +396,7 @@ SanitizerMask OHOS::getSupportedSanitizers() const { // TODO: Make a base class for Linux and OHOS and move this there. void OHOS::addProfileRTLibs(const llvm::opt::ArgList &Args, - llvm::opt::ArgStringList &CmdArgs) const { + llvm::opt::ArgStringList &CmdArgs) const { // Add linker option -u__llvm_profile_runtime to cause runtime // initialization module to be linked in. if (needsProfileRT(Args)) @@ -404,8 +413,7 @@ ToolChain::path_list OHOS::getArchSpecificLibPaths() const { return Paths; } -ToolChain::UnwindLibType -OHOS::GetUnwindLibType(const llvm::opt::ArgList &Args) const { +ToolChain::UnwindLibType OHOS::GetUnwindLibType(const llvm::opt::ArgList &Args) const { if (Args.getLastArg(options::OPT_unwindlib_EQ)) return Generic_ELF::GetUnwindLibType(Args); return GetDefaultUnwindLibType(); From 6d604ba36326de849ccf00f30351ce21fde19471 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Thu, 2 Jan 2025 14:14:43 +0000 Subject: [PATCH 279/567] [gn build] Port e45e091b9089 --- .../gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn | 1 + .../clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn index 61e4f8da3c04d..670f24c242a89 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn @@ -54,6 +54,7 @@ static_library("bugprone") { "MultiLevelImplicitPointerConversionCheck.cpp", "MultipleNewInOneExpressionCheck.cpp", "MultipleStatementMacroCheck.cpp", + "NarrowingConversionsCheck.cpp", "NoEscapeCheck.cpp", "NonZeroEnumToBoolConversionCheck.cpp", "NondeterministicPointerIterationOrderCheck.cpp", diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn index be444d47aa12a..a06b2f11b452a 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn @@ -28,7 +28,6 @@ static_library("cppcoreguidelines") { "MacroUsageCheck.cpp", "MisleadingCaptureDefaultByValueCheck.cpp", "MissingStdForwardCheck.cpp", - "NarrowingConversionsCheck.cpp", "NoMallocCheck.cpp", "NoSuspendWithLockCheck.cpp", "OwningMemoryCheck.cpp", From 073e65a8e5f92ca9c63c3fcd1c0ce2a36913f9a6 Mon Sep 17 00:00:00 2001 From: John Brawn Date: Thu, 2 Jan 2025 14:31:36 +0000 Subject: [PATCH 280/567] [LoopVectorize] Make needsExtract notice scalarized instructions (#119720) LoopVectorizationCostModel::needsExtract should recognise instructions that have been widened by scalarizing as scalar instructions, and thus not needing an extract when used by later scalarized instructions. This fixes an incorrect cost calculation in computePredInstDiscount, where we are adding a scalarization overhead cost when we shouldn't, though I haven't come up with a test case where it makes a difference. It will make a difference when the cost model switches to using the cost kind TCK_CodeSize for optsize, as not doing this causes the test LoopVectorize/X86/small-size.ll to get worse. --- .../Transforms/Vectorize/LoopVectorize.cpp | 3 +- .../LoopVectorize/AArch64/interleaved_cost.ll | 4 +- .../LoopVectorize/ARM/mve-interleaved-cost.ll | 320 +++++++++--------- 3 files changed, 164 insertions(+), 163 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 52186882b4f20..f2f8a85b7cc23 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1731,7 +1731,8 @@ class LoopVectorizationCostModel { bool needsExtract(Value *V, ElementCount VF) const { Instruction *I = dyn_cast(V); if (VF.isScalar() || !I || !TheLoop->contains(I) || - TheLoop->isLoopInvariant(I)) + TheLoop->isLoopInvariant(I) || + getWideningDecision(I, VF) == CM_Scalarize) return false; // Assume we can vectorize V (and hence we need extraction) if the diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll index dec124b55cd4e..a550f1ca14c8b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll @@ -170,8 +170,8 @@ entry: ; VF_2-LABEL: Checking a loop in 'i64_factor_8' ; VF_2: Found an estimated cost of 8 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 ; VF_2-NEXT: Found an estimated cost of 8 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 8 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 8 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i64.8, ptr %data, i64 %i, i32 2 diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll index 976c6a9a570af..551b85b7d0357 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll @@ -17,8 +17,8 @@ entry: ; VF_2-LABEL: Checking a loop in 'i8_factor_2' ; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load i8, ptr %tmp0, align 1 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i8, ptr %tmp1, align 1 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp2, ptr %tmp0, align 1 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp3, ptr %tmp1, align 1 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp2, ptr %tmp0, align 1 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp3, ptr %tmp1, align 1 ; VF_4-LABEL: Checking a loop in 'i8_factor_2' ; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i8, ptr %tmp0, align 1 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i8, ptr %tmp1, align 1 @@ -58,8 +58,8 @@ entry: ; VF_2-LABEL: Checking a loop in 'i16_factor_2' ; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i16, ptr %tmp1, align 2 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp2, ptr %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp3, ptr %tmp1, align 2 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp2, ptr %tmp0, align 2 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp3, ptr %tmp1, align 2 ; VF_4-LABEL: Checking a loop in 'i16_factor_2' ; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp1, align 2 @@ -99,8 +99,8 @@ entry: ; VF_2-LABEL: Checking a loop in 'i32_factor_2' ; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp1, align 4 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4 ; VF_4-LABEL: Checking a loop in 'i32_factor_2' ; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp1, align 4 @@ -140,23 +140,23 @@ entry: ; VF_2-LABEL: Checking a loop in 'i64_factor_2' ; VF_2: Found an estimated cost of 22 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 ; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8 ; VF_4-LABEL: Checking a loop in 'i64_factor_2' ; VF_4: Found an estimated cost of 44 for VF 4 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 ; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp2, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp3, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp2, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp3, ptr %tmp1, align 8 ; VF_8-LABEL: Checking a loop in 'i64_factor_2' ; VF_8: Found an estimated cost of 88 for VF 8 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 ; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp2, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp3, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp2, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp3, ptr %tmp1, align 8 ; VF_16-LABEL: Checking a loop in 'i64_factor_2' ; VF_16: Found an estimated cost of 176 for VF 16 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 ; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp2, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp3, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp2, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp3, ptr %tmp1, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i64.2, ptr %data, i64 %i, i32 0 @@ -181,8 +181,8 @@ entry: ; VF_2-LABEL: Checking a loop in 'f16_factor_2' ; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load half, ptr %tmp0, align 2 ; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load half, ptr %tmp1, align 2 -; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store half %tmp2, ptr %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store half %tmp3, ptr %tmp1, align 2 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store half %tmp2, ptr %tmp0, align 2 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store half %tmp3, ptr %tmp1, align 2 ; VF_4-LABEL: Checking a loop in 'f16_factor_2' ; VF_4: Found an estimated cost of 18 for VF 4 For instruction: %tmp2 = load half, ptr %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, ptr %tmp1, align 2 @@ -263,23 +263,23 @@ entry: ; VF_2-LABEL: Checking a loop in 'f64_factor_2' ; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load double, ptr %tmp0, align 8 ; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load double, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp2, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp3, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store double %tmp2, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store double %tmp3, ptr %tmp1, align 8 ; VF_4-LABEL: Checking a loop in 'f64_factor_2' ; VF_4: Found an estimated cost of 12 for VF 4 For instruction: %tmp2 = load double, ptr %tmp0, align 8 ; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: %tmp3 = load double, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp2, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp3, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store double %tmp2, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store double %tmp3, ptr %tmp1, align 8 ; VF_8-LABEL: Checking a loop in 'f64_factor_2' ; VF_8: Found an estimated cost of 24 for VF 8 For instruction: %tmp2 = load double, ptr %tmp0, align 8 ; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: %tmp3 = load double, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp2, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp3, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store double %tmp2, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store double %tmp3, ptr %tmp1, align 8 ; VF_16-LABEL: Checking a loop in 'f64_factor_2' ; VF_16: Found an estimated cost of 48 for VF 16 For instruction: %tmp2 = load double, ptr %tmp0, align 8 ; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: %tmp3 = load double, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp2, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp3, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store double %tmp2, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store double %tmp3, ptr %tmp1, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f64.2, ptr %data, i64 %i, i32 0 @@ -309,30 +309,30 @@ entry: ; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i8, ptr %tmp0, align 1 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i8, ptr %tmp1, align 1 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i8, ptr %tmp2, align 1 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp3, ptr %tmp0, align 1 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp4, ptr %tmp1, align 1 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp5, ptr %tmp2, align 1 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp3, ptr %tmp0, align 1 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp4, ptr %tmp1, align 1 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp5, ptr %tmp2, align 1 ; VF_4-LABEL: Checking a loop in 'i8_factor_3' ; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load i8, ptr %tmp0, align 1 ; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i8, ptr %tmp1, align 1 ; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i8, ptr %tmp2, align 1 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp3, ptr %tmp0, align 1 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp4, ptr %tmp1, align 1 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp5, ptr %tmp2, align 1 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp3, ptr %tmp0, align 1 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp4, ptr %tmp1, align 1 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp5, ptr %tmp2, align 1 ; VF_8-LABEL: Checking a loop in 'i8_factor_3' ; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp3 = load i8, ptr %tmp0, align 1 ; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i8, ptr %tmp1, align 1 ; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i8, ptr %tmp2, align 1 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp3, ptr %tmp0, align 1 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp4, ptr %tmp1, align 1 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp5, ptr %tmp2, align 1 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp3, ptr %tmp0, align 1 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp4, ptr %tmp1, align 1 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp5, ptr %tmp2, align 1 ; VF_16-LABEL: Checking a loop in 'i8_factor_3' ; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp3 = load i8, ptr %tmp0, align 1 ; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i8, ptr %tmp1, align 1 ; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i8, ptr %tmp2, align 1 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp3, ptr %tmp0, align 1 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp4, ptr %tmp1, align 1 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp5, ptr %tmp2, align 1 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp3, ptr %tmp0, align 1 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp4, ptr %tmp1, align 1 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp5, ptr %tmp2, align 1 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i8.3, ptr %data, i64 %i, i32 0 @@ -361,30 +361,30 @@ entry: ; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i16, ptr %tmp0, align 2 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i16, ptr %tmp1, align 2 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i16, ptr %tmp2, align 2 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp3, ptr %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp4, ptr %tmp1, align 2 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp5, ptr %tmp2, align 2 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp3, ptr %tmp0, align 2 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp4, ptr %tmp1, align 2 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp5, ptr %tmp2, align 2 ; VF_4-LABEL: Checking a loop in 'i16_factor_3' ; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i16, ptr %tmp1, align 2 ; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i16, ptr %tmp2, align 2 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp3, ptr %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp4, ptr %tmp1, align 2 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp5, ptr %tmp2, align 2 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp3, ptr %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp4, ptr %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp5, ptr %tmp2, align 2 ; VF_8-LABEL: Checking a loop in 'i16_factor_3' ; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp3 = load i16, ptr %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i16, ptr %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i16, ptr %tmp2, align 2 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp3, ptr %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp4, ptr %tmp1, align 2 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp5, ptr %tmp2, align 2 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp3, ptr %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp4, ptr %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp5, ptr %tmp2, align 2 ; VF_16-LABEL: Checking a loop in 'i16_factor_3' ; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp3 = load i16, ptr %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i16, ptr %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i16, ptr %tmp2, align 2 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp3, ptr %tmp0, align 2 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp4, ptr %tmp1, align 2 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp5, ptr %tmp2, align 2 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp3, ptr %tmp0, align 2 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp4, ptr %tmp1, align 2 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp5, ptr %tmp2, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i16.3, ptr %data, i64 %i, i32 0 @@ -413,9 +413,9 @@ entry: ; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp0, align 4 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i32, ptr %tmp1, align 4 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i32, ptr %tmp2, align 4 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp3, ptr %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp4, ptr %tmp1, align 4 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp5, ptr %tmp2, align 4 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp3, ptr %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp4, ptr %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp5, ptr %tmp2, align 4 ; VF_4-LABEL: Checking a loop in 'i32_factor_3' ; VF_4: Found an estimated cost of 8 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load i32, ptr %tmp1, align 4 @@ -427,16 +427,16 @@ entry: ; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp3 = load i32, ptr %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i32, ptr %tmp1, align 4 ; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i32, ptr %tmp2, align 4 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp3, ptr %tmp0, align 4 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp4, ptr %tmp1, align 4 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp5, ptr %tmp2, align 4 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp3, ptr %tmp0, align 4 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp4, ptr %tmp1, align 4 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp5, ptr %tmp2, align 4 ; VF_16-LABEL: Checking a loop in 'i32_factor_3' ; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp3 = load i32, ptr %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i32, ptr %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i32, ptr %tmp2, align 4 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp3, ptr %tmp0, align 4 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp4, ptr %tmp1, align 4 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp5, ptr %tmp2, align 4 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp3, ptr %tmp0, align 4 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp4, ptr %tmp1, align 4 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp5, ptr %tmp2, align 4 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i32.3, ptr %data, i64 %i, i32 0 @@ -465,30 +465,30 @@ entry: ; VF_2: Found an estimated cost of 22 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp0, align 8 ; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: %tmp4 = load i64, ptr %tmp1, align 8 ; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: %tmp5 = load i64, ptr %tmp2, align 8 -; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp3, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp4, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp5, ptr %tmp2, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp3, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp4, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp5, ptr %tmp2, align 8 ; VF_4-LABEL: Checking a loop in 'i64_factor_3' ; VF_4: Found an estimated cost of 44 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp0, align 8 ; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: %tmp4 = load i64, ptr %tmp1, align 8 ; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: %tmp5 = load i64, ptr %tmp2, align 8 -; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp3, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp4, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp5, ptr %tmp2, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp3, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp4, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp5, ptr %tmp2, align 8 ; VF_8-LABEL: Checking a loop in 'i64_factor_3' ; VF_8: Found an estimated cost of 88 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp0, align 8 ; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: %tmp4 = load i64, ptr %tmp1, align 8 ; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: %tmp5 = load i64, ptr %tmp2, align 8 -; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp3, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp4, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp5, ptr %tmp2, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp3, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp4, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp5, ptr %tmp2, align 8 ; VF_16-LABEL: Checking a loop in 'i64_factor_3' ; VF_16: Found an estimated cost of 176 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp0, align 8 ; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: %tmp4 = load i64, ptr %tmp1, align 8 ; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: %tmp5 = load i64, ptr %tmp2, align 8 -; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp3, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp4, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp5, ptr %tmp2, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp3, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp4, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp5, ptr %tmp2, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i64.3, ptr %data, i64 %i, i32 0 @@ -517,9 +517,9 @@ entry: ; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load half, ptr %tmp0, align 2 ; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp4 = load half, ptr %tmp1, align 2 ; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp5 = load half, ptr %tmp2, align 2 -; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store half %tmp3, ptr %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store half %tmp4, ptr %tmp1, align 2 -; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store half %tmp5, ptr %tmp2, align 2 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store half %tmp3, ptr %tmp0, align 2 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store half %tmp4, ptr %tmp1, align 2 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store half %tmp5, ptr %tmp2, align 2 ; VF_4-LABEL: Checking a loop in 'f16_factor_3' ; VF_4: Found an estimated cost of 28 for VF 4 For instruction: %tmp3 = load half, ptr %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load half, ptr %tmp1, align 2 @@ -621,30 +621,30 @@ entry: ; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load double, ptr %tmp0, align 8 ; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp4 = load double, ptr %tmp1, align 8 ; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp5 = load double, ptr %tmp2, align 8 -; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp3, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp4, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp5, ptr %tmp2, align 8 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store double %tmp3, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store double %tmp4, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store double %tmp5, ptr %tmp2, align 8 ; VF_4-LABEL: Checking a loop in 'f64_factor_3' ; VF_4: Found an estimated cost of 12 for VF 4 For instruction: %tmp3 = load double, ptr %tmp0, align 8 ; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: %tmp4 = load double, ptr %tmp1, align 8 ; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: %tmp5 = load double, ptr %tmp2, align 8 -; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp3, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp4, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp5, ptr %tmp2, align 8 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store double %tmp3, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store double %tmp4, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store double %tmp5, ptr %tmp2, align 8 ; VF_8-LABEL: Checking a loop in 'f64_factor_3' ; VF_8: Found an estimated cost of 24 for VF 8 For instruction: %tmp3 = load double, ptr %tmp0, align 8 ; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: %tmp4 = load double, ptr %tmp1, align 8 ; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: %tmp5 = load double, ptr %tmp2, align 8 -; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp3, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp4, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp5, ptr %tmp2, align 8 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store double %tmp3, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store double %tmp4, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store double %tmp5, ptr %tmp2, align 8 ; VF_16-LABEL: Checking a loop in 'f64_factor_3' ; VF_16: Found an estimated cost of 48 for VF 16 For instruction: %tmp3 = load double, ptr %tmp0, align 8 ; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: %tmp4 = load double, ptr %tmp1, align 8 ; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: %tmp5 = load double, ptr %tmp2, align 8 -; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp3, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp4, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp5, ptr %tmp2, align 8 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store double %tmp3, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store double %tmp4, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store double %tmp5, ptr %tmp2, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f64.3, ptr %data, i64 %i, i32 0 @@ -677,37 +677,37 @@ entry: ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i8, ptr %tmp1, align 1 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp6 = load i8, ptr %tmp2, align 1 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp7 = load i8, ptr %tmp3, align 1 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp4, ptr %tmp0, align 1 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp5, ptr %tmp1, align 1 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp6, ptr %tmp2, align 1 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp7, ptr %tmp3, align 1 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp4, ptr %tmp0, align 1 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp5, ptr %tmp1, align 1 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp6, ptr %tmp2, align 1 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp7, ptr %tmp3, align 1 ; VF_4-LABEL: Checking a loop in 'i8_factor_4' ; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i8, ptr %tmp0, align 1 ; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i8, ptr %tmp1, align 1 ; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp6 = load i8, ptr %tmp2, align 1 ; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp7 = load i8, ptr %tmp3, align 1 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp4, ptr %tmp0, align 1 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp5, ptr %tmp1, align 1 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp6, ptr %tmp2, align 1 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp7, ptr %tmp3, align 1 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp4, ptr %tmp0, align 1 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp5, ptr %tmp1, align 1 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp6, ptr %tmp2, align 1 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp7, ptr %tmp3, align 1 ; VF_8-LABEL: Checking a loop in 'i8_factor_4' ; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i8, ptr %tmp0, align 1 ; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i8, ptr %tmp1, align 1 ; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp6 = load i8, ptr %tmp2, align 1 ; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp7 = load i8, ptr %tmp3, align 1 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp4, ptr %tmp0, align 1 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp5, ptr %tmp1, align 1 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp6, ptr %tmp2, align 1 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp7, ptr %tmp3, align 1 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp4, ptr %tmp0, align 1 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp5, ptr %tmp1, align 1 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp6, ptr %tmp2, align 1 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp7, ptr %tmp3, align 1 ; VF_16-LABEL: Checking a loop in 'i8_factor_4' ; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i8, ptr %tmp0, align 1 ; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i8, ptr %tmp1, align 1 ; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp6 = load i8, ptr %tmp2, align 1 ; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp7 = load i8, ptr %tmp3, align 1 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp4, ptr %tmp0, align 1 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp5, ptr %tmp1, align 1 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp6, ptr %tmp2, align 1 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp7, ptr %tmp3, align 1 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp4, ptr %tmp0, align 1 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp5, ptr %tmp1, align 1 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp6, ptr %tmp2, align 1 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp7, ptr %tmp3, align 1 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i8.4, ptr %data, i64 %i, i32 0 @@ -740,37 +740,37 @@ entry: ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i16, ptr %tmp1, align 2 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp6 = load i16, ptr %tmp2, align 2 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp7 = load i16, ptr %tmp3, align 2 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp4, ptr %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp5, ptr %tmp1, align 2 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp6, ptr %tmp2, align 2 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp7, ptr %tmp3, align 2 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp4, ptr %tmp0, align 2 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp5, ptr %tmp1, align 2 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp6, ptr %tmp2, align 2 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp7, ptr %tmp3, align 2 ; VF_4-LABEL: Checking a loop in 'i16_factor_4' ; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i16, ptr %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i16, ptr %tmp1, align 2 ; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp6 = load i16, ptr %tmp2, align 2 ; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp7 = load i16, ptr %tmp3, align 2 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp4, ptr %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp5, ptr %tmp1, align 2 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp6, ptr %tmp2, align 2 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp7, ptr %tmp3, align 2 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp4, ptr %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp5, ptr %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp6, ptr %tmp2, align 2 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp7, ptr %tmp3, align 2 ; VF_8-LABEL: Checking a loop in 'i16_factor_4' ; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i16, ptr %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i16, ptr %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp6 = load i16, ptr %tmp2, align 2 ; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp7 = load i16, ptr %tmp3, align 2 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp4, ptr %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp5, ptr %tmp1, align 2 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp6, ptr %tmp2, align 2 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp7, ptr %tmp3, align 2 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp4, ptr %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp5, ptr %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp6, ptr %tmp2, align 2 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp7, ptr %tmp3, align 2 ; VF_16-LABEL: Checking a loop in 'i16_factor_4' ; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i16, ptr %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i16, ptr %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp6 = load i16, ptr %tmp2, align 2 ; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp7 = load i16, ptr %tmp3, align 2 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp4, ptr %tmp0, align 2 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp5, ptr %tmp1, align 2 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp6, ptr %tmp2, align 2 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp7, ptr %tmp3, align 2 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp4, ptr %tmp0, align 2 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp5, ptr %tmp1, align 2 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp6, ptr %tmp2, align 2 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp7, ptr %tmp3, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i16.4, ptr %data, i64 %i, i32 0 @@ -803,10 +803,10 @@ entry: ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i32, ptr %tmp1, align 4 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp6 = load i32, ptr %tmp2, align 4 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp7 = load i32, ptr %tmp3, align 4 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp4, ptr %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp5, ptr %tmp1, align 4 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp6, ptr %tmp2, align 4 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp7, ptr %tmp3, align 4 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp4, ptr %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp5, ptr %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp6, ptr %tmp2, align 4 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp7, ptr %tmp3, align 4 ; VF_4-LABEL: Checking a loop in 'i32_factor_4' ; VF_4: Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load i32, ptr %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp5 = load i32, ptr %tmp1, align 4 @@ -821,19 +821,19 @@ entry: ; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i32, ptr %tmp1, align 4 ; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp6 = load i32, ptr %tmp2, align 4 ; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp7 = load i32, ptr %tmp3, align 4 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp4, ptr %tmp0, align 4 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp5, ptr %tmp1, align 4 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp6, ptr %tmp2, align 4 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp7, ptr %tmp3, align 4 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp4, ptr %tmp0, align 4 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp5, ptr %tmp1, align 4 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp6, ptr %tmp2, align 4 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp7, ptr %tmp3, align 4 ; VF_16-LABEL: Checking a loop in 'i32_factor_4' ; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i32, ptr %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i32, ptr %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp6 = load i32, ptr %tmp2, align 4 ; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp7 = load i32, ptr %tmp3, align 4 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp4, ptr %tmp0, align 4 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp5, ptr %tmp1, align 4 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp6, ptr %tmp2, align 4 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp7, ptr %tmp3, align 4 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp4, ptr %tmp0, align 4 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp5, ptr %tmp1, align 4 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp6, ptr %tmp2, align 4 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp7, ptr %tmp3, align 4 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i32.4, ptr %data, i64 %i, i32 0 @@ -866,37 +866,37 @@ entry: ; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: %tmp5 = load i64, ptr %tmp1, align 8 ; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: %tmp6 = load i64, ptr %tmp2, align 8 ; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: %tmp7 = load i64, ptr %tmp3, align 8 -; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp4, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp5, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp6, ptr %tmp2, align 8 -; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp7, ptr %tmp3, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp4, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp5, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp6, ptr %tmp2, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp7, ptr %tmp3, align 8 ; VF_4-LABEL: Checking a loop in 'i64_factor_4' ; VF_4: Found an estimated cost of 44 for VF 4 For instruction: %tmp4 = load i64, ptr %tmp0, align 8 ; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: %tmp5 = load i64, ptr %tmp1, align 8 ; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: %tmp6 = load i64, ptr %tmp2, align 8 ; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: %tmp7 = load i64, ptr %tmp3, align 8 -; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp4, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp5, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp6, ptr %tmp2, align 8 -; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp7, ptr %tmp3, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp4, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp5, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp6, ptr %tmp2, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp7, ptr %tmp3, align 8 ; VF_8-LABEL: Checking a loop in 'i64_factor_4' ; VF_8: Found an estimated cost of 88 for VF 8 For instruction: %tmp4 = load i64, ptr %tmp0, align 8 ; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: %tmp5 = load i64, ptr %tmp1, align 8 ; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: %tmp6 = load i64, ptr %tmp2, align 8 ; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: %tmp7 = load i64, ptr %tmp3, align 8 -; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp4, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp5, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp6, ptr %tmp2, align 8 -; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp7, ptr %tmp3, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp4, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp5, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp6, ptr %tmp2, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp7, ptr %tmp3, align 8 ; VF_16-LABEL: Checking a loop in 'i64_factor_4' ; VF_16: Found an estimated cost of 176 for VF 16 For instruction: %tmp4 = load i64, ptr %tmp0, align 8 ; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: %tmp5 = load i64, ptr %tmp1, align 8 ; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: %tmp6 = load i64, ptr %tmp2, align 8 ; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: %tmp7 = load i64, ptr %tmp3, align 8 -; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp4, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp5, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp6, ptr %tmp2, align 8 -; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp7, ptr %tmp3, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp4, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp5, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp6, ptr %tmp2, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp7, ptr %tmp3, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i64.4, ptr %data, i64 %i, i32 0 @@ -1055,37 +1055,37 @@ entry: ; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp5 = load double, ptr %tmp1, align 8 ; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp6 = load double, ptr %tmp2, align 8 ; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp7 = load double, ptr %tmp3, align 8 -; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp4, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp5, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp6, ptr %tmp2, align 8 -; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp7, ptr %tmp3, align 8 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store double %tmp4, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store double %tmp5, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store double %tmp6, ptr %tmp2, align 8 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store double %tmp7, ptr %tmp3, align 8 ; VF_4-LABEL: Checking a loop in 'f64_factor_4' ; VF_4: Found an estimated cost of 12 for VF 4 For instruction: %tmp4 = load double, ptr %tmp0, align 8 ; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: %tmp5 = load double, ptr %tmp1, align 8 ; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: %tmp6 = load double, ptr %tmp2, align 8 ; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: %tmp7 = load double, ptr %tmp3, align 8 -; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp4, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp5, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp6, ptr %tmp2, align 8 -; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp7, ptr %tmp3, align 8 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store double %tmp4, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store double %tmp5, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store double %tmp6, ptr %tmp2, align 8 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store double %tmp7, ptr %tmp3, align 8 ; VF_8-LABEL: Checking a loop in 'f64_factor_4' ; VF_8: Found an estimated cost of 24 for VF 8 For instruction: %tmp4 = load double, ptr %tmp0, align 8 ; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: %tmp5 = load double, ptr %tmp1, align 8 ; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: %tmp6 = load double, ptr %tmp2, align 8 ; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: %tmp7 = load double, ptr %tmp3, align 8 -; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp4, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp5, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp6, ptr %tmp2, align 8 -; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp7, ptr %tmp3, align 8 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store double %tmp4, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store double %tmp5, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store double %tmp6, ptr %tmp2, align 8 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store double %tmp7, ptr %tmp3, align 8 ; VF_16-LABEL: Checking a loop in 'f64_factor_4' ; VF_16: Found an estimated cost of 48 for VF 16 For instruction: %tmp4 = load double, ptr %tmp0, align 8 ; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: %tmp5 = load double, ptr %tmp1, align 8 ; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: %tmp6 = load double, ptr %tmp2, align 8 ; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: %tmp7 = load double, ptr %tmp3, align 8 -; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp4, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp5, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp6, ptr %tmp2, align 8 -; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp7, ptr %tmp3, align 8 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store double %tmp4, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store double %tmp5, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store double %tmp6, ptr %tmp2, align 8 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store double %tmp7, ptr %tmp3, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f64.4, ptr %data, i64 %i, i32 0 From a9a3fb5b1a23e336a1656046ba1a36832e020d4e Mon Sep 17 00:00:00 2001 From: Bart Chrzaszcz Date: Thu, 2 Jan 2025 14:38:43 +0000 Subject: [PATCH 281/567] Update BUILD.bazel due to PR #121476 Breaks bazel builds due to missing dependency --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index f1192d069fa5f..e823af2f14712 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -8391,6 +8391,7 @@ cc_library( ":ArithDialect", ":ConversionPassIncGen", ":EmitCDialect", + ":EmitCTransforms", ":IR", ":SCFDialect", ":TransformUtils", From bb27d5e5c6b194a1440b8ac4e5ace68d0ee2a849 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Don=C3=A1t=20Nagy?= Date: Thu, 2 Jan 2025 15:51:03 +0100 Subject: [PATCH 282/567] [analyzer] Don't assume third iteration in loops (#119388) This commit ensures that if the loop condition is opaque (the analyzer cannot determine whether it's true or false) and there were at least two iterations, then the analyzer doesn't make the unjustified assumption that it can enter yet another iteration. Note that the presence of a loop suggests that the developer thought that two iterations can happen (otherwise an `if` would've been sufficient), but it does not imply that the developer expected three or four iterations -- and in fact there are many false positives where a loop iterates over a two-element (or three-element) data structure, but the analyzer cannot understand the loop condition and blindly assumes that there may be three or more iterations. (In particular, analyzing the FFMPEG project produces 100+ such false positives.) Moreover, this provides some performance improvements in the sense that the analyzer won't waste time on traversing the execution paths with 3 or 4 iterations in a loop (which are very similar to the paths with 2 iterations) and therefore will be able to traverse more branches elsewhere on the `ExplodedGraph`. This logic is disabled if the user enables the widen-loops analyzer option (which is disabled by default), because the "simulate one final iteration after the invalidation" execution path would be suppressed by the "exit the loop if the loop condition is opaque and there were at least two iterations" logic. If we want to support loop widening, we would need to create a follow-up commit which ensures that it "plays nicely" with this logic. --- clang/docs/ReleaseNotes.rst | 7 + .../Core/PathSensitive/CoreEngine.h | 8 + .../Core/PathSensitive/ExprEngine.h | 18 +- clang/lib/StaticAnalyzer/Core/CoreEngine.cpp | 27 ++- clang/lib/StaticAnalyzer/Core/ExprEngine.cpp | 58 ++++- clang/test/Analysis/loop-assumptions.c | 219 ++++++++++++++++++ clang/test/Analysis/loop-unrolling.cpp | 35 +-- clang/test/Analysis/misc-ps-region-store.m | 31 ++- 8 files changed, 362 insertions(+), 41 deletions(-) create mode 100644 clang/test/Analysis/loop-assumptions.c diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index e0aef1af2135c..aca07e2ba9cf2 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -1157,6 +1157,13 @@ New features Crash and bug fixes ^^^^^^^^^^^^^^^^^^^ +- In loops where the loop condition is opaque (i.e. the analyzer cannot + determine whether it's true or false), the analyzer will no longer assume + execution paths that perform more that two iterations. These unjustified + assumptions caused false positive reports (e.g. 100+ out-of-bounds reports in + the FFMPEG codebase) in loops where the programmer intended only two or three + steps but the analyzer wasn't able to understand that the loop is limited. + Improvements ^^^^^^^^^^^^ diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CoreEngine.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CoreEngine.h index a6d05a3ac67b4..80b79fd4e928f 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CoreEngine.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CoreEngine.h @@ -126,6 +126,14 @@ class CoreEngine { ExplodedNode *generateCallExitBeginNode(ExplodedNode *N, const ReturnStmt *RS); + /// Helper function called by `HandleBranch()`. If the currently handled + /// branch corresponds to a loop, this returns the number of already + /// completed iterations in that loop, otherwise the return value is + /// `std::nullopt`. Note that this counts _all_ earlier iterations, including + /// ones that were performed within an earlier iteration of an outer loop. + std::optional getCompletedIterationCount(const CFGBlock *B, + ExplodedNode *Pred) const; + public: /// Construct a CoreEngine object to analyze the provided CFG. CoreEngine(ExprEngine &exprengine, diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h index 8c7493e27fcaa..20c446e33ef9a 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h @@ -321,14 +321,14 @@ class ExprEngine { NodeBuilderWithSinks &nodeBuilder, ExplodedNode *Pred); - /// ProcessBranch - Called by CoreEngine. Used to generate successor - /// nodes by processing the 'effects' of a branch condition. - void processBranch(const Stmt *Condition, - NodeBuilderContext& BuilderCtx, - ExplodedNode *Pred, - ExplodedNodeSet &Dst, - const CFGBlock *DstT, - const CFGBlock *DstF); + /// ProcessBranch - Called by CoreEngine. Used to generate successor nodes by + /// processing the 'effects' of a branch condition. If the branch condition + /// is a loop condition, IterationsCompletedInLoop is the number of completed + /// iterations (otherwise it's std::nullopt). + void processBranch(const Stmt *Condition, NodeBuilderContext &BuilderCtx, + ExplodedNode *Pred, ExplodedNodeSet &Dst, + const CFGBlock *DstT, const CFGBlock *DstF, + std::optional IterationsCompletedInLoop); /// Called by CoreEngine. /// Used to generate successor nodes for temporary destructors depending @@ -588,6 +588,8 @@ class ExprEngine { void evalEagerlyAssumeBifurcation(ExplodedNodeSet &Dst, ExplodedNodeSet &Src, const Expr *Ex); + bool didEagerlyAssumeBifurcateAt(ProgramStateRef State, const Expr *Ex) const; + static std::pair getEagerlyAssumeBifurcationTags(); diff --git a/clang/lib/StaticAnalyzer/Core/CoreEngine.cpp b/clang/lib/StaticAnalyzer/Core/CoreEngine.cpp index 67b7d30853d9d..775a22e18c619 100644 --- a/clang/lib/StaticAnalyzer/Core/CoreEngine.cpp +++ b/clang/lib/StaticAnalyzer/Core/CoreEngine.cpp @@ -444,7 +444,8 @@ void CoreEngine::HandleBranch(const Stmt *Cond, const Stmt *Term, NodeBuilderContext Ctx(*this, B, Pred); ExplodedNodeSet Dst; ExprEng.processBranch(Cond, Ctx, Pred, Dst, *(B->succ_begin()), - *(B->succ_begin() + 1)); + *(B->succ_begin() + 1), + getCompletedIterationCount(B, Pred)); // Enqueue the new frontier onto the worklist. enqueue(Dst); } @@ -591,6 +592,30 @@ ExplodedNode *CoreEngine::generateCallExitBeginNode(ExplodedNode *N, return isNew ? Node : nullptr; } +std::optional +CoreEngine::getCompletedIterationCount(const CFGBlock *B, + ExplodedNode *Pred) const { + const LocationContext *LC = Pred->getLocationContext(); + BlockCounter Counter = WList->getBlockCounter(); + unsigned BlockCount = + Counter.getNumVisited(LC->getStackFrame(), B->getBlockID()); + + const Stmt *Term = B->getTerminatorStmt(); + if (isa(Term)) { + assert(BlockCount >= 1 && + "Block count of currently analyzed block must be >= 1"); + return BlockCount - 1; + } + if (isa(Term)) { + // In a do-while loop one iteration happens before the first evaluation of + // the loop condition, so we don't subtract one. + return BlockCount; + } + // ObjCForCollectionStmt is skipped intentionally because the current + // application of the iteration counts is not relevant for it. + return std::nullopt; +} + void CoreEngine::enqueue(ExplodedNodeSet &Set) { for (const auto I : Set) WList->enqueue(I); diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp index db385e891e762..362a985b9174a 100644 --- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp +++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp @@ -2760,12 +2760,10 @@ assumeCondition(const Stmt *Condition, ExplodedNode *N) { return State->assume(V); } -void ExprEngine::processBranch(const Stmt *Condition, - NodeBuilderContext& BldCtx, - ExplodedNode *Pred, - ExplodedNodeSet &Dst, - const CFGBlock *DstT, - const CFGBlock *DstF) { +void ExprEngine::processBranch( + const Stmt *Condition, NodeBuilderContext &BldCtx, ExplodedNode *Pred, + ExplodedNodeSet &Dst, const CFGBlock *DstT, const CFGBlock *DstF, + std::optional IterationsCompletedInLoop) { assert((!Condition || !isa(Condition)) && "CXXBindTemporaryExprs are handled by processBindTemporary."); const LocationContext *LCtx = Pred->getLocationContext(); @@ -2808,8 +2806,35 @@ void ExprEngine::processBranch(const Stmt *Condition, if (StTrue && StFalse) assert(!isa(Condition)); - if (StTrue) - Builder.generateNode(StTrue, true, PredN); + if (StTrue) { + // If we are processing a loop condition where two iterations have + // already been completed and the false branch is also feasible, then + // don't assume a third iteration because it is a redundant execution + // path (unlikely to be different from earlier loop exits) and can cause + // false positives if e.g. the loop iterates over a two-element structure + // with an opaque condition. + // + // The iteration count "2" is hardcoded because it's the natural limit: + // * the fact that the programmer wrote a loop (and not just an `if`) + // implies that they thought that the loop body might be executed twice; + // * however, there are situations where the programmer knows that there + // are at most two iterations but writes a loop that appears to be + // generic, because there is no special syntax for "loop with at most + // two iterations". (This pattern is common in FFMPEG and appears in + // many other projects as well.) + bool CompletedTwoIterations = IterationsCompletedInLoop.value_or(0) >= 2; + bool FalseAlsoFeasible = + StFalse || + didEagerlyAssumeBifurcateAt(PrevState, dyn_cast(Condition)); + bool SkipTrueBranch = CompletedTwoIterations && FalseAlsoFeasible; + + // FIXME: This "don't assume third iteration" heuristic partially + // conflicts with the widen-loop analysis option (which is off by + // default). If we intend to support and stabilize the loop widening, + // we must ensure that it 'plays nicely' with this logic. + if (!SkipTrueBranch || AMgr.options.ShouldWidenLoops) + Builder.generateNode(StTrue, true, PredN); + } if (StFalse) Builder.generateNode(StFalse, false, PredN); @@ -3731,6 +3756,12 @@ ExprEngine::getEagerlyAssumeBifurcationTags() { return std::make_pair(&TrueTag, &FalseTag); } +/// If the last EagerlyAssume attempt was successful (i.e. the true and false +/// cases were both feasible), this state trait stores the expression where it +/// happened; otherwise this holds nullptr. +REGISTER_TRAIT_WITH_PROGRAMSTATE(LastEagerlyAssumeExprIfSuccessful, + const Expr *) + void ExprEngine::evalEagerlyAssumeBifurcation(ExplodedNodeSet &Dst, ExplodedNodeSet &Src, const Expr *Ex) { @@ -3746,6 +3777,7 @@ void ExprEngine::evalEagerlyAssumeBifurcation(ExplodedNodeSet &Dst, } ProgramStateRef State = Pred->getState(); + State = State->set(nullptr); SVal V = State->getSVal(Ex, Pred->getLocationContext()); std::optional SEV = V.getAs(); if (SEV && SEV->isExpression()) { @@ -3753,6 +3785,11 @@ void ExprEngine::evalEagerlyAssumeBifurcation(ExplodedNodeSet &Dst, auto [StateTrue, StateFalse] = State->assume(*SEV); + if (StateTrue && StateFalse) { + StateTrue = StateTrue->set(Ex); + StateFalse = StateFalse->set(Ex); + } + // First assume that the condition is true. if (StateTrue) { SVal Val = svalBuilder.makeIntVal(1U, Ex->getType()); @@ -3770,6 +3807,11 @@ void ExprEngine::evalEagerlyAssumeBifurcation(ExplodedNodeSet &Dst, } } +bool ExprEngine::didEagerlyAssumeBifurcateAt(ProgramStateRef State, + const Expr *Ex) const { + return Ex && State->get() == Ex; +} + void ExprEngine::VisitGCCAsmStmt(const GCCAsmStmt *A, ExplodedNode *Pred, ExplodedNodeSet &Dst) { StmtNodeBuilder Bldr(Pred, Dst, *currBldrCtx); diff --git a/clang/test/Analysis/loop-assumptions.c b/clang/test/Analysis/loop-assumptions.c new file mode 100644 index 0000000000000..eb0ffdce722e0 --- /dev/null +++ b/clang/test/Analysis/loop-assumptions.c @@ -0,0 +1,219 @@ +// RUN: %clang_analyze_cc1 -analyzer-checker=debug.ExprInspection \ +// RUN: -verify=expected,eagerlyassume %s +// RUN: %clang_analyze_cc1 -analyzer-checker=debug.ExprInspection \ +// RUN: -analyzer-config eagerly-assume=false \ +// RUN: -verify=expected,noeagerlyassume %s + +// These tests validate the logic within `ExprEngine::processBranch` which +// ensures that in loops with opaque conditions we don't assume execution paths +// if the code does not imply that they are possible. + +void clang_analyzer_numTimesReached(void); +void clang_analyzer_warnIfReached(void); +void clang_analyzer_dump(int); + +void clearCondition(void) { + // If the analyzer can definitely determine the value of the loop condition, + // then this corrective logic doesn't activate and the engine executes + // `-analyzer-max-loop` iterations (by default, 4). + for (int i = 0; i < 10; i++) + clang_analyzer_numTimesReached(); // expected-warning {{4}} + + clang_analyzer_warnIfReached(); // unreachable +} + +void opaqueCondition(int arg) { + // If the loop condition is opaque, don't assume more than two iterations, + // because the presence of a loop does not imply that the programmer thought + // that more than two iterations are possible. (It _does_ imply that two + // iterations may be possible at least in some cases, because otherwise an + // `if` would've been enough.) + for (int i = 0; i < arg; i++) + clang_analyzer_numTimesReached(); // expected-warning {{2}} + + clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} +} + +int check(void); + +void opaqueConditionCall(int arg) { + // Same situation as `opaqueCondition()` but with a `while ()` loop. This + // is also an example for a situation where the programmer cannot easily + // insert an assertion to guide the analyzer and rule out more than two + // iterations (so the analyzer needs to proactively avoid those unjustified + // branches). + while (check()) + clang_analyzer_numTimesReached(); // expected-warning {{2}} + + clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} +} + +void opaqueConditionDoWhile(int arg) { + // Same situation as `opaqueCondition()` but with a `do {} while ()` loop. + // This is tested separately because this loop type is a special case in the + // iteration count calculation. + int i = 0; + do { + clang_analyzer_numTimesReached(); // expected-warning {{2}} + } while (i++ < arg); + + clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} +} + +void dontRememberOldBifurcation(int arg) { + // In this (slightly contrived) test case the analyzer performs an assumption + // at the first iteration of the loop, but does not make any new assumptions + // in the subsequent iterations, so the analyzer should continue evaluating + // the loop. + // Previously this was mishandled in `eagerly-assume` mode (which is enabled + // by default), because the code remembered that there was a bifurcation on + // the first iteration of the loop and didn't realize that this is obsolete. + + // NOTE: The variable `i` is introduced to ensure that the iterations of the + // loop change the state -- otherwise the analyzer stops iterating because it + // returns to the same `ExplodedNode`. + int i = 0; + while (arg > 3) { + clang_analyzer_numTimesReached(); // expected-warning {{4}} + i++; + } + + clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} +} + +void dontAssumeFourthIterartion(int arg) { + if (arg == 2) + return; + + // In this function the analyzer cannot leave the loop after exactly two + // iterations (because it knows that `arg != 2` at that point), so it + // performs a third iteration, but it does not assume that a fourth iteration + // is also possible. + for (int i = 0; i < arg; i++) + clang_analyzer_numTimesReached(); // expected-warning {{3}} + + clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} +} + +#define TRUE 1 +void shortCircuitInLoopCondition(int arg) { + // When the loop condition expression contains short-circuiting operators, it + // performs "inner" bifurcations for those operators and only considers the + // last (rightmost) operand as the branch condition that is associated with + // the loop itself (as its loop condition). + // This means that assumptions taken in the left-hand side of a short-circuiting + // operator are not recognized as "opaque" loop condition, so the loop in + // this test case is allowed to finish four iterations. + // FIXME: This corner case is responsible for at least one out-of-bounds + // false positive on the ffmpeg codebase. Eventually we should properly + // recognize the full syntactical loop condition expression as "the loop + // condition", but this will be complicated to implement. + for (int i = 0; i < arg && TRUE; i++) { + clang_analyzer_numTimesReached(); // expected-warning {{4}} + } + clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} +} + +void shortCircuitInLoopConditionRHS(int arg) { + // Unlike `shortCircuitInLoopCondition()`, this case is handled properly + // because the analyzer thinks that the right hand side of the `&&` is the + // loop condition. + for (int i = 0; TRUE && i < arg; i++) { + clang_analyzer_numTimesReached(); // expected-warning {{2}} + } + clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} +} + +void eagerlyAssumeInSubexpression(int arg) { + // The `EagerlyAssume` logic is another complication that can "split the + // state" within the loop condition, but before the `processBranch()` call + // which is (in theory) responsible for evaluating the loop condition. + // The current implementation partially compensates this by noticing the + // cases where the loop condition is targeted by `EagerlyAssume`, but does + // not handle the (fortunately rare) case when `EagerlyAssume` hits a + // sub-expression of the loop condition (as in this contrived test case). + // FIXME: I don't know a real-world example for this inconsistency, but it + // would be good to eliminate it eventually. + for (int i = 0; (i >= arg) - 1; i++) { + clang_analyzer_numTimesReached(); // eagerlyassume-warning {{4}} noeagerlyassume-warning {{2}} + } + clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} +} + +void calledTwice(int arg, int isFirstCall) { + // This function is called twice (with two different unknown 'arg' values) to + // check the iteration count handling in this situation. + for (int i = 0; i < arg; i++) { + if (isFirstCall) { + clang_analyzer_numTimesReached(); // expected-warning {{2}} + } else { + clang_analyzer_numTimesReached(); // expected-warning {{2}} + } + } +} + +void caller(int arg, int arg2) { + // Entry point for `calledTwice()`. + calledTwice(arg, 1); + calledTwice(arg2, 0); +} + +void innerLoopClearCondition(void) { + // A "control group" test case for the behavior of an inner loop. Notice that + // although the (default) value of `-analyzer-max-loop` is 4, we only see 3 iterations + // of the inner loop, because `-analyzer-max-loop` limits the number of + // evaluations of _the loop condition of the inner loop_ and in addition to + // the 3 evaluations before the 3 iterations, there is also a step where it + // evaluates to false (in the first iteration of the outer loop). + for (int outer = 0; outer < 2; outer++) { + int limit = 0; + if (outer) + limit = 10; + clang_analyzer_dump(limit); // expected-warning {{0}} expected-warning {{10}} + for (int i = 0; i < limit; i++) { + clang_analyzer_numTimesReached(); // expected-warning {{3}} + } + } +} + +void innerLoopOpaqueCondition(int arg) { + // In this test case the engine doesn't assume a second iteration within the + // inner loop (in the second iteration of the outer loop, when the limit is + // opaque) because `CoreEngine::getCompletedIterationCount()` is based on the + // `BlockCount` values queried from the `BlockCounter` which count _all_ + // evaluations of a given `CFGBlock` (in our case, the loop condition) and + // not just the evaluations within the current iteration of the outer loop. + // FIXME: This inaccurate iteration count could in theory cause some false + // negatives, although I think this would be unusual in practice, as the + // small default value of `-analyzer-max-loop` means that this is only + // relevant if the analyzer can deduce that the inner loop performs 0 or 1 + // iterations within the first iteration of the outer loop (and then the + // condition of the inner loop is opaque within the second iteration of the + // outer loop). + for (int outer = 0; outer < 2; outer++) { + int limit = 0; + if (outer) + limit = arg; + clang_analyzer_dump(limit); // expected-warning {{0}} expected-warning {{reg_$}} + for (int i = 0; i < limit; i++) { + clang_analyzer_numTimesReached(); // expected-warning {{1}} + } + } +} + +void onlyLoopConditions(int arg) { + // This "don't assume third iteration" logic only examines the conditions of + // loop statements and does not affect the analysis of code that implements + // similar behavior with different language features like if + break, goto, + // recursive functions, ... + int i = 0; + while (1) { + clang_analyzer_numTimesReached(); // expected-warning {{4}} + + // This is not a loop condition. + if (i++ > arg) + break; + } + + clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} +} diff --git a/clang/test/Analysis/loop-unrolling.cpp b/clang/test/Analysis/loop-unrolling.cpp index 66a828abfb513..bf05a7739ce48 100644 --- a/clang/test/Analysis/loop-unrolling.cpp +++ b/clang/test/Analysis/loop-unrolling.cpp @@ -63,7 +63,7 @@ int simple_no_unroll1() { int a[9]; int k = 42; for (int i = 0; i < 9; i++) { - clang_analyzer_numTimesReached(); // expected-warning {{4}} + clang_analyzer_numTimesReached(); // expected-warning {{2}} a[i] = 42; foo(i); } @@ -76,7 +76,7 @@ int simple_no_unroll2() { int k = 42; int i; for (i = 0; i < 9; i++) { - clang_analyzer_numTimesReached(); // expected-warning {{4}} + clang_analyzer_numTimesReached(); // expected-warning {{2}} a[i] = 42; i += getNum(); } @@ -309,9 +309,9 @@ int nested_inner_unrolled() { int k = 42; int j = 0; for (int i = 0; i < getNum(); i++) { - clang_analyzer_numTimesReached(); // expected-warning {{4}} + clang_analyzer_numTimesReached(); // expected-warning {{2}} for (j = 0; j < 8; ++j) { - clang_analyzer_numTimesReached(); // expected-warning {{32}} + clang_analyzer_numTimesReached(); // expected-warning {{16}} a[j] = 22; } a[i] = 42; @@ -346,11 +346,7 @@ int simple_known_bound_loop() { int simple_unknown_bound_loop() { for (int i = 2; i < getNum(); i++) { -#ifdef DFS - clang_analyzer_numTimesReached(); // expected-warning {{16}} -#else clang_analyzer_numTimesReached(); // expected-warning {{8}} -#endif } return 0; } @@ -368,11 +364,7 @@ int nested_inlined_unroll1() { int nested_inlined_no_unroll1() { int k; for (int i = 0; i < 9; i++) { -#ifdef DFS - clang_analyzer_numTimesReached(); // expected-warning {{18}} -#else - clang_analyzer_numTimesReached(); // expected-warning {{14}} -#endif + clang_analyzer_numTimesReached(); // expected-warning {{10}} k = simple_unknown_bound_loop(); // reevaluation without inlining, splits the state as well } int a = 22 / k; // no-warning @@ -475,9 +467,13 @@ int num_steps_over_limit2() { int num_steps_on_limit3() { for (int i = 0; i < getNum(); i++) { - clang_analyzer_numTimesReached(); // expected-warning {{4}} + clang_analyzer_numTimesReached(); // expected-warning {{2}} for (int j = 0; j < 32; j++) { - clang_analyzer_numTimesReached(); // expected-warning {{128}} + // Here the loop unrollig logic calculates with four potential iterations + // in the outer loop where it cannot determine the iteration count in + // advance; but after two loops the analyzer conservatively assumes that + // the (still opaque) loop condition is false. + clang_analyzer_numTimesReached(); // expected-warning {{64}} } } return 0; @@ -493,6 +489,15 @@ int num_steps_over_limit3() { return 0; } +int num_steps_on_limit4() { + for (int i = 0; i < 4; i++) { + clang_analyzer_numTimesReached(); // expected-warning {{4}} + for (int j = 0; j < 32; j++) { + clang_analyzer_numTimesReached(); // expected-warning {{128}} + } + } + return 0; +} void pr34943() { for (int i = 0; i < 6L; ++i) { diff --git a/clang/test/Analysis/misc-ps-region-store.m b/clang/test/Analysis/misc-ps-region-store.m index 668b5ffd7001a..a882e7eb0dc90 100644 --- a/clang/test/Analysis/misc-ps-region-store.m +++ b/clang/test/Analysis/misc-ps-region-store.m @@ -910,13 +910,13 @@ void pr6302(id x, Class y) { //===----------------------------------------------------------------------===// // Specially handle global variables that are declared constant. In the -// example below, this forces the loop to take exactly 2 iterations. +// example below, this forces the loop to take exactly 1 iteration. //===----------------------------------------------------------------------===// -const int pr6288_L_N = 2; +const int pr6288_L_N = 1; void pr6288_(void) { - int x[2]; - int *px[2]; + int x[1]; + int *px[1]; int i; for (i = 0; i < pr6288_L_N; i++) px[i] = &x[i]; @@ -924,8 +924,8 @@ void pr6288_(void) { } void pr6288_pos(int z) { - int x[2]; - int *px[2]; + int x[1]; + int *px[1]; int i; for (i = 0; i < z; i++) px[i] = &x[i]; // expected-warning{{Access out-of-bound array element (buffer overflow)}} @@ -933,15 +933,28 @@ void pr6288_pos(int z) { } void pr6288_b(void) { - const int L_N = 2; - int x[2]; - int *px[2]; + const int L_N = 1; + int x[1]; + int *px[1]; int i; for (i = 0; i < L_N; i++) px[i] = &x[i]; *(px[0]) = 0; // no-warning } +void pr6288_no_third_iter(int z) { + int x[2]; + int *px[2]; + int i; + // If the loop condition is opaque, we assume that there may be two + // iterations (becasuse otherwise the loop could be replaced by an if); but + // we do not assume that there may be a third iteration. Therefore, + // unlike 'pr6288_pos', this testcase does not produce an out-of-bounds error. + for (i = 0; i < z; i++) + px[i] = &x[i]; + *(px[0]) = 0; // expected-warning{{Dereference of undefined pointer value}} +} + // A bug in RemoveDeadBindings was causing instance variable bindings to get // prematurely pruned from the state. @interface Rdar7817800 { From 4a890c2c605640f48ecbaefebda8f3a42043ff3d Mon Sep 17 00:00:00 2001 From: klensy Date: Thu, 2 Jan 2025 18:18:20 +0300 Subject: [PATCH 283/567] [llvm][aarch64] fix copypaste typo (#120725) moved from #119881 --- .../lib/Target/AArch64/AArch64ISelLowering.cpp | 2 +- llvm/test/CodeGen/AArch64/vecreduce-add.ll | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 24e1ebd8421fb..070163a5fb297 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -18430,7 +18430,7 @@ static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG) { EVT VT = A.getValueType(); SDValue Op0 = A.getOperand(0); SDValue Op1 = A.getOperand(1); - if (Op0.getOpcode() != Op0.getOpcode() || + if (Op0.getOpcode() != Op1.getOpcode() || (Op0.getOpcode() != ISD::ZERO_EXTEND && Op0.getOpcode() != ISD::SIGN_EXTEND)) return SDValue(); diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll index 8473f45f6c803..5d6b523f1549a 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll @@ -72,6 +72,24 @@ entry: ret i64 %z } +define i64 @add_v4i32_v4i64_zsext(<4 x i32> %xi) { +; CHECK-LABEL: add_v4i32_v4i64_zsext: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ushll v1.2d, v0.2s, #0 +; CHECK-NEXT: saddw2 v0.2d, v1.2d, v0.4s +; CHECK-NEXT: addp d0, v0.2d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret +entry: + %x = shufflevector <4 x i32> %xi, <4 x i32> %xi, <2 x i32> + %y = shufflevector <4 x i32> %xi, <4 x i32> %xi, <2 x i32> + %xx = zext <2 x i32> %x to <2 x i64> + %yy = sext <2 x i32> %y to <2 x i64> + %zz = add <2 x i64> %xx, %yy + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %zz) + ret i64 %z +} + define i64 @add_v2i32_v2i64_zext(<2 x i32> %x) { ; CHECK-LABEL: add_v2i32_v2i64_zext: ; CHECK: // %bb.0: // %entry From 62d0aff3eb934439acac47348e2385f0751a1444 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= Date: Thu, 2 Jan 2025 15:43:45 +0000 Subject: [PATCH 284/567] [cmake] Extend zstd.dll finding logic from MSVC to Clang (#121437) Extend the special logic for finding `zstd.dll` in `Findzstd` to apply to all MSVC-compatible configurations such as Clang targeting MSVC. Fixes #121345 --- llvm/cmake/modules/Findzstd.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/cmake/modules/Findzstd.cmake b/llvm/cmake/modules/Findzstd.cmake index 86b6d48b6ec6b..f6ca5d1ebe546 100644 --- a/llvm/cmake/modules/Findzstd.cmake +++ b/llvm/cmake/modules/Findzstd.cmake @@ -10,7 +10,7 @@ # zstd::libzstd_shared # zstd::libzstd_static -if(MSVC) +if(MSVC OR "${CMAKE_CXX_SIMULATE_ID}" STREQUAL "MSVC") set(zstd_STATIC_LIBRARY_SUFFIX "_static\\${CMAKE_STATIC_LIBRARY_SUFFIX}$") else() set(zstd_STATIC_LIBRARY_SUFFIX "\\${CMAKE_STATIC_LIBRARY_SUFFIX}$") @@ -33,7 +33,7 @@ if(zstd_FOUND) set(zstd_STATIC_LIBRARY "${zstd_LIBRARY}") elseif (NOT TARGET zstd::libzstd_shared) add_library(zstd::libzstd_shared SHARED IMPORTED) - if(MSVC) + if(MSVC OR "${CMAKE_CXX_SIMULATE_ID}" STREQUAL "MSVC") include(GNUInstallDirs) # For CMAKE_INSTALL_LIBDIR and friends. # IMPORTED_LOCATION is the path to the DLL and IMPORTED_IMPLIB is the "library". get_filename_component(zstd_DIRNAME "${zstd_LIBRARY}" DIRECTORY) From 8ab88f11a12aaecb46f7b0eb5c13e7802258c1e1 Mon Sep 17 00:00:00 2001 From: Jonas Paulsson Date: Thu, 2 Jan 2025 17:03:33 +0100 Subject: [PATCH 285/567] [emacs] Add noext as an attribute in llvm-mode.el (#121444) The NoExt attribute was introduced with #100757, to exist alongside with signext and zeroext. This patch adds "noext" as an attribute to llvm-mode.el to get the proper highlighting of the keyword. --- llvm/utils/emacs/llvm-mode.el | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/emacs/llvm-mode.el b/llvm/utils/emacs/llvm-mode.el index dab37833ff63a..660d0718f098c 100644 --- a/llvm/utils/emacs/llvm-mode.el +++ b/llvm/utils/emacs/llvm-mode.el @@ -32,7 +32,7 @@ `(,(regexp-opt '("alwaysinline" "argmemonly" "allocsize" "builtin" "cold" "convergent" "dereferenceable" "dereferenceable_or_null" "hot" "immarg" "inaccessiblememonly" "inaccessiblemem_or_argmemonly" "inalloca" "inlinehint" "jumptable" "minsize" "mustprogress" "naked" "nobuiltin" "nonnull" "nocapture" - "nocallback" "nocf_check" "noduplicate" "nofree" "noimplicitfloat" "noinline" "nomerge" "nonlazybind" "noprofile" "noredzone" "noreturn" + "nocallback" "nocf_check" "noduplicate" "noext" "nofree" "noimplicitfloat" "noinline" "nomerge" "nonlazybind" "noprofile" "noredzone" "noreturn" "norecurse" "nosync" "noundef" "nounwind" "nosanitize_bounds" "nosanitize_coverage" "null_pointer_is_valid" "optdebug" "optforfuzzing" "optnone" "optsize" "preallocated" "readnone" "readonly" "returned" "returns_twice" "shadowcallstack" "signext" "speculatable" "speculative_load_hardening" "ssp" "sspreq" "sspstrong" "safestack" "sanitize_address" "sanitize_hwaddress" "sanitize_memtag" "sanitize_thread" "sanitize_memory" "strictfp" "swifterror" "uwtable" "vscale_range" "willreturn" "writeonly" "zeroext") 'symbols) . font-lock-constant-face) From 11e482c4a32be6a315e5bf2ae7599cf10eb84836 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 2 Jan 2025 23:04:44 +0700 Subject: [PATCH 286/567] RegAllocGreedy: Add dummy priority advisor for writing MIR tests (#121207) I regularly struggle reproducing failures in greedy due to changes in priority when resuming the allocation from MIR vs. a complete compilation starting at IR. That is, the fix in e0919b189bf2df4f97f22ba40260ab5153988b14 did not really fix the problem of the instruction distance mattering. Add a way to bypass all of the priority heuristics for MIR tests, by prioritizing only by virtual register number. Could also give this a more specific name, like PrioritizeLowVirtRegNumber --- llvm/lib/CodeGen/RegAllocGreedy.cpp | 6 +++ llvm/lib/CodeGen/RegAllocPriorityAdvisor.cpp | 35 +++++++++++- llvm/lib/CodeGen/RegAllocPriorityAdvisor.h | 14 ++++- .../dummy-regalloc-priority-advisor.mir | 54 +++++++++++++++++++ 4 files changed, 107 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/dummy-regalloc-priority-advisor.mir diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index 4fa2bc76b38b4..95a7801c372f7 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -376,6 +376,12 @@ unsigned DefaultPriorityAdvisor::getPriority(const LiveInterval &LI) const { return Prio; } +unsigned DummyPriorityAdvisor::getPriority(const LiveInterval &LI) const { + // Prioritize by virtual register number, lowest first. + Register Reg = LI.reg(); + return ~Reg.virtRegIndex(); +} + const LiveInterval *RAGreedy::dequeue() { return dequeue(Queue); } const LiveInterval *RAGreedy::dequeue(PQueue &CurQueue) { diff --git a/llvm/lib/CodeGen/RegAllocPriorityAdvisor.cpp b/llvm/lib/CodeGen/RegAllocPriorityAdvisor.cpp index 0650aaff56ea0..4525b8fc5a383 100644 --- a/llvm/lib/CodeGen/RegAllocPriorityAdvisor.cpp +++ b/llvm/lib/CodeGen/RegAllocPriorityAdvisor.cpp @@ -30,7 +30,10 @@ static cl::opt Mode( clEnumValN(RegAllocPriorityAdvisorAnalysis::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocPriorityAdvisorAnalysis::AdvisorMode::Development, - "development", "for training"))); + "development", "for training"), + clEnumValN( + RegAllocPriorityAdvisorAnalysis::AdvisorMode::Dummy, "dummy", + "prioritize low virtual register numbers for test and debug"))); char RegAllocPriorityAdvisorAnalysis::ID = 0; INITIALIZE_PASS(RegAllocPriorityAdvisorAnalysis, "regalloc-priority", @@ -67,6 +70,31 @@ class DefaultPriorityAdvisorAnalysis final } const bool NotAsRequested; }; + +class DummyPriorityAdvisorAnalysis final + : public RegAllocPriorityAdvisorAnalysis { +public: + DummyPriorityAdvisorAnalysis() + : RegAllocPriorityAdvisorAnalysis(AdvisorMode::Dummy) {} + + // support for isa<> and dyn_cast. + static bool classof(const RegAllocPriorityAdvisorAnalysis *R) { + return R->getAdvisorMode() == AdvisorMode::Dummy; + } + +private: + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + RegAllocPriorityAdvisorAnalysis::getAnalysisUsage(AU); + } + + std::unique_ptr + getAdvisor(const MachineFunction &MF, const RAGreedy &RA) override { + return std::make_unique( + MF, RA, &getAnalysis().getSI()); + } +}; + } // namespace template <> Pass *llvm::callDefaultCtor() { @@ -75,6 +103,9 @@ template <> Pass *llvm::callDefaultCtor() { case RegAllocPriorityAdvisorAnalysis::AdvisorMode::Default: Ret = new DefaultPriorityAdvisorAnalysis(/*NotAsRequested*/ false); break; + case RegAllocPriorityAdvisorAnalysis::AdvisorMode::Dummy: + Ret = new DummyPriorityAdvisorAnalysis(); + break; case RegAllocPriorityAdvisorAnalysis::AdvisorMode::Development: #if defined(LLVM_HAVE_TFLITE) Ret = createDevelopmentModePriorityAdvisor(); @@ -97,6 +128,8 @@ StringRef RegAllocPriorityAdvisorAnalysis::getPassName() const { return "Release mode Regalloc Priority Advisor"; case AdvisorMode::Development: return "Development mode Regalloc Priority Advisor"; + case AdvisorMode::Dummy: + return "Dummy Regalloc Priority Advisor"; } llvm_unreachable("Unknown advisor kind"); } diff --git a/llvm/lib/CodeGen/RegAllocPriorityAdvisor.h b/llvm/lib/CodeGen/RegAllocPriorityAdvisor.h index 1e9fa967214cc..32e4598b71539 100644 --- a/llvm/lib/CodeGen/RegAllocPriorityAdvisor.h +++ b/llvm/lib/CodeGen/RegAllocPriorityAdvisor.h @@ -56,9 +56,21 @@ class DefaultPriorityAdvisor : public RegAllocPriorityAdvisor { unsigned getPriority(const LiveInterval &LI) const override; }; +/// Stupid priority advisor which just enqueues in virtual register number +/// order, for debug purposes only. +class DummyPriorityAdvisor : public RegAllocPriorityAdvisor { +public: + DummyPriorityAdvisor(const MachineFunction &MF, const RAGreedy &RA, + SlotIndexes *const Indexes) + : RegAllocPriorityAdvisor(MF, RA, Indexes) {} + +private: + unsigned getPriority(const LiveInterval &LI) const override; +}; + class RegAllocPriorityAdvisorAnalysis : public ImmutablePass { public: - enum class AdvisorMode : int { Default, Release, Development }; + enum class AdvisorMode : int { Default, Release, Development, Dummy }; RegAllocPriorityAdvisorAnalysis(AdvisorMode Mode) : ImmutablePass(ID), Mode(Mode){}; diff --git a/llvm/test/CodeGen/AMDGPU/dummy-regalloc-priority-advisor.mir b/llvm/test/CodeGen/AMDGPU/dummy-regalloc-priority-advisor.mir new file mode 100644 index 0000000000000..5c7c07632f0d5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/dummy-regalloc-priority-advisor.mir @@ -0,0 +1,54 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=greedy,2 -stress-regalloc=4 -stop-after=virtregrewriter,2 -regalloc-enable-priority-advisor=default -o - %s | FileCheck -check-prefixes=CHECK,DEFAULT %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=greedy,2 -stress-regalloc=4 -stop-after=virtregrewriter,2 -regalloc-enable-priority-advisor=dummy -o - %s | FileCheck -check-prefixes=CHECK,DUMMY %s + +# Check that the regalloc-enable-priority-advisor=dummy option works +# and the result is different from the default. Ordinarily %1 would be +# prioritized higher than %0 due to the register class priority + +--- +name: foo +tracksRegLiveness: true +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vreg_128 } + - { id: 2, class: vgpr_32 } +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; DEFAULT-LABEL: name: foo + ; DEFAULT: liveins: $vgpr0, $vgpr1 + ; DEFAULT-NEXT: {{ $}} + ; DEFAULT-NEXT: SI_SPILL_V128_SAVE $vgpr1_vgpr2_vgpr3_vgpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5) + ; DEFAULT-NEXT: SI_SPILL_V32_SAVE $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) + ; DEFAULT-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 + ; DEFAULT-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) + ; DEFAULT-NEXT: renamable $vgpr3 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) + ; DEFAULT-NEXT: renamable $vgpr3 = V_ADD_U32_e32 killed $vgpr2, killed $vgpr3, implicit $exec + ; DEFAULT-NEXT: SI_RETURN implicit $vgpr3, implicit $vgpr0, implicit $vgpr1 + ; + ; DUMMY-LABEL: name: foo + ; DUMMY: liveins: $vgpr0, $vgpr1 + ; DUMMY-NEXT: {{ $}} + ; DUMMY-NEXT: SI_SPILL_V128_SAVE $vgpr1_vgpr2_vgpr3_vgpr4, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5) + ; DUMMY-NEXT: SI_SPILL_V32_SAVE $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; DUMMY-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 + ; DUMMY-NEXT: renamable $vgpr2 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) + ; DUMMY-NEXT: renamable $vgpr3_vgpr4_vgpr5_vgpr6 = SI_SPILL_V128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5) + ; DUMMY-NEXT: renamable $vgpr3 = V_ADD_U32_e32 killed $vgpr3, killed $vgpr2, implicit $exec + ; DUMMY-NEXT: SI_RETURN implicit $vgpr3, implicit $vgpr0, implicit $vgpr1 + undef %1.sub0:vreg_128 = COPY $vgpr1 + %0:vgpr_32 = COPY $vgpr0 + S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 + %2:vgpr_32 = V_ADD_U32_e32 %1.sub0, %0, implicit $exec + $vgpr3 = COPY %2 + SI_RETURN implicit $vgpr3, implicit $vgpr0, implicit $vgpr1 + +... + +# CHECK: {{.*}} From 40ac34c518985f4ff119d2e67a5a412cc951104a Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Thu, 2 Jan 2025 17:23:52 +0100 Subject: [PATCH 287/567] [libc++] Make __type_list variadic (#121117) This makes these lists signficiantly more readable. --- .../include/__type_traits/aligned_storage.h | 37 +++++++++---------- libcxx/include/__type_traits/make_signed.h | 22 +++++------ libcxx/include/__type_traits/make_unsigned.h | 22 +++++------ libcxx/include/__type_traits/type_list.h | 28 ++++++++------ 4 files changed, 53 insertions(+), 56 deletions(-) diff --git a/libcxx/include/__type_traits/aligned_storage.h b/libcxx/include/__type_traits/aligned_storage.h index 2e39afb7f8808..5cd1f587b988c 100644 --- a/libcxx/include/__type_traits/aligned_storage.h +++ b/libcxx/include/__type_traits/aligned_storage.h @@ -34,26 +34,23 @@ struct __struct_double4 { double __lx[4]; }; -// clang-format off -typedef __type_list<__align_type, - __type_list<__align_type, - __type_list<__align_type, - __type_list<__align_type, - __type_list<__align_type, - __type_list<__align_type, - __type_list<__align_type, - __type_list<__align_type<__struct_double>, - __type_list<__align_type<__struct_double4>, - __type_list<__align_type, - __nat - > > > > > > > > > > __all_types; -// clang-format on +using __all_types = + __type_list<__align_type, + __align_type, + __align_type, + __align_type, + __align_type, + __align_type, + __align_type, + __align_type<__struct_double>, + __align_type<__struct_double4>, + __align_type >; template struct __find_max_align; -template -struct __find_max_align<__type_list<_Hp, __nat>, _Len> : public integral_constant {}; +template +struct __find_max_align<__type_list<_Head>, _Len> : public integral_constant {}; template struct __select_align { @@ -65,9 +62,11 @@ struct __select_align { static const size_t value = _Len < __max ? __min : __max; }; -template -struct __find_max_align<__type_list<_Hp, _Tp>, _Len> - : public integral_constant::value>::value> {}; +template +struct __find_max_align<__type_list<_Head, _Tail...>, _Len> + : public integral_constant< + size_t, + __select_align<_Len, _Head::value, __find_max_align<__type_list<_Tail...>, _Len>::value>::value> {}; template ::value> struct _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_TEMPLATE_VIS aligned_storage { diff --git a/libcxx/include/__type_traits/make_signed.h b/libcxx/include/__type_traits/make_signed.h index 8070690b3a7a9..5c2739e674352 100644 --- a/libcxx/include/__type_traits/make_signed.h +++ b/libcxx/include/__type_traits/make_signed.h @@ -29,21 +29,17 @@ template using __make_signed_t = __make_signed(_Tp); #else -// clang-format off -typedef __type_list + , + __int128_t # endif - > > > > > __signed_types; -// clang-format on + >; template ::value || is_enum<_Tp>::value> struct __make_signed{}; diff --git a/libcxx/include/__type_traits/make_unsigned.h b/libcxx/include/__type_traits/make_unsigned.h index 562f7bab8a7fb..6c238685c2331 100644 --- a/libcxx/include/__type_traits/make_unsigned.h +++ b/libcxx/include/__type_traits/make_unsigned.h @@ -31,21 +31,17 @@ template using __make_unsigned_t = __make_unsigned(_Tp); #else -// clang-format off -typedef __type_list + , + __uint128_t # endif - > > > > > __unsigned_types; -// clang-format on + >; template ::value || is_enum<_Tp>::value> struct __make_unsigned{}; diff --git a/libcxx/include/__type_traits/type_list.h b/libcxx/include/__type_traits/type_list.h index b4898b36e2d90..34d78fc97c978 100644 --- a/libcxx/include/__type_traits/type_list.h +++ b/libcxx/include/__type_traits/type_list.h @@ -11,6 +11,7 @@ #include <__config> #include <__cstddef/size_t.h> +#include <__type_traits/enable_if.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -18,23 +19,28 @@ _LIBCPP_BEGIN_NAMESPACE_STD -template -struct __type_list { - typedef _Hp _Head; - typedef _Tp _Tail; +template +struct __type_list {}; + +template +struct __type_list_head; + +template +struct __type_list_head<__type_list<_Head, _Tail...> > { + using type _LIBCPP_NODEBUG = _Head; }; -template +template ::type)> struct __find_first; -template -struct __find_first<__type_list<_Hp, _Tp>, _Size, true> { - using type _LIBCPP_NODEBUG = _Hp; +template +struct __find_first<__type_list<_Head, _Tail...>, _Size, true> { + using type _LIBCPP_NODEBUG = _Head; }; -template -struct __find_first<__type_list<_Hp, _Tp>, _Size, false> { - using type _LIBCPP_NODEBUG = typename __find_first<_Tp, _Size>::type; +template +struct __find_first<__type_list<_Head, _Tail...>, _Size, false> { + using type _LIBCPP_NODEBUG = typename __find_first<__type_list<_Tail...>, _Size>::type; }; _LIBCPP_END_NAMESPACE_STD From 4075ddad7183e6f0b66e2c8cc7a03b461a8038e6 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Thu, 2 Jan 2025 17:30:48 +0100 Subject: [PATCH 288/567] [libc++] Run clang-tidy only once per header (#121436) There doesn't seem to be much of a reason to run clang-tidy twice per headers, and running it only once makes the test a few seconds faster. --- libcxx/.clang-tidy | 2 ++ libcxx/test/libcxx/clang_tidy.gen.py | 3 +-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/libcxx/.clang-tidy b/libcxx/.clang-tidy index f986e2100ca6b..ebbfab0379265 100644 --- a/libcxx/.clang-tidy +++ b/libcxx/.clang-tidy @@ -5,6 +5,8 @@ Checks: > bugprone-stringview-nullptr, bugprone-use-after-move, + libcpp-*, + llvm-include-order, llvm-namespace-comment, diff --git a/libcxx/test/libcxx/clang_tidy.gen.py b/libcxx/test/libcxx/clang_tidy.gen.py index 0db9c0d14b196..06f277e901d33 100644 --- a/libcxx/test/libcxx/clang_tidy.gen.py +++ b/libcxx/test/libcxx/clang_tidy.gen.py @@ -33,8 +33,7 @@ {lit_header_undeprecations.get(header, '')} // TODO: run clang-tidy with modules enabled once they are supported -// RUN: %{{clang-tidy}} %s --warnings-as-errors=* -header-filter=.* --checks='-*,libcpp-*' --load=%{{test-tools-dir}}/clang_tidy_checks/libcxx-tidy.plugin -- %{{compile_flags}} -fno-modules -// RUN: %{{clang-tidy}} %s --warnings-as-errors=* -header-filter=.* --config-file=%{{libcxx-dir}}/.clang-tidy -- -Wweak-vtables %{{compile_flags}} -fno-modules +// RUN: %{{clang-tidy}} %s --warnings-as-errors=* -header-filter=.* --config-file=%{{libcxx-dir}}/.clang-tidy --load=%{{test-tools-dir}}/clang_tidy_checks/libcxx-tidy.plugin -- -Wweak-vtables %{{compile_flags}} -fno-modules #include <{header}> """) From 7326e903d72ba390a6368ff3e9eb2ab2251a1b13 Mon Sep 17 00:00:00 2001 From: Brooks Davis Date: Thu, 2 Jan 2025 09:06:29 -0800 Subject: [PATCH 289/567] flang: fix backtrace build on FreeBSD (#120297) FreeBSD's libexecinfo defines backtrace with a size_t for the size argument and return type. This almost certainly doesn't make sense, but what's done is done so cast the output to allow compilation. Otherwise we get: .../flang/runtime/stop.cpp:165:13: error: non-constant-expression cannot be narrowed from type 'size_t' (aka 'unsigned long') to 'int' in initializer list [-Wc++11-narrowing] 165 | int nptrs{backtrace(buffer, MAX_CALL_STACK)}; | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ --- flang/runtime/stop.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flang/runtime/stop.cpp b/flang/runtime/stop.cpp index f8457e10566a2..a7be8a082e026 100644 --- a/flang/runtime/stop.cpp +++ b/flang/runtime/stop.cpp @@ -162,7 +162,7 @@ static void PrintBacktrace() { // TODO: Need to parse DWARF information to print function line numbers constexpr int MAX_CALL_STACK{999}; void *buffer[MAX_CALL_STACK]; - int nptrs{backtrace(buffer, MAX_CALL_STACK)}; + int nptrs{(int)backtrace(buffer, MAX_CALL_STACK)}; if (char **symbols{backtrace_symbols(buffer, nptrs)}) { for (int i = 0; i < nptrs; i++) { From cbff02b101c20ad6557d64c998d03dab5ee4aad7 Mon Sep 17 00:00:00 2001 From: hatoo Date: Fri, 3 Jan 2025 02:13:27 +0900 Subject: [PATCH 290/567] [mlir][emitc] Fix invalid syntax in example of emitc.return (#121112) A return type of `emitc.func` must be specified with `->` instead of `:`. I've verified the syntax using `mlir-translate --mlir-to-cpp`. --- mlir/include/mlir/Dialect/EmitC/IR/EmitC.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td index 729a573b71c97..744a0dc4770e6 100644 --- a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td +++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td @@ -727,7 +727,7 @@ def EmitC_ReturnOp : EmitC_Op<"return", [Pure, HasParent<"FuncOp">, Example: ```mlir - emitc.func @foo() : (i32) { + emitc.func @foo() -> (i32) { ... emitc.return %0 : i32 } From 5ed6229019de43df0ff4b3e73097781e0f1a6651 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 2 Jan 2025 12:46:54 +0000 Subject: [PATCH 291/567] [VectorCombine] Add scalarizeLoadExtract infinite loop test from #120984 regression scalarizeLoadExtract replaces instructions up the use list, which can result in the vectorcombine worklist adding users back to the worklist when they should really be erased first. --- .../X86/load-extractelement-scalarization.ll | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 llvm/test/Transforms/VectorCombine/X86/load-extractelement-scalarization.ll diff --git a/llvm/test/Transforms/VectorCombine/X86/load-extractelement-scalarization.ll b/llvm/test/Transforms/VectorCombine/X86/load-extractelement-scalarization.ll new file mode 100644 index 0000000000000..0acfeccb92ef7 --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/X86/load-extractelement-scalarization.ll @@ -0,0 +1,26 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s +; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s + +; infinite loop if we add the erased instructions to the work list in the wrong order. +define void @multiple_extract(ptr %p) { +; CHECK-LABEL: @multiple_extract( +; CHECK-NEXT: [[VP:%.*]] = load ptr, ptr [[P:%.*]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <2 x i32>, ptr [[VP]], i32 0, i64 0 +; CHECK-NEXT: [[E0:%.*]] = load i32, ptr [[TMP1]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <2 x i32>, ptr [[VP]], i32 0, i64 1 +; CHECK-NEXT: [[E1:%.*]] = load i32, ptr [[TMP2]], align 4 +; CHECK-NEXT: store i32 [[E0]], ptr [[P]], align 4 +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 4 +; CHECK-NEXT: store i32 [[E1]], ptr [[P1]], align 4 +; CHECK-NEXT: ret void +; + %vp = load ptr, ptr %p, align 8 + %v = load <2 x i32>, ptr %vp, align 16 + %e0 = extractelement <2 x i32> %v, i64 0 + %e1 = extractelement <2 x i32> %v, i64 1 + store i32 %e0, ptr %p, align 4 + %p1 = getelementptr inbounds nuw i8, ptr %p, i64 4 + store i32 %e1, ptr %p1, align 4 + ret void +} From f739aa4004165dc64d3a1f418d5ad3c84886f01a Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 2 Jan 2025 17:17:03 +0000 Subject: [PATCH 292/567] [VectorCombine] replaceValue - add "VC: Replacing" debug message to help the log show replacement for old/new. --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index dd109637552c4..8509a31766e35 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -128,6 +128,8 @@ class VectorCombine { bool shrinkType(Instruction &I); void replaceValue(Value &Old, Value &New) { + LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n'); + LLVM_DEBUG(dbgs() << " With: " << New << '\n'); Old.replaceAllUsesWith(&New); if (auto *NewI = dyn_cast(&New)) { New.takeName(&Old); From 1849244685bc42b07b1b14e3f62e15c535e74c39 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 2 Jan 2025 17:29:55 +0000 Subject: [PATCH 293/567] [CodeGen] Remove atEnd method from defusechain iterators (#120610) This was not used much and there are better ways of writing it. --- llvm/include/llvm/CodeGen/MachineRegisterInfo.h | 6 ------ llvm/lib/CodeGen/MachineRegisterInfo.cpp | 8 +++++--- llvm/lib/CodeGen/MachineTraceMetrics.cpp | 9 ++++----- llvm/lib/CodeGen/SwiftErrorValueTracking.cpp | 2 +- 4 files changed, 10 insertions(+), 15 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h index 5dc51aaed81c7..5ee3aef28a4fb 100644 --- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h @@ -1095,9 +1095,6 @@ class MachineRegisterInfo { return !operator==(x); } - /// atEnd - return true if this iterator is equal to reg_end() on the value. - bool atEnd() const { return Op == nullptr; } - // Iterator traversal: forward iteration only defusechain_iterator &operator++() { // Preincrement assert(Op && "Cannot increment end iterator!"); @@ -1203,9 +1200,6 @@ class MachineRegisterInfo { return !operator==(x); } - /// atEnd - return true if this iterator is equal to reg_end() on the value. - bool atEnd() const { return Op == nullptr; } - // Iterator traversal: forward iteration only defusechain_instr_iterator &operator++() { // Preincrement assert(Op && "Cannot increment end iterator!"); diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp index 6f636a161f500..394b99b85ddcc 100644 --- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp +++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp @@ -407,9 +407,11 @@ void MachineRegisterInfo::replaceRegWith(Register FromReg, Register ToReg) { MachineInstr *MachineRegisterInfo::getVRegDef(Register Reg) const { // Since we are in SSA form, we can use the first definition. def_instr_iterator I = def_instr_begin(Reg); - assert((I.atEnd() || std::next(I) == def_instr_end()) && - "getVRegDef assumes a single definition or no definition"); - return !I.atEnd() ? &*I : nullptr; + if (I == def_instr_end()) + return nullptr; + assert(std::next(I) == def_instr_end() && + "getVRegDef assumes at most one definition"); + return &*I; } /// getUniqueVRegDef - Return the unique machine instr that defines the diff --git a/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/llvm/lib/CodeGen/MachineTraceMetrics.cpp index 6576f97bea25f..021c1a058c020 100644 --- a/llvm/lib/CodeGen/MachineTraceMetrics.cpp +++ b/llvm/lib/CodeGen/MachineTraceMetrics.cpp @@ -683,11 +683,10 @@ struct DataDep { DataDep(const MachineRegisterInfo *MRI, unsigned VirtReg, unsigned UseOp) : UseOp(UseOp) { assert(Register::isVirtualRegister(VirtReg)); - MachineRegisterInfo::def_iterator DefI = MRI->def_begin(VirtReg); - assert(!DefI.atEnd() && "Register has no defs"); - DefMI = DefI->getParent(); - DefOp = DefI.getOperandNo(); - assert((++DefI).atEnd() && "Register has multiple defs"); + MachineOperand *DefMO = MRI->getOneDef(VirtReg); + assert(DefMO && "Register does not have unique def"); + DefMI = DefMO->getParent(); + DefOp = DefMO->getOperandNo(); } }; diff --git a/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp b/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp index 74a94d6110f41..decffdc7dfe45 100644 --- a/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp +++ b/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp @@ -259,7 +259,7 @@ void SwiftErrorValueTracking::propagateVRegs() { for (const auto &Use : VRegUpwardsUse) { const MachineBasicBlock *UseBB = Use.first.first; Register VReg = Use.second; - if (!MRI.def_begin(VReg).atEnd()) + if (!MRI.def_empty(VReg)) continue; #ifdef EXPENSIVE_CHECKS From 5de7af4b9f05c7a9fb3775f45627b50aba47869b Mon Sep 17 00:00:00 2001 From: Jinsong Ji Date: Thu, 2 Jan 2025 12:32:42 -0500 Subject: [PATCH 294/567] [llvm][Support][Windows] Fix slash in path for remove_directories (#121448) Before 925471ed903dad871042d7ed0bab89ab6566a564 remove_directories supports path with slash (instead of backslash). The ILCreateFromPathW in new implementation requires backslash path, so the call to remove_directories will fail if the path contains slash. This is to normalize the path to make sure remove_directories still support path with slash as well. --- llvm/lib/Support/Windows/Path.inc | 4 +++- llvm/unittests/Support/Path.cpp | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc index 17db114caeb1e..5b311e7c475c5 100644 --- a/llvm/lib/Support/Windows/Path.inc +++ b/llvm/lib/Support/Windows/Path.inc @@ -1373,9 +1373,11 @@ std::error_code closeFile(file_t &F) { } std::error_code remove_directories(const Twine &path, bool IgnoreErrors) { + SmallString<128> NativePath; + llvm::sys::path::native(path, NativePath, path::Style::windows_backslash); // Convert to utf-16. SmallVector Path16; - std::error_code EC = widenPath(path, Path16); + std::error_code EC = widenPath(NativePath, Path16); if (EC && !IgnoreErrors) return EC; diff --git a/llvm/unittests/Support/Path.cpp b/llvm/unittests/Support/Path.cpp index 8dde2fb50160c..187f47d9cfe07 100644 --- a/llvm/unittests/Support/Path.cpp +++ b/llvm/unittests/Support/Path.cpp @@ -1326,6 +1326,9 @@ TEST_F(FileSystemTest, Remove) { ASSERT_NO_ERROR(fs::remove_directories("D:/footest")); + ASSERT_NO_ERROR(fs::remove_directories(Twine(BaseDir) + "/foo/bar/baz")); + ASSERT_FALSE(fs::exists(Twine(BaseDir) + "/foo/bar/baz")); + ASSERT_NO_ERROR(fs::remove_directories(BaseDir)); ASSERT_FALSE(fs::exists(BaseDir)); } From bca92b12588d63556b749b4627af0112cd2d05c6 Mon Sep 17 00:00:00 2001 From: Angus Lees Date: Fri, 3 Jan 2025 04:33:06 +1100 Subject: [PATCH 295/567] [bazel] Allow SupportTests to be built remotely and cached (#121375) `SupportTests` fails in the bazel macOS sandbox, because `FileSystemTest.permissions` expects to be able to modify file permissions on some otherwise protected files. Previously this test was marked `local` in bazel, which has additional undesirable effects such as skipping remote build and cache. Tighten the bazel tags to just `no-sandbox`. Note in particular, that this allows the test to build, execute, and cache remotely (if configured). Testing: - Verified this test fails (as expected) on macOS with no tags, and passes with `no-sandbox`. - Verified this test passes when executed remotely (using an Engflow RBE setup) with `no-sandbox`. --- utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel index 8a6950facbdf2..d576a9190d09b 100644 --- a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel @@ -697,7 +697,7 @@ cc_test( ], linkstatic = 1, tags = [ - "local", # Not compatible with the sandbox on MacOS + "no-sandbox", # FileSystemTest.permissions not compatible with the sandbox on MacOS ], deps = [ "//llvm:AllTargetsCodeGens", From dd30aa83aa12e5b2b5e58cb72ec85070f725df34 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 2 Jan 2025 09:36:01 -0800 Subject: [PATCH 296/567] [RISCV][TTI] Simplify compound check for readability [nfc] (#121504) I misread this check earlier today on a review, so restructure it to be easier to quickly scan. --- llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 0abb270edcabc..909a64e974255 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -2558,8 +2558,10 @@ RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { TTI::MemCmpExpansionOptions Options; // TODO: Enable expansion when unaligned access is not supported after we fix // issues in ExpandMemcmp. - if (!(ST->enableUnalignedScalarMem() && - (ST->hasStdExtZbb() || ST->hasStdExtZbkb() || IsZeroCmp))) + if (!ST->enableUnalignedScalarMem()) + return Options; + + if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp) return Options; Options.AllowOverlappingLoads = true; From 035e64c0ec02b237a266ebc672718037fdd53eb2 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 2 Jan 2025 18:18:55 +0000 Subject: [PATCH 297/567] [VectorCombine] eraseInstruction - ensure we reattempt to fold other users of an erased instruction's operands (REAPPLIED) As we're reducing the use count of the operands its more likely that they will now fold, as they were previously being prevented by a m_OneUse check, or the cost of retaining the extra instruction had been too high. This is necessary for some upcoming patches, although the only change so far is instruction ordering as it allows some SSE folds of 256/512-bit with 128-bit subvectors to occur earlier in foldShuffleToIdentity as the subvector concats are free. Reapplied with a fix for foldSingleElementStore/scalarizeLoadExtract which were replacing/removing memory operations - we need to ensure that the worklist is populated in the correct order so all users of the old memory operations are erased first, so there are no remaining users of the loads when its time to remove them as well. Pulled out of #120984 --- .../Transforms/Vectorize/VectorCombine.cpp | 19 +++++- .../VectorCombine/X86/concat-boolmasks.ll | 64 ++++++++++++++----- 2 files changed, 66 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 8509a31766e35..493ed95b1d22e 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -141,10 +141,17 @@ class VectorCombine { void eraseInstruction(Instruction &I) { LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n'); - for (Value *Op : I.operands()) - Worklist.pushValue(Op); + SmallVector Ops(I.operands()); Worklist.remove(&I); I.eraseFromParent(); + + // Push remaining users of the operands and then the operand itself - allows + // further folds that were hindered by OneUse limits. + for (Value *Op : Ops) + if (auto *OpI = dyn_cast(Op)) { + Worklist.pushUsersToWorkList(*OpI); + Worklist.pushValue(OpI); + } } }; } // namespace @@ -1337,6 +1344,10 @@ bool VectorCombine::foldSingleElementStore(Instruction &I) { MemoryLocation::get(SI), AA)) return false; + // Ensure we add the load back to the worklist BEFORE its users so they can + // erased in the correct order. + Worklist.push(Load); + if (ScalarizableIdx.isSafeWithFreeze()) ScalarizableIdx.freeze(Builder, *cast(Idx)); Value *GEP = Builder.CreateInBoundsGEP( @@ -1425,6 +1436,10 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { if (ScalarizedCost >= OriginalCost) return false; + // Ensure we add the load back to the worklist BEFORE its users so they can + // erased in the correct order. + Worklist.push(LI); + // Replace extracts with narrow scalar loads. for (User *U : LI->users()) { auto *EI = cast(U); diff --git a/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll b/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll index 057d9af314ba3..c3639baf8b650 100644 --- a/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll +++ b/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll @@ -80,13 +80,29 @@ define i64 @movmsk_i64_v8i32_v4i32(<4 x i32> %v0, <4 x i32> %v1) { } define i64 @movmsk_i64_v64i8_v16i8(<16 x i8> %v0, <16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) { -; CHECK-LABEL: @movmsk_i64_v64i8_v16i8( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> [[TMP1]], <64 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64 -; CHECK-NEXT: ret i64 [[OR]] +; SSE-LABEL: @movmsk_i64_v64i8_v16i8( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> [[TMP2]], <64 x i32> +; SSE-NEXT: [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer +; SSE-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64 +; SSE-NEXT: ret i64 [[OR]] +; +; AVX2-LABEL: @movmsk_i64_v64i8_v16i8( +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> [[TMP1]], <64 x i32> +; AVX2-NEXT: [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer +; AVX2-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64 +; AVX2-NEXT: ret i64 [[OR]] +; +; AVX512-LABEL: @movmsk_i64_v64i8_v16i8( +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> [[TMP1]], <64 x i32> +; AVX512-NEXT: [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer +; AVX512-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64 +; AVX512-NEXT: ret i64 [[OR]] ; %c0 = icmp slt <16 x i8> %v0, zeroinitializer %c1 = icmp slt <16 x i8> %v1, zeroinitializer @@ -110,14 +126,32 @@ define i64 @movmsk_i64_v64i8_v16i8(<16 x i8> %v0, <16 x i8> %v1, <16 x i8> %v2, } define i64 @movmsk_i64_v32i32_v4i32(<4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) { -; CHECK-LABEL: @movmsk_i64_v32i32_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP1]], <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 -; CHECK-NEXT: [[OR:%.*]] = zext i16 [[TMP5]] to i64 -; CHECK-NEXT: ret i64 [[OR]] +; SSE-LABEL: @movmsk_i64_v32i32_v4i32( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <16 x i32> +; SSE-NEXT: [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer +; SSE-NEXT: [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 +; SSE-NEXT: [[OR:%.*]] = zext i16 [[TMP5]] to i64 +; SSE-NEXT: ret i64 [[OR]] +; +; AVX2-LABEL: @movmsk_i64_v32i32_v4i32( +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP1]], <16 x i32> +; AVX2-NEXT: [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer +; AVX2-NEXT: [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 +; AVX2-NEXT: [[OR:%.*]] = zext i16 [[TMP5]] to i64 +; AVX2-NEXT: ret i64 [[OR]] +; +; AVX512-LABEL: @movmsk_i64_v32i32_v4i32( +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP1]], <16 x i32> +; AVX512-NEXT: [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer +; AVX512-NEXT: [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 +; AVX512-NEXT: [[OR:%.*]] = zext i16 [[TMP5]] to i64 +; AVX512-NEXT: ret i64 [[OR]] ; %c0 = icmp slt <4 x i32> %v0, zeroinitializer %c1 = icmp slt <4 x i32> %v1, zeroinitializer From 5236e3dac59e16630a3730c84c2d3d65970a6db3 Mon Sep 17 00:00:00 2001 From: Dominik Adamski Date: Thu, 2 Jan 2025 20:18:55 +0100 Subject: [PATCH 298/567] [Flang][Alias analysis] Fix alias analysis for omp private allocatable item (#120243) Flang alias analysis crashes for omp private allocatable item. The issue is described here : https://github.com/llvm/llvm-project/issues/116954 . We know that private value can't alias with anything else unless it is POINTER or TARGET. That's why we can simplify alias analysis logic. --- .../lib/Optimizer/Analysis/AliasAnalysis.cpp | 36 ++++++------- ...lias-analysis-omp-private-allocatable.mlir | 50 +++++++++++++++++++ 2 files changed, 64 insertions(+), 22 deletions(-) create mode 100644 flang/test/Analysis/AliasAnalysis/alias-analysis-omp-private-allocatable.mlir diff --git a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp index 611f212269fb7..e33d8fa333e7a 100644 --- a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp +++ b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp @@ -505,30 +505,17 @@ getAttrsFromVariable(fir::FortranVariableOpInterface var) { } template -static Value getPrivateArg(omp::BlockArgOpenMPOpInterface &argIface, - OMPTypeOp &op, DeclTypeOp &declOp) { - Value privateArg; +static bool isPrivateArg(omp::BlockArgOpenMPOpInterface &argIface, + OMPTypeOp &op, DeclTypeOp &declOp) { if (!op.getPrivateSyms().has_value()) - return privateArg; + return false; for (auto [opSym, blockArg] : llvm::zip_equal(*op.getPrivateSyms(), argIface.getPrivateBlockArgs())) { if (blockArg == declOp.getMemref()) { - omp::PrivateClauseOp privateOp = - SymbolTable::lookupNearestSymbolFrom( - op, cast(opSym)); - privateOp.walk([&](omp::YieldOp yieldOp) { - // TODO Extend alias analysis if omp.yield points to - // block argument value - if (!yieldOp.getResults()[0].getDefiningOp()) - return; - llvm::TypeSwitch(yieldOp.getResults()[0].getDefiningOp()) - .template Case( - [&](auto declOp) { privateArg = declOp.getMemref(); }); - }); - return privateArg; + return true; } } - return privateArg; + return false; } AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v, @@ -631,6 +618,7 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v, breakFromLoop = true; }) .Case([&](auto op) { + bool isPrivateItem = false; if (omp::BlockArgOpenMPOpInterface argIface = dyn_cast(op->getParentOp())) { Value ompValArg; @@ -644,19 +632,18 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v, omp::MapInfoOp mapInfo = llvm::cast(opArg.getDefiningOp()); ompValArg = mapInfo.getVarPtr(); - break; + return; } } // If given operation does not reflect mapping item, // check private clause - if (!ompValArg) - ompValArg = getPrivateArg(argIface, targetOp, op); + isPrivateItem = isPrivateArg(argIface, targetOp, op); }) .template Case( [&](auto privateOp) { - ompValArg = getPrivateArg(argIface, privateOp, op); + isPrivateItem = isPrivateArg(argIface, privateOp, op); }); if (ompValArg) { v = ompValArg; @@ -706,6 +693,11 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v, } else { instantiationPoint = op; } + if (isPrivateItem) { + type = SourceKind::Allocate; + breakFromLoop = true; + return; + } // TODO: Look for the fortran attributes present on the operation // Track further through the operand v = op.getMemref(); diff --git a/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-private-allocatable.mlir b/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-private-allocatable.mlir new file mode 100644 index 0000000000000..5116622364fad --- /dev/null +++ b/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-private-allocatable.mlir @@ -0,0 +1,50 @@ +// Use --mlir-disable-threading so that the AA queries are serialized +// as well as its diagnostic output. +// RUN: fir-opt %s -pass-pipeline='builtin.module(func.func(test-fir-alias-analysis))' -split-input-file --mlir-disable-threading 2>&1 | FileCheck %s + +// Fortran code before simplification: +// SUBROUTINE mysub(ns,ne) +// INTEGER :: n +// REAL(KIND=8), DIMENSION(:), allocatable :: ar1 +// real(kind=8), dimension(20) :: ar2 +// REAL(KIND=8), DIMENSION(20) :: d +// +//!$OMP parallel PRIVATE(ar1) +// d(1:1) = (/(DOT_PRODUCT(ar1(1:n), ar2(1:n)),n=1, 1)/) +//!$OMP END parallel +// END SUBROUTINE + +// CHECK-LABEL: Testing : "testPrivateAllocatable" +// CHECK: ar2#0 <-> ar1#0: NoAlias +// CHECK: ar2#1 <-> ar1#0: NoAlias +// CHECK: ar2#0 <-> ar1#1: NoAlias +// CHECK: ar2#1 <-> ar1#1: NoAlias + +omp.private {type = private} @_QFmysubEar1_private_ref_box_heap_Uxf64 : !fir.ref>>> alloc { +^bb0(%arg0: !fir.ref>>>): + %0 = fir.alloca !fir.box>> {bindc_name = "ar1", pinned, uniq_name = "_QFmysubEar1"} + %5:2 = hlfir.declare %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFmysubEar1"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) + omp.yield(%5#0 : !fir.ref>>>) +} dealloc { +^bb0(%arg0: !fir.ref>>>): + omp.yield +} +func.func @testPrivateAllocatable(%arg0: !fir.ref {fir.bindc_name = "ns"}, %arg1: !fir.ref {fir.bindc_name = "ne"}) { + %0 = fir.dummy_scope : !fir.dscope + %1 = fir.alloca !fir.box>> {bindc_name = "ar1", uniq_name = "_QFmysubEar1"} + %2 = fir.zero_bits !fir.heap> + %c0 = arith.constant 0 : index + %3 = fir.shape %c0 : (index) -> !fir.shape<1> + %4 = fir.embox %2(%3) : (!fir.heap>, !fir.shape<1>) -> !fir.box>> + fir.store %4 to %1 : !fir.ref>>> + %5:2 = hlfir.declare %1 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFmysubEar1"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) + %c20 = arith.constant 20 : index + %6 = fir.alloca !fir.array<20xf64> {bindc_name = "ar2", uniq_name = "_QFmysubEar2"} + %7 = fir.shape %c20 : (index) -> !fir.shape<1> + %8:2 = hlfir.declare %6(%7) {uniq_name = "_QFmysubEar2", test.ptr="ar2" } : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) + omp.parallel private(@_QFmysubEar1_private_ref_box_heap_Uxf64 %5#0 -> %arg2 : !fir.ref>>>) { + %20:2 = hlfir.declare %arg2 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFmysubEar1", test.ptr = "ar1"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) + omp.terminator + } + return +} From 5f5792aedb1f8088836ccd1c0a924c5e0bbf35db Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 2 Jan 2025 20:10:45 +0000 Subject: [PATCH 299/567] [VPlan] Use removeDeadRecipes in optimizeForVFAndUF (NFCI) Split off from https://github.com/llvm/llvm-project/pull/108378. --- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 89aab71905a29..8ac2bd5160c26 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -842,11 +842,11 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, VPInstruction::BranchOnCond, {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}, Term->getDebugLoc()); - SmallVector PossiblyDead(Term->operands()); Term->eraseFromParent(); - for (VPValue *Op : PossiblyDead) - recursivelyDeleteDeadRecipes(Op); ExitingVPBB->appendRecipe(BOC); + + VPlanTransforms::removeDeadRecipes(Plan); + Plan.setVF(BestVF); Plan.setUF(BestUF); // TODO: Further simplifications are possible From 3a423a10ff83684332195b5191b16f12c81985ba Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Thu, 2 Jan 2025 12:11:59 -0800 Subject: [PATCH 300/567] [MemProf][PGO] Prevent dropping of profile metadata during optimization (#121359) This patch fixes a couple of places where memprof-related metadata (!memprof and !callsite) were being dropped, and one place where PGO metadata (!prof) was being dropped. All were due to instances of combineMetadata() being invoked. That function drops all metadata not in the list provided by the client, and also drops any not in its switch statement. Memprof metadata needed a case in the combineMetadata switch statement. For now we simply keep the metadata of the instruction being kept, which doesn't retain all the profile information when two calls with memprof metadata are being combined, but at least retains some. For the memprof metadata being dropped during call CSE, add memprof and callsite metadata to the list of known ids in combineMetadataForCSE. Neither memprof nor regular prof metadata were in the list of known ids for the callsite in MemCpyOptimizer, which was added to combine AA metadata after optimization of byval arguments fed by memcpy instructions, and similar types of optimizations of memcpy uses. There is one other callsite of combineMetadata, but it is only invoked on load instructions, which do not carry these types of metadata. --- llvm/include/llvm/IR/Metadata.h | 2 + llvm/include/llvm/Transforms/Utils/Local.h | 5 ++ llvm/lib/Analysis/MemoryProfileInfo.cpp | 17 +++++++ .../Transforms/InstCombine/InstCombinePHI.cpp | 3 ++ .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 12 +++-- llvm/lib/Transforms/Utils/Local.cpp | 17 ++++++- llvm/test/Transforms/MemCpyOpt/memcpy.ll | 18 +++++++ .../SimplifyCFG/merge-calls-memprof.ll | 51 +++++++++++++++++++ 8 files changed, 120 insertions(+), 5 deletions(-) create mode 100644 llvm/test/Transforms/SimplifyCFG/merge-calls-memprof.ll diff --git a/llvm/include/llvm/IR/Metadata.h b/llvm/include/llvm/IR/Metadata.h index 35580f3f38c61..df2384c5f6e69 100644 --- a/llvm/include/llvm/IR/Metadata.h +++ b/llvm/include/llvm/IR/Metadata.h @@ -1464,6 +1464,8 @@ class MDNode : public Metadata { static MDNode *getMergedProfMetadata(MDNode *A, MDNode *B, const Instruction *AInstr, const Instruction *BInstr); + static MDNode *getMergedMemProfMetadata(MDNode *A, MDNode *B); + static MDNode *getMergedCallsiteMetadata(MDNode *A, MDNode *B); }; /// Tuple of metadata. diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h index bbf29e6f46b47..40c448593807b 100644 --- a/llvm/include/llvm/Transforms/Utils/Local.h +++ b/llvm/include/llvm/Transforms/Utils/Local.h @@ -412,6 +412,11 @@ Instruction *removeUnwindEdge(BasicBlock *BB, DomTreeUpdater *DTU = nullptr); bool removeUnreachableBlocks(Function &F, DomTreeUpdater *DTU = nullptr, MemorySSAUpdater *MSSAU = nullptr); +/// DO NOT CALL EXTERNALLY. +/// FIXME: https://github.com/llvm/llvm-project/issues/121495 +/// Once external callers of this function are removed, either inline into +/// combineMetadataForCSE, or internalize and remove KnownIDs parameter. +/// /// Combine the metadata of two instructions so that K can replace J. Some /// metadata kinds can only be kept if K does not move, meaning it dominated /// J in the original IR. diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp index 1c3f589e84941..2f3c87a89f9f9 100644 --- a/llvm/lib/Analysis/MemoryProfileInfo.cpp +++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp @@ -347,3 +347,20 @@ template <> uint64_t CallStack::back() const { return mdconst::dyn_extract(N->operands().back()) ->getZExtValue(); } + +MDNode *MDNode::getMergedMemProfMetadata(MDNode *A, MDNode *B) { + // TODO: Support more sophisticated merging, such as selecting the one with + // more bytes allocated, or implement support for carrying multiple allocation + // leaf contexts. For now, keep the first one. + if (A) + return A; + return B; +} + +MDNode *MDNode::getMergedCallsiteMetadata(MDNode *A, MDNode *B) { + // TODO: Support more sophisticated merging, which will require support for + // carrying multiple contexts. For now, keep the first one. + if (A) + return A; + return B; +} diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp index 1fcf1c570adda..272a1942c3350 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -788,6 +788,9 @@ Instruction *InstCombinerImpl::foldPHIArgLoadIntoPHI(PHINode &PN) { BasicBlock *BB = std::get<0>(Incoming); Value *V = std::get<1>(Incoming); LoadInst *LI = cast(V); + // FIXME: https://github.com/llvm/llvm-project/issues/121495 + // Call combineMetadataForCSE instead, so that an explicit set of KnownIDs + // doesn't need to be maintained here. combineMetadata(NewLI, LI, KnownIDs, true); Value *NewInVal = LI->getOperand(0); if (NewInVal != InVal) diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index bb98b3d1c0725..5f7cb92d239bc 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -345,10 +345,14 @@ static bool writtenBetween(MemorySSA *MSSA, BatchAAResults &AA, static void combineAAMetadata(Instruction *ReplInst, Instruction *I) { // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be // handled here, but combineMetadata doesn't support them yet - unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, - LLVMContext::MD_noalias, - LLVMContext::MD_invariant_group, - LLVMContext::MD_access_group}; + unsigned KnownIDs[] = { + LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, + LLVMContext::MD_noalias, LLVMContext::MD_invariant_group, + LLVMContext::MD_access_group, LLVMContext::MD_prof, + LLVMContext::MD_memprof, LLVMContext::MD_callsite}; + // FIXME: https://github.com/llvm/llvm-project/issues/121495 + // Use custom AA metadata combining handling instead of combineMetadata, which + // is meant for CSE and will drop any metadata not in the KnownIDs list. combineMetadata(ReplInst, I, KnownIDs, true); } diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index a3af96d5af026..1e4061cb0771e 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -3308,6 +3308,9 @@ bool llvm::removeUnreachableBlocks(Function &F, DomTreeUpdater *DTU, return Changed; } +// FIXME: https://github.com/llvm/llvm-project/issues/121495 +// Once external callers of this function are removed, either inline into +// combineMetadataForCSE, or internalize and remove KnownIDs parameter. void llvm::combineMetadata(Instruction *K, const Instruction *J, ArrayRef KnownIDs, bool DoesKMove) { SmallVector, 4> Metadata; @@ -3320,6 +3323,10 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J, switch (Kind) { default: + // FIXME: https://github.com/llvm/llvm-project/issues/121495 + // Change to removing only explicitly listed other metadata, and assert + // on unknown metadata, to avoid inadvertently dropping newly added + // metadata types. K->setMetadata(Kind, nullptr); // Remove unknown metadata break; case LLVMContext::MD_dbg: @@ -3379,6 +3386,12 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J, K->setMetadata(Kind, MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD)); break; + case LLVMContext::MD_memprof: + K->setMetadata(Kind, MDNode::getMergedMemProfMetadata(KMD, JMD)); + break; + case LLVMContext::MD_callsite: + K->setMetadata(Kind, MDNode::getMergedCallsiteMetadata(KMD, JMD)); + break; case LLVMContext::MD_preserve_access_index: // Preserve !preserve.access.index in K. break; @@ -3442,7 +3455,9 @@ void llvm::combineMetadataForCSE(Instruction *K, const Instruction *J, LLVMContext::MD_nontemporal, LLVMContext::MD_noundef, LLVMContext::MD_mmra, - LLVMContext::MD_noalias_addrspace}; + LLVMContext::MD_noalias_addrspace, + LLVMContext::MD_memprof, + LLVMContext::MD_callsite}; combineMetadata(K, J, KnownIDs, KDominatesJ); } diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy.ll b/llvm/test/Transforms/MemCpyOpt/memcpy.ll index 39b90adc74ef3..65d78f4199aa0 100644 --- a/llvm/test/Transforms/MemCpyOpt/memcpy.ll +++ b/llvm/test/Transforms/MemCpyOpt/memcpy.ll @@ -803,6 +803,19 @@ define void @byval_param_noalias_metadata(ptr align 4 byval(i32) %ptr) { ret void } +define void @byval_param_profile_metadata(ptr align 4 byval(i32) %ptr) { +; CHECK-LABEL: @byval_param_profile_metadata( +; CHECK-NEXT: store i32 1, ptr [[PTR2:%.*]], align 4 +; CHECK-NEXT: call void @f_byval(ptr byval(i32) align 4 [[PTR2]]), !prof [[PROF3:![0-9]+]], !memprof [[META4:![0-9]+]], !callsite [[META7:![0-9]+]] +; CHECK-NEXT: ret void +; + %tmp = alloca i32, align 4 + store i32 1, ptr %ptr + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %tmp, ptr align 4 %ptr, i64 4, i1 false) + call void @f_byval(ptr align 4 byval(i32) %tmp), !memprof !3, !callsite !6, !prof !7 + ret void +} + define void @memcpy_memory_none(ptr %p, ptr %p2, i64 %size) { ; CHECK-LABEL: @memcpy_memory_none( ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[P:%.*]], ptr [[P2:%.*]], i64 [[SIZE:%.*]], i1 false) #[[ATTR7:[0-9]+]] @@ -897,3 +910,8 @@ define void @memcpy_immut_escape_after(ptr align 4 noalias %val) { !0 = !{!0} !1 = !{!1, !0} !2 = !{!1} +!3 = !{!4} +!4 = !{!5, !"cold"} +!5 = !{i64 123, i64 456} +!6 = !{i64 123} +!7 = !{!"branch_weights", i32 10} diff --git a/llvm/test/Transforms/SimplifyCFG/merge-calls-memprof.ll b/llvm/test/Transforms/SimplifyCFG/merge-calls-memprof.ll new file mode 100644 index 0000000000000..10c6aeb26ba76 --- /dev/null +++ b/llvm/test/Transforms/SimplifyCFG/merge-calls-memprof.ll @@ -0,0 +1,51 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 + +;; Test to ensure that memprof related metadata is not dropped when +;; instructions are combined. Currently the metadata from the first instruction +;; is kept, which prevents full loss of profile context information. + +; RUN: opt < %s -passes=simplifycfg -S | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define dso_local noundef nonnull ptr @_Z4testb(i1 noundef zeroext %b) local_unnamed_addr #0 { +; CHECK-LABEL: define dso_local noundef nonnull ptr @_Z4testb( +; CHECK-SAME: i1 noundef zeroext [[B:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CALL:%.*]] = call noalias noundef nonnull dereferenceable(4) ptr @_Znwm(i64 noundef 4), !memprof [[META0:![0-9]+]], !callsite [[META3:![0-9]+]] +; CHECK-NEXT: ret ptr [[CALL]] +; +entry: + br i1 %b, label %if.then, label %if.else + +if.then: ; preds = %entry + %call = call noalias noundef nonnull dereferenceable(4) ptr @_Znwm(i64 noundef 4), !memprof !0, !callsite !3 + br label %if.end + +if.else: ; preds = %entry + %call1 = call noalias noundef nonnull dereferenceable(4) ptr @_Znwm(i64 noundef 4), !memprof !4, !callsite !7 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %x.0 = phi ptr [ %call, %if.then ], [ %call1, %if.else ] + ret ptr %x.0 +} + + +declare ptr @_Znwm(i64) nounwind readonly + +!0 = !{!1} +!1 = !{!2, !"notcold"} +!2 = !{i64 -852997907418798798, i64 -2101080423462424381, i64 5188446645037944434} +!3 = !{i64 -852997907418798798} +!4 = !{!5} +!5 = !{!6, !"cold"} +!6 = !{i64 123, i64 -2101080423462424381, i64 5188446645037944434} +!7 = !{i64 123} +;. +; CHECK: [[META0]] = !{[[META1:![0-9]+]]} +; CHECK: [[META1]] = !{[[META2:![0-9]+]], !"notcold"} +; CHECK: [[META2]] = !{i64 -852997907418798798, i64 -2101080423462424381, i64 5188446645037944434} +; CHECK: [[META3]] = !{i64 -852997907418798798} +;. From cd19f3f787b01481fd687834457686e16fffdbe6 Mon Sep 17 00:00:00 2001 From: Nick Sarnie Date: Fri, 3 Jan 2025 05:18:33 +0900 Subject: [PATCH 301/567] [Driver][clang-linker-wrapper] Add initial support for OpenMP offloading to generic SPIR-V (#120145) This is the first of a series of patches to add support for OpenMP offloading to SPIR-V through liboffload with the first intended target being Intel GPUs. This patch implements the basic driver and `clang-linker-wrapper` work for JIT mode. There are still many missing pieces, so this is not yet usable. We introduce `spirv64-intel-unknown` as the only currently supported triple. The user-facing argument to enable offloading will be `-fopenmp -fopenmp-targets=spirv64-intel` Add a new `SPIRVOpenMPToolChain` toolchain based on the existing general SPIR-V toolchain which will call all the required SPIR-V tools (and eventually the SPIR-V backend) as well as add the corresponding device RTL as an argument to the linker. We can't get through the front end consistently yet, so it's difficult to add any LIT tests that execute any tools, but front end changes are planned very shortly, and then we can add those tests. --------- Signed-off-by: Sarnie, Nick --- clang/include/clang/Driver/Options.td | 2 + clang/lib/Driver/CMakeLists.txt | 1 + clang/lib/Driver/Driver.cpp | 12 ++-- clang/lib/Driver/ToolChains/CommonArgs.cpp | 9 ++- clang/lib/Driver/ToolChains/SPIRV.h | 2 +- clang/lib/Driver/ToolChains/SPIRVOpenMP.cpp | 34 ++++++++++ clang/lib/Driver/ToolChains/SPIRVOpenMP.h | 29 +++++++++ clang/lib/Frontend/CompilerInvocation.cpp | 1 + .../spirv-openmp/lib/libomptarget-spirv64.bc | 0 clang/test/Driver/spirv-openmp-toolchain.c | 64 +++++++++++++++++++ .../ClangLinkerWrapper.cpp | 5 +- 11 files changed, 149 insertions(+), 10 deletions(-) create mode 100644 clang/lib/Driver/ToolChains/SPIRVOpenMP.cpp create mode 100644 clang/lib/Driver/ToolChains/SPIRVOpenMP.h create mode 100644 clang/test/Driver/Inputs/spirv-openmp/lib/libomptarget-spirv64.bc create mode 100644 clang/test/Driver/spirv-openmp-toolchain.c diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index d922709db1778..523761f5e0d80 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1493,6 +1493,8 @@ def libomptarget_amdgcn_bc_path_EQ : Joined<["--"], "libomptarget-amdgcn-bc-path HelpText<"Path to libomptarget-amdgcn bitcode library">, Alias; def libomptarget_nvptx_bc_path_EQ : Joined<["--"], "libomptarget-nvptx-bc-path=">, Group, HelpText<"Path to libomptarget-nvptx bitcode library">; +def libomptarget_spirv_bc_path_EQ : Joined<["--"], "libomptarget-spirv-bc-path=">, Group, + HelpText<"Path to libomptarget-spirv bitcode library">; def dD : Flag<["-"], "dD">, Group, Visibility<[ClangOption, CC1Option]>, HelpText<"Print macro definitions in -E mode in addition to normal output">; def dI : Flag<["-"], "dI">, Group, Visibility<[ClangOption, CC1Option]>, diff --git a/clang/lib/Driver/CMakeLists.txt b/clang/lib/Driver/CMakeLists.txt index 4fd10bf671512..57d04c3fefa84 100644 --- a/clang/lib/Driver/CMakeLists.txt +++ b/clang/lib/Driver/CMakeLists.txt @@ -77,6 +77,7 @@ add_clang_library(clangDriver ToolChains/RISCVToolchain.cpp ToolChains/Solaris.cpp ToolChains/SPIRV.cpp + ToolChains/SPIRVOpenMP.cpp ToolChains/TCE.cpp ToolChains/UEFI.cpp ToolChains/VEToolchain.cpp diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index dc84c1b9d1cc4..bc5ce9f14ab69 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -43,6 +43,7 @@ #include "ToolChains/PS4CPU.h" #include "ToolChains/RISCVToolchain.h" #include "ToolChains/SPIRV.h" +#include "ToolChains/SPIRVOpenMP.h" #include "ToolChains/Solaris.h" #include "ToolChains/TCE.h" #include "ToolChains/UEFI.h" @@ -890,9 +891,9 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C, HostTC->getTriple()); // Attempt to deduce the offloading triple from the set of architectures. - // We can only correctly deduce NVPTX / AMDGPU triples currently. We need - // to temporarily create these toolchains so that we can access tools for - // inferring architectures. + // We can only correctly deduce NVPTX / AMDGPU triples currently. + // We need to temporarily create these toolchains so that we can access + // tools for inferring architectures. llvm::DenseSet Archs; if (NVPTXTriple) { auto TempTC = std::make_unique( @@ -962,7 +963,7 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C, const ToolChain *TC; // Device toolchains have to be selected differently. They pair host // and device in their implementation. - if (TT.isNVPTX() || TT.isAMDGCN()) { + if (TT.isNVPTX() || TT.isAMDGCN() || TT.isSPIRV()) { const ToolChain *HostTC = C.getSingleOffloadToolChain(); assert(HostTC && "Host toolchain should be always defined."); @@ -975,6 +976,9 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C, else if (TT.isAMDGCN()) DeviceTC = std::make_unique( *this, TT, *HostTC, C.getInputArgs()); + else if (TT.isSPIRV()) + DeviceTC = std::make_unique( + *this, TT, *HostTC, C.getInputArgs()); else assert(DeviceTC && "Device toolchain not defined."); } diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 8b9639061d543..60214c4d59cee 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -2839,10 +2839,13 @@ void tools::addOpenMPDeviceRTL(const Driver &D, LibraryPaths.emplace_back(LibPath); OptSpecifier LibomptargetBCPathOpt = - Triple.isAMDGCN() ? options::OPT_libomptarget_amdgpu_bc_path_EQ - : options::OPT_libomptarget_nvptx_bc_path_EQ; + Triple.isAMDGCN() ? options::OPT_libomptarget_amdgpu_bc_path_EQ + : Triple.isNVPTX() ? options::OPT_libomptarget_nvptx_bc_path_EQ + : options::OPT_libomptarget_spirv_bc_path_EQ; - StringRef ArchPrefix = Triple.isAMDGCN() ? "amdgpu" : "nvptx"; + StringRef ArchPrefix = Triple.isAMDGCN() ? "amdgpu" + : Triple.isNVPTX() ? "nvptx" + : "spirv64"; std::string LibOmpTargetName = ("libomptarget-" + ArchPrefix + ".bc").str(); // First check whether user specifies bc library diff --git a/clang/lib/Driver/ToolChains/SPIRV.h b/clang/lib/Driver/ToolChains/SPIRV.h index d59a8c76ed473..415f639bba3ec 100644 --- a/clang/lib/Driver/ToolChains/SPIRV.h +++ b/clang/lib/Driver/ToolChains/SPIRV.h @@ -52,7 +52,7 @@ class LLVM_LIBRARY_VISIBILITY Linker final : public Tool { namespace toolchains { -class LLVM_LIBRARY_VISIBILITY SPIRVToolChain final : public ToolChain { +class LLVM_LIBRARY_VISIBILITY SPIRVToolChain : public ToolChain { mutable std::unique_ptr Translator; public: diff --git a/clang/lib/Driver/ToolChains/SPIRVOpenMP.cpp b/clang/lib/Driver/ToolChains/SPIRVOpenMP.cpp new file mode 100644 index 0000000000000..1f27245e2839c --- /dev/null +++ b/clang/lib/Driver/ToolChains/SPIRVOpenMP.cpp @@ -0,0 +1,34 @@ +//==- SPIRVOpenMP.cpp - SPIR-V OpenMP Tool Implementations --------*- C++ -*==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//==------------------------------------------------------------------------==// +#include "SPIRVOpenMP.h" +#include "CommonArgs.h" + +using namespace clang::driver; +using namespace clang::driver::toolchains; +using namespace clang::driver::tools; +using namespace llvm::opt; + +namespace clang::driver::toolchains { +SPIRVOpenMPToolChain::SPIRVOpenMPToolChain(const Driver &D, + const llvm::Triple &Triple, + const ToolChain &HostToolchain, + const ArgList &Args) + : SPIRVToolChain(D, Triple, Args), HostTC(HostToolchain) {} + +void SPIRVOpenMPToolChain::addClangTargetOptions( + const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args, + Action::OffloadKind DeviceOffloadingKind) const { + + if (DeviceOffloadingKind != Action::OFK_OpenMP) + return; + + if (DriverArgs.hasArg(options::OPT_nogpulib)) + return; + addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, "", getTriple(), HostTC); +} +} // namespace clang::driver::toolchains diff --git a/clang/lib/Driver/ToolChains/SPIRVOpenMP.h b/clang/lib/Driver/ToolChains/SPIRVOpenMP.h new file mode 100644 index 0000000000000..64404e2a28210 --- /dev/null +++ b/clang/lib/Driver/ToolChains/SPIRVOpenMP.h @@ -0,0 +1,29 @@ +//===--- SPIRVOpenMP.h - SPIR-V OpenMP Tool Implementations ------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_SPIRV_OPENMP_H +#define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_SPIRV_OPENMP_H + +#include "SPIRV.h" +#include "clang/Driver/Tool.h" +#include "clang/Driver/ToolChain.h" + +namespace clang::driver::toolchains { +class LLVM_LIBRARY_VISIBILITY SPIRVOpenMPToolChain : public SPIRVToolChain { +public: + SPIRVOpenMPToolChain(const Driver &D, const llvm::Triple &Triple, + const ToolChain &HostTC, const llvm::opt::ArgList &Args); + + void addClangTargetOptions( + const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args, + Action::OffloadKind DeviceOffloadingKind) const override; + + const ToolChain &HostTC; +}; +} // namespace clang::driver::toolchains +#endif diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 348c56cc37da3..0ae6dce5dd40a 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -4263,6 +4263,7 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args, if (TT.getArch() == llvm::Triple::UnknownArch || !(TT.getArch() == llvm::Triple::aarch64 || TT.isPPC() || + TT.getArch() == llvm::Triple::spirv64 || TT.getArch() == llvm::Triple::systemz || TT.getArch() == llvm::Triple::loongarch64 || TT.getArch() == llvm::Triple::nvptx || diff --git a/clang/test/Driver/Inputs/spirv-openmp/lib/libomptarget-spirv64.bc b/clang/test/Driver/Inputs/spirv-openmp/lib/libomptarget-spirv64.bc new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/spirv-openmp-toolchain.c b/clang/test/Driver/spirv-openmp-toolchain.c new file mode 100644 index 0000000000000..3eb1f22a03ed0 --- /dev/null +++ b/clang/test/Driver/spirv-openmp-toolchain.c @@ -0,0 +1,64 @@ +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=spirv64-intel \ +// RUN: --libomptarget-spirv-bc-path=%t/ -nogpulib %s 2>&1 \ +// RUN: | FileCheck %s + +// verify the tools invocations +// CHECK: "-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-emit-llvm-bc"{{.*}}"-x" "c" +// CHECK: "-cc1" "-triple" "spirv64-intel" "-aux-triple" "x86_64-unknown-linux-gnu" +// CHECK: llvm-spirv{{.*}} +// CHECK: "-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-emit-obj" +// CHECK: clang-linker-wrapper{{.*}} "-o" "a.out" + +// RUN: %clang -ccc-print-phases --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=spirv64-intel %s 2>&1 \ +// RUN: | FileCheck --check-prefix=CHECK-PHASES %s + +// CHECK-PHASES: 0: input, "[[INPUT:.+]]", c, (host-openmp) +// CHECK-PHASES: 1: preprocessor, {0}, cpp-output, (host-openmp) +// CHECK-PHASES: 2: compiler, {1}, ir, (host-openmp) +// CHECK-PHASES: 3: input, "[[INPUT]]", c, (device-openmp) +// CHECK-PHASES: 4: preprocessor, {3}, cpp-output, (device-openmp) +// CHECK-PHASES: 5: compiler, {4}, ir, (device-openmp) +// CHECK-PHASES: 6: offload, "host-openmp (x86_64-unknown-linux-gnu)" {2}, "device-openmp (spirv64-intel)" {5}, ir +// CHECK-PHASES: 7: backend, {6}, assembler, (device-openmp) +// CHECK-PHASES: 8: assembler, {7}, object, (device-openmp) +// CHECK-PHASES: 9: offload, "device-openmp (spirv64-intel)" {8}, object +// CHECK-PHASES: 10: clang-offload-packager, {9}, image, (device-openmp) +// CHECK-PHASES: 11: offload, "host-openmp (x86_64-unknown-linux-gnu)" {2}, "device-openmp (x86_64-unknown-linux-gnu)" {10}, ir +// CHECK-PHASES: 12: backend, {11}, assembler, (host-openmp) +// CHECK-PHASES: 13: assembler, {12}, object, (host-openmp) +// CHECK-PHASES: 14: clang-linker-wrapper, {13}, image, (host-openmp) + +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -fopenmp -fopenmp-targets=spirv64-intel -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -fopenmp -fopenmp-targets=spirv64-intel -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS + +// CHECK-BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[HOST_BC:.+]]" +// CHECK-BINDINGS: "spirv64-intel" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[DEVICE_TEMP_BC:.+]]" +// CHECK-BINDINGS: "spirv64-intel" - "SPIR-V::Translator", inputs: ["[[DEVICE_TEMP_BC]]"], output: "[[DEVICE_SPV:.+]]" +// CHECK-BINDINGS: "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[DEVICE_SPV]]"], output: "[[DEVICE_IMAGE:.+]]" +// CHECK-BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]", "[[DEVICE_IMAGE]]"], output: "[[HOST_OBJ:.+]]" +// CHECK-BINDINGS: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[HOST_OBJ]]"], output: "a.out" + +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -save-temps -fopenmp -fopenmp-targets=spirv64-intel -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS-TEMPS +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -save-temps -fopenmp -fopenmp-targets=spirv64-intel %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS-TEMPS +// CHECK-BINDINGS-TEMPS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[HOST_PP:.+]]" +// CHECK-BINDINGS-TEMPS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_PP]]"], output: "[[HOST_BC:.+]]" +// CHECK-BINDINGS-TEMPS: "spirv64-intel" - "clang", inputs: ["[[INPUT]]"], output: "[[DEVICE_PP:.+]]" +// CHECK-BINDINGS-TEMPS: "spirv64-intel" - "clang", inputs: ["[[DEVICE_PP]]", "[[HOST_BC]]"], output: "[[DEVICE_TEMP_BC:.+]]" +// CHECK-BINDINGS-TEMPS: "spirv64-intel" - "SPIR-V::Translator", inputs: ["[[DEVICE_TEMP_BC]]"], output: "[[DEVICE_ASM:.+]]" +// CHECK-BINDINGS-TEMPS: "spirv64-intel" - "SPIR-V::Translator", inputs: ["[[DEVICE_ASM]]"], output: "[[DEVICE_SPV:.+]]" +// CHECK-BINDINGS-TEMPS: "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[DEVICE_SPV]]"], output: "[[DEVICE_IMAGE:.+]]" +// CHECK-BINDINGS-TEMPS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]", "[[DEVICE_IMAGE]]"], output: "[[HOST_ASM:.+]]" +// CHECK-BINDINGS-TEMPS: "x86_64-unknown-linux-gnu" - "clang::as", inputs: ["[[HOST_ASM]]"], output: "[[HOST_OBJ:.+]]" +// CHECK-BINDINGS-TEMPS: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[HOST_OBJ]]"], output: "a.out" + +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -emit-llvm -S -fopenmp -fopenmp-targets=spirv64-intel -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-EMIT-LLVM-IR +// CHECK-EMIT-LLVM-IR: "-cc1" "-triple" "spirv64-intel"{{.*}}"-emit-llvm-bc" + +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=spirv64-intel \ +// RUN: --sysroot=%S/Inputs/spirv-openmp/ %s 2>&1 | FileCheck --check-prefix=CHECK-GPULIB %s +// CHECK-GPULIB: "-cc1" "-triple" "spirv64-intel"{{.*}}"-mlink-builtin-bitcode" "{{.*}}libomptarget-spirv64.bc" + +// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -fopenmp --offload-arch=spirv64-intel \ +// RUN: --libomptarget-spirv-bc-path=%t/ -nogpulib %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-OFFLOAD-ARCH-ERROR +// CHECK-OFFLOAD-ARCH-ERROR: error: failed to deduce triple for target architecture 'spirv64-intel'; specify the triple using '-fopenmp-targets' and '-Xopenmp-target' instead diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index 4201f043944ed..9fba63b195bc1 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -504,14 +504,14 @@ Expected clang(ArrayRef InputFiles, const ArgList &Args) { {"-Xlinker", Args.MakeArgString("--plugin-opt=" + StringRef(Arg->getValue()))}); - if (!Triple.isNVPTX()) + if (!Triple.isNVPTX() && !Triple.isSPIRV()) CmdArgs.push_back("-Wl,--no-undefined"); for (StringRef InputFile : InputFiles) CmdArgs.push_back(InputFile); // If this is CPU offloading we copy the input libraries. - if (!Triple.isAMDGPU() && !Triple.isNVPTX()) { + if (!Triple.isAMDGPU() && !Triple.isNVPTX() && !Triple.isSPIRV()) { CmdArgs.push_back("-Wl,-Bsymbolic"); CmdArgs.push_back("-shared"); ArgStringList LinkerArgs; @@ -595,6 +595,7 @@ Expected linkDevice(ArrayRef InputFiles, case Triple::aarch64_be: case Triple::ppc64: case Triple::ppc64le: + case Triple::spirv64: case Triple::systemz: case Triple::loongarch64: return generic::clang(InputFiles, Args); From 4922350407127607a9e78fc6d19f3f6278b1e46b Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Thu, 2 Jan 2025 20:18:56 +0000 Subject: [PATCH 302/567] [gn build] Port cd19f3f787b0 --- llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn index 615c11b6b8d62..700c243864633 100644 --- a/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn @@ -94,6 +94,7 @@ static_library("Driver") { "ToolChains/PS4CPU.cpp", "ToolChains/RISCVToolchain.cpp", "ToolChains/SPIRV.cpp", + "ToolChains/SPIRVOpenMP.cpp", "ToolChains/Solaris.cpp", "ToolChains/TCE.cpp", "ToolChains/UEFI.cpp", From f03b100e93196ca1ecec20fde3fc48690b3dad7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=AE=87=E9=80=B8?= Date: Fri, 3 Jan 2025 07:08:58 +0900 Subject: [PATCH 303/567] [Cygwin] Fix global variable dll import (#121439) This PR is necessary for cygwin target of Rust. References: * https://github.com/rust-lang/llvm-project/commit/86657cc39f8e42ae73be810fb0703ddac0eeef94 * https://github.com/Berrysoft/llvm-project/commit/a807e9f077351d3c6a68f4abe74c94a039759a2e --- llvm/lib/Target/TargetMachine.cpp | 2 +- llvm/test/CodeGen/X86/mingw-refptr.ll | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp index c0985f3be91a5..d5365f3c04743 100644 --- a/llvm/lib/Target/TargetMachine.cpp +++ b/llvm/lib/Target/TargetMachine.cpp @@ -204,7 +204,7 @@ bool TargetMachine::shouldAssumeDSOLocal(const GlobalValue *GV) const { // don't assume the variables to be DSO local unless we actually know // that for sure. This only has to be done for variables; for functions // the linker can insert thunks for calling functions from another DLL. - if (TT.isWindowsGNUEnvironment() && GV->isDeclarationForLinker() && + if (TT.isOSCygMing() && GV->isDeclarationForLinker() && isa(GV)) return false; diff --git a/llvm/test/CodeGen/X86/mingw-refptr.ll b/llvm/test/CodeGen/X86/mingw-refptr.ll index 73f1a9880913c..82a90aba38654 100644 --- a/llvm/test/CodeGen/X86/mingw-refptr.ll +++ b/llvm/test/CodeGen/X86/mingw-refptr.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mtriple=x86_64-w64-mingw32 | FileCheck %s -check-prefix=CHECK-X64 +; RUN: llc < %s -mtriple=x86_64-pc-cygwin | FileCheck %s -check-prefix=CHECK-X64 ; RUN: llc < %s -mtriple=i686-w64-mingw32 | FileCheck %s -check-prefix=CHECK-X86 ; RUN: llc < %s -mtriple=i686-w64-mingw32-none-elf | FileCheck %s -check-prefix=CHECK-X86-ELF From f6cb56902c6dcafede21eb6662910b6ff661fc0f Mon Sep 17 00:00:00 2001 From: Michael Kruse Date: Thu, 2 Jan 2025 23:22:20 +0100 Subject: [PATCH 304/567] [llvm-(min-)tblgen] Avoid redundant source compilation (#114494) All the sources of `llvm-min-tblgen` are also used for `llvm-tblgen`, with identical compilation flags. Reuse the object files of `llvm-min-tblgen` for `llvm-tblgen` by applying the usual source structure of an executable: One file per executable which named after the executable name containing the (in this case trivial) main function, which just calls the tblgen_main in TableGen.cpp. This should also clear up any confusion (including mine) of where each executable's main function is. While this slightly reduces build time, the main motivation is ccache. Using the hard_link option, building the object files for `llvm-tblgen` will result in a hard link to the same object file already used for `llvm-min-tblgen`. To signal the build system that the file is new, ccache will update the file's time stamp. Unfortunately, time stamps are shared between all hard-linked files s.t. this will indirectly also update the time stamps for the object files used for `llvm-tblgen`. At the next run, Ninja will recognize this time stamp discrepancy to the expected stamp recorded in `.ninja_log` and rebuild those object files for `llvm-min-tblgen`, which again will also update the stamp for the `llvm-tblgen`... . This is especially annoying for tablegen because it means Ninja will re-run all tablegenning in every build. I am using the hard_link option because it reduces the cost of having multiple build-trees of the LLVM sources and reduces the wear to the SSD they are stored on. --- .../{ => Basic}/ARMTargetDefEmitter.cpp | 0 .../utils/TableGen/{ => Basic}/Attributes.cpp | 0 llvm/utils/TableGen/Basic/CMakeLists.txt | 7 ++++++ .../TableGen/{ => Basic}/DirectiveEmitter.cpp | 0 .../TableGen/{ => Basic}/IntrinsicEmitter.cpp | 4 +-- .../{ => Basic}/RISCVTargetDefEmitter.cpp | 0 llvm/utils/TableGen/{ => Basic}/TableGen.cpp | 6 +++-- llvm/utils/TableGen/Basic/TableGen.h | 13 ++++++++++ llvm/utils/TableGen/{ => Basic}/VTEmitter.cpp | 0 llvm/utils/TableGen/CMakeLists.txt | 25 +++++++------------ llvm/utils/TableGen/llvm-min-tblgen.cpp | 18 +++++++++++++ llvm/utils/TableGen/llvm-tblgen.cpp | 18 +++++++++++++ 12 files changed, 71 insertions(+), 20 deletions(-) rename llvm/utils/TableGen/{ => Basic}/ARMTargetDefEmitter.cpp (100%) rename llvm/utils/TableGen/{ => Basic}/Attributes.cpp (100%) rename llvm/utils/TableGen/{ => Basic}/DirectiveEmitter.cpp (100%) rename llvm/utils/TableGen/{ => Basic}/IntrinsicEmitter.cpp (99%) rename llvm/utils/TableGen/{ => Basic}/RISCVTargetDefEmitter.cpp (100%) rename llvm/utils/TableGen/{ => Basic}/TableGen.cpp (94%) create mode 100644 llvm/utils/TableGen/Basic/TableGen.h rename llvm/utils/TableGen/{ => Basic}/VTEmitter.cpp (100%) create mode 100644 llvm/utils/TableGen/llvm-min-tblgen.cpp create mode 100644 llvm/utils/TableGen/llvm-tblgen.cpp diff --git a/llvm/utils/TableGen/ARMTargetDefEmitter.cpp b/llvm/utils/TableGen/Basic/ARMTargetDefEmitter.cpp similarity index 100% rename from llvm/utils/TableGen/ARMTargetDefEmitter.cpp rename to llvm/utils/TableGen/Basic/ARMTargetDefEmitter.cpp diff --git a/llvm/utils/TableGen/Attributes.cpp b/llvm/utils/TableGen/Basic/Attributes.cpp similarity index 100% rename from llvm/utils/TableGen/Attributes.cpp rename to llvm/utils/TableGen/Basic/Attributes.cpp diff --git a/llvm/utils/TableGen/Basic/CMakeLists.txt b/llvm/utils/TableGen/Basic/CMakeLists.txt index 41d737e8d418e..b058fba78eb05 100644 --- a/llvm/utils/TableGen/Basic/CMakeLists.txt +++ b/llvm/utils/TableGen/Basic/CMakeLists.txt @@ -9,8 +9,15 @@ set(LLVM_LINK_COMPONENTS ) add_llvm_library(LLVMTableGenBasic OBJECT EXCLUDE_FROM_ALL DISABLE_LLVM_LINK_LLVM_DYLIB + ARMTargetDefEmitter.cpp + Attributes.cpp CodeGenIntrinsics.cpp + DirectiveEmitter.cpp + IntrinsicEmitter.cpp + RISCVTargetDefEmitter.cpp SDNodeProperties.cpp + TableGen.cpp + VTEmitter.cpp ) # Users may include its headers as "Basic/*.h" diff --git a/llvm/utils/TableGen/DirectiveEmitter.cpp b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp similarity index 100% rename from llvm/utils/TableGen/DirectiveEmitter.cpp rename to llvm/utils/TableGen/Basic/DirectiveEmitter.cpp diff --git a/llvm/utils/TableGen/IntrinsicEmitter.cpp b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp similarity index 99% rename from llvm/utils/TableGen/IntrinsicEmitter.cpp rename to llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp index 093602c3da804..fc2b8908a35b8 100644 --- a/llvm/utils/TableGen/IntrinsicEmitter.cpp +++ b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp @@ -10,8 +10,8 @@ // //===----------------------------------------------------------------------===// -#include "Basic/CodeGenIntrinsics.h" -#include "Basic/SequenceToOffsetTable.h" +#include "CodeGenIntrinsics.h" +#include "SequenceToOffsetTable.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" diff --git a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp b/llvm/utils/TableGen/Basic/RISCVTargetDefEmitter.cpp similarity index 100% rename from llvm/utils/TableGen/RISCVTargetDefEmitter.cpp rename to llvm/utils/TableGen/Basic/RISCVTargetDefEmitter.cpp diff --git a/llvm/utils/TableGen/TableGen.cpp b/llvm/utils/TableGen/Basic/TableGen.cpp similarity index 94% rename from llvm/utils/TableGen/TableGen.cpp rename to llvm/utils/TableGen/Basic/TableGen.cpp index bea2a2e735dbe..80ac93f2b54fb 100644 --- a/llvm/utils/TableGen/TableGen.cpp +++ b/llvm/utils/TableGen/Basic/TableGen.cpp @@ -6,10 +6,12 @@ // //===----------------------------------------------------------------------===// // -// This file contains the main function for LLVM's TableGen. +// This file contains the global defintions (mostly command line parameters) +// shared between llvm-tblgen and llvm-min-tblgen. // //===----------------------------------------------------------------------===// +#include "TableGen.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/InitLLVM.h" @@ -74,7 +76,7 @@ static TableGen::Emitter::Opt X[] = { {"print-sets", printSets, "Print expanded sets for testing DAG exprs"}, }; -int main(int argc, char **argv) { +int tblgen_main(int argc, char **argv) { InitLLVM X(argc, argv); cl::ParseCommandLineOptions(argc, argv); diff --git a/llvm/utils/TableGen/Basic/TableGen.h b/llvm/utils/TableGen/Basic/TableGen.h new file mode 100644 index 0000000000000..630aea62fcf90 --- /dev/null +++ b/llvm/utils/TableGen/Basic/TableGen.h @@ -0,0 +1,13 @@ +//===- TableGen.h ---------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Shared entry point for llvm-tblgen and llvm-min-tblgen. +// +//===----------------------------------------------------------------------===// + +int tblgen_main(int argc, char **argv); diff --git a/llvm/utils/TableGen/VTEmitter.cpp b/llvm/utils/TableGen/Basic/VTEmitter.cpp similarity index 100% rename from llvm/utils/TableGen/VTEmitter.cpp rename to llvm/utils/TableGen/Basic/VTEmitter.cpp diff --git a/llvm/utils/TableGen/CMakeLists.txt b/llvm/utils/TableGen/CMakeLists.txt index ba1e4aa01b48d..e4b686803c976 100644 --- a/llvm/utils/TableGen/CMakeLists.txt +++ b/llvm/utils/TableGen/CMakeLists.txt @@ -5,20 +5,17 @@ add_subdirectory(Basic) # code needed by the backends. add_subdirectory(Common) -set(LLVM_LINK_COMPONENTS Support) - # llvm-min-tablegen only contains a subset of backends necessary to # build llvm/include. It must not depend on TableGenCommon, as # TableGenCommon depends on this already to generate things such as # ValueType definitions. +# Sources included in both, llvm-min-tblgen and llvm-tblgen, must be included +# into LLVMTableGenBasic to avoid redundant compilation and problems with build +# caches. +# At least one source file must be included directly to avoid CMake problems. +# E.g. CMake derives which linker to use from the types of sources added. add_tablegen(llvm-min-tblgen LLVM_HEADERS - TableGen.cpp - ARMTargetDefEmitter.cpp - Attributes.cpp - DirectiveEmitter.cpp - IntrinsicEmitter.cpp - RISCVTargetDefEmitter.cpp - VTEmitter.cpp + llvm-min-tblgen.cpp $ PARTIAL_SOURCES_INTENDED @@ -32,10 +29,8 @@ set(LLVM_LINK_COMPONENTS add_tablegen(llvm-tblgen LLVM DESTINATION "${LLVM_TOOLS_INSTALL_DIR}" EXPORT LLVM - ARMTargetDefEmitter.cpp AsmMatcherEmitter.cpp AsmWriterEmitter.cpp - Attributes.cpp CallingConvEmitter.cpp CodeEmitterGen.cpp CodeGenMapTable.cpp @@ -48,7 +43,6 @@ add_tablegen(llvm-tblgen LLVM DecoderEmitter.cpp DFAEmitter.cpp DFAPacketizerEmitter.cpp - DirectiveEmitter.cpp DisassemblerEmitter.cpp DXILEmitter.cpp ExegesisEmitter.cpp @@ -57,18 +51,15 @@ add_tablegen(llvm-tblgen LLVM GlobalISelEmitter.cpp InstrDocsEmitter.cpp InstrInfoEmitter.cpp - IntrinsicEmitter.cpp + llvm-tblgen.cpp MacroFusionPredicatorEmitter.cpp OptionParserEmitter.cpp OptionRSTEmitter.cpp PseudoLoweringEmitter.cpp RegisterBankEmitter.cpp RegisterInfoEmitter.cpp - RISCVTargetDefEmitter.cpp SearchableTableEmitter.cpp SubtargetEmitter.cpp - TableGen.cpp - VTEmitter.cpp WebAssemblyDisassemblerEmitter.cpp X86InstrMappingEmitter.cpp X86DisassemblerTables.cpp @@ -79,6 +70,8 @@ add_tablegen(llvm-tblgen LLVM $ $ + PARTIAL_SOURCES_INTENDED + DEPENDS intrinsics_gen # via llvm-min-tablegen ) diff --git a/llvm/utils/TableGen/llvm-min-tblgen.cpp b/llvm/utils/TableGen/llvm-min-tblgen.cpp new file mode 100644 index 0000000000000..79fce5c555f6e --- /dev/null +++ b/llvm/utils/TableGen/llvm-min-tblgen.cpp @@ -0,0 +1,18 @@ +//===- llvm-min-tblgen.cpp ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the main function for LLVM's TableGen. +// +//===----------------------------------------------------------------------===// + +#include "Basic/TableGen.h" + +/// Command line parameters are shared between llvm-tblgen and llvm-min-tblgen. +/// The indirection to tblgen_main exists to ensure that the static variables +/// for the llvm::cl:: mechanism are linked into both executables. +int main(int argc, char **argv) { return tblgen_main(argc, argv); } diff --git a/llvm/utils/TableGen/llvm-tblgen.cpp b/llvm/utils/TableGen/llvm-tblgen.cpp new file mode 100644 index 0000000000000..a38382472a992 --- /dev/null +++ b/llvm/utils/TableGen/llvm-tblgen.cpp @@ -0,0 +1,18 @@ +//===- llvm-tblgen.cpp ----------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the main function for LLVM's TableGen. +// +//===----------------------------------------------------------------------===// + +#include "Basic/TableGen.h" + +/// Command line parameters are shared between llvm-tblgen and llvm-min-tblgen. +/// The indirection to tblgen_main exists to ensure that the static variables +/// for the llvm::cl:: mechanism are linked into both executables. +int main(int argc, char **argv) { return tblgen_main(argc, argv); } From 06b6161d3fa9d69a07e9046dbdd2e230b257d948 Mon Sep 17 00:00:00 2001 From: Michael Kruse Date: Thu, 2 Jan 2025 23:27:57 +0100 Subject: [PATCH 305/567] Revert "[llvm-(min-)tblgen] Avoid redundant source compilation (#114494)" This reverts commit f6cb56902c6dcafede21eb6662910b6ff661fc0f. Buildbot failures such as https://lab.llvm.org/buildbot/#/builders/89/builds/13541: ``` /usr/bin/ld: utils/TableGen/Basic/CMakeFiles/obj.LLVMTableGenBasic.dir/ARMTargetDefEmitter.cpp.o: undefined reference to symbol '_ZN4llvm23EnableABIBreakingChecksE' /usr/bin/ld: /home/tcwg-buildbot/worker/flang-aarch64-libcxx/build/./lib/libLLVMSupport.so.20.0git: error adding symbols: DSO missing from command line ``` Going to investigate. --- .../{Basic => }/ARMTargetDefEmitter.cpp | 0 .../utils/TableGen/{Basic => }/Attributes.cpp | 0 llvm/utils/TableGen/Basic/CMakeLists.txt | 7 ------ llvm/utils/TableGen/Basic/TableGen.h | 13 ---------- llvm/utils/TableGen/CMakeLists.txt | 25 ++++++++++++------- .../TableGen/{Basic => }/DirectiveEmitter.cpp | 0 .../TableGen/{Basic => }/IntrinsicEmitter.cpp | 4 +-- .../{Basic => }/RISCVTargetDefEmitter.cpp | 0 llvm/utils/TableGen/{Basic => }/TableGen.cpp | 6 ++--- llvm/utils/TableGen/{Basic => }/VTEmitter.cpp | 0 llvm/utils/TableGen/llvm-min-tblgen.cpp | 18 ------------- llvm/utils/TableGen/llvm-tblgen.cpp | 18 ------------- 12 files changed, 20 insertions(+), 71 deletions(-) rename llvm/utils/TableGen/{Basic => }/ARMTargetDefEmitter.cpp (100%) rename llvm/utils/TableGen/{Basic => }/Attributes.cpp (100%) delete mode 100644 llvm/utils/TableGen/Basic/TableGen.h rename llvm/utils/TableGen/{Basic => }/DirectiveEmitter.cpp (100%) rename llvm/utils/TableGen/{Basic => }/IntrinsicEmitter.cpp (99%) rename llvm/utils/TableGen/{Basic => }/RISCVTargetDefEmitter.cpp (100%) rename llvm/utils/TableGen/{Basic => }/TableGen.cpp (94%) rename llvm/utils/TableGen/{Basic => }/VTEmitter.cpp (100%) delete mode 100644 llvm/utils/TableGen/llvm-min-tblgen.cpp delete mode 100644 llvm/utils/TableGen/llvm-tblgen.cpp diff --git a/llvm/utils/TableGen/Basic/ARMTargetDefEmitter.cpp b/llvm/utils/TableGen/ARMTargetDefEmitter.cpp similarity index 100% rename from llvm/utils/TableGen/Basic/ARMTargetDefEmitter.cpp rename to llvm/utils/TableGen/ARMTargetDefEmitter.cpp diff --git a/llvm/utils/TableGen/Basic/Attributes.cpp b/llvm/utils/TableGen/Attributes.cpp similarity index 100% rename from llvm/utils/TableGen/Basic/Attributes.cpp rename to llvm/utils/TableGen/Attributes.cpp diff --git a/llvm/utils/TableGen/Basic/CMakeLists.txt b/llvm/utils/TableGen/Basic/CMakeLists.txt index b058fba78eb05..41d737e8d418e 100644 --- a/llvm/utils/TableGen/Basic/CMakeLists.txt +++ b/llvm/utils/TableGen/Basic/CMakeLists.txt @@ -9,15 +9,8 @@ set(LLVM_LINK_COMPONENTS ) add_llvm_library(LLVMTableGenBasic OBJECT EXCLUDE_FROM_ALL DISABLE_LLVM_LINK_LLVM_DYLIB - ARMTargetDefEmitter.cpp - Attributes.cpp CodeGenIntrinsics.cpp - DirectiveEmitter.cpp - IntrinsicEmitter.cpp - RISCVTargetDefEmitter.cpp SDNodeProperties.cpp - TableGen.cpp - VTEmitter.cpp ) # Users may include its headers as "Basic/*.h" diff --git a/llvm/utils/TableGen/Basic/TableGen.h b/llvm/utils/TableGen/Basic/TableGen.h deleted file mode 100644 index 630aea62fcf90..0000000000000 --- a/llvm/utils/TableGen/Basic/TableGen.h +++ /dev/null @@ -1,13 +0,0 @@ -//===- TableGen.h ---------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Shared entry point for llvm-tblgen and llvm-min-tblgen. -// -//===----------------------------------------------------------------------===// - -int tblgen_main(int argc, char **argv); diff --git a/llvm/utils/TableGen/CMakeLists.txt b/llvm/utils/TableGen/CMakeLists.txt index e4b686803c976..ba1e4aa01b48d 100644 --- a/llvm/utils/TableGen/CMakeLists.txt +++ b/llvm/utils/TableGen/CMakeLists.txt @@ -5,17 +5,20 @@ add_subdirectory(Basic) # code needed by the backends. add_subdirectory(Common) +set(LLVM_LINK_COMPONENTS Support) + # llvm-min-tablegen only contains a subset of backends necessary to # build llvm/include. It must not depend on TableGenCommon, as # TableGenCommon depends on this already to generate things such as # ValueType definitions. -# Sources included in both, llvm-min-tblgen and llvm-tblgen, must be included -# into LLVMTableGenBasic to avoid redundant compilation and problems with build -# caches. -# At least one source file must be included directly to avoid CMake problems. -# E.g. CMake derives which linker to use from the types of sources added. add_tablegen(llvm-min-tblgen LLVM_HEADERS - llvm-min-tblgen.cpp + TableGen.cpp + ARMTargetDefEmitter.cpp + Attributes.cpp + DirectiveEmitter.cpp + IntrinsicEmitter.cpp + RISCVTargetDefEmitter.cpp + VTEmitter.cpp $ PARTIAL_SOURCES_INTENDED @@ -29,8 +32,10 @@ set(LLVM_LINK_COMPONENTS add_tablegen(llvm-tblgen LLVM DESTINATION "${LLVM_TOOLS_INSTALL_DIR}" EXPORT LLVM + ARMTargetDefEmitter.cpp AsmMatcherEmitter.cpp AsmWriterEmitter.cpp + Attributes.cpp CallingConvEmitter.cpp CodeEmitterGen.cpp CodeGenMapTable.cpp @@ -43,6 +48,7 @@ add_tablegen(llvm-tblgen LLVM DecoderEmitter.cpp DFAEmitter.cpp DFAPacketizerEmitter.cpp + DirectiveEmitter.cpp DisassemblerEmitter.cpp DXILEmitter.cpp ExegesisEmitter.cpp @@ -51,15 +57,18 @@ add_tablegen(llvm-tblgen LLVM GlobalISelEmitter.cpp InstrDocsEmitter.cpp InstrInfoEmitter.cpp - llvm-tblgen.cpp + IntrinsicEmitter.cpp MacroFusionPredicatorEmitter.cpp OptionParserEmitter.cpp OptionRSTEmitter.cpp PseudoLoweringEmitter.cpp RegisterBankEmitter.cpp RegisterInfoEmitter.cpp + RISCVTargetDefEmitter.cpp SearchableTableEmitter.cpp SubtargetEmitter.cpp + TableGen.cpp + VTEmitter.cpp WebAssemblyDisassemblerEmitter.cpp X86InstrMappingEmitter.cpp X86DisassemblerTables.cpp @@ -70,8 +79,6 @@ add_tablegen(llvm-tblgen LLVM $ $ - PARTIAL_SOURCES_INTENDED - DEPENDS intrinsics_gen # via llvm-min-tablegen ) diff --git a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp b/llvm/utils/TableGen/DirectiveEmitter.cpp similarity index 100% rename from llvm/utils/TableGen/Basic/DirectiveEmitter.cpp rename to llvm/utils/TableGen/DirectiveEmitter.cpp diff --git a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp b/llvm/utils/TableGen/IntrinsicEmitter.cpp similarity index 99% rename from llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp rename to llvm/utils/TableGen/IntrinsicEmitter.cpp index fc2b8908a35b8..093602c3da804 100644 --- a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp +++ b/llvm/utils/TableGen/IntrinsicEmitter.cpp @@ -10,8 +10,8 @@ // //===----------------------------------------------------------------------===// -#include "CodeGenIntrinsics.h" -#include "SequenceToOffsetTable.h" +#include "Basic/CodeGenIntrinsics.h" +#include "Basic/SequenceToOffsetTable.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" diff --git a/llvm/utils/TableGen/Basic/RISCVTargetDefEmitter.cpp b/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp similarity index 100% rename from llvm/utils/TableGen/Basic/RISCVTargetDefEmitter.cpp rename to llvm/utils/TableGen/RISCVTargetDefEmitter.cpp diff --git a/llvm/utils/TableGen/Basic/TableGen.cpp b/llvm/utils/TableGen/TableGen.cpp similarity index 94% rename from llvm/utils/TableGen/Basic/TableGen.cpp rename to llvm/utils/TableGen/TableGen.cpp index 80ac93f2b54fb..bea2a2e735dbe 100644 --- a/llvm/utils/TableGen/Basic/TableGen.cpp +++ b/llvm/utils/TableGen/TableGen.cpp @@ -6,12 +6,10 @@ // //===----------------------------------------------------------------------===// // -// This file contains the global defintions (mostly command line parameters) -// shared between llvm-tblgen and llvm-min-tblgen. +// This file contains the main function for LLVM's TableGen. // //===----------------------------------------------------------------------===// -#include "TableGen.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/InitLLVM.h" @@ -76,7 +74,7 @@ static TableGen::Emitter::Opt X[] = { {"print-sets", printSets, "Print expanded sets for testing DAG exprs"}, }; -int tblgen_main(int argc, char **argv) { +int main(int argc, char **argv) { InitLLVM X(argc, argv); cl::ParseCommandLineOptions(argc, argv); diff --git a/llvm/utils/TableGen/Basic/VTEmitter.cpp b/llvm/utils/TableGen/VTEmitter.cpp similarity index 100% rename from llvm/utils/TableGen/Basic/VTEmitter.cpp rename to llvm/utils/TableGen/VTEmitter.cpp diff --git a/llvm/utils/TableGen/llvm-min-tblgen.cpp b/llvm/utils/TableGen/llvm-min-tblgen.cpp deleted file mode 100644 index 79fce5c555f6e..0000000000000 --- a/llvm/utils/TableGen/llvm-min-tblgen.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//===- llvm-min-tblgen.cpp ------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the main function for LLVM's TableGen. -// -//===----------------------------------------------------------------------===// - -#include "Basic/TableGen.h" - -/// Command line parameters are shared between llvm-tblgen and llvm-min-tblgen. -/// The indirection to tblgen_main exists to ensure that the static variables -/// for the llvm::cl:: mechanism are linked into both executables. -int main(int argc, char **argv) { return tblgen_main(argc, argv); } diff --git a/llvm/utils/TableGen/llvm-tblgen.cpp b/llvm/utils/TableGen/llvm-tblgen.cpp deleted file mode 100644 index a38382472a992..0000000000000 --- a/llvm/utils/TableGen/llvm-tblgen.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//===- llvm-tblgen.cpp ----------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the main function for LLVM's TableGen. -// -//===----------------------------------------------------------------------===// - -#include "Basic/TableGen.h" - -/// Command line parameters are shared between llvm-tblgen and llvm-min-tblgen. -/// The indirection to tblgen_main exists to ensure that the static variables -/// for the llvm::cl:: mechanism are linked into both executables. -int main(int argc, char **argv) { return tblgen_main(argc, argv); } From 4b577830033066cfd1b2acf4fcf39950678b27bd Mon Sep 17 00:00:00 2001 From: David CARLIER Date: Thu, 2 Jan 2025 22:30:39 +0000 Subject: [PATCH 306/567] [compiler-rt][rtsan] fopencookie support. (#120864) --- .../lib/rtsan/rtsan_interceptors_posix.cpp | 7 ++++++ .../tests/rtsan_test_interceptors_posix.cpp | 23 +++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp index 4e51f464b5730..072923ab35ae0 100644 --- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp @@ -297,6 +297,12 @@ INTERCEPTOR(FILE *, fdopen, int fd, const char *mode) { return REAL(fdopen)(fd, mode); } +INTERCEPTOR(FILE *, fopencookie, void *cookie, const char *mode, + cookie_io_functions_t funcs) { + __rtsan_notify_intercepted_call("fopencookie"); + return REAL(fopencookie)(cookie, mode, funcs); +} + #if SANITIZER_INTERCEPT_OPEN_MEMSTREAM INTERCEPTOR(FILE *, open_memstream, char **buf, size_t *size) { __rtsan_notify_intercepted_call("open_memstream"); @@ -972,6 +978,7 @@ void __rtsan::InitializeInterceptors() { INTERCEPT_FUNCTION(fputs); INTERCEPT_FUNCTION(fdopen); INTERCEPT_FUNCTION(freopen); + INTERCEPT_FUNCTION(fopencookie); RTSAN_MAYBE_INTERCEPT_OPEN_MEMSTREAM; RTSAN_MAYBE_INTERCEPT_FMEMOPEN; INTERCEPT_FUNCTION(lseek); diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp index b052dd859dcdf..c9c4d7fc4e99e 100644 --- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp @@ -353,6 +353,29 @@ TEST_F(RtsanFileTest, FopenDiesWhenRealtime) { ExpectNonRealtimeSurvival(Func); } +TEST_F(RtsanFileTest, FopenCookieDieWhenRealtime) { + FILE *f = fopen(GetTemporaryFilePath(), "w"); + EXPECT_THAT(f, Ne(nullptr)); + struct fholder { + FILE *fp; + size_t read; + } fh = {f, 0}; + auto CookieRead = [this](void *cookie, char *buf, size_t size) { + fholder *p = reinterpret_cast(cookie); + p->read = fread(static_cast(buf), 1, size, p->fp); + EXPECT_NE(0, p->read); + }; + cookie_io_functions_t funcs = {(cookie_read_function_t *)&CookieRead, nullptr, + nullptr, nullptr}; + auto Func = [&fh, &funcs]() { + FILE *f = fopencookie(&fh, "w", funcs); + EXPECT_THAT(f, Ne(nullptr)); + }; + + ExpectRealtimeDeath(Func, "fopencookie"); + ExpectNonRealtimeSurvival(Func); +} + #if SANITIZER_INTERCEPT_OPEN_MEMSTREAM TEST_F(RtsanFileTest, OpenMemstreamDiesWhenRealtime) { char *buffer; From c703b4645c79e889fd6a0f3f64f01f957d981aa4 Mon Sep 17 00:00:00 2001 From: Jacques Pienaar Date: Thu, 2 Jan 2025 14:40:15 -0800 Subject: [PATCH 307/567] [mlir][py] Enable loading only specified dialects during creation. (#121421) Gives option post as global list as well as arg to control which dialects are loaded during context creation. This enables setting either a good base set or skipping in individual cases. --- mlir/python/mlir/_mlir_libs/__init__.py | 42 +++++++++++++++++++++++-- mlir/python/mlir/ir.py | 6 +++- mlir/test/python/ir/dialects.py | 36 +++++++++++++++++++++ 3 files changed, 80 insertions(+), 4 deletions(-) diff --git a/mlir/python/mlir/_mlir_libs/__init__.py b/mlir/python/mlir/_mlir_libs/__init__.py index c5cb22c6dccb8..d021dde05dd87 100644 --- a/mlir/python/mlir/_mlir_libs/__init__.py +++ b/mlir/python/mlir/_mlir_libs/__init__.py @@ -58,6 +58,7 @@ def get_include_dirs() -> Sequence[str]: # needs. _dialect_registry = None +_load_on_create_dialects = None def get_dialect_registry(): @@ -71,6 +72,21 @@ def get_dialect_registry(): return _dialect_registry +def append_load_on_create_dialect(dialect: str): + global _load_on_create_dialects + if _load_on_create_dialects is None: + _load_on_create_dialects = [dialect] + else: + _load_on_create_dialects.append(dialect) + + +def get_load_on_create_dialects(): + global _load_on_create_dialects + if _load_on_create_dialects is None: + _load_on_create_dialects = [] + return _load_on_create_dialects + + def _site_initialize(): import importlib import itertools @@ -132,15 +148,35 @@ def process_initializer_module(module_name): break class Context(ir._BaseContext): - def __init__(self, *args, **kwargs): + def __init__(self, load_on_create_dialects=None, *args, **kwargs): super().__init__(*args, **kwargs) self.append_dialect_registry(get_dialect_registry()) for hook in post_init_hooks: hook(self) if not disable_multithreading: self.enable_multithreading(True) - if not disable_load_all_available_dialects: - self.load_all_available_dialects() + if load_on_create_dialects is not None: + logger.debug( + "Loading all dialects from load_on_create_dialects arg %r", + load_on_create_dialects, + ) + for dialect in load_on_create_dialects: + # This triggers loading the dialect into the context. + _ = self.dialects[dialect] + else: + if disable_load_all_available_dialects: + dialects = get_load_on_create_dialects() + if dialects: + logger.debug( + "Loading all dialects from global load_on_create_dialects %r", + dialects, + ) + for dialect in dialects: + # This triggers loading the dialect into the context. + _ = self.dialects[dialect] + else: + logger.debug("Loading all available dialects") + self.load_all_available_dialects() if init_module: logger.debug( "Registering translations from initializer %r", init_module diff --git a/mlir/python/mlir/ir.py b/mlir/python/mlir/ir.py index 9a6ce462047ad..6f37266d5bf39 100644 --- a/mlir/python/mlir/ir.py +++ b/mlir/python/mlir/ir.py @@ -5,7 +5,11 @@ from ._mlir_libs._mlir.ir import * from ._mlir_libs._mlir.ir import _GlobalDebug from ._mlir_libs._mlir import register_type_caster, register_value_caster -from ._mlir_libs import get_dialect_registry +from ._mlir_libs import ( + get_dialect_registry, + append_load_on_create_dialect, + get_load_on_create_dialects, +) # Convenience decorator for registering user-friendly Attribute builders. diff --git a/mlir/test/python/ir/dialects.py b/mlir/test/python/ir/dialects.py index d59c6a6bc424e..5a2ed684d298b 100644 --- a/mlir/test/python/ir/dialects.py +++ b/mlir/test/python/ir/dialects.py @@ -121,3 +121,39 @@ def testAppendPrefixSearchPath(): sys.path.append(".") _cext.globals.append_dialect_search_prefix("custom_dialect") assert _cext.globals._check_dialect_module_loaded("custom") + + +# CHECK-LABEL: TEST: testDialectLoadOnCreate +@run +def testDialectLoadOnCreate(): + with Context(load_on_create_dialects=[]) as ctx: + ctx.emit_error_diagnostics = True + ctx.allow_unregistered_dialects = True + + def callback(d): + # CHECK: DIAGNOSTIC + # CHECK-SAME: op created with unregistered dialect + print(f"DIAGNOSTIC={d.message}") + return True + + handler = ctx.attach_diagnostic_handler(callback) + loc = Location.unknown(ctx) + try: + op = Operation.create("arith.addi", loc=loc) + ctx.allow_unregistered_dialects = False + op.verify() + except MLIRError as e: + pass + + with Context(load_on_create_dialects=["func"]) as ctx: + loc = Location.unknown(ctx) + fn = Operation.create("func.func", loc=loc) + + # TODO: This may require an update if a site wide policy is set. + # CHECK: Load on create: [] + print(f"Load on create: {get_load_on_create_dialects()}") + append_load_on_create_dialect("func") + # CHECK: Load on create: + # CHECK-SAME: func + print(f"Load on create: {get_load_on_create_dialects()}") + print(get_load_on_create_dialects()) From 976f3a078bbac1889aa9e68e297f73f111a896d6 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 2 Jan 2025 15:05:52 -0800 Subject: [PATCH 308/567] [LLVM] Update RISCV maintainers (#121301) Add Philip Reames and myself as maintainers. I think between the two of us we do a lot of the patch reviews. --- llvm/Maintainers.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md index 6d0fda148ce87..fca00ca12f401 100644 --- a/llvm/Maintainers.md +++ b/llvm/Maintainers.md @@ -249,7 +249,11 @@ czhengsz@cn.ibm.com (email), [chenzheng1030](https://github.com/chenzheng1030) ( #### RISCV backend Alex Bradbury \ -asb@igalia.com (email), [asb](https://github.com/asb) (GitHub) +asb@igalia.com (email), [asb](https://github.com/asb) (GitHub) \ +Craig Topper \ +craig.topper@sifive.com (email), [topperc](https://github.com/topperc) (GitHub) \ +Philip Reames \ +listmail@philipreames.com (email), [preames](https://github.com/preames) (GitHub) #### Sparc backend From 2291d0aba927b885cf39150e59fde466a2524bb5 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Thu, 2 Jan 2025 16:28:55 -0800 Subject: [PATCH 309/567] [DAGCombiner] Turn `(neg (max x, (neg x)))` into `(min x, (neg x))` (#120666) This pattern was originally spotted in 429.mcf by @topperc. We already have a DAGCombiner pattern to turn `(neg (abs x))` into `(min x, (neg x))`. But in some cases `(neg (max x, (neg x)))` is formed by an expanded `abs` followed by a `neg` that is generated only after the `abs` expansion. This patch adds a separate pattern to match cases like this, as well as its inverse pattern: `(neg (min X, (neg X))) --> (max X, (neg X))`. This pattern is applicable to both signed and unsigned min/max. --- llvm/include/llvm/CodeGen/ISDOpcodes.h | 4 + llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 17 + .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 15 + llvm/test/CodeGen/RISCV/neg-abs.ll | 444 ++++++++++++++++++ llvm/test/CodeGen/RISCV/rvv/fixed-neg-abs.ll | 54 +++ 5 files changed, 534 insertions(+) create mode 100644 llvm/test/CodeGen/RISCV/rvv/fixed-neg-abs.ll diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index 69820aed2137b..604dc9419025b 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -1495,6 +1495,10 @@ inline bool isBitwiseLogicOp(unsigned Opcode) { return Opcode == ISD::AND || Opcode == ISD::OR || Opcode == ISD::XOR; } +/// Given a \p MinMaxOpc of ISD::(U|S)MIN or ISD::(U|S)MAX, returns +/// ISD::(U|S)MAX and ISD::(U|S)MIN, respectively. +NodeType getInverseMinMaxOpcode(unsigned MinMaxOpc); + /// Get underlying scalar opcode for VECREDUCE opcode. /// For example ISD::AND for ISD::VECREDUCE_AND. NodeType getVecReduceBaseOpcode(unsigned VecReduceOpcode); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 6b2501591c81a..9ec3310b5219b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3949,6 +3949,23 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { if (SDValue Result = TLI.expandABS(N1.getNode(), DAG, true)) return Result; + // Similar to the previous rule, but this time targeting an expanded abs. + // (sub 0, (max X, (sub 0, X))) --> (min X, (sub 0, X)) + // as well as + // (sub 0, (min X, (sub 0, X))) --> (max X, (sub 0, X)) + // Note that these two are applicable to both signed and unsigned min/max. + SDValue X; + SDValue S0; + auto NegPat = m_AllOf(m_Neg(m_Deferred(X)), m_Value(S0)); + if (sd_match(N1, m_OneUse(m_AnyOf(m_SMax(m_Value(X), NegPat), + m_UMax(m_Value(X), NegPat), + m_SMin(m_Value(X), NegPat), + m_UMin(m_Value(X), NegPat))))) { + unsigned NewOpc = ISD::getInverseMinMaxOpcode(N1->getOpcode()); + if (hasOperation(NewOpc, VT)) + return DAG.getNode(NewOpc, DL, VT, X, S0); + } + // Fold neg(splat(neg(x)) -> splat(x) if (VT.isVector()) { SDValue N1S = DAG.getSplatValue(N1, true); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 10e8ba93359fb..0dfd0302ae543 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -430,6 +430,21 @@ bool ISD::matchBinaryPredicate( return true; } +ISD::NodeType ISD::getInverseMinMaxOpcode(unsigned MinMaxOpc) { + switch (MinMaxOpc) { + default: + llvm_unreachable("unrecognized opcode"); + case ISD::UMIN: + return ISD::UMAX; + case ISD::UMAX: + return ISD::UMIN; + case ISD::SMIN: + return ISD::SMAX; + case ISD::SMAX: + return ISD::SMIN; + } +} + ISD::NodeType ISD::getVecReduceBaseOpcode(unsigned VecReduceOpcode) { switch (VecReduceOpcode) { default: diff --git a/llvm/test/CodeGen/RISCV/neg-abs.ll b/llvm/test/CodeGen/RISCV/neg-abs.ll index 7d6a6d7ed4ce6..fe19a4fa8bbd8 100644 --- a/llvm/test/CodeGen/RISCV/neg-abs.ll +++ b/llvm/test/CodeGen/RISCV/neg-abs.ll @@ -258,3 +258,447 @@ define i64 @neg_abs64_multiuse(i64 %x, ptr %y) { %neg = sub nsw i64 0, %abs ret i64 %neg } + +define i32 @expanded_neg_abs32(i32 %x) { +; RV32I-LABEL: expanded_neg_abs32: +; RV32I: # %bb.0: +; RV32I-NEXT: neg a1, a0 +; RV32I-NEXT: blt a0, a1, .LBB6_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: .LBB6_2: +; RV32I-NEXT: neg a0, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_abs32: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: neg a1, a0 +; RV32ZBB-NEXT: min a0, a0, a1 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_abs32: +; RV64I: # %bb.0: +; RV64I-NEXT: sext.w a1, a0 +; RV64I-NEXT: negw a0, a0 +; RV64I-NEXT: blt a1, a0, .LBB6_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a0, a1 +; RV64I-NEXT: .LBB6_2: +; RV64I-NEXT: negw a0, a0 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_abs32: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: sext.w a1, a0 +; RV64ZBB-NEXT: negw a0, a0 +; RV64ZBB-NEXT: max a0, a0, a1 +; RV64ZBB-NEXT: negw a0, a0 +; RV64ZBB-NEXT: ret + %n = sub i32 0, %x + %t = call i32 @llvm.smax.i32(i32 %n, i32 %x) + %r = sub i32 0, %t + ret i32 %r +} + +define i32 @expanded_neg_abs32_unsigned(i32 %x) { +; RV32I-LABEL: expanded_neg_abs32_unsigned: +; RV32I: # %bb.0: +; RV32I-NEXT: neg a1, a0 +; RV32I-NEXT: bltu a0, a1, .LBB7_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: .LBB7_2: +; RV32I-NEXT: neg a0, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_abs32_unsigned: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: neg a1, a0 +; RV32ZBB-NEXT: minu a0, a0, a1 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_abs32_unsigned: +; RV64I: # %bb.0: +; RV64I-NEXT: sext.w a1, a0 +; RV64I-NEXT: negw a0, a0 +; RV64I-NEXT: bltu a1, a0, .LBB7_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a0, a1 +; RV64I-NEXT: .LBB7_2: +; RV64I-NEXT: negw a0, a0 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_abs32_unsigned: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: sext.w a1, a0 +; RV64ZBB-NEXT: negw a0, a0 +; RV64ZBB-NEXT: maxu a0, a0, a1 +; RV64ZBB-NEXT: negw a0, a0 +; RV64ZBB-NEXT: ret + %n = sub i32 0, %x + %t = call i32 @llvm.umax.i32(i32 %n, i32 %x) + %r = sub i32 0, %t + ret i32 %r +} + +define i64 @expanded_neg_abs64(i64 %x) { +; RV32I-LABEL: expanded_neg_abs64: +; RV32I: # %bb.0: +; RV32I-NEXT: snez a2, a0 +; RV32I-NEXT: neg a3, a1 +; RV32I-NEXT: sub a2, a3, a2 +; RV32I-NEXT: neg a3, a0 +; RV32I-NEXT: beq a2, a1, .LBB8_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: slt a4, a1, a2 +; RV32I-NEXT: beqz a4, .LBB8_3 +; RV32I-NEXT: j .LBB8_4 +; RV32I-NEXT: .LBB8_2: +; RV32I-NEXT: sltu a4, a0, a3 +; RV32I-NEXT: bnez a4, .LBB8_4 +; RV32I-NEXT: .LBB8_3: +; RV32I-NEXT: mv a2, a1 +; RV32I-NEXT: mv a3, a0 +; RV32I-NEXT: .LBB8_4: +; RV32I-NEXT: snez a0, a3 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: neg a1, a0 +; RV32I-NEXT: neg a0, a3 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_abs64: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: snez a2, a0 +; RV32ZBB-NEXT: neg a3, a1 +; RV32ZBB-NEXT: sub a2, a3, a2 +; RV32ZBB-NEXT: neg a3, a0 +; RV32ZBB-NEXT: beq a2, a1, .LBB8_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: slt a4, a1, a2 +; RV32ZBB-NEXT: beqz a4, .LBB8_3 +; RV32ZBB-NEXT: j .LBB8_4 +; RV32ZBB-NEXT: .LBB8_2: +; RV32ZBB-NEXT: sltu a4, a0, a3 +; RV32ZBB-NEXT: bnez a4, .LBB8_4 +; RV32ZBB-NEXT: .LBB8_3: +; RV32ZBB-NEXT: mv a2, a1 +; RV32ZBB-NEXT: mv a3, a0 +; RV32ZBB-NEXT: .LBB8_4: +; RV32ZBB-NEXT: snez a0, a3 +; RV32ZBB-NEXT: add a0, a2, a0 +; RV32ZBB-NEXT: neg a1, a0 +; RV32ZBB-NEXT: neg a0, a3 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_abs64: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: blt a0, a1, .LBB8_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: .LBB8_2: +; RV64I-NEXT: neg a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_abs64: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: neg a1, a0 +; RV64ZBB-NEXT: min a0, a0, a1 +; RV64ZBB-NEXT: ret + %n = sub i64 0, %x + %t = call i64 @llvm.smax.i64(i64 %n, i64 %x) + %r = sub i64 0, %t + ret i64 %r +} + +define i64 @expanded_neg_abs64_unsigned(i64 %x) { +; RV32I-LABEL: expanded_neg_abs64_unsigned: +; RV32I: # %bb.0: +; RV32I-NEXT: snez a2, a0 +; RV32I-NEXT: neg a3, a1 +; RV32I-NEXT: sub a2, a3, a2 +; RV32I-NEXT: neg a3, a0 +; RV32I-NEXT: beq a2, a1, .LBB9_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sltu a4, a1, a2 +; RV32I-NEXT: beqz a4, .LBB9_3 +; RV32I-NEXT: j .LBB9_4 +; RV32I-NEXT: .LBB9_2: +; RV32I-NEXT: sltu a4, a0, a3 +; RV32I-NEXT: bnez a4, .LBB9_4 +; RV32I-NEXT: .LBB9_3: +; RV32I-NEXT: mv a2, a1 +; RV32I-NEXT: mv a3, a0 +; RV32I-NEXT: .LBB9_4: +; RV32I-NEXT: snez a0, a3 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: neg a1, a0 +; RV32I-NEXT: neg a0, a3 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_abs64_unsigned: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: snez a2, a0 +; RV32ZBB-NEXT: neg a3, a1 +; RV32ZBB-NEXT: sub a2, a3, a2 +; RV32ZBB-NEXT: neg a3, a0 +; RV32ZBB-NEXT: beq a2, a1, .LBB9_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: sltu a4, a1, a2 +; RV32ZBB-NEXT: beqz a4, .LBB9_3 +; RV32ZBB-NEXT: j .LBB9_4 +; RV32ZBB-NEXT: .LBB9_2: +; RV32ZBB-NEXT: sltu a4, a0, a3 +; RV32ZBB-NEXT: bnez a4, .LBB9_4 +; RV32ZBB-NEXT: .LBB9_3: +; RV32ZBB-NEXT: mv a2, a1 +; RV32ZBB-NEXT: mv a3, a0 +; RV32ZBB-NEXT: .LBB9_4: +; RV32ZBB-NEXT: snez a0, a3 +; RV32ZBB-NEXT: add a0, a2, a0 +; RV32ZBB-NEXT: neg a1, a0 +; RV32ZBB-NEXT: neg a0, a3 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_abs64_unsigned: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: bltu a0, a1, .LBB9_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: .LBB9_2: +; RV64I-NEXT: neg a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_abs64_unsigned: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: neg a1, a0 +; RV64ZBB-NEXT: minu a0, a0, a1 +; RV64ZBB-NEXT: ret + %n = sub i64 0, %x + %t = call i64 @llvm.umax.i64(i64 %n, i64 %x) + %r = sub i64 0, %t + ret i64 %r +} + +define i32 @expanded_neg_inv_abs32(i32 %x) { +; RV32I-LABEL: expanded_neg_inv_abs32: +; RV32I: # %bb.0: +; RV32I-NEXT: neg a1, a0 +; RV32I-NEXT: blt a1, a0, .LBB10_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: .LBB10_2: +; RV32I-NEXT: neg a0, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_inv_abs32: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: neg a1, a0 +; RV32ZBB-NEXT: max a0, a0, a1 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_inv_abs32: +; RV64I: # %bb.0: +; RV64I-NEXT: sext.w a1, a0 +; RV64I-NEXT: negw a0, a0 +; RV64I-NEXT: blt a0, a1, .LBB10_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a0, a1 +; RV64I-NEXT: .LBB10_2: +; RV64I-NEXT: negw a0, a0 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_inv_abs32: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: sext.w a1, a0 +; RV64ZBB-NEXT: negw a0, a0 +; RV64ZBB-NEXT: min a0, a0, a1 +; RV64ZBB-NEXT: negw a0, a0 +; RV64ZBB-NEXT: ret + %n = sub i32 0, %x + %t = call i32 @llvm.smin.i32(i32 %n, i32 %x) + %r = sub i32 0, %t + ret i32 %r +} + +define i32 @expanded_neg_inv_abs32_unsigned(i32 %x) { +; RV32I-LABEL: expanded_neg_inv_abs32_unsigned: +; RV32I: # %bb.0: +; RV32I-NEXT: neg a1, a0 +; RV32I-NEXT: bltu a1, a0, .LBB11_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: .LBB11_2: +; RV32I-NEXT: neg a0, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_inv_abs32_unsigned: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: neg a1, a0 +; RV32ZBB-NEXT: maxu a0, a0, a1 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_inv_abs32_unsigned: +; RV64I: # %bb.0: +; RV64I-NEXT: sext.w a1, a0 +; RV64I-NEXT: negw a0, a0 +; RV64I-NEXT: bltu a0, a1, .LBB11_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a0, a1 +; RV64I-NEXT: .LBB11_2: +; RV64I-NEXT: negw a0, a0 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_inv_abs32_unsigned: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: sext.w a1, a0 +; RV64ZBB-NEXT: negw a0, a0 +; RV64ZBB-NEXT: minu a0, a0, a1 +; RV64ZBB-NEXT: negw a0, a0 +; RV64ZBB-NEXT: ret + %n = sub i32 0, %x + %t = call i32 @llvm.umin.i32(i32 %n, i32 %x) + %r = sub i32 0, %t + ret i32 %r +} + +define i64 @expanded_neg_inv_abs64(i64 %x) { +; RV32I-LABEL: expanded_neg_inv_abs64: +; RV32I: # %bb.0: +; RV32I-NEXT: snez a2, a0 +; RV32I-NEXT: neg a3, a1 +; RV32I-NEXT: sub a2, a3, a2 +; RV32I-NEXT: neg a3, a0 +; RV32I-NEXT: beq a2, a1, .LBB12_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: slt a4, a2, a1 +; RV32I-NEXT: beqz a4, .LBB12_3 +; RV32I-NEXT: j .LBB12_4 +; RV32I-NEXT: .LBB12_2: +; RV32I-NEXT: sltu a4, a3, a0 +; RV32I-NEXT: bnez a4, .LBB12_4 +; RV32I-NEXT: .LBB12_3: +; RV32I-NEXT: mv a2, a1 +; RV32I-NEXT: mv a3, a0 +; RV32I-NEXT: .LBB12_4: +; RV32I-NEXT: snez a0, a3 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: neg a1, a0 +; RV32I-NEXT: neg a0, a3 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_inv_abs64: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: snez a2, a0 +; RV32ZBB-NEXT: neg a3, a1 +; RV32ZBB-NEXT: sub a2, a3, a2 +; RV32ZBB-NEXT: neg a3, a0 +; RV32ZBB-NEXT: beq a2, a1, .LBB12_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: slt a4, a2, a1 +; RV32ZBB-NEXT: beqz a4, .LBB12_3 +; RV32ZBB-NEXT: j .LBB12_4 +; RV32ZBB-NEXT: .LBB12_2: +; RV32ZBB-NEXT: sltu a4, a3, a0 +; RV32ZBB-NEXT: bnez a4, .LBB12_4 +; RV32ZBB-NEXT: .LBB12_3: +; RV32ZBB-NEXT: mv a2, a1 +; RV32ZBB-NEXT: mv a3, a0 +; RV32ZBB-NEXT: .LBB12_4: +; RV32ZBB-NEXT: snez a0, a3 +; RV32ZBB-NEXT: add a0, a2, a0 +; RV32ZBB-NEXT: neg a1, a0 +; RV32ZBB-NEXT: neg a0, a3 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_inv_abs64: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: blt a1, a0, .LBB12_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: .LBB12_2: +; RV64I-NEXT: neg a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_inv_abs64: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: neg a1, a0 +; RV64ZBB-NEXT: max a0, a0, a1 +; RV64ZBB-NEXT: ret + %n = sub i64 0, %x + %t = call i64 @llvm.smin.i64(i64 %n, i64 %x) + %r = sub i64 0, %t + ret i64 %r +} + +define i64 @expanded_neg_inv_abs64_unsigned(i64 %x) { +; RV32I-LABEL: expanded_neg_inv_abs64_unsigned: +; RV32I: # %bb.0: +; RV32I-NEXT: snez a2, a0 +; RV32I-NEXT: neg a3, a1 +; RV32I-NEXT: sub a2, a3, a2 +; RV32I-NEXT: neg a3, a0 +; RV32I-NEXT: beq a2, a1, .LBB13_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sltu a4, a2, a1 +; RV32I-NEXT: beqz a4, .LBB13_3 +; RV32I-NEXT: j .LBB13_4 +; RV32I-NEXT: .LBB13_2: +; RV32I-NEXT: sltu a4, a3, a0 +; RV32I-NEXT: bnez a4, .LBB13_4 +; RV32I-NEXT: .LBB13_3: +; RV32I-NEXT: mv a2, a1 +; RV32I-NEXT: mv a3, a0 +; RV32I-NEXT: .LBB13_4: +; RV32I-NEXT: snez a0, a3 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: neg a1, a0 +; RV32I-NEXT: neg a0, a3 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_inv_abs64_unsigned: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: snez a2, a0 +; RV32ZBB-NEXT: neg a3, a1 +; RV32ZBB-NEXT: sub a2, a3, a2 +; RV32ZBB-NEXT: neg a3, a0 +; RV32ZBB-NEXT: beq a2, a1, .LBB13_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: sltu a4, a2, a1 +; RV32ZBB-NEXT: beqz a4, .LBB13_3 +; RV32ZBB-NEXT: j .LBB13_4 +; RV32ZBB-NEXT: .LBB13_2: +; RV32ZBB-NEXT: sltu a4, a3, a0 +; RV32ZBB-NEXT: bnez a4, .LBB13_4 +; RV32ZBB-NEXT: .LBB13_3: +; RV32ZBB-NEXT: mv a2, a1 +; RV32ZBB-NEXT: mv a3, a0 +; RV32ZBB-NEXT: .LBB13_4: +; RV32ZBB-NEXT: snez a0, a3 +; RV32ZBB-NEXT: add a0, a2, a0 +; RV32ZBB-NEXT: neg a1, a0 +; RV32ZBB-NEXT: neg a0, a3 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_inv_abs64_unsigned: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: bltu a1, a0, .LBB13_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: .LBB13_2: +; RV64I-NEXT: neg a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_inv_abs64_unsigned: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: neg a1, a0 +; RV64ZBB-NEXT: maxu a0, a0, a1 +; RV64ZBB-NEXT: ret + %n = sub i64 0, %x + %t = call i64 @llvm.umin.i64(i64 %n, i64 %x) + %r = sub i64 0, %t + ret i64 %r +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-neg-abs.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-neg-abs.ll new file mode 100644 index 0000000000000..6f1efb6885dee --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-neg-abs.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s + +define <2 x i64> @expanded_fixed_neg_abs64(<2 x i64> %x) { +; CHECK-LABEL: expanded_fixed_neg_abs64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vrsub.vi v9, v8, 0 +; CHECK-NEXT: vmin.vv v8, v8, v9 +; CHECK-NEXT: ret + %t = sub <2 x i64> , %x + %t1 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %t, <2 x i64> %x) + %t2 = sub <2 x i64> , %t1 + ret <2 x i64> %t2 +} + +define <2 x i64> @expanded_fixed_neg_abs64_unsigned(<2 x i64> %x) { +; CHECK-LABEL: expanded_fixed_neg_abs64_unsigned: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vrsub.vi v9, v8, 0 +; CHECK-NEXT: vminu.vv v8, v8, v9 +; CHECK-NEXT: ret + %t = sub <2 x i64> , %x + %t1 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %t, <2 x i64> %x) + %t2 = sub <2 x i64> , %t1 + ret <2 x i64> %t2 +} + +define <2 x i64> @expanded_fixed_neg_inv_abs64(<2 x i64> %x) { +; CHECK-LABEL: expanded_fixed_neg_inv_abs64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vrsub.vi v9, v8, 0 +; CHECK-NEXT: vmax.vv v8, v8, v9 +; CHECK-NEXT: ret + %t = sub <2 x i64> , %x + %t1 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %t, <2 x i64> %x) + %t2 = sub <2 x i64> , %t1 + ret <2 x i64> %t2 +} + +define <2 x i64> @expanded_fixed_neg_inv_abs64_unsigned(<2 x i64> %x) { +; CHECK-LABEL: expanded_fixed_neg_inv_abs64_unsigned: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vrsub.vi v9, v8, 0 +; CHECK-NEXT: vmaxu.vv v8, v8, v9 +; CHECK-NEXT: ret + %t = sub <2 x i64> , %x + %t1 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %t, <2 x i64> %x) + %t2 = sub <2 x i64> , %t1 + ret <2 x i64> %t2 +} From 3cac26f5419b68d37e1919001e1c46a765df294f Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Thu, 2 Jan 2025 16:29:34 -0800 Subject: [PATCH 310/567] [GISel] Combine `(neg (min/max x, (neg x)))` into `(max/min x, (neg x))` (#120998) This is the GISel version of #120666. Also supports both unsigned and signed version of min & max. --- .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 3 + llvm/include/llvm/CodeGen/GlobalISel/Utils.h | 4 + .../include/llvm/Target/GlobalISel/Combine.td | 8 +- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 28 ++ llvm/lib/CodeGen/GlobalISel/Utils.cpp | 15 + llvm/lib/Target/RISCV/RISCVCombine.td | 2 +- .../RISCV/GlobalISel/combine-neg-abs.ll | 457 ++++++++++++++++++ 7 files changed, 515 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/combine-neg-abs.ll diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 871456d2a55b5..94e36e412b0cf 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -864,6 +864,9 @@ class CombinerHelper { /// Combine select to integer min/max. bool matchSelectIMinMax(const MachineOperand &MO, BuildFnTy &MatchInfo) const; + /// Tranform (neg (min/max x, (neg x))) into (max/min x, (neg x)). + bool matchSimplifyNegMinMax(MachineInstr &MI, BuildFnTy &MatchInfo) const; + /// Combine selects. bool matchSelect(MachineInstr &MI, BuildFnTy &MatchInfo) const; diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h index cb5a4c14b364c..a35ecae5d18bf 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h @@ -171,6 +171,10 @@ void reportGISelWarning(MachineFunction &MF, const TargetPassConfig &TPC, MachineOptimizationRemarkEmitter &MORE, MachineOptimizationRemarkMissed &R); +/// Returns the inverse opcode of \p MinMaxOpc, which is a generic min/max +/// opcode like G_SMIN. +unsigned getInverseGMinMaxOpcode(unsigned MinMaxOpc); + /// If \p VReg is defined by a G_CONSTANT, return the corresponding value. std::optional getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI); diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 013c3a6ed83d8..8641eabbdd84c 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -1372,6 +1372,12 @@ def select_to_iminmax: GICombineRule< [{ return Helper.matchSelectIMinMax(${root}, ${info}); }]), (apply [{ Helper.applyBuildFnMO(${root}, ${info}); }])>; +def simplify_neg_minmax : GICombineRule< + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (wip_match_opcode G_SUB):$root, + [{ return Helper.matchSimplifyNegMinMax(*${root}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>; + def match_selects : GICombineRule< (defs root:$root, build_fn_matchinfo:$matchinfo), (match (wip_match_opcode G_SELECT):$root, @@ -2008,7 +2014,7 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines, and_or_disjoint_mask, fma_combines, fold_binop_into_select, sub_add_reg, select_to_minmax, fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors, - combine_concat_vector, + simplify_neg_minmax, combine_concat_vector, sext_trunc, zext_trunc, prefer_sign_combines, shuffle_combines, combine_use_vector_truncate, merge_combines, overflow_combines]>; diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index c061c01d3c1b1..4e3aaf5da7198 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -7062,6 +7062,34 @@ bool CombinerHelper::matchSelectIMinMax(const MachineOperand &MO, } } +// (neg (min/max x, (neg x))) --> (max/min x, (neg x)) +bool CombinerHelper::matchSimplifyNegMinMax(MachineInstr &MI, + BuildFnTy &MatchInfo) const { + assert(MI.getOpcode() == TargetOpcode::G_SUB); + Register DestReg = MI.getOperand(0).getReg(); + LLT DestTy = MRI.getType(DestReg); + + Register X; + Register Sub0; + auto NegPattern = m_all_of(m_Neg(m_DeferredReg(X)), m_Reg(Sub0)); + if (mi_match(DestReg, MRI, + m_Neg(m_OneUse(m_any_of(m_GSMin(m_Reg(X), NegPattern), + m_GSMax(m_Reg(X), NegPattern), + m_GUMin(m_Reg(X), NegPattern), + m_GUMax(m_Reg(X), NegPattern)))))) { + MachineInstr *MinMaxMI = MRI.getVRegDef(MI.getOperand(2).getReg()); + unsigned NewOpc = getInverseGMinMaxOpcode(MinMaxMI->getOpcode()); + if (isLegal({NewOpc, {DestTy}})) { + MatchInfo = [=](MachineIRBuilder &B) { + B.buildInstr(NewOpc, {DestReg}, {X, Sub0}); + }; + return true; + } + } + + return false; +} + bool CombinerHelper::matchSelect(MachineInstr &MI, BuildFnTy &MatchInfo) const { GSelect *Select = cast(&MI); diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index 79382933a1f42..625d556e3ff5e 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -276,6 +276,21 @@ void llvm::reportGISelFailure(MachineFunction &MF, const TargetPassConfig &TPC, reportGISelFailure(MF, TPC, MORE, R); } +unsigned llvm::getInverseGMinMaxOpcode(unsigned MinMaxOpc) { + switch (MinMaxOpc) { + case TargetOpcode::G_SMIN: + return TargetOpcode::G_SMAX; + case TargetOpcode::G_SMAX: + return TargetOpcode::G_SMIN; + case TargetOpcode::G_UMIN: + return TargetOpcode::G_UMAX; + case TargetOpcode::G_UMAX: + return TargetOpcode::G_UMIN; + default: + llvm_unreachable("unrecognized opcode"); + } +} + std::optional llvm::getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI) { std::optional ValAndVReg = getIConstantVRegValWithLookThrough( diff --git a/llvm/lib/Target/RISCV/RISCVCombine.td b/llvm/lib/Target/RISCV/RISCVCombine.td index 030613a7d8904..995dd0c5d82eb 100644 --- a/llvm/lib/Target/RISCV/RISCVCombine.td +++ b/llvm/lib/Target/RISCV/RISCVCombine.td @@ -25,5 +25,5 @@ def RISCVPostLegalizerCombiner : GICombiner<"RISCVPostLegalizerCombinerImpl", [sub_to_add, combines_for_extload, redundant_and, identity_combines, shift_immed_chain, - commute_constant_to_rhs]> { + commute_constant_to_rhs, simplify_neg_minmax]> { } diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/combine-neg-abs.ll b/llvm/test/CodeGen/RISCV/GlobalISel/combine-neg-abs.ll new file mode 100644 index 0000000000000..6c848ecf0fffd --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/combine-neg-abs.ll @@ -0,0 +1,457 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -global-isel -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=RV32I +; RUN: llc -mtriple=riscv32 -global-isel -mattr=+zbb -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=RV32ZBB +; RUN: llc -mtriple=riscv64 -global-isel -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=RV64I +; RUN: llc -mtriple=riscv64 -global-isel -mattr=+zbb -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=RV64ZBB + +define i32 @expanded_neg_abs32(i32 %x) { +; RV32I-LABEL: expanded_neg_abs32: +; RV32I: # %bb.0: +; RV32I-NEXT: neg a1, a0 +; RV32I-NEXT: blt a0, a1, .LBB0_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: .LBB0_2: +; RV32I-NEXT: neg a0, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_abs32: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: neg a1, a0 +; RV32ZBB-NEXT: min a0, a0, a1 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_abs32: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: sext.w a2, a1 +; RV64I-NEXT: sext.w a3, a0 +; RV64I-NEXT: blt a3, a2, .LBB0_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: .LBB0_2: +; RV64I-NEXT: neg a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_abs32: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: negw a1, a0 +; RV64ZBB-NEXT: sext.w a0, a0 +; RV64ZBB-NEXT: max a0, a1, a0 +; RV64ZBB-NEXT: neg a0, a0 +; RV64ZBB-NEXT: ret + %n = sub i32 0, %x + %t = call i32 @llvm.smax.i32(i32 %n, i32 %x) + %r = sub i32 0, %t + ret i32 %r +} + +define i32 @expanded_neg_abs32_unsigned(i32 %x) { +; RV32I-LABEL: expanded_neg_abs32_unsigned: +; RV32I: # %bb.0: +; RV32I-NEXT: neg a1, a0 +; RV32I-NEXT: bltu a0, a1, .LBB1_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: .LBB1_2: +; RV32I-NEXT: neg a0, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_abs32_unsigned: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: neg a1, a0 +; RV32ZBB-NEXT: minu a0, a0, a1 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_abs32_unsigned: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: sext.w a2, a1 +; RV64I-NEXT: sext.w a3, a0 +; RV64I-NEXT: bltu a3, a2, .LBB1_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: .LBB1_2: +; RV64I-NEXT: neg a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_abs32_unsigned: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: negw a1, a0 +; RV64ZBB-NEXT: sext.w a0, a0 +; RV64ZBB-NEXT: maxu a0, a1, a0 +; RV64ZBB-NEXT: neg a0, a0 +; RV64ZBB-NEXT: ret + %n = sub i32 0, %x + %t = call i32 @llvm.umax.i32(i32 %n, i32 %x) + %r = sub i32 0, %t + ret i32 %r +} + +define i64 @expanded_neg_abs64(i64 %x) { +; RV32I-LABEL: expanded_neg_abs64: +; RV32I: # %bb.0: +; RV32I-NEXT: snez a2, a0 +; RV32I-NEXT: neg a3, a1 +; RV32I-NEXT: sub a2, a3, a2 +; RV32I-NEXT: neg a3, a0 +; RV32I-NEXT: beq a2, a1, .LBB2_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: slt a4, a1, a2 +; RV32I-NEXT: beqz a4, .LBB2_3 +; RV32I-NEXT: j .LBB2_4 +; RV32I-NEXT: .LBB2_2: +; RV32I-NEXT: sltu a4, a0, a3 +; RV32I-NEXT: bnez a4, .LBB2_4 +; RV32I-NEXT: .LBB2_3: +; RV32I-NEXT: mv a3, a0 +; RV32I-NEXT: mv a2, a1 +; RV32I-NEXT: .LBB2_4: +; RV32I-NEXT: neg a0, a3 +; RV32I-NEXT: snez a1, a3 +; RV32I-NEXT: neg a2, a2 +; RV32I-NEXT: sub a1, a2, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_abs64: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: snez a2, a0 +; RV32ZBB-NEXT: neg a3, a1 +; RV32ZBB-NEXT: sub a2, a3, a2 +; RV32ZBB-NEXT: neg a3, a0 +; RV32ZBB-NEXT: beq a2, a1, .LBB2_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: slt a4, a1, a2 +; RV32ZBB-NEXT: beqz a4, .LBB2_3 +; RV32ZBB-NEXT: j .LBB2_4 +; RV32ZBB-NEXT: .LBB2_2: +; RV32ZBB-NEXT: sltu a4, a0, a3 +; RV32ZBB-NEXT: bnez a4, .LBB2_4 +; RV32ZBB-NEXT: .LBB2_3: +; RV32ZBB-NEXT: mv a3, a0 +; RV32ZBB-NEXT: mv a2, a1 +; RV32ZBB-NEXT: .LBB2_4: +; RV32ZBB-NEXT: neg a0, a3 +; RV32ZBB-NEXT: snez a1, a3 +; RV32ZBB-NEXT: neg a2, a2 +; RV32ZBB-NEXT: sub a1, a2, a1 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_abs64: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: blt a0, a1, .LBB2_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: .LBB2_2: +; RV64I-NEXT: neg a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_abs64: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: neg a1, a0 +; RV64ZBB-NEXT: min a0, a0, a1 +; RV64ZBB-NEXT: ret + %n = sub i64 0, %x + %t = call i64 @llvm.smax.i64(i64 %n, i64 %x) + %r = sub i64 0, %t + ret i64 %r +} + +define i64 @expanded_neg_abs64_unsigned(i64 %x) { +; RV32I-LABEL: expanded_neg_abs64_unsigned: +; RV32I: # %bb.0: +; RV32I-NEXT: snez a2, a0 +; RV32I-NEXT: neg a3, a1 +; RV32I-NEXT: sub a2, a3, a2 +; RV32I-NEXT: neg a3, a0 +; RV32I-NEXT: beq a2, a1, .LBB3_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sltu a4, a1, a2 +; RV32I-NEXT: beqz a4, .LBB3_3 +; RV32I-NEXT: j .LBB3_4 +; RV32I-NEXT: .LBB3_2: +; RV32I-NEXT: sltu a4, a0, a3 +; RV32I-NEXT: bnez a4, .LBB3_4 +; RV32I-NEXT: .LBB3_3: +; RV32I-NEXT: mv a3, a0 +; RV32I-NEXT: mv a2, a1 +; RV32I-NEXT: .LBB3_4: +; RV32I-NEXT: neg a0, a3 +; RV32I-NEXT: snez a1, a3 +; RV32I-NEXT: neg a2, a2 +; RV32I-NEXT: sub a1, a2, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_abs64_unsigned: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: snez a2, a0 +; RV32ZBB-NEXT: neg a3, a1 +; RV32ZBB-NEXT: sub a2, a3, a2 +; RV32ZBB-NEXT: neg a3, a0 +; RV32ZBB-NEXT: beq a2, a1, .LBB3_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: sltu a4, a1, a2 +; RV32ZBB-NEXT: beqz a4, .LBB3_3 +; RV32ZBB-NEXT: j .LBB3_4 +; RV32ZBB-NEXT: .LBB3_2: +; RV32ZBB-NEXT: sltu a4, a0, a3 +; RV32ZBB-NEXT: bnez a4, .LBB3_4 +; RV32ZBB-NEXT: .LBB3_3: +; RV32ZBB-NEXT: mv a3, a0 +; RV32ZBB-NEXT: mv a2, a1 +; RV32ZBB-NEXT: .LBB3_4: +; RV32ZBB-NEXT: neg a0, a3 +; RV32ZBB-NEXT: snez a1, a3 +; RV32ZBB-NEXT: neg a2, a2 +; RV32ZBB-NEXT: sub a1, a2, a1 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_abs64_unsigned: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: bltu a0, a1, .LBB3_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: .LBB3_2: +; RV64I-NEXT: neg a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_abs64_unsigned: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: neg a1, a0 +; RV64ZBB-NEXT: minu a0, a0, a1 +; RV64ZBB-NEXT: ret + %n = sub i64 0, %x + %t = call i64 @llvm.umax.i64(i64 %n, i64 %x) + %r = sub i64 0, %t + ret i64 %r +} + +define i32 @expanded_neg_inv_abs32(i32 %x) { +; RV32I-LABEL: expanded_neg_inv_abs32: +; RV32I: # %bb.0: +; RV32I-NEXT: neg a1, a0 +; RV32I-NEXT: blt a1, a0, .LBB4_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: .LBB4_2: +; RV32I-NEXT: neg a0, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_inv_abs32: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: neg a1, a0 +; RV32ZBB-NEXT: max a0, a0, a1 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_inv_abs32: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: sext.w a2, a1 +; RV64I-NEXT: sext.w a3, a0 +; RV64I-NEXT: blt a2, a3, .LBB4_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: .LBB4_2: +; RV64I-NEXT: neg a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_inv_abs32: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: negw a1, a0 +; RV64ZBB-NEXT: sext.w a0, a0 +; RV64ZBB-NEXT: min a0, a1, a0 +; RV64ZBB-NEXT: neg a0, a0 +; RV64ZBB-NEXT: ret + %n = sub i32 0, %x + %t = call i32 @llvm.smin.i32(i32 %n, i32 %x) + %r = sub i32 0, %t + ret i32 %r +} + +define i32 @expanded_neg_inv_abs32_unsigned(i32 %x) { +; RV32I-LABEL: expanded_neg_inv_abs32_unsigned: +; RV32I: # %bb.0: +; RV32I-NEXT: neg a1, a0 +; RV32I-NEXT: bltu a1, a0, .LBB5_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: .LBB5_2: +; RV32I-NEXT: neg a0, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_inv_abs32_unsigned: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: neg a1, a0 +; RV32ZBB-NEXT: maxu a0, a0, a1 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_inv_abs32_unsigned: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: sext.w a2, a1 +; RV64I-NEXT: sext.w a3, a0 +; RV64I-NEXT: bltu a2, a3, .LBB5_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: .LBB5_2: +; RV64I-NEXT: neg a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_inv_abs32_unsigned: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: negw a1, a0 +; RV64ZBB-NEXT: sext.w a0, a0 +; RV64ZBB-NEXT: minu a0, a1, a0 +; RV64ZBB-NEXT: neg a0, a0 +; RV64ZBB-NEXT: ret + %n = sub i32 0, %x + %t = call i32 @llvm.umin.i32(i32 %n, i32 %x) + %r = sub i32 0, %t + ret i32 %r +} + +define i64 @expanded_neg_inv_abs64(i64 %x) { +; RV32I-LABEL: expanded_neg_inv_abs64: +; RV32I: # %bb.0: +; RV32I-NEXT: snez a2, a0 +; RV32I-NEXT: neg a3, a1 +; RV32I-NEXT: sub a2, a3, a2 +; RV32I-NEXT: neg a3, a0 +; RV32I-NEXT: beq a2, a1, .LBB6_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: slt a4, a2, a1 +; RV32I-NEXT: beqz a4, .LBB6_3 +; RV32I-NEXT: j .LBB6_4 +; RV32I-NEXT: .LBB6_2: +; RV32I-NEXT: sltu a4, a3, a0 +; RV32I-NEXT: bnez a4, .LBB6_4 +; RV32I-NEXT: .LBB6_3: +; RV32I-NEXT: mv a3, a0 +; RV32I-NEXT: mv a2, a1 +; RV32I-NEXT: .LBB6_4: +; RV32I-NEXT: neg a0, a3 +; RV32I-NEXT: snez a1, a3 +; RV32I-NEXT: neg a2, a2 +; RV32I-NEXT: sub a1, a2, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_inv_abs64: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: snez a2, a0 +; RV32ZBB-NEXT: neg a3, a1 +; RV32ZBB-NEXT: sub a2, a3, a2 +; RV32ZBB-NEXT: neg a3, a0 +; RV32ZBB-NEXT: beq a2, a1, .LBB6_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: slt a4, a2, a1 +; RV32ZBB-NEXT: beqz a4, .LBB6_3 +; RV32ZBB-NEXT: j .LBB6_4 +; RV32ZBB-NEXT: .LBB6_2: +; RV32ZBB-NEXT: sltu a4, a3, a0 +; RV32ZBB-NEXT: bnez a4, .LBB6_4 +; RV32ZBB-NEXT: .LBB6_3: +; RV32ZBB-NEXT: mv a3, a0 +; RV32ZBB-NEXT: mv a2, a1 +; RV32ZBB-NEXT: .LBB6_4: +; RV32ZBB-NEXT: neg a0, a3 +; RV32ZBB-NEXT: snez a1, a3 +; RV32ZBB-NEXT: neg a2, a2 +; RV32ZBB-NEXT: sub a1, a2, a1 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_inv_abs64: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: blt a1, a0, .LBB6_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: .LBB6_2: +; RV64I-NEXT: neg a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_inv_abs64: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: neg a1, a0 +; RV64ZBB-NEXT: max a0, a0, a1 +; RV64ZBB-NEXT: ret + %n = sub i64 0, %x + %t = call i64 @llvm.smin.i64(i64 %n, i64 %x) + %r = sub i64 0, %t + ret i64 %r +} + +define i64 @expanded_neg_inv_abs64_unsigned(i64 %x) { +; RV32I-LABEL: expanded_neg_inv_abs64_unsigned: +; RV32I: # %bb.0: +; RV32I-NEXT: snez a2, a0 +; RV32I-NEXT: neg a3, a1 +; RV32I-NEXT: sub a2, a3, a2 +; RV32I-NEXT: neg a3, a0 +; RV32I-NEXT: beq a2, a1, .LBB7_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sltu a4, a2, a1 +; RV32I-NEXT: beqz a4, .LBB7_3 +; RV32I-NEXT: j .LBB7_4 +; RV32I-NEXT: .LBB7_2: +; RV32I-NEXT: sltu a4, a3, a0 +; RV32I-NEXT: bnez a4, .LBB7_4 +; RV32I-NEXT: .LBB7_3: +; RV32I-NEXT: mv a3, a0 +; RV32I-NEXT: mv a2, a1 +; RV32I-NEXT: .LBB7_4: +; RV32I-NEXT: neg a0, a3 +; RV32I-NEXT: snez a1, a3 +; RV32I-NEXT: neg a2, a2 +; RV32I-NEXT: sub a1, a2, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_inv_abs64_unsigned: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: snez a2, a0 +; RV32ZBB-NEXT: neg a3, a1 +; RV32ZBB-NEXT: sub a2, a3, a2 +; RV32ZBB-NEXT: neg a3, a0 +; RV32ZBB-NEXT: beq a2, a1, .LBB7_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: sltu a4, a2, a1 +; RV32ZBB-NEXT: beqz a4, .LBB7_3 +; RV32ZBB-NEXT: j .LBB7_4 +; RV32ZBB-NEXT: .LBB7_2: +; RV32ZBB-NEXT: sltu a4, a3, a0 +; RV32ZBB-NEXT: bnez a4, .LBB7_4 +; RV32ZBB-NEXT: .LBB7_3: +; RV32ZBB-NEXT: mv a3, a0 +; RV32ZBB-NEXT: mv a2, a1 +; RV32ZBB-NEXT: .LBB7_4: +; RV32ZBB-NEXT: neg a0, a3 +; RV32ZBB-NEXT: snez a1, a3 +; RV32ZBB-NEXT: neg a2, a2 +; RV32ZBB-NEXT: sub a1, a2, a1 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_inv_abs64_unsigned: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: bltu a1, a0, .LBB7_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: .LBB7_2: +; RV64I-NEXT: neg a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_inv_abs64_unsigned: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: neg a1, a0 +; RV64ZBB-NEXT: maxu a0, a0, a1 +; RV64ZBB-NEXT: ret + %n = sub i64 0, %x + %t = call i64 @llvm.umin.i64(i64 %n, i64 %x) + %r = sub i64 0, %t + ret i64 %r +} From 4010e0c45b87e4d073c407cae787e96d4808ad36 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Thu, 2 Jan 2025 16:57:31 -0800 Subject: [PATCH 311/567] [libc] Use __attribute__((__noreturn__)) for _Noreturn in C < 11 (#121252) When in modes like C99, the _Noreturn keyword is not available in C. But GNU-compatible compilers have a `noreturn` attribute with the same effect on function declarations. --- libc/include/__llvm-libc-common.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/libc/include/__llvm-libc-common.h b/libc/include/__llvm-libc-common.h index d54ee7b9f91f3..d9d70aff771c0 100644 --- a/libc/include/__llvm-libc-common.h +++ b/libc/include/__llvm-libc-common.h @@ -52,6 +52,16 @@ #undef __restrict #define __restrict restrict // C99 and above support the restrict keyword. +#undef _Noreturn +#if __STDC_VERSION__ >= 201112L +// In C11 and later, _Noreturn is a keyword. +#elif defined(__GNUC__) +// GNU-compatible compilers have an equivalent attribute. +#define _Noreturn __attribute__((__noreturn__)) +#else +#define _Noreturn +#endif + #undef __NOEXCEPT #ifdef __GNUC__ #define __NOEXCEPT __attribute__((__nothrow__)) From 4b17a8b10ebb69d3bd30ee7714b5ca24f7e944dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Thu, 2 Jan 2025 17:02:45 -0800 Subject: [PATCH 312/567] [flang][cuda] Add operation to sync global descriptor (#121520) Introduce cuf.sync_descriptor to be used to sync device global descriptor after pointer association. Also move CUFCommon so it can be used in FIRBuilder lib as well. --- .../{Transforms => Builder}/CUFCommon.h | 0 .../flang/Optimizer/Dialect/CUF/CUFOps.td | 11 +++++++++++ flang/lib/Lower/Allocatable.cpp | 19 +++++++++++++++++++ flang/lib/Optimizer/Builder/CMakeLists.txt | 1 + .../{Transforms => Builder}/CUFCommon.cpp | 2 +- flang/lib/Optimizer/Transforms/CMakeLists.txt | 1 - .../Transforms/CUFAddConstructor.cpp | 2 +- .../Optimizer/Transforms/CUFDeviceGlobal.cpp | 2 +- .../Optimizer/Transforms/CUFOpConversion.cpp | 2 +- .../Transforms/SimplifyIntrinsics.cpp | 2 +- flang/test/Lower/CUDA/cuda-pointer-sync.cuf | 17 +++++++++++++++++ 11 files changed, 53 insertions(+), 6 deletions(-) rename flang/include/flang/Optimizer/{Transforms => Builder}/CUFCommon.h (100%) rename flang/lib/Optimizer/{Transforms => Builder}/CUFCommon.cpp (97%) create mode 100644 flang/test/Lower/CUDA/cuda-pointer-sync.cuf diff --git a/flang/include/flang/Optimizer/Transforms/CUFCommon.h b/flang/include/flang/Optimizer/Builder/CUFCommon.h similarity index 100% rename from flang/include/flang/Optimizer/Transforms/CUFCommon.h rename to flang/include/flang/Optimizer/Builder/CUFCommon.h diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td index d06587c57d44b..9a31ffa2e9471 100644 --- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td +++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td @@ -140,6 +140,17 @@ def cuf_DeallocateOp : cuf_Op<"deallocate", let hasVerifier = 1; } +def cuf_SyncDescriptorOp : cuf_Op<"sync_descriptor", []> { + let summary = + "Synchronize the host and device descriptor of a Fortran pointer"; + + let arguments = (ins SymbolRefAttr:$globalName); + + let assemblyFormat = [{ + $globalName attr-dict + }]; +} + def cuf_DataTransferOp : cuf_Op<"data_transfer", []> { let summary = "Represent a data transfer between host and device memory"; diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp index fb8380ac7e8c5..4c64870675816 100644 --- a/flang/lib/Lower/Allocatable.cpp +++ b/flang/lib/Lower/Allocatable.cpp @@ -22,12 +22,14 @@ #include "flang/Lower/PFTBuilder.h" #include "flang/Lower/Runtime.h" #include "flang/Lower/StatementContext.h" +#include "flang/Optimizer/Builder/CUFCommon.h" #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Builder/Runtime/RTBuilder.h" #include "flang/Optimizer/Builder/Todo.h" #include "flang/Optimizer/Dialect/CUF/CUFOps.h" #include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/Dialect/FIROpsSupport.h" +#include "flang/Optimizer/HLFIR/HLFIROps.h" #include "flang/Optimizer/Support/FatalError.h" #include "flang/Optimizer/Support/InternalNames.h" #include "flang/Parser/parse-tree.h" @@ -1086,6 +1088,22 @@ bool Fortran::lower::isArraySectionWithoutVectorSubscript( !Fortran::evaluate::HasVectorSubscript(expr); } +static void genCUFPointerSync(const mlir::Value box, + fir::FirOpBuilder &builder) { + if (auto declareOp = box.getDefiningOp()) { + if (auto addrOfOp = declareOp.getMemref().getDefiningOp()) { + auto mod = addrOfOp->getParentOfType(); + if (auto globalOp = + mod.lookupSymbol(addrOfOp.getSymbol())) { + if (cuf::isRegisteredDeviceGlobal(globalOp)) { + builder.create(box.getLoc(), + addrOfOp.getSymbol()); + } + } + } + } +} + void Fortran::lower::associateMutableBox( Fortran::lower::AbstractConverter &converter, mlir::Location loc, const fir::MutableBoxValue &box, const Fortran::lower::SomeExpr &source, @@ -1098,6 +1116,7 @@ void Fortran::lower::associateMutableBox( if (converter.getLoweringOptions().getLowerToHighLevelFIR()) { fir::ExtendedValue rhs = converter.genExprAddr(loc, source, stmtCtx); fir::factory::associateMutableBox(builder, loc, box, rhs, lbounds); + genCUFPointerSync(box.getAddr(), builder); return; } // The right hand side is not be evaluated into a temp. Array sections can diff --git a/flang/lib/Optimizer/Builder/CMakeLists.txt b/flang/lib/Optimizer/Builder/CMakeLists.txt index 05164d41a4cb5..a824d70fdb5c7 100644 --- a/flang/lib/Optimizer/Builder/CMakeLists.txt +++ b/flang/lib/Optimizer/Builder/CMakeLists.txt @@ -5,6 +5,7 @@ add_flang_library(FIRBuilder BoxValue.cpp Character.cpp Complex.cpp + CUFCommon.cpp DoLoopHelper.cpp FIRBuilder.cpp HLFIRTools.cpp diff --git a/flang/lib/Optimizer/Transforms/CUFCommon.cpp b/flang/lib/Optimizer/Builder/CUFCommon.cpp similarity index 97% rename from flang/lib/Optimizer/Transforms/CUFCommon.cpp rename to flang/lib/Optimizer/Builder/CUFCommon.cpp index bbe33217e8f45..81a8a90ce394e 100644 --- a/flang/lib/Optimizer/Transforms/CUFCommon.cpp +++ b/flang/lib/Optimizer/Builder/CUFCommon.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "flang/Optimizer/Transforms/CUFCommon.h" +#include "flang/Optimizer/Builder/CUFCommon.h" #include "flang/Optimizer/Dialect/CUF/CUFOps.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt index 9eafa4ec234bd..d20d3bc4108ce 100644 --- a/flang/lib/Optimizer/Transforms/CMakeLists.txt +++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt @@ -9,7 +9,6 @@ add_flang_library(FIRTransforms CompilerGeneratedNames.cpp ConstantArgumentGlobalisation.cpp ControlFlowConverter.cpp - CUFCommon.cpp CUFAddConstructor.cpp CUFDeviceGlobal.cpp CUFOpConversion.cpp diff --git a/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp b/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp index 9591f48c5d417..97551595db039 100644 --- a/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp +++ b/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "flang/Optimizer/Builder/BoxValue.h" +#include "flang/Optimizer/Builder/CUFCommon.h" #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Builder/Runtime/RTBuilder.h" #include "flang/Optimizer/Builder/Todo.h" @@ -19,7 +20,6 @@ #include "flang/Optimizer/Dialect/FIROpsSupport.h" #include "flang/Optimizer/Dialect/FIRType.h" #include "flang/Optimizer/Support/DataLayout.h" -#include "flang/Optimizer/Transforms/CUFCommon.h" #include "flang/Runtime/CUDA/registration.h" #include "flang/Runtime/entry-names.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" diff --git a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp index 07cc1f3b4b51c..2e6c272fa9089 100644 --- a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp +++ b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp @@ -7,12 +7,12 @@ //===----------------------------------------------------------------------===// #include "flang/Common/Fortran.h" +#include "flang/Optimizer/Builder/CUFCommon.h" #include "flang/Optimizer/Dialect/CUF/CUFOps.h" #include "flang/Optimizer/Dialect/FIRDialect.h" #include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/HLFIR/HLFIROps.h" #include "flang/Optimizer/Support/InternalNames.h" -#include "flang/Optimizer/Transforms/CUFCommon.h" #include "flang/Runtime/CUDA/common.h" #include "flang/Runtime/allocatable.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp index de5c51556eecf..fb0ef24654644 100644 --- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp @@ -8,6 +8,7 @@ #include "flang/Optimizer/Transforms/CUFOpConversion.h" #include "flang/Common/Fortran.h" +#include "flang/Optimizer/Builder/CUFCommon.h" #include "flang/Optimizer/Builder/Runtime/RTBuilder.h" #include "flang/Optimizer/CodeGen/TypeConverter.h" #include "flang/Optimizer/Dialect/CUF/CUFOps.h" @@ -15,7 +16,6 @@ #include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/HLFIR/HLFIROps.h" #include "flang/Optimizer/Support/DataLayout.h" -#include "flang/Optimizer/Transforms/CUFCommon.h" #include "flang/Runtime/CUDA/allocatable.h" #include "flang/Runtime/CUDA/common.h" #include "flang/Runtime/CUDA/descriptor.h" diff --git a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp index d3567f453fceb..fa6a7b23624e8 100644 --- a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp +++ b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp @@ -24,6 +24,7 @@ #include "flang/Common/Fortran.h" #include "flang/Optimizer/Builder/BoxValue.h" +#include "flang/Optimizer/Builder/CUFCommon.h" #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Builder/LowLevelIntrinsics.h" #include "flang/Optimizer/Builder/Todo.h" @@ -31,7 +32,6 @@ #include "flang/Optimizer/Dialect/FIRType.h" #include "flang/Optimizer/Dialect/Support/FIRContext.h" #include "flang/Optimizer/HLFIR/HLFIRDialect.h" -#include "flang/Optimizer/Transforms/CUFCommon.h" #include "flang/Optimizer/Transforms/Passes.h" #include "flang/Optimizer/Transforms/Utils.h" #include "flang/Runtime/entry-names.h" diff --git a/flang/test/Lower/CUDA/cuda-pointer-sync.cuf b/flang/test/Lower/CUDA/cuda-pointer-sync.cuf new file mode 100644 index 0000000000000..e17869b2d6357 --- /dev/null +++ b/flang/test/Lower/CUDA/cuda-pointer-sync.cuf @@ -0,0 +1,17 @@ +! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s + +module devptr + real, device, pointer, dimension(:) :: dev_ptr +end module + +use devptr +real, device, target, dimension(4) :: a_dev +a_dev = 42.0 +dev_ptr => a_dev +end + +! CHECK: fir.global @_QMdevptrEdev_ptr {data_attr = #cuf.cuda} : !fir.box>> +! CHECK-LABEL: func.func @_QQmain() +! CHECK: fir.embox +! CHECK: fir.store +! CHECK: cuf.sync_descriptor @_QMdevptrEdev_ptr From 6dcd2b035da34fa53693b401139a419adb7342db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Thu, 2 Jan 2025 17:02:59 -0800 Subject: [PATCH 313/567] [flang][cuda] Convert cuf.sync_descriptor to runtime call (#121524) Convert the op to a new entry point in the runtime `CUFSyncGlobalDescriptor` --- flang/include/flang/Runtime/CUDA/descriptor.h | 4 ++ .../Optimizer/Transforms/CUFOpConversion.cpp | 42 ++++++++++++++++++- flang/runtime/CUDA/descriptor.cpp | 7 ++++ flang/test/Fir/CUDA/cuda-sync-desc.mlir | 20 +++++++++ 4 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 flang/test/Fir/CUDA/cuda-sync-desc.mlir diff --git a/flang/include/flang/Runtime/CUDA/descriptor.h b/flang/include/flang/Runtime/CUDA/descriptor.h index 55878aaac57fb..0ee7feca10e44 100644 --- a/flang/include/flang/Runtime/CUDA/descriptor.h +++ b/flang/include/flang/Runtime/CUDA/descriptor.h @@ -33,6 +33,10 @@ void *RTDECL(CUFGetDeviceAddress)( void RTDECL(CUFDescriptorSync)(Descriptor *dst, const Descriptor *src, const char *sourceFile = nullptr, int sourceLine = 0); +/// Get the device address of registered with the \p hostPtr and sync them. +void RTDECL(CUFSyncGlobalDescriptor)( + void *hostPtr, const char *sourceFile = nullptr, int sourceLine = 0); + } // extern "C" } // namespace Fortran::runtime::cuda diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp index fb0ef24654644..f08f9e412b885 100644 --- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp @@ -788,6 +788,45 @@ struct CUFLaunchOpConversion const mlir::SymbolTable &symTab; }; +struct CUFSyncDescriptorOpConversion + : public mlir::OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + CUFSyncDescriptorOpConversion(mlir::MLIRContext *context, + const mlir::SymbolTable &symTab) + : OpRewritePattern(context), symTab{symTab} {} + + mlir::LogicalResult + matchAndRewrite(cuf::SyncDescriptorOp op, + mlir::PatternRewriter &rewriter) const override { + auto mod = op->getParentOfType(); + fir::FirOpBuilder builder(rewriter, mod); + mlir::Location loc = op.getLoc(); + + auto globalOp = mod.lookupSymbol(op.getGlobalName()); + if (!globalOp) + return mlir::failure(); + + auto hostAddr = builder.create( + loc, fir::ReferenceType::get(globalOp.getType()), op.getGlobalName()); + mlir::func::FuncOp callee = + fir::runtime::getRuntimeFunc(loc, + builder); + auto fTy = callee.getFunctionType(); + mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc); + mlir::Value sourceLine = + fir::factory::locationToLineNo(builder, loc, fTy.getInput(2)); + llvm::SmallVector args{fir::runtime::createArguments( + builder, loc, fTy, hostAddr, sourceFile, sourceLine)}; + builder.create(loc, callee, args); + op.erase(); + return mlir::success(); + } + +private: + const mlir::SymbolTable &symTab; +}; + class CUFOpConversion : public fir::impl::CUFOpConversionBase { public: void runOnOperation() override { @@ -851,7 +890,8 @@ void cuf::populateCUFToFIRConversionPatterns( CUFFreeOpConversion>(patterns.getContext()); patterns.insert(patterns.getContext(), symtab, &dl, &converter); - patterns.insert(patterns.getContext(), symtab); + patterns.insert( + patterns.getContext(), symtab); } void cuf::populateFIRCUFConversionPatterns(const mlir::SymbolTable &symtab, diff --git a/flang/runtime/CUDA/descriptor.cpp b/flang/runtime/CUDA/descriptor.cpp index 391c47e84241d..947eeb66aa3d6 100644 --- a/flang/runtime/CUDA/descriptor.cpp +++ b/flang/runtime/CUDA/descriptor.cpp @@ -46,6 +46,13 @@ void RTDEF(CUFDescriptorSync)(Descriptor *dst, const Descriptor *src, (void *)dst, (const void *)src, count, cudaMemcpyHostToDevice)); } +void RTDEF(CUFSyncGlobalDescriptor)( + void *hostPtr, const char *sourceFile, int sourceLine) { + void *devAddr{RTNAME(CUFGetDeviceAddress)(hostPtr, sourceFile, sourceLine)}; + RTNAME(CUFDescriptorSync) + ((Descriptor *)devAddr, (Descriptor *)hostPtr, sourceFile, sourceLine); +} + RT_EXT_API_GROUP_END } } // namespace Fortran::runtime::cuda diff --git a/flang/test/Fir/CUDA/cuda-sync-desc.mlir b/flang/test/Fir/CUDA/cuda-sync-desc.mlir new file mode 100644 index 0000000000000..20b317f34a7f2 --- /dev/null +++ b/flang/test/Fir/CUDA/cuda-sync-desc.mlir @@ -0,0 +1,20 @@ +// RUN: fir-opt --cuf-convert %s | FileCheck %s + +module attributes {dlti.dl_spec = #dlti.dl_spec : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f64 = dense<64> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, "dlti.endianness" = "little", "dlti.stack_alignment" = 128 : i64>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (git@github.com:clementval/llvm-project.git f37e52237791f58438790c77edeb8de08f692987)", llvm.target_triple = "x86_64-unknown-linux-gnu"} { + fir.global @_QMdevptrEdev_ptr {data_attr = #cuf.cuda} : !fir.box>> { + %0 = fir.zero_bits !fir.ptr> + %c0 = arith.constant 0 : index + %1 = fir.shape %c0 : (index) -> !fir.shape<1> + %2 = fir.embox %0(%1) {allocator_idx = 2 : i32} : (!fir.ptr>, !fir.shape<1>) -> !fir.box>> + fir.has_value %2 : !fir.box>> + } + func.func @_QQmain() { + cuf.sync_descriptor @_QMdevptrEdev_ptr + return + } +} + +// CHECK-LABEL: func.func @_QQmain() +// CHECK: %[[HOST_ADDR:.*]] = fir.address_of(@_QMdevptrEdev_ptr) : !fir.ref>>> +// CHECK: %[[HOST_ADDR_PTR:.*]] = fir.convert %[[HOST_ADDR]] : (!fir.ref>>>) -> !fir.llvm_ptr +// CHECK: fir.call @_FortranACUFSyncGlobalDescriptor(%[[HOST_ADDR_PTR]], %{{.*}}, %{{.*}}) : (!fir.llvm_ptr, !fir.ref, i32) From 532a2691bc015fafdd356c10b17c466fe28c49b1 Mon Sep 17 00:00:00 2001 From: Sudharsan Veeravalli Date: Fri, 3 Jan 2025 06:33:27 +0530 Subject: [PATCH 314/567] [RISCV] Add Qualcomm uC Xqcicli (Conditional Load Immediate) extension (#121292) This extension adds 12 instructions that conditionally load an immediate value. The current spec can be found at: https://github.com/quic/riscv-unified-db/releases/latest This patch adds assembler only support. --- .../Driver/print-supported-extensions-riscv.c | 1 + llvm/docs/RISCVUsage.rst | 3 + llvm/docs/ReleaseNotes.md | 2 + .../RISCV/Disassembler/RISCVDisassembler.cpp | 3 + llvm/lib/Target/RISCV/RISCVFeatures.td | 8 + llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td | 27 ++ llvm/lib/TargetParser/RISCVISAInfo.cpp | 3 +- llvm/test/CodeGen/RISCV/attributes.ll | 2 + llvm/test/MC/RISCV/xqcicli-invalid.s | 232 ++++++++++++++++++ llvm/test/MC/RISCV/xqcicli-valid.s | 59 +++++ .../TargetParser/RISCVISAInfoTest.cpp | 4 +- 11 files changed, 342 insertions(+), 2 deletions(-) create mode 100644 llvm/test/MC/RISCV/xqcicli-invalid.s create mode 100644 llvm/test/MC/RISCV/xqcicli-valid.s diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c index 8e46690cce5a6..395501eb85ccc 100644 --- a/clang/test/Driver/print-supported-extensions-riscv.c +++ b/clang/test/Driver/print-supported-extensions-riscv.c @@ -190,6 +190,7 @@ // CHECK-NEXT: svukte 0.3 'Svukte' (Address-Independent Latency of User-Mode Faults to Supervisor Addresses) // CHECK-NEXT: xqcia 0.2 'Xqcia' (Qualcomm uC Arithmetic Extension) // CHECK-NEXT: xqciac 0.2 'Xqciac' (Qualcomm uC Load-Store Address Calculation Extension) +// CHECK-NEXT: xqcicli 0.2 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension) // CHECK-NEXT: xqcics 0.2 'Xqcics' (Qualcomm uC Conditional Select Extension) // CHECK-NEXT: xqcicsr 0.2 'Xqcicsr' (Qualcomm uC CSR Extension) // CHECK-NEXT: xqcilsm 0.2 'Xqcilsm' (Qualcomm uC Load Store Multiple Extension) diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst index 22600f5720553..eaaad6c516818 100644 --- a/llvm/docs/RISCVUsage.rst +++ b/llvm/docs/RISCVUsage.rst @@ -432,6 +432,9 @@ The current vendor extensions supported are: ``experimental-Xqciac`` LLVM implements `version 0.2 of the Qualcomm uC Load-Store Address Calculation extension specification `__ by Qualcomm. All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32. +``experimental-Xqcicli`` + LLVM implements `version 0.2 of the Qualcomm uC Conditional Load Immediate extension specification `__ by Qualcomm. All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32. + ``experimental-Xqcics`` LLVM implements `version 0.2 of the Qualcomm uC Conditional Select extension specification `__ by Qualcomm. All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32. diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index 99a93b0467602..be62a7e8696b4 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -230,6 +230,8 @@ Changes to the RISC-V Backend extension. * Adds experimental assembler support for the Qualcomm uC 'Xqcilsm` (Load Store Multiple) extension. +* Adds experimental assembler support for the Qualcomm uC 'Xqcicli` (Conditional Load Immediate) + extension. Changes to the WebAssembly Backend ---------------------------------- diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index 57443d3f38e3c..30122831767f6 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -695,6 +695,9 @@ DecodeStatus RISCVDisassembler::getInstruction32(MCInst &MI, uint64_t &Size, TRY_TO_DECODE_FEATURE( RISCV::FeatureVendorXqciac, DecoderTableXqciac32, "Qualcomm uC Load-Store Address Calculation custom opcode table"); + TRY_TO_DECODE_FEATURE( + RISCV::FeatureVendorXqcicli, DecoderTableXqcicli32, + "Qualcomm uC Conditional Load Immediate custom opcode table"); TRY_TO_DECODE(true, DecoderTable32, "RISCV32 table"); return MCDisassembler::Fail; diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 916b140c5bde7..3885b95a8937a 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1282,6 +1282,14 @@ def HasVendorXqciac AssemblerPredicate<(all_of FeatureVendorXqciac), "'Xqciac' (Qualcomm uC Load-Store Address Calculation Extension)">; +def FeatureVendorXqcicli + : RISCVExperimentalExtension<0, 2, + "Qualcomm uC Conditional Load Immediate Extension">; +def HasVendorXqcicli + : Predicate<"Subtarget->hasVendorXqcicli()">, + AssemblerPredicate<(all_of FeatureVendorXqcicli), + "'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)">; + //===----------------------------------------------------------------------===// // LLVM specific features and extensions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index ca73fbccd9d2d..5e6722cb4995e 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -139,6 +139,17 @@ class QCIStoreMultiple funct2, DAGOperand InTyRs2, string opcodestr> let Inst{31-25} = {funct2, imm{6-2}}; } +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in +class QCILICC funct3, bits<2> funct2, DAGOperand InTyRs2, string opcodestr> + : RVInstRBase { + let Constraints = "$rd = $rd_wb"; + bits<5> simm; + + let Inst{31-25} = {simm, funct2}; +} + //===----------------------------------------------------------------------===// // Instructions //===----------------------------------------------------------------------===// @@ -243,6 +254,22 @@ let Predicates = [HasVendorXqcilsm, IsRV32], DecoderNamespace = "Xqcilsm" in { def QC_LWMI : QCILoadMultiple<0b01, uimm5nonzero, "qc.lwmi">; } // Predicates = [HasVendorXqcilsm, IsRV32], DecoderNamespace = "Xqcilsm" +let Predicates = [HasVendorXqcicli, IsRV32], DecoderNamespace = "Xqcicli" in { + def QC_LIEQ : QCILICC<0b000, 0b01, GPRNoX0, "qc.lieq">; + def QC_LINE : QCILICC<0b001, 0b01, GPRNoX0, "qc.line">; + def QC_LILT : QCILICC<0b100, 0b01, GPRNoX0, "qc.lilt">; + def QC_LIGE : QCILICC<0b101, 0b01, GPRNoX0, "qc.lige">; + def QC_LILTU : QCILICC<0b110, 0b01, GPRNoX0, "qc.liltu">; + def QC_LIGEU : QCILICC<0b111, 0b01, GPRNoX0, "qc.ligeu">; + + def QC_LIEQI : QCILICC<0b000, 0b11, simm5, "qc.lieqi">; + def QC_LINEI : QCILICC<0b001, 0b11, simm5, "qc.linei">; + def QC_LILTI : QCILICC<0b100, 0b11, simm5, "qc.lilti">; + def QC_LIGEI : QCILICC<0b101, 0b11, simm5, "qc.ligei">; + def QC_LILTUI : QCILICC<0b110, 0b11, uimm5, "qc.liltui">; + def QC_LIGEUI : QCILICC<0b111, 0b11, uimm5, "qc.ligeui">; +} // Predicates = [HasVendorXqcicli, IsRV32], DecoderNamespace = "Xqcicli" + //===----------------------------------------------------------------------===// // Aliases //===----------------------------------------------------------------------===// diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp index e4e459a77b5f8..4f403e9fb6f57 100644 --- a/llvm/lib/TargetParser/RISCVISAInfo.cpp +++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp @@ -742,7 +742,8 @@ Error RISCVISAInfo::checkDependency() { bool HasZvl = MinVLen != 0; bool HasZcmt = Exts.count("zcmt") != 0; static constexpr StringLiteral XqciExts[] = { - {"xqcia"}, {"xqciac"}, {"xqcics"}, {"xqcicsr"}, {"xqcilsm"}, {"xqcisls"}}; + {"xqcia"}, {"xqciac"}, {"xqcicli"}, {"xqcics"}, + {"xqcicsr"}, {"xqcilsm"}, {"xqcisls"}}; if (HasI && HasE) return getIncompatibleError("i", "e"); diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll index 3f2b2c9470783..bcf945470d85b 100644 --- a/llvm/test/CodeGen/RISCV/attributes.ll +++ b/llvm/test/CodeGen/RISCV/attributes.ll @@ -83,6 +83,7 @@ ; RUN: llc -mtriple=riscv32 -mattr=+xwchc %s -o - | FileCheck --check-prefix=RV32XWCHC %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcia %s -o - | FileCheck --check-prefix=RV32XQCIA %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqciac %s -o - | FileCheck --check-prefix=RV32XQCIAC %s +; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicli %s -o - | FileCheck --check-prefix=RV32XQCICLI %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcics %s -o - | FileCheck --check-prefix=RV32XQCICS %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicsr %s -o - | FileCheck --check-prefix=RV32XQCICSR %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcilsm %s -o - | FileCheck --check-prefix=RV32XQCILSM %s @@ -393,6 +394,7 @@ ; RV32XWCHC: .attribute 5, "rv32i2p1_xwchc2p2" ; RV32XQCIA: .attribute 5, "rv32i2p1_xqcia0p2" ; RV32XQCIAC: .attribute 5, "rv32i2p1_zca1p0_xqciac0p2" +; RV32XQCICLI: .attribute 5, "rv32i2p1_xqcicli0p2" ; RV32XQCICS: .attribute 5, "rv32i2p1_xqcics0p2" ; RV32XQCICSR: .attribute 5, "rv32i2p1_xqcicsr0p2" ; RV32XQCILSM: .attribute 5, "rv32i2p1_xqcilsm0p2" diff --git a/llvm/test/MC/RISCV/xqcicli-invalid.s b/llvm/test/MC/RISCV/xqcicli-invalid.s new file mode 100644 index 0000000000000..7ee92ec4cbc01 --- /dev/null +++ b/llvm/test/MC/RISCV/xqcicli-invalid.s @@ -0,0 +1,232 @@ +# Xqcicli - Qualcomm uC Conditional Load Immediate Instructions +# RUN: not llvm-mc -triple riscv32 -mattr=+experimental-xqcicli < %s 2>&1 \ +# RUN: | FileCheck -check-prefixes=CHECK,CHECK-PLUS %s +# RUN: not llvm-mc -triple riscv32 -mattr=-experimental-xqcicli < %s 2>&1 \ +# RUN: | FileCheck -check-prefixes=CHECK,CHECK-MINUS %s + +# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction +qc.lieq x0, x4, x6, 10 + +# CHECK: :[[@LINE+1]]:13: error: invalid operand for instruction +qc.lieq x2, x0, x6, 10 + +# CHECK: :[[@LINE+1]]:17: error: invalid operand for instruction +qc.lieq x2, x4, x0, 10 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.lieq x2, x4, x6 + +# CHECK-PLUS: :[[@LINE+1]]:21: error: immediate must be an integer in the range [-16, 15] +qc.lieq x2, x4, x6, 40 + +# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension) +qc.lieq x2, x4, x6, 10 + + +# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction +qc.lige x0, x8, x20, 2 + +# CHECK: :[[@LINE+1]]:13: error: invalid operand for instruction +qc.lige x4, x0, x20, 2 + +# CHECK: :[[@LINE+1]]:17: error: invalid operand for instruction +qc.lige x4, x8, x0, 2 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.lige x4, x8, x20 + +# CHECK-PLUS: :[[@LINE+1]]:22: error: immediate must be an integer in the range [-16, 15] +qc.lige x4, x8, x20, -18 + +# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension) +qc.lige x4, x8, x20, 2 + + +# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction +qc.lilt x0, x9, x10, 3 + +# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction +qc.lilt x19, x0, x10, 3 + +# CHECK: :[[@LINE+1]]:18: error: invalid operand for instruction +qc.lilt x19, x9, x0, 3 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.lilt x19, x9, x10 + +# CHECK-PLUS: :[[@LINE+1]]:23: error: immediate must be an integer in the range [-16, 15] +qc.lilt x19, x9, x10, 39 + +# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension) +qc.lilt x19, x9, x10, 3 + + +# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction +qc.line x0, x14, x6, 10 + +# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction +qc.line x18, x0, x6, 10 + +# CHECK: :[[@LINE+1]]:19: error: invalid operand for instruction +qc.line x18, x14, x0, 10 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.line x18, x14, x6 + +# CHECK-PLUS: :[[@LINE+1]]:23: error: immediate must be an integer in the range [-16, 15] +qc.line x18, x14, x6, 100 + +# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension) +qc.line x18, x14, x6, 10 + + +# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction +qc.ligeu x0, x4, x6, 10 + +# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction +qc.ligeu x2, x0, x6, 10 + +# CHECK: :[[@LINE+1]]:18: error: invalid operand for instruction +qc.ligeu x2, x4, x0, 10 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.ligeu x2, x4, x6 + +# CHECK-PLUS: :[[@LINE+1]]:22: error: immediate must be an integer in the range [-16, 15] +qc.ligeu x2, x4, x6, 70 + +# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension) +qc.ligeu x2, x4, x6, 10 + + +# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction +qc.liltu x0, x19, x12, 13 + +# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction +qc.liltu x1, x0, x12, 13 + +# CHECK: :[[@LINE+1]]:19: error: invalid operand for instruction +qc.liltu x1, x19, x0, 13 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.liltu x1, x19, x12 + +# CHECK-PLUS: :[[@LINE+1]]:24: error: immediate must be an integer in the range [-16, 15] +qc.liltu x1, x19, x12, 73 + +# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension) +qc.liltu x1, x19, x12, 13 + + +# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction +qc.lieqi x0, x1, 15, 12 + +# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction +qc.lieqi x7, x0, 15, 12 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.lieqi x7, x1, 15 + +# CHECK-PLUS: :[[@LINE+1]]:18: error: immediate must be an integer in the range [-16, 15] +qc.lieqi x7, x1, 25, 12 + +# CHECK-PLUS: :[[@LINE+1]]:22: error: immediate must be an integer in the range [-16, 15] +qc.lieqi x7, x1, 15, -22 + +# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension) +qc.lieqi x7, x1, 15, 12 + + +# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction +qc.ligei x0, x11, -4, 9 + +# CHECK: :[[@LINE+1]]:15: error: invalid operand for instruction +qc.ligei x17, x0, -4, 9 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.ligei x17, x11, -4 + +# CHECK-PLUS: :[[@LINE+1]]:20: error: immediate must be an integer in the range [-16, 15] +qc.ligei x17, x11, -24, 9 + +# CHECK-PLUS: :[[@LINE+1]]:24: error: immediate must be an integer in the range [-16, 15] +qc.ligei x17, x11, -4, 59 + +# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension) +qc.ligei x17, x11, -4, 9 + + +# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction +qc.lilti x0, x11, -14, 2 + +# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction +qc.lilti x9, x0, -14, 2 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.lilti x9, x11, -14 + +# CHECK-PLUS: :[[@LINE+1]]:19: error: immediate must be an integer in the range [-16, 15] +qc.lilti x9, x11, -84, 2 + +# CHECK-PLUS: :[[@LINE+1]]:24: error: immediate must be an integer in the range [-16, 15] +qc.lilti x9, x11, -14, 52 + +# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension) +qc.lilti x9, x11, -14, 2 + + +# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction +qc.linei x0, x1, 10, 12 + +# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction +qc.linei x5, x0, 10, 12 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.linei x5, x1, 10 + +# CHECK-PLUS: :[[@LINE+1]]:18: error: immediate must be an integer in the range [-16, 15] +qc.linei x5, x1, 130, 12 + +# CHECK-PLUS: :[[@LINE+1]]:22: error: immediate must be an integer in the range [-16, 15] +qc.linei x5, x1, 10, 124 + +# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension) +qc.linei x5, x1, 10, 12 + + +# CHECK: :[[@LINE+1]]:11: error: invalid operand for instruction +qc.ligeui x0, x12, 7, -12 + +# CHECK: :[[@LINE+1]]:15: error: invalid operand for instruction +qc.ligeui x2, x0, 7, -12 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.ligeui x2, x12, 7 + +# CHECK-PLUS: :[[@LINE+1]]:20: error: immediate must be an integer in the range [0, 31] +qc.ligeui x2, x12, -7, -12 + +# CHECK-PLUS: :[[@LINE+1]]:23: error: immediate must be an integer in the range [-16, 15] +qc.ligeui x2, x12, 7, -17 + +# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension) +qc.ligeui x2, x12, 7, -12 + + +# CHECK: :[[@LINE+1]]:11: error: invalid operand for instruction +qc.liltui x0, x25, 31, 12 + +# CHECK: :[[@LINE+1]]:15: error: invalid operand for instruction +qc.liltui x3, x0, 31, 12 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.liltui x3, x25, 31 + +# CHECK-PLUS: :[[@LINE+1]]:20: error: immediate must be an integer in the range [0, 31] +qc.liltui x3, x25, 32, 12 + +# CHECK-PLUS: :[[@LINE+1]]:24: error: immediate must be an integer in the range [-16, 15] +qc.liltui x3, x25, 31, 112 + +# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension) +qc.liltui x3, x25, 31, 12 diff --git a/llvm/test/MC/RISCV/xqcicli-valid.s b/llvm/test/MC/RISCV/xqcicli-valid.s new file mode 100644 index 0000000000000..404bfdf7bce26 --- /dev/null +++ b/llvm/test/MC/RISCV/xqcicli-valid.s @@ -0,0 +1,59 @@ +# Xqcicli - Qualcomm uC Conditional Load Immediate Extension +# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcicli -riscv-no-aliases -show-encoding \ +# RUN: | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s +# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcicli < %s \ +# RUN: | llvm-objdump --mattr=+experimental-xqcicli -M no-aliases --no-print-imm-hex -d - \ +# RUN: | FileCheck -check-prefix=CHECK-INST %s +# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcicli -show-encoding \ +# RUN: | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s +# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcicli < %s \ +# RUN: | llvm-objdump --mattr=+experimental-xqcicli --no-print-imm-hex -d - \ +# RUN: | FileCheck -check-prefix=CHECK-INST %s + +# CHECK-INST: qc.lieq sp, tp, t1, 10 +# CHECK-ENC: encoding: [0x5b,0x01,0x62,0x52] +qc.lieq x2, x4, x6, 10 + +# CHECK-INST: qc.lieqi t2, ra, 15, 12 +# CHECK-ENC: encoding: [0xdb,0x83,0xf0,0x66] +qc.lieqi x7, x1, 15, 12 + +# CHECK-INST: qc.lige tp, s0, s4, 2 +# CHECK-ENC: encoding: [0x5b,0x52,0x44,0x13] +qc.lige x4, x8, x20, 2 + +# CHECK-INST: qc.ligei a7, a1, -4, 9 +# CHECK-ENC: encoding: [0xdb,0xd8,0xc5,0x4f] +qc.ligei x17, x11, -4, 9 + +# CHECK-INST: qc.ligeu sp, tp, t1, 10 +# CHECK-ENC: encoding: [0x5b,0x71,0x62,0x52] +qc.ligeu x2, x4, x6, 10 + +# CHECK-INST: qc.ligeui sp, a2, 7, -12 +# CHECK-ENC: encoding: [0x5b,0x71,0x76,0xa6] +qc.ligeui x2, x12, 7, -12 + +# CHECK-INST: qc.lilt s3, s1, a0, 3 +# CHECK-ENC: encoding: [0xdb,0xc9,0xa4,0x1a] +qc.lilt x19, x9, x10, 3 + +# CHECK-INST: qc.lilti s1, a1, -14, 2 +# CHECK-ENC: encoding: [0xdb,0xc4,0x25,0x17] +qc.lilti x9, x11, -14, 2 + +# CHECK-INST: qc.liltu ra, s3, a2, 13 +# CHECK-ENC: encoding: [0xdb,0xe0,0xc9,0x6a] +qc.liltu x1, x19, x12, 13 + +# CHECK-INST: qc.liltui gp, s9, 31, 12 +# CHECK-ENC: encoding: [0xdb,0xe1,0xfc,0x67] +qc.liltui x3, x25, 31, 12 + +# CHECK-INST: qc.line s2, a4, t1, 10 +# CHECK-ENC: encoding: [0x5b,0x19,0x67,0x52] +qc.line x18, x14, x6, 10 + +# CHECK-INST: qc.linei t0, ra, 10, 12 +# CHECK-ENC: encoding: [0xdb,0x92,0xa0,0x66] +qc.linei x5, x1, 10, 12 diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp index 176cf82ac34b1..f631f26cf482e 100644 --- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp +++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp @@ -655,7 +655,8 @@ TEST(ParseArchString, RejectsConflictingExtensions) { for (StringRef Input : {"rv64i_xqcisls0p2", "rv64i_xqcia0p2", "rv64i_xqciac0p2", - "rv64i_xqcicsr0p2", "rv64i_xqcilsm0p2", "rv64i_xqcics0p2"}) { + "rv64i_xqcicsr0p2", "rv64i_xqcilsm0p2", "rv64i_xqcics0p2", + "rv64i_xqcicli0p2"}) { EXPECT_THAT( toString(RISCVISAInfo::parseArchString(Input, true).takeError()), ::testing::EndsWith(" is only supported for 'rv32'")); @@ -1114,6 +1115,7 @@ Experimental extensions svukte 0.3 xqcia 0.2 xqciac 0.2 + xqcicli 0.2 xqcics 0.2 xqcicsr 0.2 xqcilsm 0.2 From 8e404509cc130d95f09f255649a87446ca81b187 Mon Sep 17 00:00:00 2001 From: ZhaoQi Date: Fri, 3 Jan 2025 09:07:44 +0800 Subject: [PATCH 315/567] [JITLink][RISCV] Add feature relax for addsub tests. NFC (#121204) R_RISCV_{ADD*/SUB*} relocations are kept only when feature relax enabled. So it is better to add relax to the test, so that relocs can be reserved for processing by the jitlink. That's what this test really wants to test. --- .../JITLink/RISCV/{riscv_reloc_add.s => ELF_reloc_add.s} | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) rename llvm/test/ExecutionEngine/JITLink/RISCV/{riscv_reloc_add.s => ELF_reloc_add.s} (82%) diff --git a/llvm/test/ExecutionEngine/JITLink/RISCV/riscv_reloc_add.s b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_reloc_add.s similarity index 82% rename from llvm/test/ExecutionEngine/JITLink/RISCV/riscv_reloc_add.s rename to llvm/test/ExecutionEngine/JITLink/RISCV/ELF_reloc_add.s index 13689b6d8a026..01f9e7eb5653d 100644 --- a/llvm/test/ExecutionEngine/JITLink/RISCV/riscv_reloc_add.s +++ b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_reloc_add.s @@ -1,6 +1,8 @@ # RUN: rm -rf %t && mkdir -p %t -# RUN: llvm-mc -triple=riscv64 -filetype=obj -o %t/riscv64_reloc_add.o %s -# RUN: llvm-mc -triple=riscv32 -filetype=obj -o %t/riscv32_reloc_add.o %s +# RUN: llvm-mc -triple=riscv64 -mattr=+relax -filetype=obj \ +# RUN: -o %t/riscv64_reloc_add.o %s +# RUN: llvm-mc -triple=riscv32 -mattr=+relax -filetype=obj \ +# RUN: -o %t/riscv32_reloc_add.o %s # RUN: llvm-jitlink -noexec -check %s %t/riscv64_reloc_add.o \ # RUN: -slab-allocate=1Mb -slab-address=0x1000 -slab-page-size=0x1000 # RUN: llvm-jitlink -noexec -check %s %t/riscv32_reloc_add.o \ From 3792b36234b6c87d728f0a905543e284bf961460 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 2 Jan 2025 17:08:18 -0800 Subject: [PATCH 316/567] [lld][WebAssembly] Replace config-> with ctx.arg. Change the global variable reference to a member access of another variable `ctx`. In the future, we may pass through `ctx` to functions to eliminate global variables. Pull Request: https://github.com/llvm/llvm-project/pull/119835 --- lld/wasm/Config.h | 10 +- lld/wasm/Driver.cpp | 306 ++++++++++++++++----------------- lld/wasm/InputChunks.cpp | 8 +- lld/wasm/InputChunks.h | 4 +- lld/wasm/InputElement.h | 4 +- lld/wasm/InputFiles.cpp | 14 +- lld/wasm/InputFiles.h | 2 +- lld/wasm/LTO.cpp | 80 ++++----- lld/wasm/MapFile.cpp | 6 +- lld/wasm/MarkLive.cpp | 10 +- lld/wasm/OutputSections.cpp | 6 +- lld/wasm/Relocations.cpp | 20 +-- lld/wasm/SymbolTable.cpp | 28 +-- lld/wasm/Symbols.cpp | 6 +- lld/wasm/Symbols.h | 2 +- lld/wasm/SyntheticSections.cpp | 56 +++--- lld/wasm/SyntheticSections.h | 14 +- lld/wasm/Writer.cpp | 182 ++++++++++---------- 18 files changed, 374 insertions(+), 384 deletions(-) diff --git a/lld/wasm/Config.h b/lld/wasm/Config.h index 0c2ba3eebffc4..1fa6c42d9cd86 100644 --- a/lld/wasm/Config.h +++ b/lld/wasm/Config.h @@ -126,17 +126,9 @@ struct Config { llvm::SmallVector buildIdVector; }; -struct ConfigWrapper { - Config c; - Config *operator->() { return &c; } -}; - -// The only instance of Configuration struct. -extern ConfigWrapper config; - // The Ctx object hold all other (non-configuration) global state. struct Ctx { - Config &arg; + Config arg; llvm::SmallVector objectFiles; llvm::SmallVector stubFiles; diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp index 02471950fb519..c3a74dde6480e 100644 --- a/lld/wasm/Driver.cpp +++ b/lld/wasm/Driver.cpp @@ -44,17 +44,16 @@ using namespace llvm::sys; using namespace llvm::wasm; namespace lld::wasm { -ConfigWrapper config; Ctx ctx; void errorOrWarn(const llvm::Twine &msg) { - if (config->noinhibitExec) + if (ctx.arg.noinhibitExec) warn(msg); else error(msg); } -Ctx::Ctx() : arg(config.c) {} +Ctx::Ctx() {} void Ctx::reset() { arg.~Config(); @@ -268,7 +267,7 @@ opt::InputArgList WasmOptTable::parse(ArrayRef argv) { static void readImportFile(StringRef filename) { if (std::optional buf = readFile(filename)) for (StringRef sym : args::getLines(*buf)) - config->allowUndefinedSymbols.insert(sym); + ctx.arg.allowUndefinedSymbols.insert(sym); } // Returns slices of MB by parsing MB as an archive file. @@ -345,7 +344,7 @@ void LinkerDriver::addFile(StringRef path) { case file_magic::bitcode: case file_magic::wasm_object: { auto obj = createObjectFile(mbref, "", 0, inLib); - if (config->isStatic && isa(obj)) { + if (ctx.arg.isStatic && isa(obj)) { error("attempted static link of dynamic object " + path); break; } @@ -364,7 +363,7 @@ void LinkerDriver::addFile(StringRef path) { } static std::optional findFromSearchPaths(StringRef path) { - for (StringRef dir : config->searchPaths) + for (StringRef dir : ctx.arg.searchPaths) if (std::optional s = findFile(dir, path)) return s; return std::nullopt; @@ -373,8 +372,8 @@ static std::optional findFromSearchPaths(StringRef path) { // This is for -l. We'll look for lib.a from // search paths. static std::optional searchLibraryBaseName(StringRef name) { - for (StringRef dir : config->searchPaths) { - if (!config->isStatic) + for (StringRef dir : ctx.arg.searchPaths) { + if (!ctx.arg.isStatic) if (std::optional s = findFile(dir, "lib" + name + ".so")) return s; if (std::optional s = findFile(dir, "lib" + name + ".a")) @@ -408,10 +407,10 @@ void LinkerDriver::createFiles(opt::InputArgList &args) { addFile(arg->getValue()); break; case OPT_Bstatic: - config->isStatic = true; + ctx.arg.isStatic = true; break; case OPT_Bdynamic: - config->isStatic = false; + ctx.arg.isStatic = false; break; case OPT_whole_archive: inWholeArchive = true; @@ -527,99 +526,98 @@ getBuildId(opt::InputArgList &args) { // Initializes Config members by the command line options. static void readConfigs(opt::InputArgList &args) { - config->allowMultipleDefinition = + ctx.arg.allowMultipleDefinition = hasZOption(args, "muldefs") || args.hasFlag(OPT_allow_multiple_definition, OPT_no_allow_multiple_definition, false); - config->bsymbolic = args.hasArg(OPT_Bsymbolic); - config->checkFeatures = + ctx.arg.bsymbolic = args.hasArg(OPT_Bsymbolic); + ctx.arg.checkFeatures = args.hasFlag(OPT_check_features, OPT_no_check_features, true); - config->compressRelocations = args.hasArg(OPT_compress_relocations); - config->demangle = args.hasFlag(OPT_demangle, OPT_no_demangle, true); - config->disableVerify = args.hasArg(OPT_disable_verify); - config->emitRelocs = args.hasArg(OPT_emit_relocs); - config->experimentalPic = args.hasArg(OPT_experimental_pic); - config->entry = getEntry(args); - config->exportAll = args.hasArg(OPT_export_all); - config->exportTable = args.hasArg(OPT_export_table); - config->growableTable = args.hasArg(OPT_growable_table); - config->noinhibitExec = args.hasArg(OPT_noinhibit_exec); + ctx.arg.compressRelocations = args.hasArg(OPT_compress_relocations); + ctx.arg.demangle = args.hasFlag(OPT_demangle, OPT_no_demangle, true); + ctx.arg.disableVerify = args.hasArg(OPT_disable_verify); + ctx.arg.emitRelocs = args.hasArg(OPT_emit_relocs); + ctx.arg.experimentalPic = args.hasArg(OPT_experimental_pic); + ctx.arg.entry = getEntry(args); + ctx.arg.exportAll = args.hasArg(OPT_export_all); + ctx.arg.exportTable = args.hasArg(OPT_export_table); + ctx.arg.growableTable = args.hasArg(OPT_growable_table); + ctx.arg.noinhibitExec = args.hasArg(OPT_noinhibit_exec); if (args.hasArg(OPT_import_memory_with_name)) { - config->memoryImport = + ctx.arg.memoryImport = args.getLastArgValue(OPT_import_memory_with_name).split(","); } else if (args.hasArg(OPT_import_memory)) { - config->memoryImport = + ctx.arg.memoryImport = std::pair(defaultModule, memoryName); } else { - config->memoryImport = + ctx.arg.memoryImport = std::optional>(); } if (args.hasArg(OPT_export_memory_with_name)) { - config->memoryExport = - args.getLastArgValue(OPT_export_memory_with_name); + ctx.arg.memoryExport = args.getLastArgValue(OPT_export_memory_with_name); } else if (args.hasArg(OPT_export_memory)) { - config->memoryExport = memoryName; + ctx.arg.memoryExport = memoryName; } else { - config->memoryExport = std::optional(); + ctx.arg.memoryExport = std::optional(); } - config->sharedMemory = args.hasArg(OPT_shared_memory); - config->soName = args.getLastArgValue(OPT_soname); - config->importTable = args.hasArg(OPT_import_table); - config->importUndefined = args.hasArg(OPT_import_undefined); - config->ltoo = args::getInteger(args, OPT_lto_O, 2); - if (config->ltoo > 3) - error("invalid optimization level for LTO: " + Twine(config->ltoo)); + ctx.arg.sharedMemory = args.hasArg(OPT_shared_memory); + ctx.arg.soName = args.getLastArgValue(OPT_soname); + ctx.arg.importTable = args.hasArg(OPT_import_table); + ctx.arg.importUndefined = args.hasArg(OPT_import_undefined); + ctx.arg.ltoo = args::getInteger(args, OPT_lto_O, 2); + if (ctx.arg.ltoo > 3) + error("invalid optimization level for LTO: " + Twine(ctx.arg.ltoo)); unsigned ltoCgo = - args::getInteger(args, OPT_lto_CGO, args::getCGOptLevel(config->ltoo)); + args::getInteger(args, OPT_lto_CGO, args::getCGOptLevel(ctx.arg.ltoo)); if (auto level = CodeGenOpt::getLevel(ltoCgo)) - config->ltoCgo = *level; + ctx.arg.ltoCgo = *level; else error("invalid codegen optimization level for LTO: " + Twine(ltoCgo)); - config->ltoPartitions = args::getInteger(args, OPT_lto_partitions, 1); - config->ltoObjPath = args.getLastArgValue(OPT_lto_obj_path_eq); - config->ltoDebugPassManager = args.hasArg(OPT_lto_debug_pass_manager); - config->mapFile = args.getLastArgValue(OPT_Map); - config->optimize = args::getInteger(args, OPT_O, 1); - config->outputFile = args.getLastArgValue(OPT_o); - config->relocatable = args.hasArg(OPT_relocatable); - config->gcSections = - args.hasFlag(OPT_gc_sections, OPT_no_gc_sections, !config->relocatable); + ctx.arg.ltoPartitions = args::getInteger(args, OPT_lto_partitions, 1); + ctx.arg.ltoObjPath = args.getLastArgValue(OPT_lto_obj_path_eq); + ctx.arg.ltoDebugPassManager = args.hasArg(OPT_lto_debug_pass_manager); + ctx.arg.mapFile = args.getLastArgValue(OPT_Map); + ctx.arg.optimize = args::getInteger(args, OPT_O, 1); + ctx.arg.outputFile = args.getLastArgValue(OPT_o); + ctx.arg.relocatable = args.hasArg(OPT_relocatable); + ctx.arg.gcSections = + args.hasFlag(OPT_gc_sections, OPT_no_gc_sections, !ctx.arg.relocatable); for (auto *arg : args.filtered(OPT_keep_section)) - config->keepSections.insert(arg->getValue()); - config->mergeDataSegments = + ctx.arg.keepSections.insert(arg->getValue()); + ctx.arg.mergeDataSegments = args.hasFlag(OPT_merge_data_segments, OPT_no_merge_data_segments, - !config->relocatable); - config->pie = args.hasFlag(OPT_pie, OPT_no_pie, false); - config->printGcSections = + !ctx.arg.relocatable); + ctx.arg.pie = args.hasFlag(OPT_pie, OPT_no_pie, false); + ctx.arg.printGcSections = args.hasFlag(OPT_print_gc_sections, OPT_no_print_gc_sections, false); - config->saveTemps = args.hasArg(OPT_save_temps); - config->searchPaths = args::getStrings(args, OPT_library_path); - config->shared = args.hasArg(OPT_shared); - config->shlibSigCheck = !args.hasArg(OPT_no_shlib_sigcheck); - config->stripAll = args.hasArg(OPT_strip_all); - config->stripDebug = args.hasArg(OPT_strip_debug); - config->stackFirst = args.hasArg(OPT_stack_first); - config->trace = args.hasArg(OPT_trace); - config->thinLTOCacheDir = args.getLastArgValue(OPT_thinlto_cache_dir); - config->thinLTOCachePolicy = CHECK( + ctx.arg.saveTemps = args.hasArg(OPT_save_temps); + ctx.arg.searchPaths = args::getStrings(args, OPT_library_path); + ctx.arg.shared = args.hasArg(OPT_shared); + ctx.arg.shlibSigCheck = !args.hasArg(OPT_no_shlib_sigcheck); + ctx.arg.stripAll = args.hasArg(OPT_strip_all); + ctx.arg.stripDebug = args.hasArg(OPT_strip_debug); + ctx.arg.stackFirst = args.hasArg(OPT_stack_first); + ctx.arg.trace = args.hasArg(OPT_trace); + ctx.arg.thinLTOCacheDir = args.getLastArgValue(OPT_thinlto_cache_dir); + ctx.arg.thinLTOCachePolicy = CHECK( parseCachePruningPolicy(args.getLastArgValue(OPT_thinlto_cache_policy)), "--thinlto-cache-policy: invalid cache policy"); - config->thinLTOEmitImportsFiles = args.hasArg(OPT_thinlto_emit_imports_files); - config->thinLTOEmitIndexFiles = args.hasArg(OPT_thinlto_emit_index_files) || + ctx.arg.thinLTOEmitImportsFiles = args.hasArg(OPT_thinlto_emit_imports_files); + ctx.arg.thinLTOEmitIndexFiles = args.hasArg(OPT_thinlto_emit_index_files) || args.hasArg(OPT_thinlto_index_only) || args.hasArg(OPT_thinlto_index_only_eq); - config->thinLTOIndexOnly = args.hasArg(OPT_thinlto_index_only) || + ctx.arg.thinLTOIndexOnly = args.hasArg(OPT_thinlto_index_only) || args.hasArg(OPT_thinlto_index_only_eq); - config->thinLTOIndexOnlyArg = args.getLastArgValue(OPT_thinlto_index_only_eq); - config->thinLTOObjectSuffixReplace = + ctx.arg.thinLTOIndexOnlyArg = args.getLastArgValue(OPT_thinlto_index_only_eq); + ctx.arg.thinLTOObjectSuffixReplace = getOldNewOptions(args, OPT_thinlto_object_suffix_replace_eq); - std::tie(config->thinLTOPrefixReplaceOld, config->thinLTOPrefixReplaceNew, - config->thinLTOPrefixReplaceNativeObject) = + std::tie(ctx.arg.thinLTOPrefixReplaceOld, ctx.arg.thinLTOPrefixReplaceNew, + ctx.arg.thinLTOPrefixReplaceNativeObject) = getOldNewOptionsExtra(args, OPT_thinlto_prefix_replace_eq); - if (config->thinLTOEmitIndexFiles && !config->thinLTOIndexOnly) { + if (ctx.arg.thinLTOEmitIndexFiles && !ctx.arg.thinLTOIndexOnly) { if (args.hasArg(OPT_thinlto_object_suffix_replace_eq)) error("--thinlto-object-suffix-replace is not supported with " "--thinlto-emit-index-files"); @@ -627,45 +625,45 @@ static void readConfigs(opt::InputArgList &args) { error("--thinlto-prefix-replace is not supported with " "--thinlto-emit-index-files"); } - if (!config->thinLTOPrefixReplaceNativeObject.empty() && - config->thinLTOIndexOnlyArg.empty()) { + if (!ctx.arg.thinLTOPrefixReplaceNativeObject.empty() && + ctx.arg.thinLTOIndexOnlyArg.empty()) { error("--thinlto-prefix-replace=old_dir;new_dir;obj_dir must be used with " "--thinlto-index-only="); } - config->unresolvedSymbols = getUnresolvedSymbolPolicy(args); - config->whyExtract = args.getLastArgValue(OPT_why_extract); + ctx.arg.unresolvedSymbols = getUnresolvedSymbolPolicy(args); + ctx.arg.whyExtract = args.getLastArgValue(OPT_why_extract); errorHandler().verbose = args.hasArg(OPT_verbose); LLVM_DEBUG(errorHandler().verbose = true); - config->tableBase = args::getInteger(args, OPT_table_base, 0); - config->globalBase = args::getInteger(args, OPT_global_base, 0); - config->initialHeap = args::getInteger(args, OPT_initial_heap, 0); - config->initialMemory = args::getInteger(args, OPT_initial_memory, 0); - config->maxMemory = args::getInteger(args, OPT_max_memory, 0); - config->noGrowableMemory = args.hasArg(OPT_no_growable_memory); - config->zStackSize = + ctx.arg.tableBase = args::getInteger(args, OPT_table_base, 0); + ctx.arg.globalBase = args::getInteger(args, OPT_global_base, 0); + ctx.arg.initialHeap = args::getInteger(args, OPT_initial_heap, 0); + ctx.arg.initialMemory = args::getInteger(args, OPT_initial_memory, 0); + ctx.arg.maxMemory = args::getInteger(args, OPT_max_memory, 0); + ctx.arg.noGrowableMemory = args.hasArg(OPT_no_growable_memory); + ctx.arg.zStackSize = args::getZOptionValue(args, OPT_z, "stack-size", WasmPageSize); // -Bdynamic by default if -pie or -shared is specified. - if (config->pie || config->shared) - config->isStatic = false; + if (ctx.arg.pie || ctx.arg.shared) + ctx.arg.isStatic = false; - if (config->maxMemory != 0 && config->noGrowableMemory) { + if (ctx.arg.maxMemory != 0 && ctx.arg.noGrowableMemory) { // Erroring out here is simpler than defining precedence rules. error("--max-memory is incompatible with --no-growable-memory"); } // Default value of exportDynamic depends on `-shared` - config->exportDynamic = - args.hasFlag(OPT_export_dynamic, OPT_no_export_dynamic, config->shared); + ctx.arg.exportDynamic = + args.hasFlag(OPT_export_dynamic, OPT_no_export_dynamic, ctx.arg.shared); // Parse wasm32/64. if (auto *arg = args.getLastArg(OPT_m)) { StringRef s = arg->getValue(); if (s == "wasm32") - config->is64 = false; + ctx.arg.is64 = false; else if (s == "wasm64") - config->is64 = true; + ctx.arg.is64 = true; else error("invalid target architecture: " + s); } @@ -679,36 +677,36 @@ static void readConfigs(opt::InputArgList &args) { error(arg->getSpelling() + ": expected a positive integer, but got '" + arg->getValue() + "'"); parallel::strategy = hardware_concurrency(threads); - config->thinLTOJobs = v; + ctx.arg.thinLTOJobs = v; } if (auto *arg = args.getLastArg(OPT_thinlto_jobs)) - config->thinLTOJobs = arg->getValue(); + ctx.arg.thinLTOJobs = arg->getValue(); if (auto *arg = args.getLastArg(OPT_features)) { - config->features = + ctx.arg.features = std::optional>(std::vector()); for (StringRef s : arg->getValues()) - config->features->push_back(std::string(s)); + ctx.arg.features->push_back(std::string(s)); } if (auto *arg = args.getLastArg(OPT_extra_features)) { - config->extraFeatures = + ctx.arg.extraFeatures = std::optional>(std::vector()); for (StringRef s : arg->getValues()) - config->extraFeatures->push_back(std::string(s)); + ctx.arg.extraFeatures->push_back(std::string(s)); } // Legacy --allow-undefined flag which is equivalent to // --unresolve-symbols=ignore + --import-undefined if (args.hasArg(OPT_allow_undefined)) { - config->importUndefined = true; - config->unresolvedSymbols = UnresolvedPolicy::Ignore; + ctx.arg.importUndefined = true; + ctx.arg.unresolvedSymbols = UnresolvedPolicy::Ignore; } if (args.hasArg(OPT_print_map)) - config->mapFile = "-"; + ctx.arg.mapFile = "-"; - std::tie(config->buildId, config->buildIdVector) = getBuildId(args); + std::tie(ctx.arg.buildId, ctx.arg.buildIdVector) = getBuildId(args); } // Some Config members do not directly correspond to any particular @@ -716,86 +714,86 @@ static void readConfigs(opt::InputArgList &args) { // This function initialize such members. See Config.h for the details // of these values. static void setConfigs() { - ctx.isPic = config->pie || config->shared; + ctx.isPic = ctx.arg.pie || ctx.arg.shared; if (ctx.isPic) { - if (config->exportTable) + if (ctx.arg.exportTable) error("-shared/-pie is incompatible with --export-table"); - config->importTable = true; + ctx.arg.importTable = true; } else { // Default table base. Defaults to 1, reserving 0 for the NULL function // pointer. - if (!config->tableBase) - config->tableBase = 1; + if (!ctx.arg.tableBase) + ctx.arg.tableBase = 1; // The default offset for static/global data, for when --global-base is // not specified on the command line. The precise value of 1024 is // somewhat arbitrary, and pre-dates wasm-ld (Its the value that // emscripten used prior to wasm-ld). - if (!config->globalBase && !config->relocatable && !config->stackFirst) - config->globalBase = 1024; + if (!ctx.arg.globalBase && !ctx.arg.relocatable && !ctx.arg.stackFirst) + ctx.arg.globalBase = 1024; } - if (config->relocatable) { - if (config->exportTable) + if (ctx.arg.relocatable) { + if (ctx.arg.exportTable) error("--relocatable is incompatible with --export-table"); - if (config->growableTable) + if (ctx.arg.growableTable) error("--relocatable is incompatible with --growable-table"); // Ignore any --import-table, as it's redundant. - config->importTable = true; + ctx.arg.importTable = true; } - if (config->shared) { - if (config->memoryExport.has_value()) { + if (ctx.arg.shared) { + if (ctx.arg.memoryExport.has_value()) { error("--export-memory is incompatible with --shared"); } - if (!config->memoryImport.has_value()) { - config->memoryImport = - std::pair(defaultModule, memoryName); + if (!ctx.arg.memoryImport.has_value()) { + ctx.arg.memoryImport = std::pair( + defaultModule, memoryName); } } // If neither export-memory nor import-memory is specified, default to // exporting memory under its default name. - if (!config->memoryExport.has_value() && !config->memoryImport.has_value()) { - config->memoryExport = memoryName; + if (!ctx.arg.memoryExport.has_value() && !ctx.arg.memoryImport.has_value()) { + ctx.arg.memoryExport = memoryName; } } // Some command line options or some combinations of them are not allowed. // This function checks for such errors. static void checkOptions(opt::InputArgList &args) { - if (!config->stripDebug && !config->stripAll && config->compressRelocations) + if (!ctx.arg.stripDebug && !ctx.arg.stripAll && ctx.arg.compressRelocations) error("--compress-relocations is incompatible with output debug" " information. Please pass --strip-debug or --strip-all"); - if (config->ltoPartitions == 0) + if (ctx.arg.ltoPartitions == 0) error("--lto-partitions: number of threads must be > 0"); - if (!get_threadpool_strategy(config->thinLTOJobs)) - error("--thinlto-jobs: invalid job count: " + config->thinLTOJobs); + if (!get_threadpool_strategy(ctx.arg.thinLTOJobs)) + error("--thinlto-jobs: invalid job count: " + ctx.arg.thinLTOJobs); - if (config->pie && config->shared) + if (ctx.arg.pie && ctx.arg.shared) error("-shared and -pie may not be used together"); - if (config->outputFile.empty() && !config->thinLTOIndexOnly) + if (ctx.arg.outputFile.empty() && !ctx.arg.thinLTOIndexOnly) error("no output file specified"); - if (config->importTable && config->exportTable) + if (ctx.arg.importTable && ctx.arg.exportTable) error("--import-table and --export-table may not be used together"); - if (config->relocatable) { - if (!config->entry.empty()) + if (ctx.arg.relocatable) { + if (!ctx.arg.entry.empty()) error("entry point specified for relocatable output file"); - if (config->gcSections) + if (ctx.arg.gcSections) error("-r and --gc-sections may not be used together"); - if (config->compressRelocations) + if (ctx.arg.compressRelocations) error("-r -and --compress-relocations may not be used together"); if (args.hasArg(OPT_undefined)) error("-r -and --undefined may not be used together"); - if (config->pie) + if (ctx.arg.pie) error("-r and -pie may not be used together"); - if (config->sharedMemory) + if (ctx.arg.sharedMemory) error("-r and --shared-memory may not be used together"); - if (config->globalBase) + if (ctx.arg.globalBase) error("-r and --global-base may not by used together"); } @@ -804,31 +802,31 @@ static void checkOptions(opt::InputArgList &args) { // mode, to give anyone using them a heads-up that they will be changing. // // Also, warn about flags which request explicit exports. - if (!config->experimentalPic) { + if (!ctx.arg.experimentalPic) { // -shared will change meaning when Module Linking is implemented. - if (config->shared) { + if (ctx.arg.shared) { warn("creating shared libraries, with -shared, is not yet stable"); } // -pie will change meaning when Module Linking is implemented. - if (config->pie) { + if (ctx.arg.pie) { warn("creating PIEs, with -pie, is not yet stable"); } - if (config->unresolvedSymbols == UnresolvedPolicy::ImportDynamic) { + if (ctx.arg.unresolvedSymbols == UnresolvedPolicy::ImportDynamic) { warn("dynamic imports are not yet stable " "(--unresolved-symbols=import-dynamic)"); } } - if (config->bsymbolic && !config->shared) { + if (ctx.arg.bsymbolic && !ctx.arg.shared) { warn("-Bsymbolic is only meaningful when combined with -shared"); } if (ctx.isPic) { - if (config->globalBase) + if (ctx.arg.globalBase) error("--global-base may not be used with -shared/-pie"); - if (config->tableBase) + if (ctx.arg.tableBase) error("--table-base may not be used with -shared/-pie"); } } @@ -851,7 +849,7 @@ static Symbol *handleUndefined(StringRef name, const char *option) { if (auto *lazySym = dyn_cast(sym)) { lazySym->extract(); - if (!config->whyExtract.empty()) + if (!ctx.arg.whyExtract.empty()) ctx.whyExtractRecords.emplace_back(option, sym->getFile(), *sym); } @@ -861,20 +859,20 @@ static Symbol *handleUndefined(StringRef name, const char *option) { static void handleLibcall(StringRef name) { Symbol *sym = symtab->find(name); if (sym && sym->isLazy() && isa(sym->getFile())) { - if (!config->whyExtract.empty()) + if (!ctx.arg.whyExtract.empty()) ctx.whyExtractRecords.emplace_back("", sym->getFile(), *sym); cast(sym)->extract(); } } static void writeWhyExtract() { - if (config->whyExtract.empty()) + if (ctx.arg.whyExtract.empty()) return; std::error_code ec; - raw_fd_ostream os(config->whyExtract, ec, sys::fs::OF_None); + raw_fd_ostream os(ctx.arg.whyExtract, ec, sys::fs::OF_None); if (ec) { - error("cannot open --why-extract= file " + config->whyExtract + ": " + + error("cannot open --why-extract= file " + ctx.arg.whyExtract + ": " + ec.message()); return; } @@ -905,14 +903,14 @@ static UndefinedGlobal * createUndefinedGlobal(StringRef name, llvm::wasm::WasmGlobalType *type) { auto *sym = cast(symtab->addUndefinedGlobal( name, std::nullopt, std::nullopt, WASM_SYMBOL_UNDEFINED, nullptr, type)); - config->allowUndefinedSymbols.insert(sym->getName()); + ctx.arg.allowUndefinedSymbols.insert(sym->getName()); sym->isUsedInRegularObj = true; return sym; } static InputGlobal *createGlobal(StringRef name, bool isMutable) { llvm::wasm::WasmGlobal wasmGlobal; - bool is64 = config->is64.value_or(false); + bool is64 = ctx.arg.is64.value_or(false); wasmGlobal.Type = {uint8_t(is64 ? WASM_TYPE_I64 : WASM_TYPE_I32), isMutable}; wasmGlobal.InitExpr = intConst(0, is64); wasmGlobal.SymbolName = name; @@ -931,7 +929,7 @@ static GlobalSymbol *createOptionalGlobal(StringRef name, bool isMutable) { // Create ABI-defined synthetic symbols static void createSyntheticSymbols() { - if (config->relocatable) + if (ctx.arg.relocatable) return; static WasmSignature nullSignature = {{}, {}}; @@ -947,11 +945,11 @@ static void createSyntheticSymbols() { "__wasm_call_ctors", WASM_SYMBOL_VISIBILITY_HIDDEN, make(nullSignature, "__wasm_call_ctors")); - bool is64 = config->is64.value_or(false); + bool is64 = ctx.arg.is64.value_or(false); if (ctx.isPic) { WasmSym::stackPointer = - createUndefinedGlobal("__stack_pointer", config->is64.value_or(false) + createUndefinedGlobal("__stack_pointer", ctx.arg.is64.value_or(false) ? &mutableGlobalTypeI64 : &mutableGlobalTypeI32); // For PIC code, we import two global variables (__memory_base and @@ -970,7 +968,7 @@ static void createSyntheticSymbols() { WasmSym::stackPointer->markLive(); } - if (config->sharedMemory) { + if (ctx.arg.sharedMemory) { WasmSym::tlsBase = createGlobalVariable("__tls_base", true); WasmSym::tlsSize = createGlobalVariable("__tls_size", false); WasmSym::tlsAlign = createGlobalVariable("__tls_align", false); @@ -983,12 +981,12 @@ static void createSyntheticSymbols() { } static void createOptionalSymbols() { - if (config->relocatable) + if (ctx.arg.relocatable) return; WasmSym::dsoHandle = symtab->addOptionalDataSymbol("__dso_handle"); - if (!config->shared) + if (!ctx.arg.shared) WasmSym::dataEnd = symtab->addOptionalDataSymbol("__data_end"); if (!ctx.isPic) { @@ -1010,7 +1008,7 @@ static void createOptionalSymbols() { // // __tls_size and __tls_align are not needed in this case since they are only // needed for __wasm_init_tls (which we do not create in this case). - if (!config->sharedMemory) + if (!ctx.arg.sharedMemory) WasmSym::tlsBase = createOptionalGlobal("__tls_base", false); } @@ -1035,7 +1033,7 @@ static void processStubLibrariesPreLTO() { // extracted during processStubLibraries, which is too late since // LTO has already being performed at that point. if (needed->isLazy() && isa(needed->getFile())) { - if (!config->whyExtract.empty()) + if (!ctx.arg.whyExtract.empty()) ctx.whyExtractRecords.emplace_back(toString(stub_file), needed->getFile(), *needed); cast(needed)->extract(); @@ -1079,7 +1077,7 @@ static bool addStubSymbolDeps(const StubFile *stub_file, Symbol *sym, if (auto *lazy = dyn_cast(needed)) { depsAdded = true; lazy->extract(); - if (!config->whyExtract.empty()) + if (!ctx.arg.whyExtract.empty()) ctx.whyExtractRecords.emplace_back(toString(stub_file), sym->getFile(), *sym); } diff --git a/lld/wasm/InputChunks.cpp b/lld/wasm/InputChunks.cpp index 9383dcaeb4f55..ccdc92f5c8d71 100644 --- a/lld/wasm/InputChunks.cpp +++ b/lld/wasm/InputChunks.cpp @@ -67,7 +67,7 @@ uint32_t InputChunk::getSize() const { return ms->builder.getSize(); if (const auto *f = dyn_cast(this)) { - if (config->compressRelocations && f->file) { + if (ctx.arg.compressRelocations && f->file) { return f->getCompressedSize(); } } @@ -84,7 +84,7 @@ uint32_t InputChunk::getInputSize() const { // Copy this input chunk to an mmap'ed output file and apply relocations. void InputChunk::writeTo(uint8_t *buf) const { if (const auto *f = dyn_cast(this)) { - if (file && config->compressRelocations) + if (file && ctx.arg.compressRelocations) return f->writeCompressed(buf); } else if (const auto *ms = dyn_cast(this)) { ms->builder.write(buf + outSecOff); @@ -269,7 +269,7 @@ static unsigned getRelocWidth(const WasmRelocation &rel, uint64_t value) { // This function only computes the final output size. It must be called // before getSize() is used to calculate of layout of the code section. void InputFunction::calculateSize() { - if (!file || !config->compressRelocations) + if (!file || !ctx.arg.compressRelocations) return; LLVM_DEBUG(dbgs() << "calculateSize: " << name << "\n"); @@ -365,7 +365,7 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const { LLVM_DEBUG(dbgs() << "generating runtime relocations: " << name << " count=" << relocations.size() << "\n"); - bool is64 = config->is64.value_or(false); + bool is64 = ctx.arg.is64.value_or(false); bool generated = false; unsigned opcode_ptr_const = is64 ? WASM_OPCODE_I64_CONST : WASM_OPCODE_I32_CONST; diff --git a/lld/wasm/InputChunks.h b/lld/wasm/InputChunks.h index d6769bcf5c823..f545449e1246f 100644 --- a/lld/wasm/InputChunks.h +++ b/lld/wasm/InputChunks.h @@ -112,7 +112,7 @@ class InputChunk { InputChunk(ObjFile *f, Kind k, StringRef name, uint32_t alignment = 0, uint32_t flags = 0) : name(name), file(f), alignment(alignment), flags(flags), sectionKind(k), - live(!config->gcSections), discarded(false) {} + live(!ctx.arg.gcSections), discarded(false) {} ArrayRef data() const { return rawData; } uint64_t getTombstone() const; @@ -156,7 +156,7 @@ class SyntheticMergedChunk; // be found by looking at the next one). struct SectionPiece { SectionPiece(size_t off, uint32_t hash, bool live) - : inputOff(off), live(live || !config->gcSections), hash(hash >> 1) {} + : inputOff(off), live(live || !ctx.arg.gcSections), hash(hash >> 1) {} uint32_t inputOff; uint32_t live : 1; diff --git a/lld/wasm/InputElement.h b/lld/wasm/InputElement.h index 10dc2a3e4a826..c2a24c8ff5f4e 100644 --- a/lld/wasm/InputElement.h +++ b/lld/wasm/InputElement.h @@ -24,7 +24,7 @@ namespace wasm { class InputElement { protected: InputElement(StringRef name, ObjFile *f) - : file(f), live(!config->gcSections), name(name) {} + : file(f), live(!ctx.arg.gcSections), name(name) {} public: StringRef getName() const { return name; } @@ -65,7 +65,7 @@ class InputGlobal : public InputElement { const WasmInitExpr &getInitExpr() const { return initExpr; } void setPointerValue(uint64_t value) { - initExpr = intConst(value, config->is64.value_or(false)); + initExpr = intConst(value, ctx.arg.is64.value_or(false)); } private: diff --git a/lld/wasm/InputFiles.cpp b/lld/wasm/InputFiles.cpp index 221f02aa1c157..614cddddd1b19 100644 --- a/lld/wasm/InputFiles.cpp +++ b/lld/wasm/InputFiles.cpp @@ -47,7 +47,7 @@ std::string toString(const wasm::InputFile *file) { namespace wasm { std::string replaceThinLTOSuffix(StringRef path) { - auto [suffix, repl] = config->thinLTOObjectSuffixReplace; + auto [suffix, repl] = ctx.arg.thinLTOObjectSuffixReplace; if (path.consume_back(suffix)) return (path + repl).str(); return std::string(path); @@ -55,10 +55,10 @@ std::string replaceThinLTOSuffix(StringRef path) { void InputFile::checkArch(Triple::ArchType arch) const { bool is64 = arch == Triple::wasm64; - if (is64 && !config->is64) { + if (is64 && !ctx.arg.is64) { fatal(toString(this) + ": must specify -mwasm64 to process wasm64 object files"); - } else if (config->is64.value_or(false) != is64) { + } else if (ctx.arg.is64.value_or(false) != is64) { fatal(toString(this) + ": wasm32 object file can't be linked in wasm64 mode"); } @@ -169,7 +169,7 @@ uint64_t ObjFile::calcNewValue(const WasmRelocation &reloc, uint64_t tombstone, uint32_t index = getFunctionSymbol(reloc.Index)->getTableIndex(); if (reloc.Type == R_WASM_TABLE_INDEX_REL_SLEB || reloc.Type == R_WASM_TABLE_INDEX_REL_SLEB64) - index -= config->tableBase; + index -= ctx.arg.tableBase; return index; } case R_WASM_MEMORY_ADDR_LEB: @@ -360,7 +360,7 @@ void ObjFile::addLegacyIndirectFunctionTableIfNeeded( } static bool shouldMerge(const WasmSection &sec) { - if (config->optimize == 0) + if (ctx.arg.optimize == 0) return false; // Sadly we don't have section attributes yet for custom sections, so we // currently go by the name alone. @@ -383,7 +383,7 @@ static bool shouldMerge(const WasmSegment &seg) { // On a regular link we don't merge sections if -O0 (default is -O1). This // sometimes makes the linker significantly faster, although the output will // be bigger. - if (config->optimize == 0) + if (ctx.arg.optimize == 0) return false; // A mergeable section with size 0 is useless because they don't have @@ -845,7 +845,7 @@ BitcodeFile::BitcodeFile(MemoryBufferRef m, StringRef archiveName, this->archiveName = std::string(archiveName); std::string path = mb.getBufferIdentifier().str(); - if (config->thinLTOIndexOnly) + if (ctx.arg.thinLTOIndexOnly) path = replaceThinLTOSuffix(mb.getBufferIdentifier()); // ThinLTO assumes that all MemoryBufferRefs given to it have a unique diff --git a/lld/wasm/InputFiles.h b/lld/wasm/InputFiles.h index 1b1de98d2d17a..fd7fcb13f4426 100644 --- a/lld/wasm/InputFiles.h +++ b/lld/wasm/InputFiles.h @@ -73,7 +73,7 @@ class InputFile { protected: InputFile(Kind k, MemoryBufferRef m) - : mb(m), fileKind(k), live(!config->gcSections) {} + : mb(m), fileKind(k), live(!ctx.arg.gcSections) {} void checkArch(llvm::Triple::ArchType arch) const; diff --git a/lld/wasm/LTO.cpp b/lld/wasm/LTO.cpp index d9fff748bdb65..b9bd48acd6dc1 100644 --- a/lld/wasm/LTO.cpp +++ b/lld/wasm/LTO.cpp @@ -44,8 +44,8 @@ using namespace lld::wasm; using namespace lld; static std::string getThinLTOOutputFile(StringRef modulePath) { - return lto::getThinLTOOutputFile(modulePath, config->thinLTOPrefixReplaceOld, - config->thinLTOPrefixReplaceNew); + return lto::getThinLTOOutputFile(modulePath, ctx.arg.thinLTOPrefixReplaceOld, + ctx.arg.thinLTOPrefixReplaceNew); } static lto::Config createConfig() { @@ -56,23 +56,23 @@ static lto::Config createConfig() { c.Options.FunctionSections = true; c.Options.DataSections = true; - c.DisableVerify = config->disableVerify; + c.DisableVerify = ctx.arg.disableVerify; c.DiagHandler = diagnosticHandler; - c.OptLevel = config->ltoo; + c.OptLevel = ctx.arg.ltoo; c.MAttrs = getMAttrs(); - c.CGOptLevel = config->ltoCgo; - c.DebugPassManager = config->ltoDebugPassManager; - c.AlwaysEmitRegularLTOObj = !config->ltoObjPath.empty(); + c.CGOptLevel = ctx.arg.ltoCgo; + c.DebugPassManager = ctx.arg.ltoDebugPassManager; + c.AlwaysEmitRegularLTOObj = !ctx.arg.ltoObjPath.empty(); - if (config->relocatable) + if (ctx.arg.relocatable) c.RelocModel = std::nullopt; else if (ctx.isPic) c.RelocModel = Reloc::PIC_; else c.RelocModel = Reloc::Static; - if (config->saveTemps) - checkError(c.addSaveTemps(config->outputFile.str() + ".", + if (ctx.arg.saveTemps) + checkError(c.addSaveTemps(ctx.arg.outputFile.str() + ".", /*UseInputModulePath*/ true)); return c; } @@ -81,27 +81,27 @@ namespace lld::wasm { BitcodeCompiler::BitcodeCompiler() { // Initialize indexFile. - if (!config->thinLTOIndexOnlyArg.empty()) - indexFile = openFile(config->thinLTOIndexOnlyArg); + if (!ctx.arg.thinLTOIndexOnlyArg.empty()) + indexFile = openFile(ctx.arg.thinLTOIndexOnlyArg); // Initialize ltoObj. lto::ThinBackend backend; auto onIndexWrite = [&](StringRef s) { thinIndices.erase(s); }; - if (config->thinLTOIndexOnly) { + if (ctx.arg.thinLTOIndexOnly) { backend = lto::createWriteIndexesThinBackend( - llvm::hardware_concurrency(config->thinLTOJobs), - std::string(config->thinLTOPrefixReplaceOld), - std::string(config->thinLTOPrefixReplaceNew), - std::string(config->thinLTOPrefixReplaceNativeObject), - config->thinLTOEmitImportsFiles, indexFile.get(), onIndexWrite); + llvm::hardware_concurrency(ctx.arg.thinLTOJobs), + std::string(ctx.arg.thinLTOPrefixReplaceOld), + std::string(ctx.arg.thinLTOPrefixReplaceNew), + std::string(ctx.arg.thinLTOPrefixReplaceNativeObject), + ctx.arg.thinLTOEmitImportsFiles, indexFile.get(), onIndexWrite); } else { backend = lto::createInProcessThinBackend( - llvm::heavyweight_hardware_concurrency(config->thinLTOJobs), - onIndexWrite, config->thinLTOEmitIndexFiles, - config->thinLTOEmitImportsFiles); + llvm::heavyweight_hardware_concurrency(ctx.arg.thinLTOJobs), + onIndexWrite, ctx.arg.thinLTOEmitIndexFiles, + ctx.arg.thinLTOEmitImportsFiles); } ltoObj = std::make_unique(createConfig(), backend, - config->ltoPartitions); + ctx.arg.ltoPartitions); } BitcodeCompiler::~BitcodeCompiler() = default; @@ -123,7 +123,7 @@ void BitcodeCompiler::add(BitcodeFile &f) { ArrayRef syms = f.getSymbols(); std::vector resols(syms.size()); - if (config->thinLTOEmitIndexFiles) { + if (ctx.arg.thinLTOEmitIndexFiles) { thinIndices.insert(obj.getName()); } @@ -139,7 +139,7 @@ void BitcodeCompiler::add(BitcodeFile &f) { // Once IRObjectFile is fixed to report only one symbol this hack can // be removed. r.Prevailing = !objSym.isUndefined() && sym->getFile() == &f; - r.VisibleToRegularObj = config->relocatable || sym->isUsedInRegularObj || + r.VisibleToRegularObj = ctx.arg.relocatable || sym->isUsedInRegularObj || sym->isNoStrip() || (r.Prevailing && sym->isExported()); if (r.Prevailing) @@ -175,7 +175,7 @@ static void thinLTOCreateEmptyIndexFiles() { ModuleSummaryIndex m(/*HaveGVs*/ false); m.setSkipModuleByDistributedBackend(); writeIndexToFile(m, *os); - if (config->thinLTOEmitImportsFiles) + if (ctx.arg.thinLTOEmitImportsFiles) openFile(path + ".imports"); } } @@ -191,8 +191,8 @@ std::vector BitcodeCompiler::compile() { // to cache native object files for ThinLTO incremental builds. If a path was // specified, configure LTO to use it as the cache directory. FileCache cache; - if (!config->thinLTOCacheDir.empty()) - cache = check(localCache("ThinLTO", "Thin", config->thinLTOCacheDir, + if (!ctx.arg.thinLTOCacheDir.empty()) + cache = check(localCache("ThinLTO", "Thin", ctx.arg.thinLTOCacheDir, [&](size_t task, const Twine &moduleName, std::unique_ptr mb) { files[task] = std::move(mb); @@ -210,16 +210,16 @@ std::vector BitcodeCompiler::compile() { for (StringRef s : thinIndices) { std::string path(s); openFile(path + ".thinlto.bc"); - if (config->thinLTOEmitImportsFiles) + if (ctx.arg.thinLTOEmitImportsFiles) openFile(path + ".imports"); } - if (config->thinLTOEmitIndexFiles) + if (ctx.arg.thinLTOEmitIndexFiles) thinLTOCreateEmptyIndexFiles(); - if (config->thinLTOIndexOnly) { - if (!config->ltoObjPath.empty()) - saveBuffer(buf[0].second, config->ltoObjPath); + if (ctx.arg.thinLTOIndexOnly) { + if (!ctx.arg.ltoObjPath.empty()) + saveBuffer(buf[0].second, ctx.arg.ltoObjPath); // ThinLTO with index only option is required to generate only the index // files. After that, we exit from linker and ThinLTO backend runs in a @@ -229,8 +229,8 @@ std::vector BitcodeCompiler::compile() { return {}; } - if (!config->thinLTOCacheDir.empty()) - pruneCache(config->thinLTOCacheDir, config->thinLTOCachePolicy, files); + if (!ctx.arg.thinLTOCacheDir.empty()) + pruneCache(ctx.arg.thinLTOCacheDir, ctx.arg.thinLTOCachePolicy, files); std::vector ret; for (unsigned i = 0; i != maxTasks; ++i) { @@ -239,7 +239,7 @@ std::vector BitcodeCompiler::compile() { if (objBuf.empty()) continue; ret.emplace_back(objBuf.data(), objBuf.size()); - if (!config->saveTemps) + if (!ctx.arg.saveTemps) continue; // If the input bitcode file is path/to/x.o and -o specifies a.out, the @@ -248,7 +248,7 @@ std::vector BitcodeCompiler::compile() { StringRef ltoObjName; if (bitcodeFilePath == "ld-temp.o") { ltoObjName = - saver().save(Twine(config->outputFile) + ".lto" + + saver().save(Twine(ctx.arg.outputFile) + ".lto" + (i == 0 ? Twine("") : Twine('.') + Twine(i)) + ".o"); } else { StringRef directory = sys::path::parent_path(bitcodeFilePath); @@ -258,7 +258,7 @@ std::vector BitcodeCompiler::compile() { StringRef baseName = bitcodeFilePath.ends_with(")") ? sys::path::filename(bitcodeFilePath) : sys::path::stem(bitcodeFilePath); - StringRef outputFileBaseName = sys::path::filename(config->outputFile); + StringRef outputFileBaseName = sys::path::filename(ctx.arg.outputFile); SmallString<256> path; sys::path::append(path, directory, outputFileBaseName + ".lto." + baseName + ".o"); @@ -268,10 +268,10 @@ std::vector BitcodeCompiler::compile() { saveBuffer(objBuf, ltoObjName); } - if (!config->ltoObjPath.empty()) { - saveBuffer(buf[0].second, config->ltoObjPath); + if (!ctx.arg.ltoObjPath.empty()) { + saveBuffer(buf[0].second, ctx.arg.ltoObjPath); for (unsigned i = 1; i != maxTasks; ++i) - saveBuffer(buf[i].second, config->ltoObjPath + Twine(i)); + saveBuffer(buf[i].second, ctx.arg.ltoObjPath + Twine(i)); } for (std::unique_ptr &file : files) diff --git a/lld/wasm/MapFile.cpp b/lld/wasm/MapFile.cpp index c96b64cb64838..d8487e48b8c6b 100644 --- a/lld/wasm/MapFile.cpp +++ b/lld/wasm/MapFile.cpp @@ -103,14 +103,14 @@ getSymbolStrings(ArrayRef syms) { } void lld::wasm::writeMapFile(ArrayRef outputSections) { - if (config->mapFile.empty()) + if (ctx.arg.mapFile.empty()) return; // Open a map file for writing. std::error_code ec; - raw_fd_ostream os(config->mapFile, ec, sys::fs::OF_None); + raw_fd_ostream os(ctx.arg.mapFile, ec, sys::fs::OF_None); if (ec) { - error("cannot open " + config->mapFile + ": " + ec.message()); + error("cannot open " + ctx.arg.mapFile + ": " + ec.message()); return; } diff --git a/lld/wasm/MarkLive.cpp b/lld/wasm/MarkLive.cpp index 1b99f03747fb0..13c7a3d894fe3 100644 --- a/lld/wasm/MarkLive.cpp +++ b/lld/wasm/MarkLive.cpp @@ -106,8 +106,8 @@ void MarkLive::enqueueRetainedSegments(const ObjFile *file) { void MarkLive::run() { // Add GC root symbols. - if (!config->entry.empty()) - enqueue(symtab->find(config->entry)); + if (!ctx.arg.entry.empty()) + enqueue(symtab->find(ctx.arg.entry)); // We need to preserve any no-strip or exported symbol for (Symbol *sym : symtab->symbols()) @@ -166,7 +166,7 @@ void MarkLive::mark() { } void markLive() { - if (!config->gcSections) + if (!ctx.arg.gcSections) return; LLVM_DEBUG(dbgs() << "markLive\n"); @@ -175,7 +175,7 @@ void markLive() { marker.run(); // Report garbage-collected sections. - if (config->printGcSections) { + if (ctx.arg.printGcSections) { for (const ObjFile *obj : ctx.objectFiles) { for (InputChunk *c : obj->functions) if (!c->live) @@ -207,7 +207,7 @@ void markLive() { bool MarkLive::isCallCtorsLive() { // In a reloctable link, we don't call `__wasm_call_ctors`. - if (config->relocatable) + if (ctx.arg.relocatable) return false; // In Emscripten-style PIC, we call `__wasm_call_ctors` which calls diff --git a/lld/wasm/OutputSections.cpp b/lld/wasm/OutputSections.cpp index e4f75829ec4c3..95f7ecc29de6b 100644 --- a/lld/wasm/OutputSections.cpp +++ b/lld/wasm/OutputSections.cpp @@ -105,13 +105,13 @@ void DataSection::finalizeContents() { }); #endif - assert((config->sharedMemory || !ctx.isPic || config->extendedConst || + assert((ctx.arg.sharedMemory || !ctx.isPic || ctx.arg.extendedConst || activeCount <= 1) && "output segments should have been combined by now"); writeUleb128(os, segmentCount, "data segment count"); bodySize = dataSectionHeader.size(); - bool is64 = config->is64.value_or(false); + bool is64 = ctx.arg.is64.value_or(false); for (OutputSegment *segment : segments) { if (!segment->requiredInBinary()) @@ -121,7 +121,7 @@ void DataSection::finalizeContents() { if (segment->initFlags & WASM_DATA_SEGMENT_HAS_MEMINDEX) writeUleb128(os, 0, "memory index"); if ((segment->initFlags & WASM_DATA_SEGMENT_IS_PASSIVE) == 0) { - if (ctx.isPic && config->extendedConst) { + if (ctx.isPic && ctx.arg.extendedConst) { writeU8(os, WASM_OPCODE_GLOBAL_GET, "global get"); writeUleb128(os, WasmSym::memoryBase->getGlobalIndex(), "literal (global index)"); diff --git a/lld/wasm/Relocations.cpp b/lld/wasm/Relocations.cpp index 45ad32701616a..745dfde76ab70 100644 --- a/lld/wasm/Relocations.cpp +++ b/lld/wasm/Relocations.cpp @@ -22,13 +22,13 @@ static bool requiresGOTAccess(const Symbol *sym) { if (sym->isShared()) return true; if (!ctx.isPic && - config->unresolvedSymbols != UnresolvedPolicy::ImportDynamic) + ctx.arg.unresolvedSymbols != UnresolvedPolicy::ImportDynamic) return false; if (sym->isHidden() || sym->isLocal()) return false; // With `-Bsymbolic` (or when building an executable) as don't need to use // the GOT for symbols that are defined within the current module. - if (sym->isDefined() && (!config->shared || config->bsymbolic)) + if (sym->isDefined() && (!ctx.arg.shared || ctx.arg.bsymbolic)) return false; return true; } @@ -38,15 +38,15 @@ static bool allowUndefined(const Symbol* sym) { // link time. if (sym->isImported()) return true; - if (isa(sym) && config->importUndefined) + if (isa(sym) && ctx.arg.importUndefined) return true; - return config->allowUndefinedSymbols.count(sym->getName()) != 0; + return ctx.arg.allowUndefinedSymbols.count(sym->getName()) != 0; } static void reportUndefined(ObjFile *file, Symbol *sym) { if (!allowUndefined(sym)) { - switch (config->unresolvedSymbols) { + switch (ctx.arg.unresolvedSymbols) { case UnresolvedPolicy::ReportError: error(toString(file) + ": undefined symbol: " + toString(*sym)); break; @@ -63,8 +63,8 @@ static void reportUndefined(ObjFile *file, Symbol *sym) { if (auto *f = dyn_cast(sym)) { if (!f->stubFunction && - config->unresolvedSymbols != UnresolvedPolicy::ImportDynamic && - !config->importUndefined) { + ctx.arg.unresolvedSymbols != UnresolvedPolicy::ImportDynamic && + !ctx.arg.importUndefined) { f->stubFunction = symtab->createUndefinedStub(*f->getSignature()); f->stubFunction->markLive(); // Mark the function itself as a stub which prevents it from being @@ -125,7 +125,7 @@ void scanRelocations(InputChunk *chunk) { // In single-threaded builds TLS is lowered away and TLS data can be // merged with normal data and allowing TLS relocation in non-TLS // segments. - if (config->sharedMemory) { + if (ctx.arg.sharedMemory) { if (!sym->isTLS()) { error(toString(file) + ": relocation " + relocTypeToString(reloc.Type) + @@ -146,7 +146,7 @@ void scanRelocations(InputChunk *chunk) { if (ctx.isPic || (sym->isUndefined() && - config->unresolvedSymbols == UnresolvedPolicy::ImportDynamic)) { + ctx.arg.unresolvedSymbols == UnresolvedPolicy::ImportDynamic)) { switch (reloc.Type) { case R_WASM_TABLE_INDEX_SLEB: case R_WASM_TABLE_INDEX_SLEB64: @@ -173,7 +173,7 @@ void scanRelocations(InputChunk *chunk) { } } - if (!config->relocatable && sym->isUndefined()) { + if (!ctx.arg.relocatable && sym->isUndefined()) { switch (reloc.Type) { case R_WASM_TABLE_INDEX_REL_SLEB: case R_WASM_TABLE_INDEX_REL_SLEB64: diff --git a/lld/wasm/SymbolTable.cpp b/lld/wasm/SymbolTable.cpp index 4cbf44b4d0398..f57359083d242 100644 --- a/lld/wasm/SymbolTable.cpp +++ b/lld/wasm/SymbolTable.cpp @@ -53,7 +53,7 @@ void SymbolTable::addFile(InputFile *file, StringRef symName) { return; } - if (config->trace) + if (ctx.arg.trace) message(toString(file)); // LLVM bitcode file @@ -125,7 +125,7 @@ std::pair SymbolTable::insertName(StringRef name) { sym->canInline = true; sym->traced = trace; sym->forceExport = false; - sym->referenced = !config->gcSections; + sym->referenced = !ctx.arg.gcSections; symVector.emplace_back(sym); return {sym, true}; } @@ -235,7 +235,7 @@ DefinedFunction *SymbolTable::addSyntheticFunction(StringRef name, DefinedData *SymbolTable::addOptionalDataSymbol(StringRef name, uint64_t value) { Symbol *s = find(name); - if (!s && (config->exportAll || config->exportedSymbols.count(name) != 0)) + if (!s && (ctx.arg.exportAll || ctx.arg.exportedSymbols.count(name) != 0)) s = insertName(name).first; else if (!s || s->isDefined()) return nullptr; @@ -317,7 +317,7 @@ static bool shouldReplace(const Symbol *existing, InputFile *newFile, } // Neither symbol is week. They conflict. - if (config->allowMultipleDefinition) + if (ctx.arg.allowMultipleDefinition) return false; errorOrWarn("duplicate symbol: " + toString(*existing) + "\n>>> defined in " + @@ -387,7 +387,7 @@ Symbol *SymbolTable::addSharedFunction(StringRef name, uint32_t flags, checkSig = ud->isCalledDirectly; if (checkSig && !signatureMatches(existingFunction, sig)) { - if (config->shlibSigCheck) { + if (ctx.arg.shlibSigCheck) { reportFunctionSignatureMismatch(name, existingFunction, sig, file); } else { // With --no-shlib-sigcheck we ignore the signature of the function as @@ -637,7 +637,7 @@ Symbol *SymbolTable::addUndefinedFunction(StringRef name, lazy->signature = sig; } else { lazy->extract(); - if (!config->whyExtract.empty()) + if (!ctx.arg.whyExtract.empty()) ctx.whyExtractRecords.emplace_back(toString(file), s->getFile(), *s); } } else { @@ -652,7 +652,7 @@ Symbol *SymbolTable::addUndefinedFunction(StringRef name, if (isCalledDirectly && !signatureMatches(existingFunction, sig)) { if (existingFunction->isShared()) { // Special handling for when the existing function is a shared symbol - if (config->shlibSigCheck) { + if (ctx.arg.shlibSigCheck) { reportFunctionSignatureMismatch(name, existingFunction, sig, file); } else { existingFunction->signature = sig; @@ -788,12 +788,12 @@ TableSymbol *SymbolTable::createUndefinedIndirectFunctionTable(StringRef name) { WasmTableType *type = make(); type->ElemType = ValType::FUNCREF; type->Limits = limits; - uint32_t flags = config->exportTable ? 0 : WASM_SYMBOL_VISIBILITY_HIDDEN; + uint32_t flags = ctx.arg.exportTable ? 0 : WASM_SYMBOL_VISIBILITY_HIDDEN; flags |= WASM_SYMBOL_UNDEFINED; Symbol *sym = addUndefinedTable(name, name, defaultModule, flags, nullptr, type); sym->markLive(); - sym->forceExport = config->exportTable; + sym->forceExport = ctx.arg.exportTable; return cast(sym); } @@ -803,10 +803,10 @@ TableSymbol *SymbolTable::createDefinedIndirectFunctionTable(StringRef name) { WasmTableType type{ValType::FUNCREF, limits}; WasmTable desc{invalidIndex, type, name}; InputTable *table = make(desc, nullptr); - uint32_t flags = config->exportTable ? 0 : WASM_SYMBOL_VISIBILITY_HIDDEN; + uint32_t flags = ctx.arg.exportTable ? 0 : WASM_SYMBOL_VISIBILITY_HIDDEN; TableSymbol *sym = addSyntheticTable(name, flags, table); sym->markLive(); - sym->forceExport = config->exportTable; + sym->forceExport = ctx.arg.exportTable; return sym; } @@ -830,7 +830,7 @@ TableSymbol *SymbolTable::resolveIndirectFunctionTable(bool required) { } } - if (config->importTable) { + if (ctx.arg.importTable) { if (existing) { existing->importModule = defaultModule; existing->importName = functionTableName; @@ -838,7 +838,7 @@ TableSymbol *SymbolTable::resolveIndirectFunctionTable(bool required) { } if (required) return createUndefinedIndirectFunctionTable(functionTableName); - } else if ((existing && existing->isLive()) || config->exportTable || + } else if ((existing && existing->isLive()) || ctx.arg.exportTable || required) { // A defined table is required. Either because the user request an exported // table or because the table symbol is already live. The existing table is @@ -885,7 +885,7 @@ void SymbolTable::addLazy(StringRef name, InputFile *file) { LLVM_DEBUG(dbgs() << "replacing existing undefined\n"); const InputFile *oldFile = s->getFile(); LazySymbol(name, 0, file).extract(); - if (!config->whyExtract.empty()) + if (!ctx.arg.whyExtract.empty()) ctx.whyExtractRecords.emplace_back(toString(oldFile), s->getFile(), *s); } diff --git a/lld/wasm/Symbols.cpp b/lld/wasm/Symbols.cpp index e62e7bec609f5..a687fd6d6c4ef 100644 --- a/lld/wasm/Symbols.cpp +++ b/lld/wasm/Symbols.cpp @@ -35,7 +35,7 @@ std::string maybeDemangleSymbol(StringRef name) { // `main` in the case where we need to pass it arguments. if (name == "__main_argc_argv") return "main"; - if (wasm::config->demangle) + if (wasm::ctx.arg.demangle) return demangle(name); return name.str(); } @@ -235,10 +235,10 @@ bool Symbol::isExported() const { // Shared libraries must export all weakly defined symbols // in case they contain the version that will be chosen by // the dynamic linker. - if (config->shared && isLive() && isWeak() && !isHidden()) + if (ctx.arg.shared && isLive() && isWeak() && !isHidden()) return true; - if (config->exportAll || (config->exportDynamic && !isHidden())) + if (ctx.arg.exportAll || (ctx.arg.exportDynamic && !isHidden())) return true; return isExportedExplicit(); diff --git a/lld/wasm/Symbols.h b/lld/wasm/Symbols.h index 80b658773bd20..b409fffc50a6c 100644 --- a/lld/wasm/Symbols.h +++ b/lld/wasm/Symbols.h @@ -139,7 +139,7 @@ class Symbol { protected: Symbol(StringRef name, Kind k, uint32_t flags, InputFile *f) - : name(name), file(f), symbolKind(k), referenced(!config->gcSections), + : name(name), file(f), symbolKind(k), referenced(!ctx.arg.gcSections), requiresGOT(false), isUsedInRegularObj(false), forceExport(false), forceImport(false), canInline(false), traced(false), isStub(false), flags(flags) {} diff --git a/lld/wasm/SyntheticSections.cpp b/lld/wasm/SyntheticSections.cpp index 6b32d12ebeb45..715fba1ee6da5 100644 --- a/lld/wasm/SyntheticSections.cpp +++ b/lld/wasm/SyntheticSections.cpp @@ -55,7 +55,7 @@ class SubSection { bool DylinkSection::isNeeded() const { return ctx.isPic || - config->unresolvedSymbols == UnresolvedPolicy::ImportDynamic || + ctx.arg.unresolvedSymbols == UnresolvedPolicy::ImportDynamic || !ctx.sharedFiles.empty(); } @@ -162,7 +162,7 @@ void TypeSection::writeBody() { uint32_t ImportSection::getNumImports() const { assert(isSealed); uint32_t numImports = importedSymbols.size() + gotSymbols.size(); - if (config->memoryImport.has_value()) + if (ctx.arg.memoryImport.has_value()) ++numImports; return numImports; } @@ -232,20 +232,20 @@ void ImportSection::writeBody() { writeUleb128(os, getNumImports(), "import count"); - bool is64 = config->is64.value_or(false); + bool is64 = ctx.arg.is64.value_or(false); - if (config->memoryImport) { + if (ctx.arg.memoryImport) { WasmImport import; - import.Module = config->memoryImport->first; - import.Field = config->memoryImport->second; + import.Module = ctx.arg.memoryImport->first; + import.Field = ctx.arg.memoryImport->second; import.Kind = WASM_EXTERNAL_MEMORY; import.Memory.Flags = 0; import.Memory.Minimum = out.memorySec->numMemoryPages; - if (out.memorySec->maxMemoryPages != 0 || config->sharedMemory) { + if (out.memorySec->maxMemoryPages != 0 || ctx.arg.sharedMemory) { import.Memory.Flags |= WASM_LIMITS_FLAG_HAS_MAX; import.Memory.Maximum = out.memorySec->maxMemoryPages; } - if (config->sharedMemory) + if (ctx.arg.sharedMemory) import.Memory.Flags |= WASM_LIMITS_FLAG_IS_SHARED; if (is64) import.Memory.Flags |= WASM_LIMITS_FLAG_IS_64; @@ -351,14 +351,14 @@ void TableSection::assignIndexes() { void MemorySection::writeBody() { raw_ostream &os = bodyOutputStream; - bool hasMax = maxMemoryPages != 0 || config->sharedMemory; + bool hasMax = maxMemoryPages != 0 || ctx.arg.sharedMemory; writeUleb128(os, 1, "memory count"); unsigned flags = 0; if (hasMax) flags |= WASM_LIMITS_FLAG_HAS_MAX; - if (config->sharedMemory) + if (ctx.arg.sharedMemory) flags |= WASM_LIMITS_FLAG_IS_SHARED; - if (config->is64.value_or(false)) + if (ctx.arg.is64.value_or(false)) flags |= WASM_LIMITS_FLAG_IS_64; writeUleb128(os, flags, "memory limits flags"); writeUleb128(os, numMemoryPages, "initial pages"); @@ -415,8 +415,8 @@ void GlobalSection::addInternalGOTEntry(Symbol *sym) { } void GlobalSection::generateRelocationCode(raw_ostream &os, bool TLS) const { - assert(!config->extendedConst); - bool is64 = config->is64.value_or(false); + assert(!ctx.arg.extendedConst); + bool is64 = ctx.arg.is64.value_or(false); unsigned opcode_ptr_const = is64 ? WASM_OPCODE_I64_CONST : WASM_OPCODE_I32_CONST; unsigned opcode_ptr_add = is64 ? WASM_OPCODE_I64_ADD @@ -466,7 +466,7 @@ void GlobalSection::writeBody() { writeGlobalType(os, g->getType()); writeInitExpr(os, g->getInitExpr()); } - bool is64 = config->is64.value_or(false); + bool is64 = ctx.arg.is64.value_or(false); uint8_t itype = is64 ? WASM_TYPE_I64 : WASM_TYPE_I32; for (const Symbol *sym : internalGotSymbols) { bool mutable_ = false; @@ -474,11 +474,11 @@ void GlobalSection::writeBody() { // In the case of dynamic linking, unless we have 'extended-const' // available, these global must to be mutable since they get updated to // the correct runtime value during `__wasm_apply_global_relocs`. - if (!config->extendedConst && ctx.isPic && !sym->isTLS()) + if (!ctx.arg.extendedConst && ctx.isPic && !sym->isTLS()) mutable_ = true; // With multi-theadeding any TLS globals must be mutable since they get // set during `__wasm_apply_global_tls_relocs` - if (config->sharedMemory && sym->isTLS()) + if (ctx.arg.sharedMemory && sym->isTLS()) mutable_ = true; } WasmGlobalType type{itype, mutable_}; @@ -487,7 +487,7 @@ void GlobalSection::writeBody() { bool useExtendedConst = false; uint32_t globalIdx; int64_t offset; - if (config->extendedConst && ctx.isPic) { + if (ctx.arg.extendedConst && ctx.isPic) { if (auto *d = dyn_cast(sym)) { if (!sym->isTLS()) { globalIdx = WasmSym::memoryBase->getGlobalIndex(); @@ -518,7 +518,7 @@ void GlobalSection::writeBody() { // In the sharedMemory case TLS globals are set during // `__wasm_apply_global_tls_relocs`, but in the non-shared case // we know the absolute value at link time. - initExpr = intConst(d->getVA(/*absolute=*/!config->sharedMemory), is64); + initExpr = intConst(d->getVA(/*absolute=*/!ctx.arg.sharedMemory), is64); else if (auto *f = dyn_cast(sym)) initExpr = intConst(f->isStub ? 0 : f->getTableIndex(), is64); else { @@ -566,7 +566,7 @@ void ElemSection::addEntry(FunctionSymbol *sym) { // They only exist so that the calls to missing functions can validate. if (sym->hasTableIndex() || sym->isStub) return; - sym->setTableIndex(config->tableBase + indirectFunctions.size()); + sym->setTableIndex(ctx.arg.tableBase + indirectFunctions.size()); indirectFunctions.emplace_back(sym); } @@ -589,8 +589,8 @@ void ElemSection::writeBody() { initExpr.Inst.Opcode = WASM_OPCODE_GLOBAL_GET; initExpr.Inst.Value.Global = WasmSym::tableBase->getGlobalIndex(); } else { - bool is64 = config->is64.value_or(false); - initExpr = intConst(config->tableBase, is64); + bool is64 = ctx.arg.is64.value_or(false); + initExpr = intConst(ctx.arg.tableBase, is64); } writeInitExpr(os, initExpr); @@ -602,7 +602,7 @@ void ElemSection::writeBody() { } writeUleb128(os, indirectFunctions.size(), "elem count"); - uint32_t tableIndex = config->tableBase; + uint32_t tableIndex = ctx.arg.tableBase; for (const FunctionSymbol *sym : indirectFunctions) { assert(sym->getTableIndex() == tableIndex); (void) tableIndex; @@ -622,7 +622,7 @@ void DataCountSection::writeBody() { } bool DataCountSection::isNeeded() const { - return numSegments && config->sharedMemory; + return numSegments && ctx.arg.sharedMemory; } void LinkingSection::writeBody() { @@ -786,9 +786,9 @@ unsigned NameSection::numNamedDataSegments() const { void NameSection::writeBody() { { SubSection sub(WASM_NAMES_MODULE); - StringRef moduleName = config->soName; - if (config->soName.empty()) - moduleName = llvm::sys::path::filename(config->outputFile); + StringRef moduleName = ctx.arg.soName; + if (ctx.arg.soName.empty()) + moduleName = llvm::sys::path::filename(ctx.arg.outputFile); writeStr(sub.os, moduleName, "module name"); sub.writeTo(bodyOutputStream); } @@ -917,14 +917,14 @@ void RelocSection::writeBody() { } static size_t getHashSize() { - switch (config->buildId) { + switch (ctx.arg.buildId) { case BuildIdKind::Fast: case BuildIdKind::Uuid: return 16; case BuildIdKind::Sha1: return 20; case BuildIdKind::Hexstring: - return config->buildIdVector.size(); + return ctx.arg.buildIdVector.size(); case BuildIdKind::None: return 0; } diff --git a/lld/wasm/SyntheticSections.h b/lld/wasm/SyntheticSections.h index 10183e93d2a28..068fbed11f4a7 100644 --- a/lld/wasm/SyntheticSections.h +++ b/lld/wasm/SyntheticSections.h @@ -228,7 +228,7 @@ class MemorySection : public SyntheticSection { public: MemorySection() : SyntheticSection(llvm::wasm::WASM_SEC_MEMORY) {} - bool isNeeded() const override { return !config->memoryImport.has_value(); } + bool isNeeded() const override { return !ctx.arg.memoryImport.has_value(); } void writeBody() override; uint64_t numMemoryPages = 0; @@ -286,7 +286,7 @@ class GlobalSection : public SyntheticSection { // transform a `global.get` to an `i32.const`. void addInternalGOTEntry(Symbol *sym); bool needsRelocations() { - if (config->extendedConst) + if (ctx.arg.extendedConst) return false; return llvm::any_of(internalGotSymbols, [=](Symbol *sym) { return !sym->isTLS(); }); @@ -354,7 +354,7 @@ class LinkingSection : public SyntheticSection { : SyntheticSection(llvm::wasm::WASM_SEC_CUSTOM, "linking"), initFunctions(initFunctions), dataSegments(dataSegments) {} bool isNeeded() const override { - return config->relocatable || config->emitRelocs; + return ctx.arg.relocatable || ctx.arg.emitRelocs; } void writeBody() override; void addToSymtab(Symbol *sym); @@ -373,7 +373,7 @@ class NameSection : public SyntheticSection { : SyntheticSection(llvm::wasm::WASM_SEC_CUSTOM, "name"), segments(segments) {} bool isNeeded() const override { - if (config->stripAll && !config->keepSections.count(name)) + if (ctx.arg.stripAll && !ctx.arg.keepSections.count(name)) return false; return numNames() > 0; } @@ -396,7 +396,7 @@ class ProducersSection : public SyntheticSection { ProducersSection() : SyntheticSection(llvm::wasm::WASM_SEC_CUSTOM, "producers") {} bool isNeeded() const override { - if (config->stripAll && !config->keepSections.count(name)) + if (ctx.arg.stripAll && !ctx.arg.keepSections.count(name)) return false; return fieldCount() > 0; } @@ -417,7 +417,7 @@ class TargetFeaturesSection : public SyntheticSection { TargetFeaturesSection() : SyntheticSection(llvm::wasm::WASM_SEC_CUSTOM, "target_features") {} bool isNeeded() const override { - if (config->stripAll && !config->keepSections.count(name)) + if (ctx.arg.stripAll && !ctx.arg.keepSections.count(name)) return false; return features.size() > 0; } @@ -443,7 +443,7 @@ class BuildIdSection : public SyntheticSection { BuildIdSection(); void writeBody() override; bool isNeeded() const override { - return config->buildId != BuildIdKind::None; + return ctx.arg.buildId != BuildIdKind::None; } void writeBuildId(llvm::ArrayRef buf); void writeTo(uint8_t *buf) override { diff --git a/lld/wasm/Writer.cpp b/lld/wasm/Writer.cpp index aeac1a51824f5..76e38f548157c 100644 --- a/lld/wasm/Writer.cpp +++ b/lld/wasm/Writer.cpp @@ -132,7 +132,7 @@ class Writer { void Writer::calculateCustomSections() { log("calculateCustomSections"); - bool stripDebug = config->stripDebug || config->stripAll; + bool stripDebug = ctx.arg.stripDebug || ctx.arg.stripAll; for (ObjFile *file : ctx.objectFiles) { for (InputChunk *section : file->customSections) { // Exclude COMDAT sections that are not selected for inclusion @@ -172,7 +172,7 @@ void Writer::createCustomSections() { LLVM_DEBUG(dbgs() << "createCustomSection: " << name << "\n"); OutputSection *sec = make(std::string(name), pair.second); - if (config->relocatable || config->emitRelocs) { + if (ctx.arg.relocatable || ctx.arg.emitRelocs) { auto *sym = make(sec); out.linkingSec->addToSymtab(sym); sec->sectionSym = sym; @@ -282,8 +282,8 @@ static void makeUUID(unsigned version, llvm::ArrayRef fileHash, void Writer::writeBuildId() { if (!out.buildIdSec->isNeeded()) return; - if (config->buildId == BuildIdKind::Hexstring) { - out.buildIdSec->writeBuildId(config->buildIdVector); + if (ctx.arg.buildId == BuildIdKind::Hexstring) { + out.buildIdSec->writeBuildId(ctx.arg.buildIdVector); return; } @@ -292,7 +292,7 @@ void Writer::writeBuildId() { std::vector buildId(hashSize); llvm::ArrayRef buf{buffer->getBufferStart(), size_t(fileSize)}; - switch (config->buildId) { + switch (ctx.arg.buildId) { case BuildIdKind::Fast: { std::vector fileHash(8); computeHash(fileHash, buf, [](uint8_t *dest, ArrayRef arr) { @@ -324,9 +324,9 @@ static void setGlobalPtr(DefinedGlobal *g, uint64_t memoryPtr) { // to each of the input data sections as well as the explicit stack region. // The default memory layout is as follows, from low to high. // -// - initialized data (starting at config->globalBase) +// - initialized data (starting at ctx.arg.globalBase) // - BSS data (not currently implemented in llvm) -// - explicit stack (config->ZStackSize) +// - explicit stack (ctx.arg.ZStackSize) // - heap start / unallocated // // The --stack-first option means that stack is placed before any static data. @@ -337,33 +337,33 @@ void Writer::layoutMemory() { uint64_t memoryPtr = 0; auto placeStack = [&]() { - if (config->relocatable || ctx.isPic) + if (ctx.arg.relocatable || ctx.isPic) return; memoryPtr = alignTo(memoryPtr, stackAlignment); if (WasmSym::stackLow) WasmSym::stackLow->setVA(memoryPtr); - if (config->zStackSize != alignTo(config->zStackSize, stackAlignment)) + if (ctx.arg.zStackSize != alignTo(ctx.arg.zStackSize, stackAlignment)) error("stack size must be " + Twine(stackAlignment) + "-byte aligned"); - log("mem: stack size = " + Twine(config->zStackSize)); + log("mem: stack size = " + Twine(ctx.arg.zStackSize)); log("mem: stack base = " + Twine(memoryPtr)); - memoryPtr += config->zStackSize; + memoryPtr += ctx.arg.zStackSize; setGlobalPtr(cast(WasmSym::stackPointer), memoryPtr); if (WasmSym::stackHigh) WasmSym::stackHigh->setVA(memoryPtr); log("mem: stack top = " + Twine(memoryPtr)); }; - if (config->stackFirst) { + if (ctx.arg.stackFirst) { placeStack(); - if (config->globalBase) { - if (config->globalBase < memoryPtr) { + if (ctx.arg.globalBase) { + if (ctx.arg.globalBase < memoryPtr) { error("--global-base cannot be less than stack size when --stack-first is used"); return; } - memoryPtr = config->globalBase; + memoryPtr = ctx.arg.globalBase; } } else { - memoryPtr = config->globalBase; + memoryPtr = ctx.arg.globalBase; } log("mem: global base = " + Twine(memoryPtr)); @@ -385,7 +385,7 @@ void Writer::layoutMemory() { log(formatv("mem: {0,-15} offset={1,-8} size={2,-8} align={3}", seg->name, memoryPtr, seg->size, seg->alignment)); - if (!config->relocatable && seg->isTLS()) { + if (!ctx.arg.relocatable && seg->isTLS()) { if (WasmSym::tlsSize) { auto *tlsSize = cast(WasmSym::tlsSize); setGlobalPtr(tlsSize, seg->size); @@ -394,7 +394,7 @@ void Writer::layoutMemory() { auto *tlsAlign = cast(WasmSym::tlsAlign); setGlobalPtr(tlsAlign, int64_t{1} << seg->alignment); } - if (!config->sharedMemory && WasmSym::tlsBase) { + if (!ctx.arg.sharedMemory && WasmSym::tlsBase) { auto *tlsBase = cast(WasmSym::tlsBase); setGlobalPtr(tlsBase, memoryPtr); } @@ -404,7 +404,7 @@ void Writer::layoutMemory() { } // Make space for the memory initialization flag - if (config->sharedMemory && hasPassiveInitializedSegments()) { + if (ctx.arg.sharedMemory && hasPassiveInitializedSegments()) { memoryPtr = alignTo(memoryPtr, 4); WasmSym::initMemoryFlag = symtab->addSyntheticDataSymbol( "__wasm_init_memory_flag", WASM_SYMBOL_VISIBILITY_HIDDEN); @@ -423,7 +423,7 @@ void Writer::layoutMemory() { if (ctx.isPic) out.dylinkSec->memSize = staticDataSize; - if (!config->stackFirst) + if (!ctx.arg.stackFirst) placeStack(); if (WasmSym::heapBase) { @@ -438,31 +438,31 @@ void Writer::layoutMemory() { } uint64_t maxMemorySetting = 1ULL << 32; - if (config->is64.value_or(false)) { + if (ctx.arg.is64.value_or(false)) { // TODO: Update once we decide on a reasonable limit here: // https://github.com/WebAssembly/memory64/issues/33 maxMemorySetting = 1ULL << 34; } - if (config->initialHeap != 0) { - if (config->initialHeap != alignTo(config->initialHeap, WasmPageSize)) + if (ctx.arg.initialHeap != 0) { + if (ctx.arg.initialHeap != alignTo(ctx.arg.initialHeap, WasmPageSize)) error("initial heap must be " + Twine(WasmPageSize) + "-byte aligned"); uint64_t maxInitialHeap = maxMemorySetting - memoryPtr; - if (config->initialHeap > maxInitialHeap) + if (ctx.arg.initialHeap > maxInitialHeap) error("initial heap too large, cannot be greater than " + Twine(maxInitialHeap)); - memoryPtr += config->initialHeap; + memoryPtr += ctx.arg.initialHeap; } - if (config->initialMemory != 0) { - if (config->initialMemory != alignTo(config->initialMemory, WasmPageSize)) + if (ctx.arg.initialMemory != 0) { + if (ctx.arg.initialMemory != alignTo(ctx.arg.initialMemory, WasmPageSize)) error("initial memory must be " + Twine(WasmPageSize) + "-byte aligned"); - if (memoryPtr > config->initialMemory) + if (memoryPtr > ctx.arg.initialMemory) error("initial memory too small, " + Twine(memoryPtr) + " bytes needed"); - if (config->initialMemory > maxMemorySetting) + if (ctx.arg.initialMemory > maxMemorySetting) error("initial memory too large, cannot be greater than " + Twine(maxMemorySetting)); - memoryPtr = config->initialMemory; + memoryPtr = ctx.arg.initialMemory; } memoryPtr = alignTo(memoryPtr, WasmPageSize); @@ -479,23 +479,23 @@ void Writer::layoutMemory() { } uint64_t maxMemory = 0; - if (config->maxMemory != 0) { - if (config->maxMemory != alignTo(config->maxMemory, WasmPageSize)) + if (ctx.arg.maxMemory != 0) { + if (ctx.arg.maxMemory != alignTo(ctx.arg.maxMemory, WasmPageSize)) error("maximum memory must be " + Twine(WasmPageSize) + "-byte aligned"); - if (memoryPtr > config->maxMemory) + if (memoryPtr > ctx.arg.maxMemory) error("maximum memory too small, " + Twine(memoryPtr) + " bytes needed"); - if (config->maxMemory > maxMemorySetting) + if (ctx.arg.maxMemory > maxMemorySetting) error("maximum memory too large, cannot be greater than " + Twine(maxMemorySetting)); - maxMemory = config->maxMemory; - } else if (config->noGrowableMemory) { + maxMemory = ctx.arg.maxMemory; + } else if (ctx.arg.noGrowableMemory) { maxMemory = memoryPtr; } // If no maxMemory config was supplied but we are building with // shared memory, we need to pick a sensible upper limit. - if (config->sharedMemory && maxMemory == 0) { + if (ctx.arg.sharedMemory && maxMemory == 0) { if (ctx.isPic) maxMemory = maxMemorySetting; else @@ -552,7 +552,7 @@ void Writer::addSections() { createCustomSections(); addSection(out.linkingSec); - if (config->emitRelocs || config->relocatable) { + if (ctx.arg.emitRelocs || ctx.arg.relocatable) { createRelocSections(); } @@ -583,18 +583,18 @@ void Writer::populateTargetFeatures() { allowed.insert("mutable-globals"); } - if (config->extraFeatures.has_value()) { - auto &extraFeatures = *config->extraFeatures; + if (ctx.arg.extraFeatures.has_value()) { + auto &extraFeatures = *ctx.arg.extraFeatures; allowed.insert(extraFeatures.begin(), extraFeatures.end()); } // Only infer used features if user did not specify features - bool inferFeatures = !config->features.has_value(); + bool inferFeatures = !ctx.arg.features.has_value(); if (!inferFeatures) { - auto &explicitFeatures = *config->features; + auto &explicitFeatures = *ctx.arg.features; allowed.insert(explicitFeatures.begin(), explicitFeatures.end()); - if (!config->checkFeatures) + if (!ctx.arg.checkFeatures) goto done; } @@ -626,10 +626,10 @@ void Writer::populateTargetFeatures() { for (const auto &key : used.keys()) allowed.insert(std::string(key)); - if (!config->checkFeatures) + if (!ctx.arg.checkFeatures) goto done; - if (config->sharedMemory) { + if (ctx.arg.sharedMemory) { if (disallowed.count("shared-mem")) error("--shared-memory is disallowed by " + disallowed["shared-mem"] + " because it was not compiled with 'atomics' or 'bulk-memory' " @@ -679,19 +679,19 @@ void Writer::populateTargetFeatures() { // instruction, then we can also avoid including the segments. // Finally, if we are emitting relocations, they may refer to locations within // the bss segments, so these segments need to exist in the binary. - if (config->emitRelocs || - (config->memoryImport.has_value() && !allowed.count("bulk-memory"))) + if (ctx.arg.emitRelocs || + (ctx.arg.memoryImport.has_value() && !allowed.count("bulk-memory"))) ctx.emitBssSegments = true; if (allowed.count("extended-const")) - config->extendedConst = true; + ctx.arg.extendedConst = true; for (auto &feature : allowed) log("Allowed feature: " + feature); } void Writer::checkImportExportTargetFeatures() { - if (config->relocatable || !config->checkFeatures) + if (ctx.arg.relocatable || !ctx.arg.checkFeatures) return; if (out.targetFeaturesSec->features.count("mutable-globals") == 0) { @@ -727,14 +727,14 @@ static bool shouldImport(Symbol *sym) { // When a symbol is weakly defined in a shared library we need to allow // it to be overridden by another module so need to both import // and export the symbol. - if (config->shared && sym->isWeak() && !sym->isUndefined() && + if (ctx.arg.shared && sym->isWeak() && !sym->isUndefined() && !sym->isHidden()) return true; if (sym->isShared()) return true; if (!sym->isUndefined()) return false; - if (sym->isWeak() && !config->relocatable && !ctx.isPic) + if (sym->isWeak() && !ctx.arg.relocatable && !ctx.isPic) return false; // In PIC mode we only need to import functions when they are called directly. @@ -745,10 +745,10 @@ static bool shouldImport(Symbol *sym) { return false; } - if (ctx.isPic || config->relocatable || config->importUndefined || - config->unresolvedSymbols == UnresolvedPolicy::ImportDynamic) + if (ctx.isPic || ctx.arg.relocatable || ctx.arg.importUndefined || + ctx.arg.unresolvedSymbols == UnresolvedPolicy::ImportDynamic) return true; - if (config->allowUndefinedSymbols.count(sym->getName()) != 0) + if (ctx.arg.allowUndefinedSymbols.count(sym->getName()) != 0) return true; return sym->isImported(); @@ -773,12 +773,12 @@ void Writer::calculateImports() { } void Writer::calculateExports() { - if (config->relocatable) + if (ctx.arg.relocatable) return; - if (!config->relocatable && config->memoryExport.has_value()) { + if (!ctx.arg.relocatable && ctx.arg.memoryExport.has_value()) { out.exportSec->exports.push_back( - WasmExport{*config->memoryExport, WASM_EXTERNAL_MEMORY, 0}); + WasmExport{*ctx.arg.memoryExport, WASM_EXTERNAL_MEMORY, 0}); } unsigned globalIndex = @@ -827,7 +827,7 @@ void Writer::calculateExports() { } void Writer::populateSymtab() { - if (!config->relocatable && !config->emitRelocs) + if (!ctx.arg.relocatable && !ctx.arg.emitRelocs) return; for (Symbol *sym : symtab->symbols()) @@ -931,13 +931,13 @@ static void finalizeIndirectFunctionTable() { out.importSec->addImport(WasmSym::indirectFunctionTable); } - uint32_t tableSize = config->tableBase + out.elemSec->numEntries(); + uint32_t tableSize = ctx.arg.tableBase + out.elemSec->numEntries(); WasmLimits limits = {0, tableSize, 0}; - if (WasmSym::indirectFunctionTable->isDefined() && !config->growableTable) { + if (WasmSym::indirectFunctionTable->isDefined() && !ctx.arg.growableTable) { limits.Flags |= WASM_LIMITS_FLAG_HAS_MAX; limits.Maximum = limits.Minimum; } - if (config->is64.value_or(false)) + if (ctx.arg.is64.value_or(false)) limits.Flags |= WASM_LIMITS_FLAG_IS_64; WasmSym::indirectFunctionTable->setLimits(limits); } @@ -1001,7 +1001,7 @@ static StringRef getOutputDataSegmentName(const InputChunk &seg) { // symbols are be relative to single __tls_base. if (seg.isTLS()) return ".tdata"; - if (!config->mergeDataSegments) + if (!ctx.arg.mergeDataSegments) return seg.name; if (seg.name.starts_with(".text.")) return ".text"; @@ -1017,9 +1017,9 @@ static StringRef getOutputDataSegmentName(const InputChunk &seg) { OutputSegment *Writer::createOutputSegment(StringRef name) { LLVM_DEBUG(dbgs() << "new segment: " << name << "\n"); OutputSegment *s = make(name); - if (config->sharedMemory) + if (ctx.arg.sharedMemory) s->initFlags = WASM_DATA_SEGMENT_IS_PASSIVE; - if (!config->relocatable && name.starts_with(".bss")) + if (!ctx.arg.relocatable && name.starts_with(".bss")) s->isBss = true; segments.push_back(s); return s; @@ -1035,7 +1035,7 @@ void Writer::createOutputSegments() { // When running in relocatable mode we can't merge segments that are part // of comdat groups since the ultimate linker needs to be able exclude or // include them individually. - if (config->relocatable && !segment->getComdatName().empty()) { + if (ctx.arg.relocatable && !segment->getComdatName().empty()) { s = createOutputSegment(name); } else { if (segmentMap.count(name) == 0) @@ -1075,8 +1075,8 @@ void Writer::combineOutputSegments() { // combines all data segments into a single .data segment. // This restriction does not apply when the extended const extension is // available: https://github.com/WebAssembly/extended-const - assert(!config->extendedConst); - assert(ctx.isPic && !config->sharedMemory); + assert(!ctx.arg.extendedConst); + assert(ctx.isPic && !ctx.arg.sharedMemory); if (segments.size() <= 1) return; OutputSegment *combined = make(".data"); @@ -1117,7 +1117,7 @@ static void createFunction(DefinedFunction *func, StringRef bodyContent) { bool Writer::needsPassiveInitialization(const OutputSegment *segment) { // If bulk memory features is supported then we can perform bss initialization // (via memory.fill) during `__wasm_init_memory`. - if (config->memoryImport.has_value() && !segment->requiredInBinary()) + if (ctx.arg.memoryImport.has_value() && !segment->requiredInBinary()) return true; return segment->initFlags & WASM_DATA_SEGMENT_IS_PASSIVE; } @@ -1129,7 +1129,7 @@ bool Writer::hasPassiveInitializedSegments() { } void Writer::createSyntheticInitFunctions() { - if (config->relocatable) + if (ctx.arg.relocatable) return; static WasmSignature nullSignature = {{}, {}}; @@ -1146,14 +1146,14 @@ void Writer::createSyntheticInitFunctions() { "__wasm_init_memory", WASM_SYMBOL_VISIBILITY_HIDDEN, make(nullSignature, "__wasm_init_memory")); WasmSym::initMemory->markLive(); - if (config->sharedMemory) { + if (ctx.arg.sharedMemory) { // This global is assigned during __wasm_init_memory in the shared memory // case. WasmSym::tlsBase->markLive(); } } - if (config->sharedMemory) { + if (ctx.arg.sharedMemory) { if (out.globalSec->needsTLSRelocations()) { WasmSym::applyGlobalTLSRelocs = symtab->addSyntheticFunction( "__wasm_apply_global_tls_relocs", WASM_SYMBOL_VISIBILITY_HIDDEN, @@ -1203,11 +1203,11 @@ void Writer::createInitMemoryFunction() { assert(WasmSym::initMemory); assert(hasPassiveInitializedSegments()); uint64_t flagAddress; - if (config->sharedMemory) { + if (ctx.arg.sharedMemory) { assert(WasmSym::initMemoryFlag); flagAddress = WasmSym::initMemoryFlag->getVA(); } - bool is64 = config->is64.value_or(false); + bool is64 = ctx.arg.is64.value_or(false); std::string bodyContent; { raw_string_ostream os(bodyContent); @@ -1271,7 +1271,7 @@ void Writer::createInitMemoryFunction() { } }; - if (config->sharedMemory) { + if (ctx.arg.sharedMemory) { // With PIC code we cache the flag address in local 0 if (ctx.isPic) { writeUleb128(os, 1, "num local decls"); @@ -1334,7 +1334,7 @@ void Writer::createInitMemoryFunction() { // When we initialize the TLS segment we also set the `__tls_base` // global. This allows the runtime to use this static copy of the // TLS data for the first/main thread. - if (config->sharedMemory && s->isTLS()) { + if (ctx.arg.sharedMemory && s->isTLS()) { if (ctx.isPic) { // Cache the result of the addionion in local 0 writeU8(os, WASM_OPCODE_LOCAL_TEE, "local.tee"); @@ -1368,7 +1368,7 @@ void Writer::createInitMemoryFunction() { } } - if (config->sharedMemory) { + if (ctx.arg.sharedMemory) { // Set flag to 2 to mark end of initialization writeGetFlagAddress(); writeI32Const(os, 2, "flag value"); @@ -1407,7 +1407,7 @@ void Writer::createInitMemoryFunction() { if (needsPassiveInitialization(s) && !s->isBss) { // The TLS region should not be dropped since its is needed // during the initialization of each thread (__wasm_init_tls). - if (config->sharedMemory && s->isTLS()) + if (ctx.arg.sharedMemory && s->isTLS()) continue; // data.drop instruction writeU8(os, WASM_OPCODE_MISC_PREFIX, "bulk-memory prefix"); @@ -1460,7 +1460,7 @@ void Writer::createApplyDataRelocationsFunction() { writeUleb128(os, 0, "num locals"); bool generated = false; for (const OutputSegment *seg : segments) - if (!config->sharedMemory || !seg->isTLS()) + if (!ctx.arg.sharedMemory || !seg->isTLS()) for (const InputChunk *inSeg : seg->inputSegments) generated |= inSeg->generateRelocationCode(os); @@ -1656,7 +1656,7 @@ void Writer::createInitTLSFunction() { // This is then used either when creating the output linking section or to // synthesize the "__wasm_call_ctors" function. void Writer::calculateInitFunctions() { - if (!config->relocatable && !WasmSym::callCtors->isLive()) + if (!ctx.arg.relocatable && !WasmSym::callCtors->isLive()) return; for (ObjFile *file : ctx.objectFiles) { @@ -1708,7 +1708,7 @@ void Writer::run() { // For PIC code the table base is assigned dynamically by the loader. // For non-PIC, we start at 1 so that accessing table index 0 always traps. if (!ctx.isPic && WasmSym::definedTableBase) - WasmSym::definedTableBase->setVA(config->tableBase); + WasmSym::definedTableBase->setVA(ctx.arg.tableBase); log("-- createOutputSegments"); createOutputSegments(); @@ -1717,7 +1717,7 @@ void Writer::run() { log("-- layoutMemory"); layoutMemory(); - if (!config->relocatable) { + if (!ctx.arg.relocatable) { // Create linker synthesized __start_SECNAME/__stop_SECNAME symbols // This has to be done after memory layout is performed. for (const OutputSegment *seg : segments) { @@ -1725,7 +1725,7 @@ void Writer::run() { } } - for (auto &pair : config->exportedSymbols) { + for (auto &pair : ctx.arg.exportedSymbols) { Symbol *sym = symtab->find(pair.first()); if (sym && sym->isDefined()) sym->forceExport = true; @@ -1733,12 +1733,12 @@ void Writer::run() { // Delay reporting errors about explicit exports until after // addStartStopSymbols which can create optional symbols. - for (auto &name : config->requiredExports) { + for (auto &name : ctx.arg.requiredExports) { Symbol *sym = symtab->find(name); if (!sym || !sym->isDefined()) { - if (config->unresolvedSymbols == UnresolvedPolicy::ReportError) + if (ctx.arg.unresolvedSymbols == UnresolvedPolicy::ReportError) error(Twine("symbol exported via --export not found: ") + name); - if (config->unresolvedSymbols == UnresolvedPolicy::Warn) + if (ctx.arg.unresolvedSymbols == UnresolvedPolicy::Warn) warn(Twine("symbol exported via --export not found: ") + name); } } @@ -1750,7 +1750,7 @@ void Writer::run() { // `__memory_base` import. Unless we support the extended const expression we // can't do addition inside the constant expression, so we much combine the // segments into a single one that can live at `__memory_base`. - if (ctx.isPic && !config->extendedConst && !config->sharedMemory) { + if (ctx.isPic && !ctx.arg.extendedConst && !ctx.arg.sharedMemory) { // In shared memory mode all data segments are passive and initialized // via __wasm_init_memory. log("-- combineOutputSegments"); @@ -1774,7 +1774,7 @@ void Writer::run() { log("-- calculateInitFunctions"); calculateInitFunctions(); - if (!config->relocatable) { + if (!ctx.arg.relocatable) { // Create linker synthesized functions if (WasmSym::applyGlobalRelocs) createApplyGlobalRelocationsFunction(); @@ -1793,7 +1793,7 @@ void Writer::run() { // If the input contains a call to `__wasm_call_ctors`, either in one of // the input objects or an explicit export from the command-line, we // assume ctors and dtors are taken care of already. - if (!config->relocatable && !ctx.isPic && + if (!ctx.arg.relocatable && !ctx.isPic && !WasmSym::callCtors->isUsedInRegularObj && !WasmSym::callCtors->isExported()) { log("-- createCommandExportWrappers"); @@ -1861,14 +1861,14 @@ void Writer::run() { // Open a result file. void Writer::openFile() { - log("writing: " + config->outputFile); + log("writing: " + ctx.arg.outputFile); Expected> bufferOrErr = - FileOutputBuffer::create(config->outputFile, fileSize, + FileOutputBuffer::create(ctx.arg.outputFile, fileSize, FileOutputBuffer::F_executable); if (!bufferOrErr) - error("failed to open " + config->outputFile + ": " + + error("failed to open " + ctx.arg.outputFile + ": " + toString(bufferOrErr.takeError())); else buffer = std::move(*bufferOrErr); From 7531672712b0fb517f1818d512fbdfa6feed4232 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Thu, 2 Jan 2025 17:37:44 -0800 Subject: [PATCH 317/567] [flang][cuda][NFC] Remove unused variable (#121533) Failed buildbot after https://github.com/llvm/llvm-project/pull/121524 --- flang/lib/Optimizer/Transforms/CUFOpConversion.cpp | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp index f08f9e412b885..8c525fc6daff5 100644 --- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp @@ -792,10 +792,6 @@ struct CUFSyncDescriptorOpConversion : public mlir::OpRewritePattern { using OpRewritePattern::OpRewritePattern; - CUFSyncDescriptorOpConversion(mlir::MLIRContext *context, - const mlir::SymbolTable &symTab) - : OpRewritePattern(context), symTab{symTab} {} - mlir::LogicalResult matchAndRewrite(cuf::SyncDescriptorOp op, mlir::PatternRewriter &rewriter) const override { @@ -822,9 +818,6 @@ struct CUFSyncDescriptorOpConversion op.erase(); return mlir::success(); } - -private: - const mlir::SymbolTable &symTab; }; class CUFOpConversion : public fir::impl::CUFOpConversionBase { @@ -887,11 +880,11 @@ void cuf::populateCUFToFIRConversionPatterns( const mlir::SymbolTable &symtab, mlir::RewritePatternSet &patterns) { patterns.insert(patterns.getContext(), &dl, &converter); patterns.insert(patterns.getContext()); + CUFFreeOpConversion, CUFSyncDescriptorOpConversion>( + patterns.getContext()); patterns.insert(patterns.getContext(), symtab, &dl, &converter); - patterns.insert( - patterns.getContext(), symtab); + patterns.insert(patterns.getContext(), symtab); } void cuf::populateFIRCUFConversionPatterns(const mlir::SymbolTable &symtab, From c1ecc0d168ad122d858dd5fec475da391f97e959 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20=C3=81lvarez=20Ayll=C3=B3n?= Date: Fri, 3 Jan 2025 02:43:53 +0100 Subject: [PATCH 318/567] [clang] Allow generating module interfaces with parsing errors (#121485) Fixes a regression introduced in commit da00c60dae0040185dc45039c4397f6e746548e9 This functionality was originally added in commit 5834996fefc937d6211dc8c8a5b200068753391a Co-authored-by: Tomasz Kaminski --- clang/include/clang/Serialization/ASTWriter.h | 13 ++++++---- clang/lib/Frontend/FrontendActions.cpp | 6 +++-- clang/lib/Serialization/GeneratePCH.cpp | 5 ++-- clang/test/Modules/pcm-with-errors.cpp | 26 +++++++++++++++++++ 4 files changed, 41 insertions(+), 9 deletions(-) create mode 100644 clang/test/Modules/pcm-with-errors.cpp diff --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h index cb972f0106402..adb7cce522a80 100644 --- a/clang/include/clang/Serialization/ASTWriter.h +++ b/clang/include/clang/Serialization/ASTWriter.h @@ -997,13 +997,15 @@ class CXX20ModulesGenerator : public PCHGenerator { virtual Module *getEmittingModule(ASTContext &Ctx) override; CXX20ModulesGenerator(Preprocessor &PP, InMemoryModuleCache &ModuleCache, - StringRef OutputFile, bool GeneratingReducedBMI); + StringRef OutputFile, bool GeneratingReducedBMI, + bool AllowASTWithErrors); public: CXX20ModulesGenerator(Preprocessor &PP, InMemoryModuleCache &ModuleCache, - StringRef OutputFile) + StringRef OutputFile, bool AllowASTWithErrors = false) : CXX20ModulesGenerator(PP, ModuleCache, OutputFile, - /*GeneratingReducedBMI=*/false) {} + /*GeneratingReducedBMI=*/false, + AllowASTWithErrors) {} void HandleTranslationUnit(ASTContext &Ctx) override; }; @@ -1013,9 +1015,10 @@ class ReducedBMIGenerator : public CXX20ModulesGenerator { public: ReducedBMIGenerator(Preprocessor &PP, InMemoryModuleCache &ModuleCache, - StringRef OutputFile) + StringRef OutputFile, bool AllowASTWithErrors = false) : CXX20ModulesGenerator(PP, ModuleCache, OutputFile, - /*GeneratingReducedBMI=*/true) {} + /*GeneratingReducedBMI=*/true, + AllowASTWithErrors) {} }; /// If we can elide the definition of \param D in reduced BMI. diff --git a/clang/lib/Frontend/FrontendActions.cpp b/clang/lib/Frontend/FrontendActions.cpp index e943f143d4c15..30dfa5481d070 100644 --- a/clang/lib/Frontend/FrontendActions.cpp +++ b/clang/lib/Frontend/FrontendActions.cpp @@ -279,12 +279,14 @@ GenerateModuleInterfaceAction::CreateASTConsumer(CompilerInstance &CI, !CI.getFrontendOpts().ModuleOutputPath.empty()) { Consumers.push_back(std::make_unique( CI.getPreprocessor(), CI.getModuleCache(), - CI.getFrontendOpts().ModuleOutputPath)); + CI.getFrontendOpts().ModuleOutputPath, + +CI.getFrontendOpts().AllowPCMWithCompilerErrors)); } Consumers.push_back(std::make_unique( CI.getPreprocessor(), CI.getModuleCache(), - CI.getFrontendOpts().OutputFile)); + CI.getFrontendOpts().OutputFile, + +CI.getFrontendOpts().AllowPCMWithCompilerErrors)); return std::make_unique(std::move(Consumers)); } diff --git a/clang/lib/Serialization/GeneratePCH.cpp b/clang/lib/Serialization/GeneratePCH.cpp index 7a8a951b34f25..a3189bb40b191 100644 --- a/clang/lib/Serialization/GeneratePCH.cpp +++ b/clang/lib/Serialization/GeneratePCH.cpp @@ -102,12 +102,13 @@ void PCHGenerator::anchor() {} CXX20ModulesGenerator::CXX20ModulesGenerator(Preprocessor &PP, InMemoryModuleCache &ModuleCache, StringRef OutputFile, - bool GeneratingReducedBMI) + bool GeneratingReducedBMI, + bool AllowASTWithErrors) : PCHGenerator( PP, ModuleCache, OutputFile, llvm::StringRef(), std::make_shared(), /*Extensions=*/ArrayRef>(), - /*AllowASTWithErrors*/ false, /*IncludeTimestamps=*/false, + AllowASTWithErrors, /*IncludeTimestamps=*/false, /*BuildingImplicitModule=*/false, /*ShouldCacheASTInMemory=*/false, GeneratingReducedBMI) {} diff --git a/clang/test/Modules/pcm-with-errors.cpp b/clang/test/Modules/pcm-with-errors.cpp new file mode 100644 index 0000000000000..1bbc3865ee3ee --- /dev/null +++ b/clang/test/Modules/pcm-with-errors.cpp @@ -0,0 +1,26 @@ +// RUN: rm -rf %t +// RUN: split-file %s %t +// RUN: cd %t + +// RUN: %clang_cc1 -std=c++23 m.cppm -emit-module-interface -o m.pcm -fallow-pcm-with-compiler-errors -verify +// RUN: %clang_cc1 -std=c++23 main.cpp -fmodule-file=m=m.pcm -verify -fallow-pcm-with-compiler-errors -verify + +// RUN: %clang_cc1 -std=c++23 m.cppm -fmodules-reduced-bmi -emit-module-interface -o m.pcm -fallow-pcm-with-compiler-errors -verify +// RUN: %clang_cc1 -std=c++23 main.cpp -fmodule-file=m=m.pcm -verify -fallow-pcm-with-compiler-errors -verify + +//--- m.cppm +export module m; + +export int f() { + return 0; +} + +export struct Foo { + __Int bar; // expected-error {{unknown type name '__Int'}} +}; + +//--- main.cpp +// expected-no-diagnostics +import m; // ok + +static_assert(__is_same(decltype(f), int())); // ok From e8cf41311fe6940e096d3c9e8a43338b47cb8b2a Mon Sep 17 00:00:00 2001 From: gulfemsavrun Date: Thu, 2 Jan 2025 18:34:02 -0800 Subject: [PATCH 319/567] Revert "[compiler-rt][rtsan] fopencookie support." (#121537) Reverts llvm/llvm-project#120864 because it broke building compiler-rt on Mac. https://luci-milo.appspot.com/ui/p/fuchsia/builders/toolchain.ci/clang-mac-arm64/b8726812736235038609/overview --- .../lib/rtsan/rtsan_interceptors_posix.cpp | 7 ------ .../tests/rtsan_test_interceptors_posix.cpp | 23 ------------------- 2 files changed, 30 deletions(-) diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp index 072923ab35ae0..4e51f464b5730 100644 --- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp @@ -297,12 +297,6 @@ INTERCEPTOR(FILE *, fdopen, int fd, const char *mode) { return REAL(fdopen)(fd, mode); } -INTERCEPTOR(FILE *, fopencookie, void *cookie, const char *mode, - cookie_io_functions_t funcs) { - __rtsan_notify_intercepted_call("fopencookie"); - return REAL(fopencookie)(cookie, mode, funcs); -} - #if SANITIZER_INTERCEPT_OPEN_MEMSTREAM INTERCEPTOR(FILE *, open_memstream, char **buf, size_t *size) { __rtsan_notify_intercepted_call("open_memstream"); @@ -978,7 +972,6 @@ void __rtsan::InitializeInterceptors() { INTERCEPT_FUNCTION(fputs); INTERCEPT_FUNCTION(fdopen); INTERCEPT_FUNCTION(freopen); - INTERCEPT_FUNCTION(fopencookie); RTSAN_MAYBE_INTERCEPT_OPEN_MEMSTREAM; RTSAN_MAYBE_INTERCEPT_FMEMOPEN; INTERCEPT_FUNCTION(lseek); diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp index c9c4d7fc4e99e..b052dd859dcdf 100644 --- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp @@ -353,29 +353,6 @@ TEST_F(RtsanFileTest, FopenDiesWhenRealtime) { ExpectNonRealtimeSurvival(Func); } -TEST_F(RtsanFileTest, FopenCookieDieWhenRealtime) { - FILE *f = fopen(GetTemporaryFilePath(), "w"); - EXPECT_THAT(f, Ne(nullptr)); - struct fholder { - FILE *fp; - size_t read; - } fh = {f, 0}; - auto CookieRead = [this](void *cookie, char *buf, size_t size) { - fholder *p = reinterpret_cast(cookie); - p->read = fread(static_cast(buf), 1, size, p->fp); - EXPECT_NE(0, p->read); - }; - cookie_io_functions_t funcs = {(cookie_read_function_t *)&CookieRead, nullptr, - nullptr, nullptr}; - auto Func = [&fh, &funcs]() { - FILE *f = fopencookie(&fh, "w", funcs); - EXPECT_THAT(f, Ne(nullptr)); - }; - - ExpectRealtimeDeath(Func, "fopencookie"); - ExpectNonRealtimeSurvival(Func); -} - #if SANITIZER_INTERCEPT_OPEN_MEMSTREAM TEST_F(RtsanFileTest, OpenMemstreamDiesWhenRealtime) { char *buffer; From 510a5c7fc25b2a3c33679480131cd60049747dd1 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 2 Jan 2025 18:46:44 -0800 Subject: [PATCH 320/567] [ELF] Fix .gnu.version crash when .dynsym is discarded Fix #88650 In addition, delete the unneeded comment. https://sourceware.org/gnu-gabi/program-loading-and-dynamic-linking.txt --- lld/ELF/SyntheticSections.cpp | 5 ++- .../ELF/linkerscript/discard-section-dynsym.s | 33 +++++++++++++++---- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp index baa7a083404fe..10cbfe19b3b0a 100644 --- a/lld/ELF/SyntheticSections.cpp +++ b/lld/ELF/SyntheticSections.cpp @@ -3798,9 +3798,8 @@ VersionTableSection::VersionTableSection(Ctx &ctx) } void VersionTableSection::finalizeContents() { - // At the moment of june 2016 GNU docs does not mention that sh_link field - // should be set, but Sun docs do. Also readelf relies on this field. - getParent()->link = getPartition(ctx).dynSymTab->getParent()->sectionIndex; + if (OutputSection *osec = getPartition(ctx).dynSymTab->getParent()) + getParent()->link = osec->sectionIndex; } size_t VersionTableSection::getSize() const { diff --git a/lld/test/ELF/linkerscript/discard-section-dynsym.s b/lld/test/ELF/linkerscript/discard-section-dynsym.s index 7c7c9c29cee84..f5d483dca86ec 100644 --- a/lld/test/ELF/linkerscript/discard-section-dynsym.s +++ b/lld/test/ELF/linkerscript/discard-section-dynsym.s @@ -1,24 +1,43 @@ # REQUIRES: aarch64 ## We allow discarding .dynsym, check we don't crash. -# RUN: llvm-mc -filetype=obj -triple=aarch64 %s -o %t.o +# RUN: rm -rf %t && split-file %s %t && cd %t +# RUN: llvm-mc -filetype=obj -triple=aarch64 a.s -o a.o +# RUN: llvm-mc -filetype=obj -triple=aarch64 c.s -o c.o +# RUN: ld.lld -shared --version-script=c.ver c.o -o c.so -# RUN: echo 'SECTIONS { /DISCARD/ : { *(.dynsym) } }' > %t.lds -# RUN: ld.lld -shared -T %t.lds %t.o -o %t.so -# RUN: llvm-readelf -r %t.so | FileCheck %s +# RUN: echo 'SECTIONS { /DISCARD/ : { *(.dynsym) } }' > 1.lds +# RUN: ld.lld -shared -T 1.lds a.o c.so -o out1.so +# RUN: llvm-readelf -Sr out1.so | FileCheck %s --check-prefixes=CHECK,CHECK1 -# RUN: echo 'SECTIONS { /DISCARD/ : { *(.dynsym .dynstr) } }' > %t.lds -# RUN: ld.lld -shared -T %t.lds %t.o -o %t.so -# RUN: llvm-readelf -r %t.so | FileCheck %s +# RUN: echo 'SECTIONS { /DISCARD/ : { *(.dynsym .dynstr) } }' > 2.lds +# RUN: ld.lld -shared -T 2.lds a.o c.so -o out2.so +# RUN: llvm-readelf -Sr out2.so | FileCheck %s --check-prefixes=CHECK,CHECK2 + +# CHECK: [Nr] Name Type Address Off Size ES Flg Lk Inf Al +# CHECK-NEXT: [ 0] NULL 0000000000000000 000000 000000 00 0 0 0 +# CHECK-NEXT: [ 1] .gnu.version VERSYM 0000000000000000 {{.*}} 000006 02 A 0 0 2 +# CHECK1-NEXT: [ 2] .gnu.version_r VERNEED 0000000000000008 {{.*}} 000020 00 A 5 1 4 +# CHECK2-NEXT: [ 2] .gnu.version_r VERNEED 0000000000000008 {{.*}} 000020 00 A 0 1 4 +# CHECK1: [ 5] .dynstr STRTAB # CHECK: contains 2 entries: # CHECK: R_AARCH64_RELATIVE [[#]] # CHECK-NEXT: R_AARCH64_GLOB_DAT 0{{$}} +#--- a.s adrp x9, :got:var ldr x9, [x9, :got_lo12:var] + bl __libc_start_main .data .align 8 foo: .quad foo + +#--- c.s +.globl __libc_start_main +__libc_start_main: + +#--- c.ver +GLIBC_2.34 { __libc_start_main; }; From 9df375e5eae726c5a90ada70f9535a5e22e90214 Mon Sep 17 00:00:00 2001 From: YAMAMOTO Takashi Date: Fri, 3 Jan 2025 11:53:21 +0900 Subject: [PATCH 321/567] [lld][WebAssembly] Fix non-pie dynamic-linking executable (#108146) The commit 22b7b84860d39da71964c9b329937f2ee1d875ba made the symbols provided by shared libraries "defined", and thus effectively made it impossible to generate non-pie dynamically linked executables using --unresolved-symbols=import-dynamic. This commit, based on https://github.com/llvm/llvm-project/pull/109249, fixes it by checking sym->isShared() explictly. (as a bonus, you don't need to rely on --unresolved-symbols=import-dynamic anymore.) Fixes https://github.com/llvm/llvm-project/issues/107387 --- lld/test/wasm/dylink-non-pie.s | 38 ++++++++++++++++++++++++++++++++++ lld/wasm/Relocations.cpp | 2 +- 2 files changed, 39 insertions(+), 1 deletion(-) create mode 100755 lld/test/wasm/dylink-non-pie.s diff --git a/lld/test/wasm/dylink-non-pie.s b/lld/test/wasm/dylink-non-pie.s new file mode 100755 index 0000000000000..3157b8c32120f --- /dev/null +++ b/lld/test/wasm/dylink-non-pie.s @@ -0,0 +1,38 @@ +# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.lib.o %p/Inputs/ret32.s +# RUN: wasm-ld -m wasm32 --experimental-pic -shared --no-entry %t.lib.o -o %t.lib.so +# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s +# RUN: wasm-ld -m wasm32 -Bdynamic %t.o %t.lib.so -o %t.wasm +# RUN: obj2yaml %t.wasm | FileCheck %s +# RUN: llvm-objdump -d --no-show-raw-insn --no-leading-addr %t.wasm | FileCheck %s --check-prefixes DIS + + .functype ret32 (f32) -> (i32) + .globl _start +_start: + .functype _start () -> () + i32.const f_p + drop + end_function + + .section .data.f_p,"",@ +f_p: + .int32 ret32 + .size f_p, 4 + +# CHECK: Sections: +# CHECK-NEXT: - Type: CUSTOM +# CHECK-NEXT: Name: dylink.0 + +# non-pie executable doesn't import __memory_base +# CHECK: - Type: IMPORT +# CHECK-NOT: Field: __memory_base + +# CHECK: - Type: EXPORT +# CHECK: - Name: __wasm_apply_data_relocs +# CHECK-NEXT: Kind: FUNCTION + +# DIS: <__wasm_apply_data_relocs>: +# DIS-EMPTY: +# DIS-NEXT: i32.const 1024 +# DIS-NEXT: global.get 0 +# DIS-NEXT: i32.store 0 +# DIS-NEXT: end diff --git a/lld/wasm/Relocations.cpp b/lld/wasm/Relocations.cpp index 745dfde76ab70..52888ad25034e 100644 --- a/lld/wasm/Relocations.cpp +++ b/lld/wasm/Relocations.cpp @@ -144,7 +144,7 @@ void scanRelocations(InputChunk *chunk) { break; } - if (ctx.isPic || + if (ctx.isPic || sym->isShared() || (sym->isUndefined() && ctx.arg.unresolvedSymbols == UnresolvedPolicy::ImportDynamic)) { switch (reloc.Type) { From e4372c4454c963c9f52dbf2a10229797f3f1e6fc Mon Sep 17 00:00:00 2001 From: ZhaoQi Date: Fri, 3 Jan 2025 11:23:44 +0800 Subject: [PATCH 322/567] [LoongArch] Pre-commit tests for tls-desc scheduling. NFC (#121538) Code sequence for tls-desc in large code model is not expected to be scheduled according to psABI 2.30. A later commit will fix it. --- .../LoongArch/psabi-restricted-scheduling.ll | 75 ++++++++++++++++++- 1 file changed, 74 insertions(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll index c7de3dcf2ecfd..1773b8e014997 100644 --- a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll +++ b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll @@ -1,4 +1,3 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=medium --relocation-model=pic --post-RA-scheduler=0 < %s \ ; RUN: | FileCheck %s --check-prefix=MEDIUM_NO_SCH ; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=medium --relocation-model=pic --post-RA-scheduler=1 < %s \ @@ -7,6 +6,14 @@ ; RUN: | FileCheck %s --check-prefix=LARGE_NO_SCH ; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=large --relocation-model=pic --post-RA-scheduler=1 < %s \ ; RUN: | FileCheck %s --check-prefix=LARGE_SCH +; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=medium --relocation-model=pic --enable-tlsdesc \ +; RUN: --post-RA-scheduler=0 < %s | FileCheck %s --check-prefix=MEDIUMDESC_NO_SCH +; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=medium --relocation-model=pic --enable-tlsdesc \ +; RUN: --post-RA-scheduler=1 < %s | FileCheck %s --check-prefix=MEDIUMDESC_SCH +; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=large --relocation-model=pic --enable-tlsdesc \ +; RUN: --post-RA-scheduler=0 < %s | FileCheck %s --check-prefix=LARGEDESC_NO_SCH +; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=large --relocation-model=pic --enable-tlsdesc \ +; RUN: --post-RA-scheduler=1 < %s | FileCheck %s --check-prefix=LARGEDESC_SCH @g = dso_local global i64 zeroinitializer, align 4 @G = global i64 zeroinitializer, align 4 @@ -194,3 +201,69 @@ define void @foo() nounwind { %v_ie = load volatile i64, ptr @ie ret void } + +define void @baz() nounwind { +; MEDIUMDESC_NO_SCH-LABEL: baz: +; MEDIUMDESC_NO_SCH: # %bb.0: +; MEDIUMDESC_NO_SCH-NEXT: addi.d $sp, $sp, -16 +; MEDIUMDESC_NO_SCH-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; MEDIUMDESC_NO_SCH-NEXT: pcalau12i $a0, %desc_pc_hi20(gd) +; MEDIUMDESC_NO_SCH-NEXT: addi.d $a0, $a0, %desc_pc_lo12(gd) +; MEDIUMDESC_NO_SCH-NEXT: ld.d $ra, $a0, %desc_ld(gd) +; MEDIUMDESC_NO_SCH-NEXT: jirl $ra, $ra, %desc_call(gd) +; MEDIUMDESC_NO_SCH-NEXT: add.d $a0, $a0, $tp +; MEDIUMDESC_NO_SCH-NEXT: ld.d $zero, $a0, 0 +; MEDIUMDESC_NO_SCH-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; MEDIUMDESC_NO_SCH-NEXT: addi.d $sp, $sp, 16 +; MEDIUMDESC_NO_SCH-NEXT: ret +; +; MEDIUMDESC_SCH-LABEL: baz: +; MEDIUMDESC_SCH: # %bb.0: +; MEDIUMDESC_SCH-NEXT: addi.d $sp, $sp, -16 +; MEDIUMDESC_SCH-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; MEDIUMDESC_SCH-NEXT: pcalau12i $a0, %desc_pc_hi20(gd) +; MEDIUMDESC_SCH-NEXT: addi.d $a0, $a0, %desc_pc_lo12(gd) +; MEDIUMDESC_SCH-NEXT: ld.d $ra, $a0, %desc_ld(gd) +; MEDIUMDESC_SCH-NEXT: jirl $ra, $ra, %desc_call(gd) +; MEDIUMDESC_SCH-NEXT: add.d $a0, $a0, $tp +; MEDIUMDESC_SCH-NEXT: ld.d $zero, $a0, 0 +; MEDIUMDESC_SCH-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; MEDIUMDESC_SCH-NEXT: addi.d $sp, $sp, 16 +; MEDIUMDESC_SCH-NEXT: ret +; +; LARGEDESC_NO_SCH-LABEL: baz: +; LARGEDESC_NO_SCH: # %bb.0: +; LARGEDESC_NO_SCH-NEXT: addi.d $sp, $sp, -16 +; LARGEDESC_NO_SCH-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LARGEDESC_NO_SCH-NEXT: pcalau12i $a0, %desc_pc_hi20(gd) +; LARGEDESC_NO_SCH-NEXT: addi.d $a1, $zero, %desc_pc_lo12(gd) +; LARGEDESC_NO_SCH-NEXT: lu32i.d $a1, %desc64_pc_lo20(gd) +; LARGEDESC_NO_SCH-NEXT: lu52i.d $a1, $a1, %desc64_pc_hi12(gd) +; LARGEDESC_NO_SCH-NEXT: add.d $a0, $a0, $a1 +; LARGEDESC_NO_SCH-NEXT: ld.d $ra, $a0, %desc_ld(gd) +; LARGEDESC_NO_SCH-NEXT: jirl $ra, $ra, %desc_call(gd) +; LARGEDESC_NO_SCH-NEXT: add.d $a0, $a0, $tp +; LARGEDESC_NO_SCH-NEXT: ld.d $zero, $a0, 0 +; LARGEDESC_NO_SCH-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LARGEDESC_NO_SCH-NEXT: addi.d $sp, $sp, 16 +; LARGEDESC_NO_SCH-NEXT: ret +; +; LARGEDESC_SCH-LABEL: baz: +; LARGEDESC_SCH: # %bb.0: +; LARGEDESC_SCH-NEXT: addi.d $sp, $sp, -16 +; LARGEDESC_SCH-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LARGEDESC_SCH-NEXT: addi.d $a1, $zero, %desc_pc_lo12(gd) +; LARGEDESC_SCH-NEXT: pcalau12i $a0, %desc_pc_hi20(gd) +; LARGEDESC_SCH-NEXT: lu32i.d $a1, %desc64_pc_lo20(gd) +; LARGEDESC_SCH-NEXT: lu52i.d $a1, $a1, %desc64_pc_hi12(gd) +; LARGEDESC_SCH-NEXT: add.d $a0, $a0, $a1 +; LARGEDESC_SCH-NEXT: ld.d $ra, $a0, %desc_ld(gd) +; LARGEDESC_SCH-NEXT: jirl $ra, $ra, %desc_call(gd) +; LARGEDESC_SCH-NEXT: add.d $a0, $a0, $tp +; LARGEDESC_SCH-NEXT: ld.d $zero, $a0, 0 +; LARGEDESC_SCH-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LARGEDESC_SCH-NEXT: addi.d $sp, $sp, 16 +; LARGEDESC_SCH-NEXT: ret + %v_gd = load volatile i64, ptr @gd + ret void +} From 56e944bede9654127cc210506a6cccdd43cd96e7 Mon Sep 17 00:00:00 2001 From: Alex MacLean Date: Thu, 2 Jan 2025 20:13:18 -0800 Subject: [PATCH 323/567] [NFC] add anonymous namespace to a couple classes (#121511) This ensures these classes are visible only to the appropriate translation unit and allows for more optimizations. --- llvm/lib/IR/SafepointIRVerifier.cpp | 2 ++ .../Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp | 2 ++ 2 files changed, 4 insertions(+) diff --git a/llvm/lib/IR/SafepointIRVerifier.cpp b/llvm/lib/IR/SafepointIRVerifier.cpp index ed99d05975c23..d32852b796c20 100644 --- a/llvm/lib/IR/SafepointIRVerifier.cpp +++ b/llvm/lib/IR/SafepointIRVerifier.cpp @@ -289,6 +289,7 @@ static void PrintValueSet(raw_ostream &OS, IteratorTy Begin, IteratorTy End) { using AvailableValueSet = DenseSet; +namespace { /// State we compute and track per basic block. struct BasicBlockState { // Set of values available coming in, before the phi nodes @@ -305,6 +306,7 @@ struct BasicBlockState { // contribute to AvailableOut. bool Cleared = false; }; +} // namespace /// A given derived pointer can have multiple base pointers through phi/selects. /// This type indicates when the base pointer is exclusively constant diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index 45ee2d472a11b..12ae6740e055e 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -181,6 +181,7 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) { /// the bit indexes (Mask) needed by a masked compare. If we're matching a chain /// of 'and' ops, then we also need to capture the fact that we saw an /// "and X, 1", so that's an extra return value for that case. +namespace { struct MaskOps { Value *Root = nullptr; APInt Mask; @@ -190,6 +191,7 @@ struct MaskOps { MaskOps(unsigned BitWidth, bool MatchAnds) : Mask(APInt::getZero(BitWidth)), MatchAndChain(MatchAnds) {} }; +} // namespace /// This is a recursive helper for foldAnyOrAllBitsSet() that walks through a /// chain of 'and' or 'or' instructions looking for shift ops of a common source From b6c06d1a8d9b359e7319312a2a7654f0e7c6690c Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Fri, 3 Jan 2025 15:18:35 +1100 Subject: [PATCH 324/567] [ORC] Fix bug in source file name finding in DebuggerSupportPlugin. The debug section map was using MachO section names (with the "__" prefix), but DWARFContext expects section names with the object format prefix stripped off. This was preventing DWARFContext from accessing the debug_str section, resulting in bogus source name strings. --- .../Orc/Debugging/DebuggerSupportPlugin.cpp | 26 +- .../x86-64/MachO-check-dwarf-filename.s | 315 ++++++++++++++++++ 2 files changed, 333 insertions(+), 8 deletions(-) create mode 100644 llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s diff --git a/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp index c08e52e943c92..0d9a912e25606 100644 --- a/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp @@ -148,7 +148,7 @@ class MachODebugObjectSynthesizer : public MachODebugObjectSynthesizerBase { DSec.BuilderSec->align = Log2_64(SR.getFirstBlock()->getAlignment()); StringRef SectionData(SR.getFirstBlock()->getContent().data(), SR.getFirstBlock()->getSize()); - DebugSectionMap[SecName] = + DebugSectionMap[SecName.drop_front(2)] = // drop "__" prefix. MemoryBuffer::getMemBuffer(SectionData, G.getName(), false); if (SecName == "__debug_line") DebugLineSectionData = SectionData; @@ -167,11 +167,10 @@ class MachODebugObjectSynthesizer : public MachODebugObjectSynthesizerBase { DebugLineSectionData, G.getEndianness() == llvm::endianness::little, G.getPointerSize()); uint64_t Offset = 0; - DWARFDebugLine::LineTable LineTable; + DWARFDebugLine::Prologue P; // Try to parse line data. Consume error on failure. - if (auto Err = LineTable.parse(DebugLineData, &Offset, *DWARFCtx, nullptr, - consumeError)) { + if (auto Err = P.parse(DebugLineData, &Offset, consumeError, *DWARFCtx)) { handleAllErrors(std::move(Err), [&](ErrorInfoBase &EIB) { LLVM_DEBUG({ dbgs() << "Cannot parse line table for \"" << G.getName() << "\": "; @@ -180,15 +179,26 @@ class MachODebugObjectSynthesizer : public MachODebugObjectSynthesizerBase { }); }); } else { - if (!LineTable.Prologue.FileNames.empty()) - FileName = *dwarf::toString(LineTable.Prologue.FileNames[0].Name); + for (auto &FN : P.FileNames) + if ((FileName = dwarf::toString(FN.Name))) { + LLVM_DEBUG({ + dbgs() << "Using FileName = \"" << *FileName + << "\" from DWARF line table\n"; + }); + break; + } } } // If no line table (or unable to use) then use graph name. // FIXME: There are probably other debug sections we should look in first. - if (!FileName) - FileName = StringRef(G.getName()); + if (!FileName) { + LLVM_DEBUG({ + dbgs() << "Could not find source name from DWARF line table. " + "Using FileName = \"\"\n"; + }); + FileName = ""; + } Builder.addSymbol("", MachO::N_SO, 0, 0, 0); Builder.addSymbol(*FileName, MachO::N_SO, 0, 0, 0); diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s new file mode 100644 index 0000000000000..058ef55fd1e3c --- /dev/null +++ b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s @@ -0,0 +1,315 @@ +# RUN: llvm-mc -triple=x86_64-apple-macosx10.9 -filetype=obj -o %t.o %s +# RUN: llvm-jitlink -debug-only=orc -noexec -debugger-support %t.o 2>&1 | \ +# RUN: FileCheck %s +# +# Test that source file names can be indentified from DWARF line tables. + +# CHECK: Using FileName = "check-dwarf-filename.c" from DWARF line table + + .section __TEXT,__text,regular,pure_instructions + .build_version macos, 15, 0 sdk_version 15, 0 + .globl _main ## -- Begin function main + .p2align 4, 0x90 +_main: ## @main +Lfunc_begin0: + .file 0 "/Users/lhames/Projects/scratch" "check-dwarf-filename.c" md5 0x331a6c7ae0cfcd2896eca60ac6f5703e + .loc 0 1 0 ## check-dwarf-filename.c:1:0 + .cfi_startproc +## %bb.0: + ##DEBUG_VALUE: main:argc <- $edi + ##DEBUG_VALUE: main:argv <- $rsi + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + movq %rsp, %rbp + .cfi_def_cfa_register %rbp +Ltmp0: + .loc 0 2 3 prologue_end ## check-dwarf-filename.c:2:3 + xorl %eax, %eax + .loc 0 2 3 epilogue_begin is_stmt 0 ## check-dwarf-filename.c:2:3 + popq %rbp + retq +Ltmp1: +Lfunc_end0: + .cfi_endproc + ## -- End function + .section __DWARF,__debug_abbrev,regular,debug +Lsection_abbrev: + .byte 1 ## Abbreviation Code + .byte 17 ## DW_TAG_compile_unit + .byte 1 ## DW_CHILDREN_yes + .byte 37 ## DW_AT_producer + .byte 37 ## DW_FORM_strx1 + .byte 19 ## DW_AT_language + .byte 5 ## DW_FORM_data2 + .byte 3 ## DW_AT_name + .byte 37 ## DW_FORM_strx1 + .ascii "\202|" ## DW_AT_LLVM_sysroot + .byte 37 ## DW_FORM_strx1 + .ascii "\357\177" ## DW_AT_APPLE_sdk + .byte 37 ## DW_FORM_strx1 + .byte 114 ## DW_AT_str_offsets_base + .byte 23 ## DW_FORM_sec_offset + .byte 16 ## DW_AT_stmt_list + .byte 23 ## DW_FORM_sec_offset + .byte 27 ## DW_AT_comp_dir + .byte 37 ## DW_FORM_strx1 + .ascii "\341\177" ## DW_AT_APPLE_optimized + .byte 25 ## DW_FORM_flag_present + .byte 17 ## DW_AT_low_pc + .byte 27 ## DW_FORM_addrx + .byte 18 ## DW_AT_high_pc + .byte 6 ## DW_FORM_data4 + .byte 115 ## DW_AT_addr_base + .byte 23 ## DW_FORM_sec_offset + .byte 0 ## EOM(1) + .byte 0 ## EOM(2) + .byte 2 ## Abbreviation Code + .byte 46 ## DW_TAG_subprogram + .byte 1 ## DW_CHILDREN_yes + .byte 17 ## DW_AT_low_pc + .byte 27 ## DW_FORM_addrx + .byte 18 ## DW_AT_high_pc + .byte 6 ## DW_FORM_data4 + .byte 64 ## DW_AT_frame_base + .byte 24 ## DW_FORM_exprloc + .byte 122 ## DW_AT_call_all_calls + .byte 25 ## DW_FORM_flag_present + .byte 3 ## DW_AT_name + .byte 37 ## DW_FORM_strx1 + .byte 58 ## DW_AT_decl_file + .byte 11 ## DW_FORM_data1 + .byte 59 ## DW_AT_decl_line + .byte 11 ## DW_FORM_data1 + .byte 39 ## DW_AT_prototyped + .byte 25 ## DW_FORM_flag_present + .byte 73 ## DW_AT_type + .byte 19 ## DW_FORM_ref4 + .byte 63 ## DW_AT_external + .byte 25 ## DW_FORM_flag_present + .ascii "\341\177" ## DW_AT_APPLE_optimized + .byte 25 ## DW_FORM_flag_present + .byte 0 ## EOM(1) + .byte 0 ## EOM(2) + .byte 3 ## Abbreviation Code + .byte 5 ## DW_TAG_formal_parameter + .byte 0 ## DW_CHILDREN_no + .byte 2 ## DW_AT_location + .byte 24 ## DW_FORM_exprloc + .byte 3 ## DW_AT_name + .byte 37 ## DW_FORM_strx1 + .byte 58 ## DW_AT_decl_file + .byte 11 ## DW_FORM_data1 + .byte 59 ## DW_AT_decl_line + .byte 11 ## DW_FORM_data1 + .byte 73 ## DW_AT_type + .byte 19 ## DW_FORM_ref4 + .byte 0 ## EOM(1) + .byte 0 ## EOM(2) + .byte 4 ## Abbreviation Code + .byte 36 ## DW_TAG_base_type + .byte 0 ## DW_CHILDREN_no + .byte 3 ## DW_AT_name + .byte 37 ## DW_FORM_strx1 + .byte 62 ## DW_AT_encoding + .byte 11 ## DW_FORM_data1 + .byte 11 ## DW_AT_byte_size + .byte 11 ## DW_FORM_data1 + .byte 0 ## EOM(1) + .byte 0 ## EOM(2) + .byte 5 ## Abbreviation Code + .byte 15 ## DW_TAG_pointer_type + .byte 0 ## DW_CHILDREN_no + .byte 73 ## DW_AT_type + .byte 19 ## DW_FORM_ref4 + .byte 0 ## EOM(1) + .byte 0 ## EOM(2) + .byte 0 ## EOM(3) + .section __DWARF,__debug_info,regular,debug +Lsection_info: +Lcu_begin0: +.set Lset0, Ldebug_info_end0-Ldebug_info_start0 ## Length of Unit + .long Lset0 +Ldebug_info_start0: + .short 5 ## DWARF version number + .byte 1 ## DWARF Unit Type + .byte 8 ## Address Size (in bytes) +.set Lset1, Lsection_abbrev-Lsection_abbrev ## Offset Into Abbrev. Section + .long Lset1 + .byte 1 ## Abbrev [1] 0xc:0x50 DW_TAG_compile_unit + .byte 0 ## DW_AT_producer + .short 29 ## DW_AT_language + .byte 1 ## DW_AT_name + .byte 2 ## DW_AT_LLVM_sysroot + .byte 3 ## DW_AT_APPLE_sdk +.set Lset2, Lstr_offsets_base0-Lsection_str_off ## DW_AT_str_offsets_base + .long Lset2 +.set Lset3, Lline_table_start0-Lsection_line ## DW_AT_stmt_list + .long Lset3 + .byte 4 ## DW_AT_comp_dir + ## DW_AT_APPLE_optimized + .byte 0 ## DW_AT_low_pc +.set Lset4, Lfunc_end0-Lfunc_begin0 ## DW_AT_high_pc + .long Lset4 +.set Lset5, Laddr_table_base0-Lsection_info0 ## DW_AT_addr_base + .long Lset5 + .byte 2 ## Abbrev [2] 0x25:0x24 DW_TAG_subprogram + .byte 0 ## DW_AT_low_pc +.set Lset6, Lfunc_end0-Lfunc_begin0 ## DW_AT_high_pc + .long Lset6 + .byte 1 ## DW_AT_frame_base + .byte 86 + ## DW_AT_call_all_calls + .byte 5 ## DW_AT_name + .byte 0 ## DW_AT_decl_file + .byte 1 ## DW_AT_decl_line + ## DW_AT_prototyped + .long 73 ## DW_AT_type + ## DW_AT_external + ## DW_AT_APPLE_optimized + .byte 3 ## Abbrev [3] 0x34:0xa DW_TAG_formal_parameter + .byte 1 ## DW_AT_location + .byte 85 + .byte 7 ## DW_AT_name + .byte 0 ## DW_AT_decl_file + .byte 1 ## DW_AT_decl_line + .long 73 ## DW_AT_type + .byte 3 ## Abbrev [3] 0x3e:0xa DW_TAG_formal_parameter + .byte 1 ## DW_AT_location + .byte 84 + .byte 8 ## DW_AT_name + .byte 0 ## DW_AT_decl_file + .byte 1 ## DW_AT_decl_line + .long 77 ## DW_AT_type + .byte 0 ## End Of Children Mark + .byte 4 ## Abbrev [4] 0x49:0x4 DW_TAG_base_type + .byte 6 ## DW_AT_name + .byte 5 ## DW_AT_encoding + .byte 4 ## DW_AT_byte_size + .byte 5 ## Abbrev [5] 0x4d:0x5 DW_TAG_pointer_type + .long 82 ## DW_AT_type + .byte 5 ## Abbrev [5] 0x52:0x5 DW_TAG_pointer_type + .long 87 ## DW_AT_type + .byte 4 ## Abbrev [4] 0x57:0x4 DW_TAG_base_type + .byte 9 ## DW_AT_name + .byte 6 ## DW_AT_encoding + .byte 1 ## DW_AT_byte_size + .byte 0 ## End Of Children Mark +Ldebug_info_end0: + .section __DWARF,__debug_str_offs,regular,debug +Lsection_str_off: + .long 44 ## Length of String Offsets Set + .short 5 + .short 0 +Lstr_offsets_base0: + .section __DWARF,__debug_str,regular,debug +Linfo_string: + .asciz "Apple clang version 16.0.0 (clang-1600.0.26.3)" ## string offset=0 + .asciz "check-dwarf-filename.c" ## string offset=47 + .asciz "/Library/Developer/CommandLineTools/SDKs/MacOSX15.0.sdk" ## string offset=70 + .asciz "MacOSX15.0.sdk" ## string offset=126 + .asciz "/Users/lhames/Projects/scratch" ## string offset=141 + .asciz "main" ## string offset=172 + .asciz "int" ## string offset=177 + .asciz "argc" ## string offset=181 + .asciz "argv" ## string offset=186 + .asciz "char" ## string offset=191 + .section __DWARF,__debug_str_offs,regular,debug + .long 0 + .long 47 + .long 70 + .long 126 + .long 141 + .long 172 + .long 177 + .long 181 + .long 186 + .long 191 + .section __DWARF,__debug_addr,regular,debug +Lsection_info0: +.set Lset7, Ldebug_addr_end0-Ldebug_addr_start0 ## Length of contribution + .long Lset7 +Ldebug_addr_start0: + .short 5 ## DWARF version number + .byte 8 ## Address size + .byte 0 ## Segment selector size +Laddr_table_base0: + .quad Lfunc_begin0 +Ldebug_addr_end0: + .section __DWARF,__debug_names,regular,debug +Ldebug_names_begin: +.set Lset8, Lnames_end0-Lnames_start0 ## Header: unit length + .long Lset8 +Lnames_start0: + .short 5 ## Header: version + .short 0 ## Header: padding + .long 1 ## Header: compilation unit count + .long 0 ## Header: local type unit count + .long 0 ## Header: foreign type unit count + .long 3 ## Header: bucket count + .long 3 ## Header: name count +.set Lset9, Lnames_abbrev_end0-Lnames_abbrev_start0 ## Header: abbreviation table size + .long Lset9 + .long 8 ## Header: augmentation string size + .ascii "LLVM0700" ## Header: augmentation string +.set Lset10, Lcu_begin0-Lsection_info ## Compilation unit 0 + .long Lset10 + .long 0 ## Bucket 0 + .long 1 ## Bucket 1 + .long 2 ## Bucket 2 + .long 2090499946 ## Hash in Bucket 1 + .long 193495088 ## Hash in Bucket 2 + .long 2090147939 ## Hash in Bucket 2 + .long 172 ## String in Bucket 1: main + .long 177 ## String in Bucket 2: int + .long 191 ## String in Bucket 2: char +.set Lset11, Lnames0-Lnames_entries0 ## Offset in Bucket 1 + .long Lset11 +.set Lset12, Lnames1-Lnames_entries0 ## Offset in Bucket 2 + .long Lset12 +.set Lset13, Lnames2-Lnames_entries0 ## Offset in Bucket 2 + .long Lset13 +Lnames_abbrev_start0: + .ascii "\230." ## Abbrev code + .byte 46 ## DW_TAG_subprogram + .byte 3 ## DW_IDX_die_offset + .byte 19 ## DW_FORM_ref4 + .byte 4 ## DW_IDX_parent + .byte 25 ## DW_FORM_flag_present + .byte 0 ## End of abbrev + .byte 0 ## End of abbrev + .ascii "\230$" ## Abbrev code + .byte 36 ## DW_TAG_base_type + .byte 3 ## DW_IDX_die_offset + .byte 19 ## DW_FORM_ref4 + .byte 4 ## DW_IDX_parent + .byte 25 ## DW_FORM_flag_present + .byte 0 ## End of abbrev + .byte 0 ## End of abbrev + .byte 0 ## End of abbrev list +Lnames_abbrev_end0: +Lnames_entries0: +Lnames0: +L1: + .ascii "\230." ## Abbreviation code + .long 37 ## DW_IDX_die_offset + .byte 0 ## DW_IDX_parent + ## End of list: main +Lnames1: +L0: + .ascii "\230$" ## Abbreviation code + .long 73 ## DW_IDX_die_offset + .byte 0 ## DW_IDX_parent + ## End of list: int +Lnames2: +L2: + .ascii "\230$" ## Abbreviation code + .long 87 ## DW_IDX_die_offset + .byte 0 ## DW_IDX_parent + ## End of list: char + .p2align 2, 0x0 +Lnames_end0: +.subsections_via_symbols + .section __DWARF,__debug_line,regular,debug +Lsection_line: +Lline_table_start0: From 1c997feff16860ab6b21c5c03dc7ca65f300967f Mon Sep 17 00:00:00 2001 From: dmasloff <74042473+dmasloff@users.noreply.github.com> Date: Fri, 3 Jan 2025 08:52:01 +0300 Subject: [PATCH 325/567] [clang-format] Add option WrapNamespaceBodyWithNewlines (#106145) It wraps the body of namespace with additional newlines, turning this code: ``` namespace N { int function(); } ``` into the following: ``` namespace N { int function(); } ``` --------- Co-authored-by: Owen Pan --- clang/docs/ClangFormatStyleOptions.rst | 39 ++++++ clang/docs/ReleaseNotes.rst | 1 + clang/include/clang/Format/Format.h | 36 +++++- clang/lib/Format/Format.cpp | 15 +++ clang/lib/Format/UnwrappedLineFormatter.cpp | 17 +++ clang/unittests/Format/ConfigParseTest.cpp | 7 ++ clang/unittests/Format/FormatTest.cpp | 130 ++++++++++++++++++++ 7 files changed, 244 insertions(+), 1 deletion(-) diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst index d9b3f666df03c..7bfaee4e2d35b 100644 --- a/clang/docs/ClangFormatStyleOptions.rst +++ b/clang/docs/ClangFormatStyleOptions.rst @@ -6843,6 +6843,45 @@ the configuration (without a prefix: ``Auto``). For example: BOOST_PP_STRINGIZE +.. _WrapNamespaceBodyWithEmptyLines: + +**WrapNamespaceBodyWithEmptyLines** (``WrapNamespaceBodyWithEmptyLinesStyle``) :versionbadge:`clang-format 20` :ref:`¶ ` + Wrap namespace body with empty lines. + + Possible values: + + * ``WNBWELS_Never`` (in configuration: ``Never``) + Remove all empty lines at the beginning and the end of namespace body. + + .. code-block:: c++ + + namespace N1 { + namespace N2 + function(); + } + } + + * ``WNBWELS_Always`` (in configuration: ``Always``) + Always have at least one empty line at the beginning and the end of + namespace body except that the number of empty lines between consecutive + nested namespace definitions is not increased. + + .. code-block:: c++ + + namespace N1 { + namespace N2 { + + function(); + + } + } + + * ``WNBWELS_Leave`` (in configuration: ``Leave``) + Keep existing newlines at the beginning and the end of namespace body. + ``MaxEmptyLinesToKeep`` still applies. + + + .. END_FORMAT_STYLE_OPTIONS Adding additional style options diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index aca07e2ba9cf2..2789a24ebf273 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -1127,6 +1127,7 @@ clang-format - Adds ``AllowShortNamespacesOnASingleLine`` option. - Adds ``VariableTemplates`` option. - Adds support for bash globstar in ``.clang-format-ignore``. +- Adds ``WrapNamespaceBodyWithEmptyLines`` option. libclang -------- diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h index bb34f2d33ac15..9b7a633e0a146 100644 --- a/clang/include/clang/Format/Format.h +++ b/clang/include/clang/Format/Format.h @@ -5143,6 +5143,39 @@ struct FormatStyle { /// \version 11 std::vector WhitespaceSensitiveMacros; + /// Different styles for wrapping namespace body with empty lines. + enum WrapNamespaceBodyWithEmptyLinesStyle : int8_t { + /// Remove all empty lines at the beginning and the end of namespace body. + /// \code + /// namespace N1 { + /// namespace N2 + /// function(); + /// } + /// } + /// \endcode + WNBWELS_Never, + /// Always have at least one empty line at the beginning and the end of + /// namespace body except that the number of empty lines between consecutive + /// nested namespace definitions is not increased. + /// \code + /// namespace N1 { + /// namespace N2 { + /// + /// function(); + /// + /// } + /// } + /// \endcode + WNBWELS_Always, + /// Keep existing newlines at the beginning and the end of namespace body. + /// ``MaxEmptyLinesToKeep`` still applies. + WNBWELS_Leave + }; + + /// Wrap namespace body with empty lines. + /// \version 20 + WrapNamespaceBodyWithEmptyLinesStyle WrapNamespaceBodyWithEmptyLines; + bool operator==(const FormatStyle &R) const { return AccessModifierOffset == R.AccessModifierOffset && AlignAfterOpenBracket == R.AlignAfterOpenBracket && @@ -5326,7 +5359,8 @@ struct FormatStyle { UseTab == R.UseTab && VariableTemplates == R.VariableTemplates && VerilogBreakBetweenInstancePorts == R.VerilogBreakBetweenInstancePorts && - WhitespaceSensitiveMacros == R.WhitespaceSensitiveMacros; + WhitespaceSensitiveMacros == R.WhitespaceSensitiveMacros && + WrapNamespaceBodyWithEmptyLines == R.WrapNamespaceBodyWithEmptyLines; } std::optional GetLanguageStyle(LanguageKind Language) const; diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index a5657f2d910f6..e51d7ac2e5b6c 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -839,6 +839,18 @@ template <> struct ScalarEnumerationTraits { } }; +template <> +struct ScalarEnumerationTraits< + FormatStyle::WrapNamespaceBodyWithEmptyLinesStyle> { + static void + enumeration(IO &IO, + FormatStyle::WrapNamespaceBodyWithEmptyLinesStyle &Value) { + IO.enumCase(Value, "Never", FormatStyle::WNBWELS_Never); + IO.enumCase(Value, "Always", FormatStyle::WNBWELS_Always); + IO.enumCase(Value, "Leave", FormatStyle::WNBWELS_Leave); + } +}; + template <> struct MappingTraits { static void mapping(IO &IO, FormatStyle &Style) { // When reading, read the language first, we need it for getPredefinedStyle. @@ -1171,6 +1183,8 @@ template <> struct MappingTraits { Style.VerilogBreakBetweenInstancePorts); IO.mapOptional("WhitespaceSensitiveMacros", Style.WhitespaceSensitiveMacros); + IO.mapOptional("WrapNamespaceBodyWithEmptyLines", + Style.WrapNamespaceBodyWithEmptyLines); // If AlwaysBreakAfterDefinitionReturnType was specified but // BreakAfterReturnType was not, initialize the latter from the former for @@ -1639,6 +1653,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) { LLVMStyle.WhitespaceSensitiveMacros.push_back("NS_SWIFT_NAME"); LLVMStyle.WhitespaceSensitiveMacros.push_back("PP_STRINGIZE"); LLVMStyle.WhitespaceSensitiveMacros.push_back("STRINGIZE"); + LLVMStyle.WrapNamespaceBodyWithEmptyLines = FormatStyle::WNBWELS_Leave; LLVMStyle.PenaltyBreakAssignment = prec::Assignment; LLVMStyle.PenaltyBreakBeforeFirstCallParameter = 19; diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp index 803c600cec44d..bc6766a47f5c7 100644 --- a/clang/lib/Format/UnwrappedLineFormatter.cpp +++ b/clang/lib/Format/UnwrappedLineFormatter.cpp @@ -1584,6 +1584,23 @@ static auto computeNewlines(const AnnotatedLine &Line, Newlines = 1; } + if (Style.WrapNamespaceBodyWithEmptyLines != FormatStyle::WNBWELS_Leave) { + // Modify empty lines after TT_NamespaceLBrace. + if (PreviousLine && PreviousLine->endsWith(TT_NamespaceLBrace)) { + if (Style.WrapNamespaceBodyWithEmptyLines == FormatStyle::WNBWELS_Never) + Newlines = 1; + else if (!Line.startsWithNamespace()) + Newlines = std::max(Newlines, 2u); + } + // Modify empty lines before TT_NamespaceRBrace. + if (Line.startsWith(TT_NamespaceRBrace)) { + if (Style.WrapNamespaceBodyWithEmptyLines == FormatStyle::WNBWELS_Never) + Newlines = 1; + else if (!PreviousLine->startsWith(TT_NamespaceRBrace)) + Newlines = std::max(Newlines, 2u); + } + } + // Insert or remove empty line before access specifiers. if (PreviousLine && RootToken.isAccessSpecifier()) { switch (Style.EmptyLineBeforeAccessModifier) { diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp index b249bf073aa45..9c38dbbc51f0a 100644 --- a/clang/unittests/Format/ConfigParseTest.cpp +++ b/clang/unittests/Format/ConfigParseTest.cpp @@ -865,6 +865,13 @@ TEST(ConfigParseTest, ParsesConfiguration) { CHECK_PARSE("SortUsingDeclarations: true", SortUsingDeclarations, FormatStyle::SUD_LexicographicNumeric); + CHECK_PARSE("WrapNamespaceBodyWithEmptyLines: Never", + WrapNamespaceBodyWithEmptyLines, FormatStyle::WNBWELS_Never); + CHECK_PARSE("WrapNamespaceBodyWithEmptyLines: Always", + WrapNamespaceBodyWithEmptyLines, FormatStyle::WNBWELS_Always); + CHECK_PARSE("WrapNamespaceBodyWithEmptyLines: Leave", + WrapNamespaceBodyWithEmptyLines, FormatStyle::WNBWELS_Leave); + // FIXME: This is required because parsing a configuration simply overwrites // the first N elements of the list instead of resetting it. Style.ForEachMacros.clear(); diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index 22b6f7e1b62e2..44b9dba249890 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -28427,6 +28427,136 @@ TEST_F(FormatTest, ShortNamespacesOption) { Style); } +TEST_F(FormatTest, WrapNamespaceBodyWithEmptyLinesNever) { + auto Style = getLLVMStyle(); + Style.FixNamespaceComments = false; + Style.MaxEmptyLinesToKeep = 2; + Style.WrapNamespaceBodyWithEmptyLines = FormatStyle::WNBWELS_Never; + + // Empty namespace. + verifyFormat("namespace N {}", Style); + + // Single namespace. + verifyFormat("namespace N {\n" + "int f1(int a) { return 2 * a; }\n" + "}", + "namespace N {\n" + "\n" + "\n" + "int f1(int a) { return 2 * a; }\n" + "\n" + "\n" + "}", + Style); + + // Nested namespace. + verifyFormat("namespace N1 {\n" + "namespace N2 {\n" + "int a = 1;\n" + "}\n" + "}", + "namespace N1 {\n" + "\n" + "\n" + "namespace N2 {\n" + "\n" + "int a = 1;\n" + "\n" + "}\n" + "\n" + "\n" + "}", + Style); + + Style.CompactNamespaces = true; + + verifyFormat("namespace N1 { namespace N2 {\n" + "int a = 1;\n" + "}}", + "namespace N1 { namespace N2 {\n" + "\n" + "\n" + "int a = 1;\n" + "\n" + "\n" + "}}", + Style); +} + +TEST_F(FormatTest, WrapNamespaceBodyWithEmptyLinesAlways) { + auto Style = getLLVMStyle(); + Style.FixNamespaceComments = false; + Style.MaxEmptyLinesToKeep = 2; + Style.WrapNamespaceBodyWithEmptyLines = FormatStyle::WNBWELS_Always; + + // Empty namespace. + verifyFormat("namespace N {}", Style); + + // Single namespace. + verifyFormat("namespace N {\n" + "\n" + "int f1(int a) { return 2 * a; }\n" + "\n" + "}", + "namespace N {\n" + "int f1(int a) { return 2 * a; }\n" + "}", + Style); + + // Nested namespace. + verifyFormat("namespace N1 {\n" + "namespace N2 {\n" + "\n" + "int a = 1;\n" + "\n" + "}\n" + "}", + "namespace N1 {\n" + "namespace N2 {\n" + "int a = 1;\n" + "}\n" + "}", + Style); + + verifyFormat("namespace N1 {\n" + "\n" + "namespace N2 {\n" + "\n" + "\n" + "int a = 1;\n" + "\n" + "\n" + "}\n" + "\n" + "}", + "namespace N1 {\n" + "\n" + "namespace N2 {\n" + "\n" + "\n" + "\n" + "int a = 1;\n" + "\n" + "\n" + "\n" + "}\n" + "\n" + "}", + Style); + + Style.CompactNamespaces = true; + + verifyFormat("namespace N1 { namespace N2 {\n" + "\n" + "int a = 1;\n" + "\n" + "}}", + "namespace N1 { namespace N2 {\n" + "int a = 1;\n" + "}}", + Style); +} + } // namespace } // namespace test } // namespace format From 72db3f989e499c8c5d585d3624cd563600cd2396 Mon Sep 17 00:00:00 2001 From: Pengcheng Wang Date: Fri, 3 Jan 2025 14:05:02 +0800 Subject: [PATCH 326/567] [RISCV] Allow tail memcmp expansion (#121460) This optimization was introduced by #70469. Like AArch64, we allow tail expansions for 3 on RV32 and 3/5/6 on RV64. This can simplify the comparison and reduce the number of blocks. --- .../Target/RISCV/RISCVTargetTransformInfo.cpp | 7 +- llvm/test/CodeGen/RISCV/memcmp-optsize.ll | 234 ++++++++---------- llvm/test/CodeGen/RISCV/memcmp.ll | 234 ++++++++---------- 3 files changed, 201 insertions(+), 274 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 909a64e974255..850d6244affa5 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -2567,9 +2567,12 @@ RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { Options.AllowOverlappingLoads = true; Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); Options.NumLoadsPerBlock = Options.MaxNumLoads; - if (ST->is64Bit()) + if (ST->is64Bit()) { Options.LoadSizes = {8, 4, 2, 1}; - else + Options.AllowedTailExpansions = {3, 5, 6}; + } else { Options.LoadSizes = {4, 2, 1}; + Options.AllowedTailExpansions = {3}; + } return Options; } diff --git a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll index d529ae6ecd0ab..b9a27b9d0c9e7 100644 --- a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll +++ b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll @@ -2449,82 +2449,72 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize { ; ; CHECK-UNALIGNED-RV32-ZBB-LABEL: memcmp_size_3: ; CHECK-UNALIGNED-RV32-ZBB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lh a2, 0(a0) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lh a3, 0(a1) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: srli a2, a2, 16 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: srli a3, a3, 16 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB24_2 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.1: # %loadbb1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lbu a0, 2(a0) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lbu a1, 2(a1) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a0, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB24_2: # %res_block -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lbu a2, 2(a0) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lhu a0, 0(a0) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lbu a3, 2(a1) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lhu a1, 0(a1) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: slli a2, a2, 16 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: or a0, a0, a2 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: slli a3, a3, 16 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: or a1, a1, a3 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_3: ; CHECK-UNALIGNED-RV64-ZBB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lh a2, 0(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lh a3, 0(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 48 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 48 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB24_2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a0, 2(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a1, 2(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB24_2: # %res_block -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a2, 2(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a0, 0(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a3, 2(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a1, 0(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a2, a2, 16 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a0, a0, a2 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a3, a3, 16 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a1, a3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a1, a1, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_3: ; CHECK-UNALIGNED-RV32-ZBKB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lh a2, 0(a0) -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lh a3, 0(a1) -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: srli a2, a2, 16 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: srli a3, a3, 16 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB24_2 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.1: # %loadbb1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lhu a2, 0(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lbu a0, 2(a0) +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lhu a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lbu a1, 2(a1) -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a0, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB24_2: # %res_block -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: pack a0, a2, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: pack a1, a3, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_3: ; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lh a2, 0(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lh a3, 0(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 48 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 48 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB24_2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a0, 2(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a1, 2(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB24_2: # %res_block -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a2, 2(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a0, 0(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a3, 2(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a1, 0(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: slli a2, a2, 16 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: or a0, a0, a2 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: slli a3, a3, 16 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: or a1, a1, a3 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a1, a1, 32 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_3: @@ -2845,22 +2835,19 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind optsize { ; ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_5: ; CHECK-UNALIGNED-RV64-ZBB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a2, 0(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a3, 0(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB26_2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a0, 4(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a1, 4(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB26_2: # %res_block -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a2, 4(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a0, 0(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a3, 4(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a1, 0(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a2, a2, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a0, a0, a2 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a3, a3, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a1, a3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_5: @@ -2883,22 +2870,17 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind optsize { ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_5: ; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a2, 0(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a3, 0(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB26_2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a0, 4(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a1, 4(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB26_2: # %res_block -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_5: @@ -3052,28 +3034,19 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize { ; ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_6: ; CHECK-UNALIGNED-RV64-ZBB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a2, 0(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a3, 0(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB27_3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lh a0, 4(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lh a1, 4(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 48 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 48 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB27_3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.2: -; CHECK-UNALIGNED-RV64-ZBB-NEXT: li a0, 0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB27_3: # %res_block -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a2, 4(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a0, 0(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a3, 4(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a1, 0(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a2, a2, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a0, a0, a2 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a3, a3, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a1, a3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_6: @@ -3102,28 +3075,17 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize { ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_6: ; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a2, 0(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a3, 0(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB27_3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lh a0, 4(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lh a1, 4(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 48 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 48 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB27_3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.2: -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: li a0, 0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB27_3: # %res_block -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a2, 0(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a0, 4(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a3, 0(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a1, 4(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_6: diff --git a/llvm/test/CodeGen/RISCV/memcmp.ll b/llvm/test/CodeGen/RISCV/memcmp.ll index 860c3a94abc0a..629a9298ee469 100644 --- a/llvm/test/CodeGen/RISCV/memcmp.ll +++ b/llvm/test/CodeGen/RISCV/memcmp.ll @@ -3145,82 +3145,72 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind { ; ; CHECK-UNALIGNED-RV32-ZBB-LABEL: memcmp_size_3: ; CHECK-UNALIGNED-RV32-ZBB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lh a2, 0(a0) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lh a3, 0(a1) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: srli a2, a2, 16 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: srli a3, a3, 16 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB24_2 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.1: # %loadbb1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lbu a0, 2(a0) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lbu a1, 2(a1) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a0, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB24_2: # %res_block -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lbu a2, 2(a0) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lhu a0, 0(a0) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lbu a3, 2(a1) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lhu a1, 0(a1) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: slli a2, a2, 16 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: or a0, a0, a2 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: slli a3, a3, 16 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: or a1, a1, a3 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_3: ; CHECK-UNALIGNED-RV64-ZBB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lh a2, 0(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lh a3, 0(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 48 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 48 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB24_2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a0, 2(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a1, 2(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB24_2: # %res_block -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a2, 2(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a0, 0(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a3, 2(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a1, 0(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a2, a2, 16 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a0, a0, a2 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a3, a3, 16 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a1, a3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a1, a1, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_3: ; CHECK-UNALIGNED-RV32-ZBKB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lh a2, 0(a0) -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lh a3, 0(a1) -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: srli a2, a2, 16 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: srli a3, a3, 16 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB24_2 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.1: # %loadbb1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lhu a2, 0(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lbu a0, 2(a0) +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lhu a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lbu a1, 2(a1) -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a0, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB24_2: # %res_block -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: pack a0, a2, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: pack a1, a3, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_3: ; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lh a2, 0(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lh a3, 0(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 48 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 48 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB24_2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a0, 2(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a1, 2(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB24_2: # %res_block -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a2, 2(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a0, 0(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a3, 2(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a1, 0(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: slli a2, a2, 16 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: or a0, a0, a2 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: slli a3, a3, 16 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: or a1, a1, a3 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a1, a1, 32 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_3: @@ -3541,22 +3531,19 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind { ; ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_5: ; CHECK-UNALIGNED-RV64-ZBB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a2, 0(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a3, 0(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB26_2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a0, 4(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a1, 4(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB26_2: # %res_block -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a2, 4(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a0, 0(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a3, 4(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a1, 0(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a2, a2, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a0, a0, a2 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a3, a3, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a1, a3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_5: @@ -3579,22 +3566,17 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind { ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_5: ; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a2, 0(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a3, 0(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB26_2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a0, 4(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a1, 4(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB26_2: # %res_block -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_5: @@ -3748,28 +3730,19 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind { ; ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_6: ; CHECK-UNALIGNED-RV64-ZBB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a2, 0(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a3, 0(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB27_3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lh a0, 4(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lh a1, 4(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 48 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 48 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB27_3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.2: -; CHECK-UNALIGNED-RV64-ZBB-NEXT: li a0, 0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB27_3: # %res_block -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a2, 4(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a0, 0(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a3, 4(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a1, 0(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a2, a2, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a0, a0, a2 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a3, a3, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a1, a3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_6: @@ -3798,28 +3771,17 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind { ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_6: ; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a2, 0(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a3, 0(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB27_3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lh a0, 4(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lh a1, 4(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 48 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 48 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB27_3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.2: -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: li a0, 0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB27_3: # %res_block -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a2, 0(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a0, 4(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a3, 0(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a1, 4(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_6: From 93a68a5188b6aa940f51d8ce0317299409f828ae Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Fri, 3 Jan 2025 17:29:43 +1100 Subject: [PATCH 327/567] [ORC] Testcase requires asserts as it depends on debugging output. --- .../ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s index 058ef55fd1e3c..a2eee21a0761d 100644 --- a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s +++ b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s @@ -2,6 +2,8 @@ # RUN: llvm-jitlink -debug-only=orc -noexec -debugger-support %t.o 2>&1 | \ # RUN: FileCheck %s # +# REQUIRES: asserts +# # Test that source file names can be indentified from DWARF line tables. # CHECK: Using FileName = "check-dwarf-filename.c" from DWARF line table From febe1a9d286df495ca342011b3134823eee37557 Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Fri, 3 Jan 2025 15:47:43 +1100 Subject: [PATCH 328/567] [ORC] Use structured binding to improve readability. NFC. --- llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h b/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h index 6ffd286c365ac..8e29f219774b3 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h @@ -460,8 +460,8 @@ template class MachOBuilder { return; StrTab.resize(Strings.size()); - for (auto &KV : Strings) - StrTab[KV.second] = {KV.first, 0}; + for (auto &[Str, Idx] : Strings) + StrTab[Idx] = {Str, 0}; size_t Offset = 0; for (auto &Elem : StrTab) { Elem.Offset = Offset; From 30b73ed7bd8934c32e4bd5430bccf52a226deabd Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Fri, 3 Jan 2025 17:15:33 +1100 Subject: [PATCH 329/567] [ORC][MachO] Avoid another race condition in MachOPlatform bootstrap. Similar to a9e75b1d4d1: During MachOPlatform bootstrap we need to defer actions until essential platform functionality has been loaded, but the platform itself may be loaded under a concurrent dispatcher so we have to guard against the deferred actions vector being accessed concurrently. This fixes a probablistic failure in the ORC runtime regression tests on Darwin/x86-64 that was spotted after edca1d9bad2 (which turned on concurrent linking by default in llvm-jitlink). --- llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp index 0e8349711e6fe..9f324c7048c63 100644 --- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp +++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp @@ -937,6 +937,12 @@ Error MachOPlatform::MachOPlatformPlugin::bootstrapPipelineEnd( jitlink::LinkGraph &G) { std::lock_guard Lock(MP.Bootstrap.load()->Mutex); assert(MP.Bootstrap && "DeferredAAs reset before bootstrap completed"); + + // Transfer any allocation actions to DeferredAAs. + std::move(G.allocActions().begin(), G.allocActions().end(), + std::back_inserter(MP.Bootstrap.load()->DeferredAAs)); + G.allocActions().clear(); + --MP.Bootstrap.load()->ActiveGraphs; // Notify Bootstrap->CV while holding the mutex because the mutex is // also keeping Bootstrap->CV alive. @@ -1397,10 +1403,6 @@ Error MachOPlatform::MachOPlatformPlugin::registerObjectPlatformSections( SPSExecutorAddrRange, SPSExecutorAddrRange>>, SPSSequence>>; - shared::AllocActions &allocActions = LLVM_LIKELY(!InBootstrapPhase) - ? G.allocActions() - : MP.Bootstrap.load()->DeferredAAs; - ExecutorAddr HeaderAddr; { std::lock_guard Lock(MP.PlatformMutex); @@ -1410,7 +1412,7 @@ Error MachOPlatform::MachOPlatformPlugin::registerObjectPlatformSections( assert(I->second && "Null header registered for JD"); HeaderAddr = I->second; } - allocActions.push_back( + G.allocActions().push_back( {cantFail( WrapperFunctionCall::Create( MP.RegisterObjectPlatformSections.Addr, HeaderAddr, UnwindInfo, From 82fecab85ae2d72ffac0e44749d99f12d6f71cc0 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 2 Jan 2025 23:01:28 -0800 Subject: [PATCH 330/567] [gcov] Bump default version to 11.1 The gcov version is set to 11.1 (compatible with gcov 9) even if `-Xclang -coverage-version=` specified version is less than 11.1. Therefore, we can drop producer support for version < 11.1. --- clang/include/clang/Basic/CodeGenOptions.h | 2 +- clang/lib/Basic/CodeGenOptions.cpp | 2 - clang/lib/Frontend/CompilerInvocation.cpp | 3 +- clang/test/CodeGen/code-coverage.c | 26 +++----- .../Inputs/instrprof-gcov-exceptions.cpp.gcov | 1 - ...rprof-gcov-multiple-bbs-single-line.c.gcov | 1 - .../instrprof-gcov-one-line-function.c.gcov | 1 - .../Inputs/instrprof-gcov-switch1.c.gcov | 1 - .../Inputs/instrprof-gcov-switch2.c.gcov | 1 - .../instrprof-shared-lib_in-loop.c.gcov | 1 - .../Inputs/instrprof-shared-main.c.gcov | 1 - .../profile/gcov-__gcov_flush-terminate.c | 1 - .../Instrumentation/GCOVProfiling.cpp | 64 +++++++------------ .../Transforms/GCOVProfiling/exit-block.ll | 2 +- llvm/test/Transforms/GCOVProfiling/version.ll | 8 +-- 15 files changed, 41 insertions(+), 74 deletions(-) diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h index 8097c9ef772bc..c555fb3b72d64 100644 --- a/clang/include/clang/Basic/CodeGenOptions.h +++ b/clang/include/clang/Basic/CodeGenOptions.h @@ -186,7 +186,7 @@ class CodeGenOptions : public CodeGenOptionsBase { std::string ProfileExcludeFiles; /// The version string to put into coverage files. - char CoverageVersion[4]; + char CoverageVersion[4] = {'0', '0', '0', '0'}; /// Enable additional debugging information. std::string DebugPass; diff --git a/clang/lib/Basic/CodeGenOptions.cpp b/clang/lib/Basic/CodeGenOptions.cpp index 79d715305ef20..95e65ba9266f5 100644 --- a/clang/lib/Basic/CodeGenOptions.cpp +++ b/clang/lib/Basic/CodeGenOptions.cpp @@ -17,7 +17,6 @@ CodeGenOptions::CodeGenOptions() { #include "clang/Basic/CodeGenOptions.def" RelocationModel = llvm::Reloc::PIC_; - memcpy(CoverageVersion, "408*", 4); } void CodeGenOptions::resetNonModularOptions(StringRef ModuleFormat) { @@ -54,7 +53,6 @@ void CodeGenOptions::resetNonModularOptions(StringRef ModuleFormat) { } RelocationModel = llvm::Reloc::PIC_; - memcpy(CoverageVersion, "408*", 4); } } // end namespace clang diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 0ae6dce5dd40a..36dc45bde11ab 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -1691,7 +1691,7 @@ void CompilerInvocationBase::GenerateCodeGenArgs(const CodeGenOptions &Opts, } } - if (memcmp(Opts.CoverageVersion, "408*", 4) != 0) + if (memcmp(Opts.CoverageVersion, "0000", 4)) GenerateArg(Consumer, OPT_coverage_version_EQ, StringRef(Opts.CoverageVersion, 4)); @@ -2007,7 +2007,6 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, } else if (Args.hasArg(OPT_fmemory_profile)) Opts.MemoryProfileOutput = MemProfileBasename; - memcpy(Opts.CoverageVersion, "408*", 4); if (Opts.CoverageNotesFile.size() || Opts.CoverageDataFile.size()) { if (Args.hasArg(OPT_coverage_version_EQ)) { StringRef CoverageVersion = Args.getLastArgValue(OPT_coverage_version_EQ); diff --git a/clang/test/CodeGen/code-coverage.c b/clang/test/CodeGen/code-coverage.c index 4e3364df21785..5fa62360c9b56 100644 --- a/clang/test/CodeGen/code-coverage.c +++ b/clang/test/CodeGen/code-coverage.c @@ -3,18 +3,14 @@ /// 4.7 enables cfg_checksum. /// 4.8 (default, compatible with gcov 7) emits the exit block the second. // RUN: rm -rf %t && mkdir %t && cd %t -// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -disable-red-zone -coverage-data-file=/dev/null -coverage-version='304*' %s -o - | \ -// RUN: FileCheck --check-prefixes=CHECK,CHECK-CTOR-INIT,304 %s -// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -disable-red-zone -coverage-data-file=/dev/null -coverage-version='407*' %s -o - | \ -// RUN: FileCheck --check-prefixes=CHECK,CHECK-CTOR-INIT,407 %s +// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -disable-red-zone -coverage-data-file=/dev/null -coverage-version='B21*' %s -o - | \ +// RUN: FileCheck --check-prefixes=CHECK,CHECK-CTOR-INIT,1210 %s // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -disable-red-zone -coverage-data-file=/dev/null %s -o - | \ -// RUN: FileCheck --check-prefixes=CHECK,CHECK-CTOR-INIT,408 %s -// RUN: %clang_cc1 -triple powerpc64-ibm-aix -emit-llvm -disable-red-zone -coverage-data-file=/dev/null -coverage-version='304*' %s -o - | \ -// RUN: FileCheck --check-prefixes=CHECK,CHECK-RT-INIT,304 %s -// RUN: %clang_cc1 -triple powerpc64-ibm-aix -emit-llvm -disable-red-zone -coverage-data-file=/dev/null -coverage-version='407*' %s -o - | \ -// RUN: FileCheck --check-prefixes=CHECK,CHECK-RT-INIT,407 %s +// RUN: FileCheck --check-prefixes=CHECK,CHECK-CTOR-INIT,1110 %s +// RUN: %clang_cc1 -triple powerpc64-ibm-aix -emit-llvm -disable-red-zone -coverage-data-file=/dev/null -coverage-version='B21*' %s -o - | \ +// RUN: FileCheck --check-prefixes=CHECK,CHECK-RT-INIT,1210 %s // RUN: %clang_cc1 -triple powerpc64-ibm-aix -emit-llvm -disable-red-zone -coverage-data-file=/dev/null %s -o - | \ -// RUN: FileCheck --check-prefixes=CHECK,CHECK-RT-INIT,408 %s +// RUN: FileCheck --check-prefixes=CHECK,CHECK-RT-INIT,1110 %s // RUN: %clang_cc1 -emit-llvm -disable-red-zone -coverage-notes-file=aaa.gcno -coverage-data-file=bbb.gcda -debug-info-kind=limited -dwarf-version=4 %s -o - | FileCheck %s --check-prefix GCOV_FILE_INFO @@ -48,12 +44,10 @@ int test2(int b) { // CHECK-SAME: [%emit_function_args_ty { i32 0, i32 {{[-0-9]+}}, i32 {{[-0-9]+}} }, %emit_function_args_ty { i32 1, i32 {{[-0-9]+}}, i32 {{[-0-9]+}} }] // CHECK: @__llvm_internal_gcov_emit_file_info = internal unnamed_addr constant [1 x %file_info] -/// 0x3330342a '3' '0' '4' '*' -// 304-SAME: i32 858797098 -/// 0x3430372a '4' '0' '7' '*' -// 407-SAME: i32 875575082 -/// 0x3430382a '4' '0' '8' '*' -// 408-SAME: i32 875575338 +/// 0x4231312a 'B' '1' '1' '*' +// 1110-SAME: i32 1110520106 +/// 0x4232312a 'B' '2' '1' '*' +// 1210-SAME: i32 1110585642 // Check for gcov initialization function pointers. // CHECK-RT-INIT: @__llvm_covinit_functions = private constant { ptr, ptr } { ptr @__llvm_gcov_writeout, ptr @__llvm_gcov_reset }, section "__llvm_covinit" diff --git a/compiler-rt/test/profile/Inputs/instrprof-gcov-exceptions.cpp.gcov b/compiler-rt/test/profile/Inputs/instrprof-gcov-exceptions.cpp.gcov index aa202763fd564..233fd142444a5 100644 --- a/compiler-rt/test/profile/Inputs/instrprof-gcov-exceptions.cpp.gcov +++ b/compiler-rt/test/profile/Inputs/instrprof-gcov-exceptions.cpp.gcov @@ -2,7 +2,6 @@ // CHECK-NEXT: -: 0:Graph:instrprof-gcov-exceptions.gcno // CHECK-NEXT: -: 0:Data:instrprof-gcov-exceptions.gcda // CHECK-NEXT: -: 0:Runs:1 -// CHECK-NEXT: -: 0:Programs:1 // CHECK-NEXT: -: 1:#include // CHECK-NEXT: -: 2: // CHECK-NEXT: 1: 3:void asd(std::string i) { diff --git a/compiler-rt/test/profile/Inputs/instrprof-gcov-multiple-bbs-single-line.c.gcov b/compiler-rt/test/profile/Inputs/instrprof-gcov-multiple-bbs-single-line.c.gcov index 9297073d21ef8..a25632d475b34 100644 --- a/compiler-rt/test/profile/Inputs/instrprof-gcov-multiple-bbs-single-line.c.gcov +++ b/compiler-rt/test/profile/Inputs/instrprof-gcov-multiple-bbs-single-line.c.gcov @@ -2,7 +2,6 @@ // CHECK-NEXT: -: 0:Graph:instrprof-gcov-multiple-bbs-single-line.gcno // CHECK-NEXT: -: 0:Data:instrprof-gcov-multiple-bbs-single-line.gcda // CHECK-NEXT: -: 0:Runs:1 -// CHECK-NEXT: -: 0:Programs:1 // CHECK-NEXT:function main called 1 returned 100% blocks executed 77% // CHECK-NEXT: 1: 1:int main(void) // CHECK-NEXT: -: 2:{ diff --git a/compiler-rt/test/profile/Inputs/instrprof-gcov-one-line-function.c.gcov b/compiler-rt/test/profile/Inputs/instrprof-gcov-one-line-function.c.gcov index 5a570a04742df..4dc68177e0b75 100644 --- a/compiler-rt/test/profile/Inputs/instrprof-gcov-one-line-function.c.gcov +++ b/compiler-rt/test/profile/Inputs/instrprof-gcov-one-line-function.c.gcov @@ -2,7 +2,6 @@ // CHECK-NEXT: -: 0:Graph:instrprof-gcov-one-line-function.gcno // CHECK-NEXT: -: 0:Data:instrprof-gcov-one-line-function.gcda // CHECK-NEXT: -: 0:Runs:1 -// CHECK-NEXT: -: 0:Programs:1 // CHECK-NEXT: 1: 1:void foo() { } // CHECK-NEXT: -: 2: // CHECK-NEXT: 1: 3:void bar() { } diff --git a/compiler-rt/test/profile/Inputs/instrprof-gcov-switch1.c.gcov b/compiler-rt/test/profile/Inputs/instrprof-gcov-switch1.c.gcov index 741dff59954bc..2b4d67f9abbef 100644 --- a/compiler-rt/test/profile/Inputs/instrprof-gcov-switch1.c.gcov +++ b/compiler-rt/test/profile/Inputs/instrprof-gcov-switch1.c.gcov @@ -2,7 +2,6 @@ // CHECK-NEXT: -: 0:Graph:instrprof-gcov-switch1.gcno // CHECK-NEXT: -: 0:Data:instrprof-gcov-switch1.gcda // CHECK-NEXT: -: 0:Runs:1 -// CHECK-NEXT: -: 0:Programs:1 // CHECK-NEXT: 1: 1:int main(void) // CHECK-NEXT: -: 2:{ // CHECK-NEXT: 1: 3: int i = 22; diff --git a/compiler-rt/test/profile/Inputs/instrprof-gcov-switch2.c.gcov b/compiler-rt/test/profile/Inputs/instrprof-gcov-switch2.c.gcov index c931365ddf484..f9501e0c870b2 100644 --- a/compiler-rt/test/profile/Inputs/instrprof-gcov-switch2.c.gcov +++ b/compiler-rt/test/profile/Inputs/instrprof-gcov-switch2.c.gcov @@ -2,7 +2,6 @@ // CHECK-NEXT: -: 0:Graph:instrprof-gcov-switch2.gcno // CHECK-NEXT: -: 0:Data:instrprof-gcov-switch2.gcda // CHECK-NEXT: -: 0:Runs:1 -// CHECK-NEXT: -: 0:Programs:1 // CHECK-NEXT: 1: 1:int main(void) // CHECK-NEXT: -: 2:{ // CHECK-NEXT: 1: 3: int i = 22; diff --git a/compiler-rt/test/profile/Inputs/instrprof-shared-lib_in-loop.c.gcov b/compiler-rt/test/profile/Inputs/instrprof-shared-lib_in-loop.c.gcov index 69350471312e3..d75a222977a0c 100644 --- a/compiler-rt/test/profile/Inputs/instrprof-shared-lib_in-loop.c.gcov +++ b/compiler-rt/test/profile/Inputs/instrprof-shared-lib_in-loop.c.gcov @@ -2,7 +2,6 @@ // CHECK-NEXT: -: 0:Graph:instrprof-shared-lib.gcno // CHECK-NEXT: -: 0:Data:instrprof-shared-lib.gcda // CHECK-NEXT: -: 0:Runs:1 -// CHECK-NEXT: -: 0:Programs:1 // CHECK-NEXT: -: 1:int g1 = 0; // CHECK-NEXT: -: 2:int g2 = 1; // CHECK-NEXT: -: 3: diff --git a/compiler-rt/test/profile/Inputs/instrprof-shared-main.c.gcov b/compiler-rt/test/profile/Inputs/instrprof-shared-main.c.gcov index a31a60238809a..24facb5e1a380 100644 --- a/compiler-rt/test/profile/Inputs/instrprof-shared-main.c.gcov +++ b/compiler-rt/test/profile/Inputs/instrprof-shared-main.c.gcov @@ -2,7 +2,6 @@ // CHECK-NEXT: -: 0:Graph:instrprof-shared-main.gcno // CHECK-NEXT: -: 0:Data:instrprof-shared-main.gcda // CHECK-NEXT: -: 0:Runs:1 -// CHECK-NEXT: -: 0:Programs:1 // CHECK-NEXT: -: 1:extern int g1, g2; // CHECK-NEXT: -: 2:extern void foo(int n); // CHECK-NEXT: -: 3: diff --git a/compiler-rt/test/profile/gcov-__gcov_flush-terminate.c b/compiler-rt/test/profile/gcov-__gcov_flush-terminate.c index ca13a0896a7b2..96cf4296524d1 100644 --- a/compiler-rt/test/profile/gcov-__gcov_flush-terminate.c +++ b/compiler-rt/test/profile/gcov-__gcov_flush-terminate.c @@ -8,7 +8,6 @@ // RUN: llvm-cov gcov -t gcov-__gcov_flush-terminate.gcda | FileCheck %s // CHECK: -: 0:Runs:1 -// CHECK-NEXT: -: 0:Programs:1 void __gcov_dump(void); void __gcov_reset(void); diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp index f9be7f933d31e..6e86ffdc80275 100644 --- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp @@ -61,7 +61,7 @@ enum : uint32_t { }; static cl::opt DefaultGCOVVersion("default-gcov-version", - cl::init("408*"), cl::Hidden, + cl::init("0000"), cl::Hidden, cl::ValueRequired); static cl::opt AtomicCounter("gcov-atomic-counter", cl::Hidden, @@ -154,6 +154,7 @@ class GCOVProfiler { GCOVOptions Options; llvm::endianness Endian; raw_ostream *os; + int Version = 0; // Checksum, produced by hash of EdgeDestinations SmallVector FileChecksums; @@ -334,12 +335,9 @@ namespace { : GCOVRecord(P), SP(SP), EndLine(EndLine), Ident(Ident), Version(Version), EntryBlock(P, 0), ReturnBlock(P, 1) { LLVM_DEBUG(dbgs() << "Function: " << getFunctionName(SP) << "\n"); - bool ExitBlockBeforeBody = Version >= 48; - uint32_t i = ExitBlockBeforeBody ? 2 : 1; + uint32_t i = 2; for (BasicBlock &BB : *F) Blocks.insert(std::make_pair(&BB, GCOVBlock(P, i++))); - if (!ExitBlockBeforeBody) - ReturnBlock.Number = i; std::string FunctionNameAndLine; raw_string_ostream FNLOS(FunctionNameAndLine); @@ -363,44 +361,28 @@ namespace { void writeOut(uint32_t CfgChecksum) { write(GCOV_TAG_FUNCTION); SmallString<128> Filename = getFilename(SP); - uint32_t BlockLen = - 2 + (Version >= 47) + wordsOfString(getFunctionName(SP)); - if (Version < 80) - BlockLen += wordsOfString(Filename) + 1; - else - BlockLen += 1 + wordsOfString(Filename) + 3 + (Version >= 90); + uint32_t BlockLen = 3 + wordsOfString(getFunctionName(SP)); + BlockLen += 1 + wordsOfString(Filename) + 4; write(BlockLen); write(Ident); write(FuncChecksum); - if (Version >= 47) - write(CfgChecksum); + write(CfgChecksum); writeString(getFunctionName(SP)); - if (Version < 80) { - writeString(Filename); - write(SP->getLine()); - } else { - write(SP->isArtificial()); // artificial - writeString(Filename); - write(SP->getLine()); // start_line - write(0); // start_column - // EndLine is the last line with !dbg. It is not the } line as in GCC, - // but good enough. - write(EndLine); - if (Version >= 90) - write(0); // end_column - } + + write(SP->isArtificial()); // artificial + writeString(Filename); + write(SP->getLine()); // start_line + write(0); // start_column + // EndLine is the last line with !dbg. It is not the } line as in GCC, + // but good enough. + write(EndLine); + write(0); // end_column // Emit count of blocks. write(GCOV_TAG_BLOCKS); - if (Version < 80) { - write(Blocks.size() + 2); - for (int i = Blocks.size() + 2; i; --i) - write(0); - } else { - write(1); - write(Blocks.size() + 2); - } + write(1); + write(Blocks.size() + 2); LLVM_DEBUG(dbgs() << (Blocks.size() + 1) << " blocks\n"); // Emit edges between blocks. @@ -767,7 +749,6 @@ bool GCOVProfiler::emitProfileNotes( function_ref GetBFI, function_ref GetBPI, function_ref GetTLI) { - int Version; { uint8_t c3 = Options.Version[0]; uint8_t c2 = Options.Version[1]; @@ -775,6 +756,11 @@ bool GCOVProfiler::emitProfileNotes( Version = c3 >= 'A' ? (c3 - 'A') * 100 + (c2 - '0') * 10 + c1 - '0' : (c3 - '0') * 10 + c1 - '0'; } + // Emit .gcno files that are compatible with GCC 11.1. + if (Version < 111) { + Version = 111; + memcpy(Options.Version, "B11*", 4); + } bool EmitGCDA = Options.EmitData; for (unsigned i = 0, e = CUNode->getNumOperands(); i != e; ++i) { @@ -973,10 +959,8 @@ bool GCOVProfiler::emitProfileNotes( out.write(Tmp, 4); } write(Stamp); - if (Version >= 90) - writeString(""); // unuseful current_working_directory - if (Version >= 80) - write(0); // unuseful has_unexecuted_blocks + writeString("."); // unuseful current_working_directory + write(0); // unuseful has_unexecuted_blocks for (auto &Func : Funcs) Func->writeOut(Stamp); diff --git a/llvm/test/Transforms/GCOVProfiling/exit-block.ll b/llvm/test/Transforms/GCOVProfiling/exit-block.ll index 50c4dc4665c95..567e22222f580 100644 --- a/llvm/test/Transforms/GCOVProfiling/exit-block.ll +++ b/llvm/test/Transforms/GCOVProfiling/exit-block.ll @@ -9,7 +9,7 @@ ; But we can optionally emit it last, to match GCC<4.8 (r189778). ; RUN: opt -passes=insert-gcov-profiling -default-gcov-version='407*' -disable-output %t/2 -; RUN: llvm-cov gcov -n -dump %t/exit-block.gcno 2>&1 | FileCheck --check-prefixes=CHECK,EXIT-LAST %s +; RUN: llvm-cov gcov -n -dump %t/exit-block.gcno 2>&1 | FileCheck --check-prefixes=CHECK,EXIT-SECOND %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/GCOVProfiling/version.ll b/llvm/test/Transforms/GCOVProfiling/version.ll index bfac2557da0b1..4751bc1bd6dc7 100644 --- a/llvm/test/Transforms/GCOVProfiling/version.ll +++ b/llvm/test/Transforms/GCOVProfiling/version.ll @@ -5,16 +5,16 @@ ; RUN: cat %t/little.txt %s %t/version.txt > %t/2 ; RUN: opt -passes=insert-gcov-profiling -disable-output < %t/2 -; RUN: head -c8 %t/version.gcno | grep '^oncg.804' +; RUN: head -c8 %t/version.gcno | grep '^oncg.11B' ; RUN: rm %t/version.gcno ; RUN: not opt -passes=insert-gcov-profiling -default-gcov-version=asdfasdf -disable-output < %t/2 -; RUN: opt -passes=insert-gcov-profiling -default-gcov-version='402*' -disable-output < %t/2 -; RUN: head -c8 %t/version.gcno | grep '^oncg.204' +; RUN: opt -passes=insert-gcov-profiling -default-gcov-version='B21*' -disable-output < %t/2 +; RUN: head -c8 %t/version.gcno | grep '^oncg.12B' ; RUN: rm %t/version.gcno ; RUN: cat %t/big.txt %s %t/version.txt > %t/big.ll ; RUN: opt -passes=insert-gcov-profiling -disable-output < %t/big.ll -; RUN: head -c8 %t/version.gcno | grep '^gcno408.' +; RUN: head -c8 %t/version.gcno | grep '^gcnoB11.' define void @test() !dbg !5 { ret void, !dbg !8 From 3ef78188d0d39cd00429f77f1b300be9bdf85770 Mon Sep 17 00:00:00 2001 From: Pengcheng Wang Date: Fri, 3 Jan 2025 16:41:18 +0800 Subject: [PATCH 331/567] [PowerPC] Use `RegisterClassInfo::getRegPressureSetLimit` (#120383) `RegisterClassInfo::getRegPressureSetLimit` is a wrapper of `TargetRegisterInfo::getRegPressureSetLimit` with some logics to adjust the limit by removing reserved registers. It seems that we shouldn't use `TargetRegisterInfo::getRegPressureSetLimit` directly, just like the comment "This limit must be adjusted dynamically for reserved registers" said. Separate from https://github.com/llvm/llvm-project/pull/118787 --- llvm/lib/Target/PowerPC/PPCInstrInfo.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 44f6db5061e21..fa45a7fb7fabe 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -643,8 +643,8 @@ bool PPCInstrInfo::shouldReduceRegisterPressure( }; // For now we only care about float and double type fma. - unsigned VSSRCLimit = TRI->getRegPressureSetLimit( - *MBB->getParent(), PPC::RegisterPressureSets::VSSRC); + unsigned VSSRCLimit = + RegClassInfo->getRegPressureSetLimit(PPC::RegisterPressureSets::VSSRC); // Only reduce register pressure when pressure is high. return GetMBBPressure(MBB)[PPC::RegisterPressureSets::VSSRC] > From 27f30029741ecf023baece7b3dde1ff9011ffefc Mon Sep 17 00:00:00 2001 From: Michael Kruse Date: Fri, 3 Jan 2025 00:34:24 +0100 Subject: [PATCH 332/567] [llvm-(min-)tblgen] Avoid redundant source compilation (#114494) All the sources of `llvm-min-tblgen` are also used for `llvm-tblgen`, with identical compilation flags. Reuse the object files of `llvm-min-tblgen` for `llvm-tblgen` by applying the usual source structure of an executable: One file per executable which named after the executable name containing the (in this case trivial) main function, which just calls the tblgen_main in TableGen.cpp. This should also clear up any confusion (including mine) of where each executable's main function is. While this slightly reduces build time, the main motivation is ccache. Using the hard_link option, building the object files for `llvm-tblgen` will result in a hard link to the same object file already used for `llvm-min-tblgen`. To signal the build system that the file is new, ccache will update the file's time stamp. Unfortunately, time stamps are shared between all hard-linked files s.t. this will indirectly also update the time stamps for the object files used for `llvm-tblgen`. At the next run, Ninja will recognize this time stamp discrepancy to the expected stamp recorded in `.ninja_log` and rebuild those object files for `llvm-min-tblgen`, which again will also update the stamp for the `llvm-tblgen`... . This is especially annoying for tablegen because it means Ninja will re-run all tablegenning in every build. I am using the hard_link option because it reduces the cost of having multiple build-trees of the LLVM sources and reduces the wear to the SSD they are stored on. --- .../{ => Basic}/ARMTargetDefEmitter.cpp | 0 .../utils/TableGen/{ => Basic}/Attributes.cpp | 0 llvm/utils/TableGen/Basic/CMakeLists.txt | 7 ++++++ .../TableGen/{ => Basic}/DirectiveEmitter.cpp | 0 .../TableGen/{ => Basic}/IntrinsicEmitter.cpp | 4 ++-- .../{ => Basic}/RISCVTargetDefEmitter.cpp | 0 llvm/utils/TableGen/{ => Basic}/TableGen.cpp | 6 +++-- llvm/utils/TableGen/Basic/TableGen.h | 13 +++++++++++ llvm/utils/TableGen/{ => Basic}/VTEmitter.cpp | 0 llvm/utils/TableGen/CMakeLists.txt | 23 ++++++++----------- llvm/utils/TableGen/llvm-min-tblgen.cpp | 18 +++++++++++++++ llvm/utils/TableGen/llvm-tblgen.cpp | 18 +++++++++++++++ 12 files changed, 71 insertions(+), 18 deletions(-) rename llvm/utils/TableGen/{ => Basic}/ARMTargetDefEmitter.cpp (100%) rename llvm/utils/TableGen/{ => Basic}/Attributes.cpp (100%) rename llvm/utils/TableGen/{ => Basic}/DirectiveEmitter.cpp (100%) rename llvm/utils/TableGen/{ => Basic}/IntrinsicEmitter.cpp (99%) rename llvm/utils/TableGen/{ => Basic}/RISCVTargetDefEmitter.cpp (100%) rename llvm/utils/TableGen/{ => Basic}/TableGen.cpp (94%) create mode 100644 llvm/utils/TableGen/Basic/TableGen.h rename llvm/utils/TableGen/{ => Basic}/VTEmitter.cpp (100%) create mode 100644 llvm/utils/TableGen/llvm-min-tblgen.cpp create mode 100644 llvm/utils/TableGen/llvm-tblgen.cpp diff --git a/llvm/utils/TableGen/ARMTargetDefEmitter.cpp b/llvm/utils/TableGen/Basic/ARMTargetDefEmitter.cpp similarity index 100% rename from llvm/utils/TableGen/ARMTargetDefEmitter.cpp rename to llvm/utils/TableGen/Basic/ARMTargetDefEmitter.cpp diff --git a/llvm/utils/TableGen/Attributes.cpp b/llvm/utils/TableGen/Basic/Attributes.cpp similarity index 100% rename from llvm/utils/TableGen/Attributes.cpp rename to llvm/utils/TableGen/Basic/Attributes.cpp diff --git a/llvm/utils/TableGen/Basic/CMakeLists.txt b/llvm/utils/TableGen/Basic/CMakeLists.txt index 41d737e8d418e..b058fba78eb05 100644 --- a/llvm/utils/TableGen/Basic/CMakeLists.txt +++ b/llvm/utils/TableGen/Basic/CMakeLists.txt @@ -9,8 +9,15 @@ set(LLVM_LINK_COMPONENTS ) add_llvm_library(LLVMTableGenBasic OBJECT EXCLUDE_FROM_ALL DISABLE_LLVM_LINK_LLVM_DYLIB + ARMTargetDefEmitter.cpp + Attributes.cpp CodeGenIntrinsics.cpp + DirectiveEmitter.cpp + IntrinsicEmitter.cpp + RISCVTargetDefEmitter.cpp SDNodeProperties.cpp + TableGen.cpp + VTEmitter.cpp ) # Users may include its headers as "Basic/*.h" diff --git a/llvm/utils/TableGen/DirectiveEmitter.cpp b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp similarity index 100% rename from llvm/utils/TableGen/DirectiveEmitter.cpp rename to llvm/utils/TableGen/Basic/DirectiveEmitter.cpp diff --git a/llvm/utils/TableGen/IntrinsicEmitter.cpp b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp similarity index 99% rename from llvm/utils/TableGen/IntrinsicEmitter.cpp rename to llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp index 093602c3da804..fc2b8908a35b8 100644 --- a/llvm/utils/TableGen/IntrinsicEmitter.cpp +++ b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp @@ -10,8 +10,8 @@ // //===----------------------------------------------------------------------===// -#include "Basic/CodeGenIntrinsics.h" -#include "Basic/SequenceToOffsetTable.h" +#include "CodeGenIntrinsics.h" +#include "SequenceToOffsetTable.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" diff --git a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp b/llvm/utils/TableGen/Basic/RISCVTargetDefEmitter.cpp similarity index 100% rename from llvm/utils/TableGen/RISCVTargetDefEmitter.cpp rename to llvm/utils/TableGen/Basic/RISCVTargetDefEmitter.cpp diff --git a/llvm/utils/TableGen/TableGen.cpp b/llvm/utils/TableGen/Basic/TableGen.cpp similarity index 94% rename from llvm/utils/TableGen/TableGen.cpp rename to llvm/utils/TableGen/Basic/TableGen.cpp index bea2a2e735dbe..80ac93f2b54fb 100644 --- a/llvm/utils/TableGen/TableGen.cpp +++ b/llvm/utils/TableGen/Basic/TableGen.cpp @@ -6,10 +6,12 @@ // //===----------------------------------------------------------------------===// // -// This file contains the main function for LLVM's TableGen. +// This file contains the global defintions (mostly command line parameters) +// shared between llvm-tblgen and llvm-min-tblgen. // //===----------------------------------------------------------------------===// +#include "TableGen.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/InitLLVM.h" @@ -74,7 +76,7 @@ static TableGen::Emitter::Opt X[] = { {"print-sets", printSets, "Print expanded sets for testing DAG exprs"}, }; -int main(int argc, char **argv) { +int tblgen_main(int argc, char **argv) { InitLLVM X(argc, argv); cl::ParseCommandLineOptions(argc, argv); diff --git a/llvm/utils/TableGen/Basic/TableGen.h b/llvm/utils/TableGen/Basic/TableGen.h new file mode 100644 index 0000000000000..630aea62fcf90 --- /dev/null +++ b/llvm/utils/TableGen/Basic/TableGen.h @@ -0,0 +1,13 @@ +//===- TableGen.h ---------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Shared entry point for llvm-tblgen and llvm-min-tblgen. +// +//===----------------------------------------------------------------------===// + +int tblgen_main(int argc, char **argv); diff --git a/llvm/utils/TableGen/VTEmitter.cpp b/llvm/utils/TableGen/Basic/VTEmitter.cpp similarity index 100% rename from llvm/utils/TableGen/VTEmitter.cpp rename to llvm/utils/TableGen/Basic/VTEmitter.cpp diff --git a/llvm/utils/TableGen/CMakeLists.txt b/llvm/utils/TableGen/CMakeLists.txt index ba1e4aa01b48d..96a74c6fd89f7 100644 --- a/llvm/utils/TableGen/CMakeLists.txt +++ b/llvm/utils/TableGen/CMakeLists.txt @@ -11,14 +11,13 @@ set(LLVM_LINK_COMPONENTS Support) # build llvm/include. It must not depend on TableGenCommon, as # TableGenCommon depends on this already to generate things such as # ValueType definitions. +# Sources included in both, llvm-min-tblgen and llvm-tblgen, must be included +# into LLVMTableGenBasic to avoid redundant compilation and problems with build +# caches. +# At least one source file must be included directly to avoid CMake problems. +# E.g. CMake derives which linker to use from the types of sources added. add_tablegen(llvm-min-tblgen LLVM_HEADERS - TableGen.cpp - ARMTargetDefEmitter.cpp - Attributes.cpp - DirectiveEmitter.cpp - IntrinsicEmitter.cpp - RISCVTargetDefEmitter.cpp - VTEmitter.cpp + llvm-min-tblgen.cpp $ PARTIAL_SOURCES_INTENDED @@ -32,10 +31,8 @@ set(LLVM_LINK_COMPONENTS add_tablegen(llvm-tblgen LLVM DESTINATION "${LLVM_TOOLS_INSTALL_DIR}" EXPORT LLVM - ARMTargetDefEmitter.cpp AsmMatcherEmitter.cpp AsmWriterEmitter.cpp - Attributes.cpp CallingConvEmitter.cpp CodeEmitterGen.cpp CodeGenMapTable.cpp @@ -48,7 +45,6 @@ add_tablegen(llvm-tblgen LLVM DecoderEmitter.cpp DFAEmitter.cpp DFAPacketizerEmitter.cpp - DirectiveEmitter.cpp DisassemblerEmitter.cpp DXILEmitter.cpp ExegesisEmitter.cpp @@ -57,18 +53,15 @@ add_tablegen(llvm-tblgen LLVM GlobalISelEmitter.cpp InstrDocsEmitter.cpp InstrInfoEmitter.cpp - IntrinsicEmitter.cpp + llvm-tblgen.cpp MacroFusionPredicatorEmitter.cpp OptionParserEmitter.cpp OptionRSTEmitter.cpp PseudoLoweringEmitter.cpp RegisterBankEmitter.cpp RegisterInfoEmitter.cpp - RISCVTargetDefEmitter.cpp SearchableTableEmitter.cpp SubtargetEmitter.cpp - TableGen.cpp - VTEmitter.cpp WebAssemblyDisassemblerEmitter.cpp X86InstrMappingEmitter.cpp X86DisassemblerTables.cpp @@ -79,6 +72,8 @@ add_tablegen(llvm-tblgen LLVM $ $ + PARTIAL_SOURCES_INTENDED + DEPENDS intrinsics_gen # via llvm-min-tablegen ) diff --git a/llvm/utils/TableGen/llvm-min-tblgen.cpp b/llvm/utils/TableGen/llvm-min-tblgen.cpp new file mode 100644 index 0000000000000..79fce5c555f6e --- /dev/null +++ b/llvm/utils/TableGen/llvm-min-tblgen.cpp @@ -0,0 +1,18 @@ +//===- llvm-min-tblgen.cpp ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the main function for LLVM's TableGen. +// +//===----------------------------------------------------------------------===// + +#include "Basic/TableGen.h" + +/// Command line parameters are shared between llvm-tblgen and llvm-min-tblgen. +/// The indirection to tblgen_main exists to ensure that the static variables +/// for the llvm::cl:: mechanism are linked into both executables. +int main(int argc, char **argv) { return tblgen_main(argc, argv); } diff --git a/llvm/utils/TableGen/llvm-tblgen.cpp b/llvm/utils/TableGen/llvm-tblgen.cpp new file mode 100644 index 0000000000000..a38382472a992 --- /dev/null +++ b/llvm/utils/TableGen/llvm-tblgen.cpp @@ -0,0 +1,18 @@ +//===- llvm-tblgen.cpp ----------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the main function for LLVM's TableGen. +// +//===----------------------------------------------------------------------===// + +#include "Basic/TableGen.h" + +/// Command line parameters are shared between llvm-tblgen and llvm-min-tblgen. +/// The indirection to tblgen_main exists to ensure that the static variables +/// for the llvm::cl:: mechanism are linked into both executables. +int main(int argc, char **argv) { return tblgen_main(argc, argv); } From 67ff11ea5b2d2d51fa634361dd88c6dc9429706a Mon Sep 17 00:00:00 2001 From: ZhaoQi Date: Fri, 3 Jan 2025 16:43:39 +0800 Subject: [PATCH 333/567] [LoongArch] Avoid scheduling tls-desc code sequence in large code model (#121541) --- llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp | 11 ++++++++++- .../CodeGen/LoongArch/psabi-restricted-scheduling.ll | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp index 7d0e4f9d58a16..54aeda2836400 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp @@ -406,6 +406,11 @@ bool LoongArchInstrInfo::isSchedulingBoundary(const MachineInstr &MI, // lu32i.d $a1, %ie64_pc_lo20(s) // lu52i.d $a1, $a1, %ie64_pc_hi12(s) // + // * pcalau12i $a0, %desc_pc_hi20(s) + // addi.d $a1, $zero, %desc_pc_lo12(s) + // lu32i.d $a1, %desc64_pc_lo20(s) + // lu52i.d $a1, $a1, %desc64_pc_hi12(s) + // // For simplicity, only pcalau12i and lu52i.d are marked as scheduling // boundaries, and the instructions between them are guaranteed to be // ordered according to data dependencies. @@ -430,12 +435,16 @@ bool LoongArchInstrInfo::isSchedulingBoundary(const MachineInstr &MI, if (MO0 == LoongArchII::MO_IE_PC_HI && MO1 == LoongArchII::MO_IE_PC_LO && MO2 == LoongArchII::MO_IE_PC64_LO) return true; + if (MO0 == LoongArchII::MO_DESC_PC_HI && + MO1 == LoongArchII::MO_DESC_PC_LO && + MO2 == LoongArchII::MO_DESC64_PC_LO) + return true; break; } case LoongArch::LU52I_D: { auto MO = MI.getOperand(2).getTargetFlags(); if (MO == LoongArchII::MO_PCREL64_HI || MO == LoongArchII::MO_GOT_PC64_HI || - MO == LoongArchII::MO_IE_PC64_HI) + MO == LoongArchII::MO_IE_PC64_HI || MO == LoongArchII::MO_DESC64_PC_HI) return true; break; } diff --git a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll index 1773b8e014997..3390f7fe14ae6 100644 --- a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll +++ b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll @@ -252,8 +252,8 @@ define void @baz() nounwind { ; LARGEDESC_SCH: # %bb.0: ; LARGEDESC_SCH-NEXT: addi.d $sp, $sp, -16 ; LARGEDESC_SCH-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill -; LARGEDESC_SCH-NEXT: addi.d $a1, $zero, %desc_pc_lo12(gd) ; LARGEDESC_SCH-NEXT: pcalau12i $a0, %desc_pc_hi20(gd) +; LARGEDESC_SCH-NEXT: addi.d $a1, $zero, %desc_pc_lo12(gd) ; LARGEDESC_SCH-NEXT: lu32i.d $a1, %desc64_pc_lo20(gd) ; LARGEDESC_SCH-NEXT: lu52i.d $a1, $a1, %desc64_pc_hi12(gd) ; LARGEDESC_SCH-NEXT: add.d $a0, $a0, $a1 From 8b23ebb498bc67f03571b1d429771b28868b8932 Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Fri, 3 Jan 2025 03:55:58 -0500 Subject: [PATCH 334/567] [AMDGPU][True16[MC] true16 for v_max3/min3_num_f16 (#121510) V_MAX3/MIN3_NUM_F16 are alias GFX12 instructions with V_MAX3/MIN3_F16 in GFX11 and they should be updated together. This fix a bug introduced in https://github.com/llvm/llvm-project/pull/113603 such that only V_MAX3/MIN3_F16 are replaced in true16 format. Also added GFX12 runlines for CodeGen test --- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 4 +- llvm/test/CodeGen/AMDGPU/fmax3.ll | 134 ++++++++++++ llvm/test/CodeGen/AMDGPU/fmin3.ll | 200 ++++++++++++++++++ llvm/test/MC/AMDGPU/gfx12_asm_vop3.s | 144 +++++++------ llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s | 8 +- llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s | 168 +++++++-------- llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s | 152 ++++++------- .../Disassembler/AMDGPU/gfx12_dasm_vop3.txt | 162 +++++++++++--- .../AMDGPU/gfx12_dasm_vop3_dpp16.txt | 200 +++++++++++++++--- .../AMDGPU/gfx12_dasm_vop3_dpp8.txt | 200 +++++++++++++++--- 10 files changed, 1056 insertions(+), 316 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 22e457674c07a..d00c810859e3b 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -1578,8 +1578,8 @@ def : MinimumMaximumByMinimum3Maximum3; defm V_MIN3_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x229, "V_MIN3_F32", "v_min3_num_f32">; defm V_MAX3_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x22a, "V_MAX3_F32", "v_max3_num_f32">; -defm V_MIN3_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x22b, "V_MIN3_F16", "v_min3_num_f16">; -defm V_MAX3_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x22c, "V_MAX3_F16", "v_max3_num_f16">; +defm V_MIN3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x22b, "v_min3_num_f16", "V_MIN3_F16", "v_min3_f16">; +defm V_MAX3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x22c, "v_max3_num_f16", "V_MAX3_F16", "v_max3_f16">; defm V_MINIMUM3_F32 : VOP3Only_Realtriple_gfx12<0x22d>; defm V_MAXIMUM3_F32 : VOP3Only_Realtriple_gfx12<0x22e>; defm V_MINIMUM3_F16 : VOP3Only_Realtriple_t16_gfx12<0x22f>; diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll index 4b3f0dbbaea98..fbcdbed338e60 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmax3_olt_0_f32: @@ -124,6 +125,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_max3_f32 v0, v0, v1, v2 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_fmax3_olt_0_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s18, s10 +; GFX12-NEXT: s_mov_b32 s19, s11 +; GFX12-NEXT: s_mov_b32 s22, s10 +; GFX12-NEXT: s_mov_b32 s23, s11 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s16, s4 +; GFX12-NEXT: s_mov_b32 s17, s5 +; GFX12-NEXT: s_mov_b32 s20, s6 +; GFX12-NEXT: s_mov_b32 s21, s7 +; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: v_max3_num_f32 v0, v0, v1, v2 +; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null +; GFX12-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 %c = load volatile float, ptr addrspace(1) %cptr, align 4 @@ -254,6 +285,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_max3_f32 v0, v2, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_fmax3_olt_1_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s18, s10 +; GFX12-NEXT: s_mov_b32 s19, s11 +; GFX12-NEXT: s_mov_b32 s22, s10 +; GFX12-NEXT: s_mov_b32 s23, s11 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s16, s4 +; GFX12-NEXT: s_mov_b32 s17, s5 +; GFX12-NEXT: s_mov_b32 s20, s6 +; GFX12-NEXT: s_mov_b32 s21, s7 +; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: v_max3_num_f32 v0, v2, v0, v1 +; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null +; GFX12-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 %c = load volatile float, ptr addrspace(1) %cptr, align 4 @@ -391,6 +452,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_max3_f16 v0, v0, v1, v2 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_fmax3_olt_0_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s18, s10 +; GFX12-NEXT: s_mov_b32 s19, s11 +; GFX12-NEXT: s_mov_b32 s22, s10 +; GFX12-NEXT: s_mov_b32 s23, s11 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s16, s4 +; GFX12-NEXT: s_mov_b32 s17, s5 +; GFX12-NEXT: s_mov_b32 s20, s6 +; GFX12-NEXT: s_mov_b32 s21, s7 +; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: v_max3_num_f16 v0, v0, v1, v2 +; GFX12-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX12-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 %c = load volatile half, ptr addrspace(1) %cptr, align 2 @@ -529,6 +620,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_max3_f16 v0, v2, v0, v1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_fmax3_olt_1_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s18, s10 +; GFX12-NEXT: s_mov_b32 s19, s11 +; GFX12-NEXT: s_mov_b32 s22, s10 +; GFX12-NEXT: s_mov_b32 s23, s11 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s16, s4 +; GFX12-NEXT: s_mov_b32 s17, s5 +; GFX12-NEXT: s_mov_b32 s20, s6 +; GFX12-NEXT: s_mov_b32 s21, s7 +; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: v_max3_num_f16 v0, v2, v0, v1 +; GFX12-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX12-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 %c = load volatile half, ptr addrspace(1) %cptr, align 2 @@ -594,6 +715,19 @@ define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, < ; GFX11-NEXT: v_pk_max_f16 v0, v2, v0 ; GFX11-NEXT: v_pk_max_f16 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: no_fmax3_v2f16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v0, v2, v0 +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max) diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll index 38b712e044df9..269fd52df5c49 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmin3_olt_0_f32: @@ -124,6 +125,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_min3_f32 v0, v0, v1, v2 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_fmin3_olt_0_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s18, s10 +; GFX12-NEXT: s_mov_b32 s19, s11 +; GFX12-NEXT: s_mov_b32 s22, s10 +; GFX12-NEXT: s_mov_b32 s23, s11 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s16, s4 +; GFX12-NEXT: s_mov_b32 s17, s5 +; GFX12-NEXT: s_mov_b32 s20, s6 +; GFX12-NEXT: s_mov_b32 s21, s7 +; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: v_min3_num_f32 v0, v0, v1, v2 +; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null +; GFX12-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 %c = load volatile float, ptr addrspace(1) %cptr, align 4 @@ -254,6 +285,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_min3_f32 v0, v2, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_fmin3_olt_1_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s18, s10 +; GFX12-NEXT: s_mov_b32 s19, s11 +; GFX12-NEXT: s_mov_b32 s22, s10 +; GFX12-NEXT: s_mov_b32 s23, s11 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s16, s4 +; GFX12-NEXT: s_mov_b32 s17, s5 +; GFX12-NEXT: s_mov_b32 s20, s6 +; GFX12-NEXT: s_mov_b32 s21, s7 +; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: v_min3_num_f32 v0, v2, v0, v1 +; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null +; GFX12-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 %c = load volatile float, ptr addrspace(1) %cptr, align 4 @@ -391,6 +452,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_min3_f16 v0, v0, v1, v2 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_fmin3_olt_0_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s18, s10 +; GFX12-NEXT: s_mov_b32 s19, s11 +; GFX12-NEXT: s_mov_b32 s22, s10 +; GFX12-NEXT: s_mov_b32 s23, s11 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s16, s4 +; GFX12-NEXT: s_mov_b32 s17, s5 +; GFX12-NEXT: s_mov_b32 s20, s6 +; GFX12-NEXT: s_mov_b32 s21, s7 +; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: v_min3_num_f16 v0, v0, v1, v2 +; GFX12-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX12-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 %c = load volatile half, ptr addrspace(1) %cptr, align 2 @@ -529,6 +620,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_min3_f16 v0, v2, v0, v1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_fmin3_olt_1_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s18, s10 +; GFX12-NEXT: s_mov_b32 s19, s11 +; GFX12-NEXT: s_mov_b32 s22, s10 +; GFX12-NEXT: s_mov_b32 s23, s11 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s16, s4 +; GFX12-NEXT: s_mov_b32 s17, s5 +; GFX12-NEXT: s_mov_b32 s20, s6 +; GFX12-NEXT: s_mov_b32 s21, s7 +; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: v_min3_num_f16 v0, v2, v0, v1 +; GFX12-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX12-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 %c = load volatile half, ptr addrspace(1) %cptr, align 2 @@ -594,6 +715,19 @@ define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, < ; GFX11-NEXT: v_pk_min_f16 v0, v2, v0 ; GFX11-NEXT: v_pk_min_f16 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: no_fmin3_v2f16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v0, v2, v0 +; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %min = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) %min1 = call <2 x half> @llvm.minnum.v2f16(<2 x half> %c, <2 x half> %min) @@ -734,6 +868,39 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_fmin3_olt_0_f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s18, s10 +; GFX12-NEXT: s_mov_b32 s19, s11 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s16, s4 +; GFX12-NEXT: s_mov_b32 s17, s5 +; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s6 +; GFX12-NEXT: s_mov_b32 s13, s7 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null +; GFX12-NEXT: s_endpgm %a = load volatile double, ptr addrspace(1) %aptr, align 4 %b = load volatile double, ptr addrspace(1) %bptr, align 4 %c = load volatile double, ptr addrspace(1) %cptr, align 4 @@ -877,6 +1044,39 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_fmin3_olt_1_f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s18, s10 +; GFX12-NEXT: s_mov_b32 s19, s11 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s16, s4 +; GFX12-NEXT: s_mov_b32 s17, s5 +; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s6 +; GFX12-NEXT: s_mov_b32 s13, s7 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[2:3], v[0:1] +; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null +; GFX12-NEXT: s_endpgm %a = load volatile double, ptr addrspace(1) %aptr, align 4 %b = load volatile double, ptr addrspace(1) %bptr, align 4 %c = load volatile double, ptr addrspace(1) %cptr, align 4 diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s index 0309b2e8e517e..5674d26327201 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s @@ -3164,50 +3164,62 @@ v_mad_co_u64_u32 v[5:6], ttmp[14:15], src_scc, vcc_lo, src_scc v_mad_co_u64_u32 v[254:255], null, 0xaf123456, vcc_hi, 0.5 clamp // GFX12: v_mad_co_u64_u32 v[254:255], null, 0xaf123456, vcc_hi, 0.5 clamp ; encoding: [0xfe,0xfc,0xfe,0xd6,0xff,0xd6,0xc0,0x03,0x56,0x34,0x12,0xaf] -v_max3_num_f16 v5, v1, v2, s3 -// GFX12: v_max3_num_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00] +v_max3_num_f16 v5.l, v1.l, v2.l, s3 +// GFX12: v_max3_num_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00] -v_max3_num_f16 v5, v255, s2, s105 -// GFX12: v_max3_num_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01] +v_max3_num_f16 v5.l, v255.l, s2, s105 +// GFX12: v_max3_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01] -v_max3_num_f16 v5, s1, v255, exec_hi -// GFX12: v_max3_num_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01] +v_max3_num_f16 v5.l, s1, v255.l, exec_hi +// GFX12: v_max3_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01] -v_max3_num_f16 v5, s105, s105, exec_lo -// GFX12: v_max3_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01] +v_max3_num_f16 v5.l, s105, s105, exec_lo +// GFX12: v_max3_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01] -v_max3_num_f16 v5, vcc_lo, ttmp15, v3 -// GFX12: v_max3_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04] +v_max3_num_f16 v5.l, vcc_lo, ttmp15, v3.l +// GFX12: v_max3_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04] -v_max3_num_f16 v5, vcc_hi, 0xfe0b, v255 -// GFX12: v_max3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l +// GFX12: v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] -v_max3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| -// GFX12: v_max3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1] +v_max3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX12: v_max3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1] -v_max3_num_f16 v5, m0, 0.5, m0 -// GFX12: v_max3_num_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01] +v_max3_num_f16 v5.l, m0, 0.5, m0 +// GFX12: v_max3_num_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01] -v_max3_num_f16 v5, |exec_lo|, -1, vcc_hi -// GFX12: v_max3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01] +v_max3_num_f16 v5.l, |exec_lo|, -1, vcc_hi +// GFX12: v_max3_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01] -v_max3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] -// GFX12: v_max3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1] +v_max3_num_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] +// GFX12: v_max3_num_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1] -v_max3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0] -// GFX12: v_max3_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +v_max3_num_f16 v5.l, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0] +// GFX12: v_max3_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] -v_max3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] -// GFX12: v_max3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3] +v_max3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] +// GFX12: v_max3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3] -v_max3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] -// GFX12: v_max3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43] +v_max3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] +// GFX12: v_max3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43] -v_max3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] -// GFX12: v_max3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23] +v_max3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] +// GFX12: v_max3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23] -v_max3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp -// GFX12: v_max3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +v_max3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp +// GFX12: v_max3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] + +v_max3_num_f16 v5.l, v255.h, s2, s105 +// GFX12: v_max3_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x2c,0xd6,0xff,0x05,0xa4,0x01] + +v_max3_num_f16 v5.l, s1, v255.h, exec_hi +// GFX12: v_max3_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0x01,0xfe,0xff,0x01] + +v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h +// GFX12: v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_max3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp +// GFX12: v_max3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] v_max3_num_f32 v5, v1, v2, s3 // GFX12: v_max3_num_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x2a,0xd6,0x01,0x05,0x0e,0x00] @@ -4142,50 +4154,62 @@ v_med3_u32 v5, src_scc, vcc_lo, -1 v_med3_u32 v255, 0xaf123456, vcc_hi, null // GFX12: v_med3_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x21,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] -v_min3_num_f16 v5, v1, v2, s3 -// GFX12: v_min3_num_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00] +v_min3_num_f16 v5.l, v1.l, v2.l, s3 +// GFX12: v_min3_num_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00] + +v_min3_num_f16 v5.l, v255.l, s2, s105 +// GFX12: v_min3_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01] + +v_min3_num_f16 v5.l, s1, v255.l, exec_hi +// GFX12: v_min3_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01] + +v_min3_num_f16 v5.l, s105, s105, exec_lo +// GFX12: v_min3_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01] + +v_min3_num_f16 v5.l, vcc_lo, ttmp15, v3.l +// GFX12: v_min3_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04] -v_min3_num_f16 v5, v255, s2, s105 -// GFX12: v_min3_num_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01] +v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l +// GFX12: v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] -v_min3_num_f16 v5, s1, v255, exec_hi -// GFX12: v_min3_num_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01] +v_min3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX12: v_min3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1] -v_min3_num_f16 v5, s105, s105, exec_lo -// GFX12: v_min3_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01] +v_min3_num_f16 v5.l, m0, 0.5, m0 +// GFX12: v_min3_num_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01] -v_min3_num_f16 v5, vcc_lo, ttmp15, v3 -// GFX12: v_min3_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04] +v_min3_num_f16 v5.l, |exec_lo|, -1, vcc_hi +// GFX12: v_min3_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01] -v_min3_num_f16 v5, vcc_hi, 0xfe0b, v255 -// GFX12: v_min3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +v_min3_num_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] +// GFX12: v_min3_num_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1] -v_min3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| -// GFX12: v_min3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1] +v_min3_num_f16 v5.l, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0] +// GFX12: v_min3_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] -v_min3_num_f16 v5, m0, 0.5, m0 -// GFX12: v_min3_num_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01] +v_min3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] +// GFX12: v_min3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3] -v_min3_num_f16 v5, |exec_lo|, -1, vcc_hi -// GFX12: v_min3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01] +v_min3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] +// GFX12: v_min3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43] -v_min3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] -// GFX12: v_min3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1] +v_min3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] +// GFX12: v_min3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23] -v_min3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0] -// GFX12: v_min3_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +v_min3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp +// GFX12: v_min3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] -v_min3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] -// GFX12: v_min3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3] +v_min3_num_f16 v5.l, v255.h, s2, s105 +// GFX12: v_min3_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x2b,0xd6,0xff,0x05,0xa4,0x01] -v_min3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] -// GFX12: v_min3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43] +v_min3_num_f16 v5.l, s1, v255.h, exec_hi +// GFX12: v_min3_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0x01,0xfe,0xff,0x01] -v_min3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] -// GFX12: v_min3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23] +v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h +// GFX12: v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] -v_min3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp -// GFX12: v_min3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +v_min3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp +// GFX12: v_min3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] v_min3_num_f32 v5, v1, v2, s3 // GFX12: v_min3_num_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x29,0xd6,0x01,0x05,0x0e,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s index 59cb1a479450f..ee4561fad367c 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s @@ -6,11 +6,11 @@ v_min3_f32 v5, v1, v2, v3 v_max3_f32 v5, v1, v2, v3 // GFX12: v_max3_num_f32 v5, v1, v2, v3 ; encoding: [0x05,0x00,0x2a,0xd6,0x01,0x05,0x0e,0x04] -v_min3_f16 v5, v1, v2, v3 -// GFX12: v_min3_num_f16 v5, v1, v2, v3 ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x04] +v_min3_f16 v5.l, v1.l, v2.l, v3.l +// GFX12: v_min3_num_f16 v5.l, v1.l, v2.l, v3.l ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x04] -v_max3_f16 v5, v1, v2, v3 -// GFX12: v_max3_num_f16 v5, v1, v2, v3 ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x04] +v_max3_f16 v5.l, v1.l, v2.l, v3.l +// GFX12: v_max3_num_f16 v5.l, v1.l, v2.l, v3.l ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x04] v_med3_f32 v5, v1, v2, v3 // GFX12: v_med3_num_f32 v5, v1, v2, v3 ; encoding: [0x05,0x00,0x31,0xd6,0x01,0x05,0x0e,0x04] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s index b769324d5412f..0fa344f7e73a3 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s @@ -2480,53 +2480,53 @@ v_mad_u32_u24_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bou v_mad_u32_u24_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_mad_u32_u24_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x0b,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] -v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] -v_max3_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff] +v_max3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff] -v_max3_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff] +v_max3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff] -v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] -// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] -v_max3_num_f16_e64_dpp v5, v1, v2, v3 row_mirror -// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] -v_max3_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror -// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] -v_max3_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 -// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] -v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 -// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] -v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 -// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] -v_max3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 -// GFX12: v_max3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2c,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] +v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 +// GFX12: v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2c,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] -v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 -// GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] -v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 -// GFX12: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +v_max3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 +// GFX12: v_max3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] -v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] -v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] +v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] -v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13] +v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13] -v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30] +v_max3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_max3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30] v_max3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] // GFX12: v_max3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] @@ -3515,53 +3515,53 @@ v_med3_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ v_med3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_med3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x21,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] -v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] -v_min3_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff] +v_min3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff] -v_min3_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff] +v_min3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff] -v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] -// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] -v_min3_num_f16_e64_dpp v5, v1, v2, v3 row_mirror -// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] -v_min3_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror -// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] -v_min3_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 -// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] -v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 -// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] -v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 -// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] -v_min3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 -// GFX12: v_min3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] +v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 +// GFX12: v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] -v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 -// GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] -v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 -// GFX12: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +v_min3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 +// GFX12: v_min3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] -v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] -v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] +v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] -v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13] +v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13] -v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30] +v_min3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_min3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30] v_min3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] // GFX12: v_min3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x29,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] @@ -5302,20 +5302,20 @@ v_mad_u32_u16_e64_dpp v5, v1, v2, 0.5 op_sel:[1,0,0,0] row_xmask:0 row_mask:0x1 v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 // GFX12: v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x90,0x59,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30] -v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf -// GFX12: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX12: v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] -v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] -v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] +v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] -v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 -// GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] +v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] -v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 -// GFX12: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] +v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX12: v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] v_max3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf // GFX12: v_max3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x4d,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] @@ -5392,20 +5392,20 @@ v_med3_u16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x v_med3_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 // GFX12: v_med3_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x51,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30] -v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf -// GFX12: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX12: v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] -v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] -v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] +v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] -v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 -// GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] +v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] -v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 -// GFX12: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] +v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX12: v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] v_min3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf // GFX12: v_min3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x4a,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s index f76dd26623144..657663f4353ba 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s @@ -1545,47 +1545,47 @@ v_mad_u32_u24_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_mad_u32_u24_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_mad_u32_u24_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x0b,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] -v_max3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2c,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2c,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x2c,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x2c,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] -// GFX12: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2c,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] +v_max3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_max3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2c,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] v_max3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_max3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] @@ -2340,47 +2340,47 @@ v_med3_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_med3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_med3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x21,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] -v_min3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x2b,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x2b,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] -// GFX12: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2b,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] +v_min3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_min3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2b,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] v_min3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_min3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x29,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] @@ -3571,20 +3571,20 @@ v_mad_u32_u16_e64_dpp v5, v1, v2, 0.5 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 // GFX12: v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x90,0x59,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] -v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 -// GFX12: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] +v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX12: v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] v_max3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_max3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x4d,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] @@ -3661,20 +3661,20 @@ v_med3_u16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] v_med3_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 // GFX12: v_med3_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x51,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] -v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 -// GFX12: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] +v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX12: v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] v_min3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_min3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x4a,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt index 4c2060ad44b8a..58696613e852f 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt @@ -3509,49 +3509,100 @@ # GFX12: v_mad_co_u64_u32 v[254:255], null, 0xaf123456, vcc_hi, 0.5 clamp ; encoding: [0xfe,0xfc,0xfe,0xd6,0xff,0xd6,0xc0,0x03,0x56,0x34,0x12,0xaf] 0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00 -# GFX12: v_max3_num_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00] +# W32-REAL16: v_max3_num_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00] +# W32-FAKE16: v_max3_num_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00] +# W64-REAL16: v_max3_num_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00] +# W64-FAKE16: v_max3_num_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00] 0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01 -# GFX12: v_max3_num_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01] +# W32-REAL16: v_max3_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01] +# W32-FAKE16: v_max3_num_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01] +# W64-REAL16: v_max3_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01] +# W64-FAKE16: v_max3_num_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01] 0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01 -# GFX12: v_max3_num_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01] +# W32-REAL16: v_max3_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01] +# W32-FAKE16: v_max3_num_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01] +# W64-REAL16: v_max3_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01] +# W64-FAKE16: v_max3_num_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01] 0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01 -# GFX12: v_max3_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01] +# W32-REAL16: v_max3_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01] +# W32-FAKE16: v_max3_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01] +# W64-REAL16: v_max3_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01] +# W64-FAKE16: v_max3_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01] 0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04 -# GFX12: v_max3_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04] +# W32-REAL16: v_max3_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04] +# W32-FAKE16: v_max3_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04] +# W64-REAL16: v_max3_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04] +# W64-FAKE16: v_max3_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04] 0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00 -# GFX12: v_max3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_max3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_max3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] 0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1 -# GFX12: v_max3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1] +# W32-REAL16: v_max3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1] +# W32-FAKE16: v_max3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1] +# W64-REAL16: v_max3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1] +# W64-FAKE16: v_max3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1] 0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01 -# GFX12: v_max3_num_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01] +# W32-REAL16: v_max3_num_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01] +# W32-FAKE16: v_max3_num_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01] +# W64-REAL16: v_max3_num_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01] +# W64-FAKE16: v_max3_num_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01] 0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01 -# GFX12: v_max3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01] +# W32-REAL16: v_max3_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01] +# W32-FAKE16: v_max3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01] +# W64-REAL16: v_max3_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01] +# W64-FAKE16: v_max3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01] 0x05,0x05,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1 -# GFX12: v_max3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1] +# W32-REAL16: v_max3_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1] +# W32-FAKE16: v_max3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1] +# W64-REAL16: v_max3_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1] +# W64-FAKE16: v_max3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1] 0x05,0x7c,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00 -# GFX12: v_max3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_max3_num_f16 v5.h, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_max3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_max3_num_f16 v5.h, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_max3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] 0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3 -# GFX12: v_max3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3] +# W32-REAL16: v_max3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3] +# W32-FAKE16: v_max3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3] +# W64-REAL16: v_max3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3] +# W64-FAKE16: v_max3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3] 0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43 -# GFX12: v_max3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43] +# W32-REAL16: v_max3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43] +# W32-FAKE16: v_max3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43] +# W64-REAL16: v_max3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43] +# W64-FAKE16: v_max3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43] 0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23 -# GFX12: v_max3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23] +# W32-REAL16: v_max3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23] +# W32-FAKE16: v_max3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23] +# W64-REAL16: v_max3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23] +# W64-FAKE16: v_max3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23] 0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00 -# GFX12: v_max3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_max3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_max3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_max3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_max3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] + +0x05,0x20,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00 +# W32-REAL16: v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_max3_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_max3_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] 0x05,0x00,0x2a,0xd6,0x01,0x05,0x0e,0x00 # GFX12: v_max3_num_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x2a,0xd6,0x01,0x05,0x0e,0x00] @@ -4886,49 +4937,100 @@ # GFX12: v_med3_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x21,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] 0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00 -# GFX12: v_min3_num_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00] +# W32-REAL16: v_min3_num_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00] +# W32-FAKE16: v_min3_num_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00] +# W64-REAL16: v_min3_num_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00] +# W64-FAKE16: v_min3_num_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00] 0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01 -# GFX12: v_min3_num_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01] +# W32-REAL16: v_min3_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01] +# W32-FAKE16: v_min3_num_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01] +# W64-REAL16: v_min3_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01] +# W64-FAKE16: v_min3_num_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01] 0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01 -# GFX12: v_min3_num_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01] +# W32-REAL16: v_min3_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01] +# W32-FAKE16: v_min3_num_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01] +# W64-REAL16: v_min3_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01] +# W64-FAKE16: v_min3_num_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01] 0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01 -# GFX12: v_min3_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01] +# W32-REAL16: v_min3_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01] +# W32-FAKE16: v_min3_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01] +# W64-REAL16: v_min3_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01] +# W64-FAKE16: v_min3_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01] 0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04 -# GFX12: v_min3_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04] +# W32-REAL16: v_min3_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04] +# W32-FAKE16: v_min3_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04] +# W64-REAL16: v_min3_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04] +# W64-FAKE16: v_min3_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04] 0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00 -# GFX12: v_min3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_min3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_min3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] 0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1 -# GFX12: v_min3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1] +# W32-REAL16: v_min3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1] +# W32-FAKE16: v_min3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1] +# W64-REAL16: v_min3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1] +# W64-FAKE16: v_min3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1] 0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01 -# GFX12: v_min3_num_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01] +# W32-REAL16: v_min3_num_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01] +# W32-FAKE16: v_min3_num_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01] +# W64-REAL16: v_min3_num_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01] +# W64-FAKE16: v_min3_num_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01] 0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01 -# GFX12: v_min3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01] +# W32-REAL16: v_min3_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01] +# W32-FAKE16: v_min3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01] +# W64-REAL16: v_min3_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01] +# W64-FAKE16: v_min3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01] 0x05,0x05,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1 -# GFX12: v_min3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1] +# W32-REAL16: v_min3_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1] +# W32-FAKE16: v_min3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1] +# W64-REAL16: v_min3_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1] +# W64-FAKE16: v_min3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1] 0x05,0x7c,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00 -# GFX12: v_min3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_min3_num_f16 v5.h, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_min3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_min3_num_f16 v5.h, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_min3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] 0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3 -# GFX12: v_min3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3] +# W32-REAL16: v_min3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3] +# W32-FAKE16: v_min3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3] +# W64-REAL16: v_min3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3] +# W64-FAKE16: v_min3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3] 0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43 -# GFX12: v_min3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43] +# W32-REAL16: v_min3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43] +# W32-FAKE16: v_min3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43] +# W64-REAL16: v_min3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43] +# W64-FAKE16: v_min3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43] 0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23 -# GFX12: v_min3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23] +# W32-REAL16: v_min3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23] +# W32-FAKE16: v_min3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23] +# W64-REAL16: v_min3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23] +# W64-FAKE16: v_min3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23] 0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00 -# GFX12: v_min3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_min3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_min3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_min3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_min3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] + +0x05,0x20,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00 +# W32-REAL16: v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_min3_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_min3_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] 0x05,0x00,0x29,0xd6,0x01,0x05,0x0e,0x00 # GFX12: v_min3_num_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x29,0xd6,0x01,0x05,0x0e,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt index c64fe39d32558..83370defe6349 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt @@ -5199,49 +5199,119 @@ # GFX12: v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x90,0x59,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30] 0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff -# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] 0x05,0x00,0x2c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff -# GFX12: v_max3_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff] 0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff -# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] 0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff -# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff -# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff] 0x05,0x00,0x2c,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff -# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff] 0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff -# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] 0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff -# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] 0x05,0x01,0x2c,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff -# GFX12: v_max3_num_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2c,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2c,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2c,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2c,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2c,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff] 0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff -# GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] 0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff -# GFX12: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] 0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff -# GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] 0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01 -# GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] 0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13 -# GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] 0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30 -# GFX12: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] +# W32-REAL16: v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] + +0x05,0x78,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff +# W32-REAL16: v_max3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +0x05,0x20,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] + +0x05,0x0a,0x2c,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01 +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2c,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2c,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2c,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2c,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01] + +0x05,0x13,0x2c,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13 +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2c,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2c,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2c,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2c,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13] + 0x05,0x00,0x4d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff # W32-REAL16: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] @@ -5964,49 +6034,119 @@ # W64-FAKE16: v_med3_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x51,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30] 0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff -# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] 0x05,0x00,0x2b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff -# GFX12: v_min3_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff] 0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff -# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] 0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff -# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff -# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff] 0x05,0x00,0x2b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff -# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff] 0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff -# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] 0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff -# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] 0x05,0x01,0x2b,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff -# GFX12: v_min3_num_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2b,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2b,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2b,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2b,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2b,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff] 0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff -# GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] 0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff -# GFX12: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] 0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff -# GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] 0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01 -# GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] 0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13 -# GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] 0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30 -# GFX12: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] +# W32-REAL16: v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] + +0x05,0x78,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff +# W32-REAL16: v_min3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +0x05,0x20,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] + +0x05,0x0a,0x2b,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01 +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2b,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2b,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2b,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2b,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01] + +0x05,0x13,0x2b,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13 +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2b,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2b,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2b,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2b,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13] + 0x05,0x00,0x4a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff # W32-REAL16: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt index 9ed20c70c17a2..2a25e1eefae49 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt @@ -3240,49 +3240,119 @@ # GFX12: v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x90,0x59,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] 0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05 -# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] 0x05,0x00,0x2c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05 -# GFX12: v_max3_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05] 0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05 -# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] 0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05 -# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] 0x05,0x00,0x2c,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05 -# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] 0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05 -# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] 0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05 -# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] 0x05,0x01,0x2c,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05 -# GFX12: v_max3_num_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2c,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2c,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2c,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2c,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2c,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05] 0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05 -# GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] 0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05 -# GFX12: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] 0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05 -# GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] 0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05 -# GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] 0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05 -# GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] 0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00 -# GFX12: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] +# W32-REAL16: v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] +# W64-REAL16: v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +0x05,0x78,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05 +# W32-REAL16: v_max3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +0x05,0x20,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05 +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +0x05,0x0a,0x2c,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05 +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2c,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2c,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2c,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2c,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05] + +0x05,0x13,0x2c,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05 +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2c,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2c,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2c,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2c,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05] + 0x05,0x00,0x4d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05 # W32-REAL16: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] @@ -3981,49 +4051,119 @@ # W64-FAKE16: v_med3_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x51,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] 0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05 -# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] 0x05,0x00,0x2b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05 -# GFX12: v_min3_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05] 0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05 -# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] 0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05 -# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] 0x05,0x00,0x2b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05 -# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] 0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05 -# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] 0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05 -# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] 0x05,0x01,0x2b,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05 -# GFX12: v_min3_num_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2b,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2b,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2b,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2b,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2b,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05] 0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05 -# GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] 0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05 -# GFX12: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] 0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05 -# GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] 0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05 -# GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] 0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05 -# GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] 0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00 -# GFX12: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] +# W32-REAL16: v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] +# W64-REAL16: v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +0x05,0x78,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05 +# W32-REAL16: v_min3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +0x05,0x20,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05 +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +0x05,0x0a,0x2b,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05 +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2b,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2b,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2b,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2b,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05] + +0x05,0x13,0x2b,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05 +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2b,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2b,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2b,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2b,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05] + 0x05,0x00,0x4a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05 # W32-REAL16: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] From 9f6a1ddb43133328c90edfa29ccd4c714b289cb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= Date: Fri, 3 Jan 2025 09:09:23 +0000 Subject: [PATCH 335/567] [mlir][tensor] Introduce `FoldTensorCastUnPackOp` (#121393) This patch specializes `FoldTensorCastProducerOp` for `tensor::UnPackOp` by introducing a dedicated pattern: `FoldTensorCastUnPackOp`. This mirrors a similar update made for `tensor::PackOp` in #114559. Below is the updated rationale tailored to `tensor::UnPackOp`. ISSUE DESCRIPTION Currently, `FoldTensorCastProducerOp` incorrectly folds the following: ```mlir %cast = tensor.cast %dest : tensor<1x1x8x1xi32> to tensor<1x1x?x1xi32> // Note: `%c8` and `?`. %unpack = tensor.unpack %cast inner_dims_pos = [0, 1] inner_tiles = [%c8, 1] into %res : tensor<1x1x?x1xi32> -> tensor<7x?xi32> ``` as: ```mlir // Note: `%c8` and `8`. %unpack = tensor.unpack %cast inner_dims_pos = [0, 1] inner_tiles = [%c8, 1] into %res : tensor<1x1x8x1xi32> -> tensor<7x?xi32> ``` This triggers an Op verification failure because the folder does not update the inner tile sizes in the unpack Op. This patch addresses the issue by ensuring proper handling of inner tile sizes. ADDITIONAL CHANGES * invalid.mlir: Fixed a typo. * TensorOps.cpp: * Removed unnecessary `(void)tileSize`. * Added comments following the discussion in PR #115772. * Made minor updates to `FoldTensorCastPackOp` for consistency with the newly introduced `FoldTensorCastUnPackOp`. * Tensor/canonicalize.mlir: Ensured consistent usage of `test_attr` (e.g., replaced mixed use of `test_attr` and `some_attr`). --- mlir/lib/Dialect/Tensor/IR/TensorOps.cpp | 123 +++++++++++++++++---- mlir/test/Dialect/Tensor/canonicalize.mlir | 25 ++++- mlir/test/Dialect/Tensor/invalid.mlir | 2 +- 3 files changed, 123 insertions(+), 27 deletions(-) diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp index f79c774ceb3e9..24a1d55315319 100644 --- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp +++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp @@ -4795,6 +4795,44 @@ static SmallVector getNewOperands(DestinationStyleOpInterface op, return newOperands; } +// Given the (potentially) updated packed type, `newPackedTy`, generates an +// updated mixed-tile-sizes attribute. A tile size is updated only +// when: +// * a dim from newPackedTy is static, and +// * the corresponding size from mixedTiles is still dynamic. +// Otherwise, the original tile size is preserved. +// Note - packed-type-dim and mixed-tile-size should always match! +static SmallVector +getNewMixedTileSizes(PatternRewriter &rewriter, Type newPackedTy, + SmallVector mixedTiles) { + SmallVector newMixedTileSizes; + for (auto it : llvm::zip(cast(newPackedTy) + .getShape() + .take_back(mixedTiles.size()), + mixedTiles)) { + int64_t shape = std::get<0>(it); + if (shape == ShapedType::kDynamic) { + newMixedTileSizes.push_back(std::get<1>(it)); + continue; + } + + // If the current result dim is static, update the dynamic mixed-size + // (provided the original value is dynamic). + OpFoldResult tile = std::get<1>(it); + if (Attribute attr = llvm::dyn_cast_if_present(tile)) { + // Already a constant + newMixedTileSizes.push_back(tile); + } else { + assert(getConstantIntValue(tile).value() == shape && + "tile size and dim size don't match!"); + newMixedTileSizes.push_back( + (rewriter.getIntegerAttr(rewriter.getIndexType(), shape))); + } + } + + return newMixedTileSizes; +} + /// Folds a tensor.cast op into a consuming tensor::PackOp op if the /// `tensor.cast` has source that is more static than the consuming op. /// @@ -4821,31 +4859,13 @@ struct FoldTensorCastPackOp : public OpRewritePattern { SmallVector newOperands = getNewOperands(op, newResultTypes); // Get the updated mixed-tile-sizes attribute. - SmallVector newMixedTileSizes; - for (auto it : llvm::zip(cast(newResultTypes[0]) - .getShape() - .take_back(op.getMixedTiles().size()), - op.getMixedTiles())) { - int64_t shape = std::get<0>(it); - if (shape == ShapedType::kDynamic) { - newMixedTileSizes.push_back(std::get<1>(it)); - continue; - } - - if (Attribute attr = - llvm::dyn_cast_if_present(std::get<1>(it))) { - // Already a constant - newMixedTileSizes.push_back(std::get<1>(it)); - } else { - int64_t tileSize = getConstantIntValue(std::get<1>(it)).value(); - assert(tileSize == shape && "tile size and dim size don't match!"); - (void)tileSize; - newMixedTileSizes.push_back( - (rewriter.getIntegerAttr(rewriter.getIndexType(), shape))); - } - } + SmallVector newMixedTileSizes = + getNewMixedTileSizes(rewriter, newResultTypes[0], op.getMixedTiles()); // Clone op. + // TODO: Strictly speaking, discardable attributes should be _discarded_ at + // this point. However, in practice, we use them for things that we'd like + // to preserve. Implement a better abstraction. PackOp newOp = rewriter.create( op.getLoc(), newOperands[0], newOperands[1], op.getInnerDimsPos(), newMixedTileSizes, op.getPaddingValue(), op.getOuterDimsPerm()); @@ -4865,6 +4885,59 @@ struct FoldTensorCastPackOp : public OpRewritePattern { } }; +/// Folds a tensor.cast op into a consuming tensor::UnPackOp op if the +/// `tensor.cast` has source that is more static than the consuming op. +/// +/// Example: +/// ```mlir +/// %1 = tensor.cast %0 : tensor<1x1x8x1xi32> to tensor<1x1x?x1xi32> +/// %2 = tensor.unpack %1 ... : tensor<1x1x?x1xi32> -> tensor<7x?xi32> +/// ``` +/// +/// folds into: +/// +/// ```mlir +/// %2 = tensor.unpack %0 ... tensor<1x1x8x1xi32> -> tensor<7x?xi32> +/// ``` +struct FoldTensorCastUnPackOp : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(UnPackOp op, + PatternRewriter &rewriter) const override { + if (!foldTensorCastPrecondition(op)) + return failure(); + + SmallVector newResultTypes(op->getResultTypes()); + SmallVector newOperands = getNewOperands(op, newResultTypes); + Value sourceTensor = newOperands[0]; + + // Get the updated mixed-tile-sizes attribute. + SmallVector newMixedTileSizes = getNewMixedTileSizes( + rewriter, sourceTensor.getType(), op.getMixedTiles()); + + // Clone op. + // TODO: Strictly speaking, discardable attributes should be _discarded_ at + // this point. However, in practice, we use them for things that we'd like + // to preserve. Implement a better abstraction. + UnPackOp newOp = rewriter.create( + op.getLoc(), sourceTensor, newOperands[1], op.getInnerDimsPos(), + newMixedTileSizes, op.getOuterDimsPerm()); + newOp->setDiscardableAttrs(op->getDiscardableAttrDictionary()); + + // Replace op. + Value oldResult = op.getResult(); + Value newResult = newOp.getResult(); + Value replacement = (newResult.getType() != oldResult.getType()) + ? rewriter.create( + op->getLoc(), oldResult.getType(), newResult) + : newResult; + + rewriter.replaceOp(op, {replacement}); + + return success(); + } +}; + /// Folds a tensor.cast op into a consuming DestinationStyleOpInterface op if /// the `tensor.cast` has source that is more static than the consuming op. /// @@ -4890,7 +4963,8 @@ struct FoldTensorCastProducerOp PatternRewriter &rewriter) const override { // Reject tensor::PackOp - there's dedicated pattern for that instead. - if (!foldTensorCastPrecondition(op) || dyn_cast(*op)) + if (!foldTensorCastPrecondition(op) || + isa(*op)) return failure(); SmallVector newResultTypes(op->getResultTypes()); @@ -4923,6 +4997,7 @@ struct FoldTensorCastProducerOp void TensorDialect::getCanonicalizationPatterns( RewritePatternSet &results) const { results.add(getContext()); + results.add(getContext()); results.add(getContext()); } diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir index e8fc4ce834e18..01d14871072cd 100644 --- a/mlir/test/Dialect/Tensor/canonicalize.mlir +++ b/mlir/test/Dialect/Tensor/canonicalize.mlir @@ -2786,6 +2786,7 @@ func.func @fold_cast_multiple_results(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2x %0:2 = test.destination_style_op ins(%cast : tensor) outs(%cast_0 : tensor) -> tensor, index return %0#1 : index } + // ----- // CHECK-LABEL: func.func @fold_cast_pack_dynamic_tile_size @@ -2794,7 +2795,7 @@ func.func @fold_cast_multiple_results(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2x // CHECK-SAME: %[[PAD:.*]]: i32) -> tensor<1x1x8x1xi32> { // CHECK: %[[PACK:.*]] = tensor.pack %[[SRC]] padding_value(%[[PAD]] : i32) // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %[[DEST]] -// CHECK-SAME: some_attr +// CHECK-SAME: test_attr // CHECK-SAME: : tensor<7x?xi32> -> tensor<1x1x8x1xi32> // CHECK: return %[[PACK]] : tensor<1x1x8x1xi32> func.func @fold_cast_pack_dynamic_tile_size( @@ -2807,13 +2808,33 @@ func.func @fold_cast_pack_dynamic_tile_size( %pack = tensor.pack %src padding_value(%pad : i32) inner_dims_pos = [0, 1] inner_tiles = [%c8, 1] - into %cast {some_attr} : tensor<7x?xi32> -> tensor<1x1x?x1xi32> + into %cast {test_attr} : tensor<7x?xi32> -> tensor<1x1x?x1xi32> %res = tensor.cast %pack : tensor<1x1x?x1xi32> to tensor<1x1x8x1xi32> return %res : tensor<1x1x8x1xi32> } // ----- +// CHECK-LABEL: func.func @fold_cast_unpack_dynamic_tile_size( +// CHECK-SAME: %[[SRC:.*]]: tensor<1x1x8x1xi32>, +// CHECK-SAME: %[[DEST:.*]]: tensor<7x?xi32>) -> tensor<7x?xi32> { +// CHECK: %[[RES:.*]] = tensor.unpack %[[SRC]] inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %[[DEST]] {test_attr} : tensor<1x1x8x1xi32> -> tensor<7x?xi32> +// CHECK: return %[[RES]] : tensor<7x?xi32> +func.func @fold_cast_unpack_dynamic_tile_size( + %src: tensor<1x1x8x1xi32>, + %res: tensor<7x?xi32>) -> tensor<7x?xi32> { + + %cast = tensor.cast %src : tensor<1x1x8x1xi32> to tensor<1x1x?x1xi32> + %c8 = arith.constant 8 : index + %unpack = tensor.unpack %cast + inner_dims_pos = [0, 1] + inner_tiles = [%c8, 1] + into %res {test_attr} : tensor<1x1x?x1xi32> -> tensor<7x?xi32> + return %unpack : tensor<7x?xi32> +} + +// ----- + // CHECK-LABEL: func.func @pack_dont_drop_attributes( // CHECK: tensor.pack {{.*}} {test_attr} func.func @pack_dont_drop_attributes(%arg0: tensor, %arg1: tensor<128x?x100x16x1xf16>) -> tensor<128x?x100x16x1xf16> { diff --git a/mlir/test/Dialect/Tensor/invalid.mlir b/mlir/test/Dialect/Tensor/invalid.mlir index 83cb4b9d4ab24..1de3e281bc462 100644 --- a/mlir/test/Dialect/Tensor/invalid.mlir +++ b/mlir/test/Dialect/Tensor/invalid.mlir @@ -699,7 +699,7 @@ func.func @pack_invalid_output_rank(%input: tensor<256x128xf32>, %output: tensor // ----- -func.func @pack_invalid_output_rank(%input: tensor<256x128xf32>, %output: tensor<64x32x16xf32>) -> tensor<256x128xf32> { +func.func @unpack_invalid_output_rank(%input: tensor<256x128xf32>, %output: tensor<64x32x16xf32>) -> tensor<256x128xf32> { // expected-error@+1 {{packed rank != (unpacked rank + num tiling factors), got 3 != 4}} %0 = tensor.unpack %output inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %input : tensor<64x32x16xf32> -> tensor<256x128xf32> return %0 : tensor<256x128xf32> From 258256821753504836f797e38d83a8e88daa424d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= Date: Fri, 3 Jan 2025 09:11:38 +0000 Subject: [PATCH 336/567] [mlir] Add missing patterns to `linalg.decompose_pack_unpack` TD Op (#121400) This PR is a follow-up to #116373 and #116439, where a Transform Dialect (TD) operation was introduced to collect patterns for decomposing tensor.pack. The second patch renamed the patterns and the TD Op. Originally, adding patterns for `tensor.unpack` was marked as a TODO, which this PR addresses. No new tests are introduced in this PR. Instead, existing tests from: * "decompose-tensor-unpack.mlir" are reused. To achieve this: * The test is updated to use the TD operation `transform.apply_patterns.linalg.decompose_pack_unpack` instead of the flag `--test-linalg-transform-patterns="test-decompose-tensor-unpack"`, avoiding artificial tests created solely for the TD Op. * The TD sequence is saved to a new file, "decompose_unpack.mlir", and preloaded using the option. --- mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp | 2 +- .../Dialect/Linalg/decompose-tensor-unpack-tile.mlir | 5 ++++- .../test/Dialect/Linalg/decompose-tensor-unpack.mlir | 4 +++- mlir/test/Dialect/Linalg/td/decompose-unpack.mlir | 12 ++++++++++++ 4 files changed, 20 insertions(+), 3 deletions(-) create mode 100644 mlir/test/Dialect/Linalg/td/decompose-unpack.mlir diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp index 60cf897b00de3..50593b08ad74b 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp @@ -1656,8 +1656,8 @@ void linalg::populateDecomposeConvolutionPatterns(RewritePatternSet &patterns, } void linalg::populateDecomposePackUnpackPatterns(RewritePatternSet &patterns) { - // TODO: Add and test patterns for tensor.unpack patterns.add(patterns.getContext()); + patterns.add(patterns.getContext()); } void linalg::populateDecomposePadPatterns(RewritePatternSet &patterns) { diff --git a/mlir/test/Dialect/Linalg/decompose-tensor-unpack-tile.mlir b/mlir/test/Dialect/Linalg/decompose-tensor-unpack-tile.mlir index 6d9709caf7093..0dbdf470bbfc9 100644 --- a/mlir/test/Dialect/Linalg/decompose-tensor-unpack-tile.mlir +++ b/mlir/test/Dialect/Linalg/decompose-tensor-unpack-tile.mlir @@ -1,4 +1,7 @@ -// RUN: mlir-opt -split-input-file --transform-interpreter --canonicalize --test-linalg-transform-patterns="test-decompose-tensor-unpack" %s | FileCheck %s +// RUN: mlir-opt -split-input-file -transform-interpreter --canonicalize \ +// RUN: -transform-preload-library='transform-library-paths=%p/td/decompose-unpack.mlir' \ +// RUN: -transform-interpreter=entry-point=decompose_unpack \ +// RUN: -transform-interpreter %s | FileCheck %s func.func @KCRSsr_to_KCRS(%arg0: tensor<1x1x4x8x8x32xf32>, %arg1: tensor<1x1x128x64xf32>) -> tensor<1x1x128x64xf32> { %0 = tensor.unpack %arg0 inner_dims_pos = [3, 2] inner_tiles = [8, 32] into %arg1 : tensor<1x1x4x8x8x32xf32> -> tensor<1x1x128x64xf32> diff --git a/mlir/test/Dialect/Linalg/decompose-tensor-unpack.mlir b/mlir/test/Dialect/Linalg/decompose-tensor-unpack.mlir index bd60504f53345..ba1f214952562 100644 --- a/mlir/test/Dialect/Linalg/decompose-tensor-unpack.mlir +++ b/mlir/test/Dialect/Linalg/decompose-tensor-unpack.mlir @@ -1,4 +1,6 @@ -// RUN: mlir-opt -split-input-file --test-linalg-transform-patterns="test-decompose-tensor-unpack" %s | FileCheck %s +// RUN: mlir-opt -split-input-file \ +// RUN: -transform-preload-library='transform-library-paths=%p/td/decompose-unpack.mlir' \ +// RUN: -transform-interpreter=entry-point=decompose_unpack %s | FileCheck %s func.func @simple_KCRSsr_to_KCRS(%arg0: tensor<1x1x1x1x8x32xf32>, %arg1: tensor<1x1x32x8xf32>) -> tensor<1x1x32x8xf32> { %0 = tensor.unpack %arg0 inner_dims_pos = [3, 2] inner_tiles = [8, 32] into %arg1 : tensor<1x1x1x1x8x32xf32> -> tensor<1x1x32x8xf32> diff --git a/mlir/test/Dialect/Linalg/td/decompose-unpack.mlir b/mlir/test/Dialect/Linalg/td/decompose-unpack.mlir new file mode 100644 index 0000000000000..11243634262e0 --- /dev/null +++ b/mlir/test/Dialect/Linalg/td/decompose-unpack.mlir @@ -0,0 +1,12 @@ +module @transforms attributes { transform.with_named_sequence } { + transform.named_sequence @decompose_unpack(%module: !transform.any_op {transform.readonly}) { + %pack = transform.structured.match ops{["tensor.unpack"]} in %module : (!transform.any_op) -> !transform.any_op + + %1 = transform.get_parent_op %pack {isolated_from_above} : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %1 { + transform.apply_patterns.linalg.decompose_pack_unpack + } : !transform.any_op + + transform.yield + } +} From 2fae5bdea7c2016d4086aa7ecf3c5d0592ce95c8 Mon Sep 17 00:00:00 2001 From: Shao-Ce SUN Date: Fri, 3 Jan 2025 17:25:42 +0800 Subject: [PATCH 337/567] [RISCV] Add support of Sdext,Sdtrig extentions (#120936) `Sdext` and `Sdtrig` are RISC-V extensions related to debugging. The full specification can be found at https://github.com/riscv/riscv-debug-spec/releases/download/1.0.0-rc4/riscv-debug-specification.pdf --- .../Driver/print-supported-extensions-riscv.c | 2 + .../test/Preprocessor/riscv-target-features.c | 18 ++++++++ llvm/docs/RISCVUsage.rst | 3 ++ llvm/docs/ReleaseNotes.md | 1 + llvm/lib/Target/RISCV/RISCVFeatures.td | 4 ++ llvm/lib/Target/RISCV/RISCVSystemOperands.td | 3 ++ llvm/test/CodeGen/RISCV/attributes.ll | 4 ++ llvm/test/CodeGen/RISCV/features-info.ll | 2 + llvm/test/MC/RISCV/attribute-arch.s | 6 +++ llvm/test/MC/RISCV/machine-csr-names.s | 42 +++++++++++++++++++ .../TargetParser/RISCVISAInfoTest.cpp | 2 + 11 files changed, 87 insertions(+) diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c index 395501eb85ccc..f08ff00c9cbeb 100644 --- a/clang/test/Driver/print-supported-extensions-riscv.c +++ b/clang/test/Driver/print-supported-extensions-riscv.c @@ -185,6 +185,8 @@ // CHECK-NEXT: zalasr 0.1 'Zalasr' (Load-Acquire and Store-Release Instructions) // CHECK-NEXT: zvbc32e 0.7 'Zvbc32e' (Vector Carryless Multiplication with 32-bits elements) // CHECK-NEXT: zvkgs 0.7 'Zvkgs' (Vector-Scalar GCM instructions for Cryptography) +// CHECK-NEXT: sdext 1.0 'Sdext' (External debugger) +// CHECK-NEXT: sdtrig 1.0 'Sdtrig' (Debugger triggers) // CHECK-NEXT: smctr 1.0 'Smctr' (Control Transfer Records Machine Level) // CHECK-NEXT: ssctr 1.0 'Ssctr' (Control Transfer Records Supervisor Level) // CHECK-NEXT: svukte 0.3 'Svukte' (Address-Independent Latency of User-Mode Faults to Supervisor Addresses) diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c index e376821a5517c..c219771135275 100644 --- a/clang/test/Preprocessor/riscv-target-features.c +++ b/clang/test/Preprocessor/riscv-target-features.c @@ -182,6 +182,8 @@ // Experimental extensions +// CHECK-NOT: __riscv_sdext{{.*$}} +// CHECK-NOT: __riscv_sdtrig{{.*$}} // CHECK-NOT: __riscv_smctr{{.*$}} // CHECK-NOT: __riscv_smmpm{{.*$}} // CHECK-NOT: __riscv_smnpm{{.*$}} @@ -1795,6 +1797,22 @@ // RUN: -o - | FileCheck --check-prefix=CHECK-SUPM-EXT %s // CHECK-SUPM-EXT: __riscv_supm 1000000{{$}} +// RUN: %clang --target=riscv32 -menable-experimental-extensions \ +// RUN: -march=rv32i_sdext1p0 -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-SDEXT-EXT %s +// RUN: %clang --target=riscv64 -menable-experimental-extensions \ +// RUN: -march=rv64i_sdext1p0 -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-SDEXT-EXT %s +// CHECK-SDEXT-EXT: __riscv_sdext 1000000{{$}} + +// RUN: %clang --target=riscv32 -menable-experimental-extensions \ +// RUN: -march=rv32i_sdtrig1p0 -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-SDTRIG-EXT %s +// RUN: %clang --target=riscv64 -menable-experimental-extensions \ +// RUN: -march=rv64i_sdtrig1p0 -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-SDTRIG-EXT %s +// CHECK-SDTRIG-EXT: __riscv_sdtrig 1000000{{$}} + // RUN: %clang --target=riscv32 -menable-experimental-extensions \ // RUN: -march=rv32i_smctr1p0 -E -dM %s \ // RUN: -o - | FileCheck --check-prefix=CHECK-SMCTR-EXT %s diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst index eaaad6c516818..835b910ec452d 100644 --- a/llvm/docs/RISCVUsage.rst +++ b/llvm/docs/RISCVUsage.rst @@ -326,6 +326,9 @@ The primary goal of experimental support is to assist in the process of ratifica ``experimental-zvbc32e``, ``experimental-zvkgs`` LLVM implements the `0.7 release specification `__. +``experimental-sdext``, ``experimental-sdtrig`` + LLVM implements the `1.0-rc4 specification `__. + ``experimental-smctr``, ``experimental-ssctr`` LLVM implements the `1.0-rc3 specification `__. diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index be62a7e8696b4..11ee9864e5174 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -232,6 +232,7 @@ Changes to the RISC-V Backend extension. * Adds experimental assembler support for the Qualcomm uC 'Xqcicli` (Conditional Load Immediate) extension. +* Added ``Sdext`` and ``Sdtrig`` extensions. Changes to the WebAssembly Backend ---------------------------------- diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 3885b95a8937a..0074be35798ac 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -844,6 +844,10 @@ def HasStdExtH : Predicate<"Subtarget->hasStdExtH()">, // Supervisor extensions +def FeatureStdExtSdext : RISCVExperimentalExtension<1, 0, "External debugger">; + +def FeatureStdExtSdtrig : RISCVExperimentalExtension<1, 0, "Debugger triggers">; + def FeatureStdExtShgatpa : RISCVExtension<1, 0, "SvNNx4 mode supported for all modes supported by satp, as well as Bare">; diff --git a/llvm/lib/Target/RISCV/RISCVSystemOperands.td b/llvm/lib/Target/RISCV/RISCVSystemOperands.td index d85b4a9cf77b3..72275daa1b8d1 100644 --- a/llvm/lib/Target/RISCV/RISCVSystemOperands.td +++ b/llvm/lib/Target/RISCV/RISCVSystemOperands.td @@ -323,7 +323,10 @@ def : SysReg<"tselect", 0x7A0>; def : SysReg<"tdata1", 0x7A1>; def : SysReg<"tdata2", 0x7A2>; def : SysReg<"tdata3", 0x7A3>; +def : SysReg<"tinfo", 0x7A4>; +def : SysReg<"tcontrol", 0x7A5>; def : SysReg<"mcontext", 0x7A8>; +def : SysReg<"mscontext", 0x7AA>; //===----------------------------------------------------------------------===// // Debug Mode Registers diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll index bcf945470d85b..7e55e0590ec59 100644 --- a/llvm/test/CodeGen/RISCV/attributes.ll +++ b/llvm/test/CodeGen/RISCV/attributes.ll @@ -296,6 +296,8 @@ ; RUN: llc -mtriple=riscv64 -mattr=+supm %s -o - | FileCheck --check-prefix=RV64SUPM %s ; RUN: llc -mtriple=riscv64 -mattr=+experimental-smctr %s -o - | FileCheck --check-prefix=RV64SMCTR %s ; RUN: llc -mtriple=riscv64 -mattr=+experimental-ssctr %s -o - | FileCheck --check-prefix=RV64SSCTR %s +; RUN: llc -mtriple=riscv64 -mattr=+experimental-sdext %s -o - | FileCheck --check-prefix=RV64SDEXT %s +; RUN: llc -mtriple=riscv64 -mattr=+experimental-sdtrig %s -o - | FileCheck --check-prefix=RV64SDTRIG %s ; Tests for profile features. ; RUN: llc -mtriple=riscv32 -mattr=+rvi20u32 %s -o - | FileCheck --check-prefix=RVI20U32 %s @@ -605,6 +607,8 @@ ; RV64SUPM: .attribute 5, "rv64i2p1_supm1p0" ; RV64SMCTR: .attribute 5, "rv64i2p1_smctr1p0_sscsrind1p0" ; RV64SSCTR: .attribute 5, "rv64i2p1_sscsrind1p0_ssctr1p0" +; RV64SDEXT: .attribute 5, "rv64i2p1_sdext1p0" +; RV64SDTRIG: .attribute 5, "rv64i2p1_sdtrig1p0" ; RVI20U32: .attribute 5, "rv32i2p1" ; RVI20U64: .attribute 5, "rv64i2p1" diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll index 99db90c5fa925..70fbda47a14a1 100644 --- a/llvm/test/CodeGen/RISCV/features-info.ll +++ b/llvm/test/CodeGen/RISCV/features-info.ll @@ -15,6 +15,8 @@ ; CHECK: e - 'E' (Embedded Instruction Set with 16 GPRs). ; CHECK: experimental - Experimental intrinsics. ; CHECK: experimental-rvm23u32 - RISC-V experimental-rvm23u32 profile. +; CHECK: experimental-sdext - 'Sdext' (External debugger). +; CHECK: experimental-sdtrig - 'Sdtrig' (Debugger triggers). ; CHECK: experimental-smctr - 'Smctr' (Control Transfer Records Machine Level). ; CHECK: experimental-ssctr - 'Ssctr' (Control Transfer Records Supervisor Level). ; CHECK: experimental-svukte - 'Svukte' (Address-Independent Latency of User-Mode Faults to Supervisor Addresses). diff --git a/llvm/test/MC/RISCV/attribute-arch.s b/llvm/test/MC/RISCV/attribute-arch.s index 6ffaa62d50dcf..4e77a53bd706c 100644 --- a/llvm/test/MC/RISCV/attribute-arch.s +++ b/llvm/test/MC/RISCV/attribute-arch.s @@ -467,3 +467,9 @@ .attribute arch, "rv32i_ssctr1p0" # CHECK: attribute 5, "rv32i2p1_sscsrind1p0_ssctr1p0" + +.attribute arch, "rv32i_sdext1p0" +# CHECK: attribute 5, "rv32i2p1_sdext1p0" + +.attribute arch, "rv32i_sdtrig1p0" +# CHECK: attribute 5, "rv32i2p1_sdtrig1p0" diff --git a/llvm/test/MC/RISCV/machine-csr-names.s b/llvm/test/MC/RISCV/machine-csr-names.s index 07b948a78e6c2..ba2a79f1f6aa1 100644 --- a/llvm/test/MC/RISCV/machine-csr-names.s +++ b/llvm/test/MC/RISCV/machine-csr-names.s @@ -1419,6 +1419,34 @@ csrrs t1, tdata3, zero # uimm12 csrrs t2, 0x7A3, zero +# tinfo +# name +# CHECK-INST: csrrs t1, tinfo, zero +# CHECK-ENC: encoding: [0x73,0x23,0x40,0x7a] +# CHECK-INST-ALIAS: csrr t1, tinfo +# uimm12 +# CHECK-INST: csrrs t2, tinfo, zero +# CHECK-ENC: encoding: [0xf3,0x23,0x40,0x7a] +# CHECK-INST-ALIAS: csrr t2, tinfo +# name +csrrs t1, tinfo, zero +# uimm12 +csrrs t2, 0x7A4, zero + +# tcontrol +# name +# CHECK-INST: csrrs t1, tcontrol, zero +# CHECK-ENC: encoding: [0x73,0x23,0x50,0x7a] +# CHECK-INST-ALIAS: csrr t1, tcontrol +# uimm12 +# CHECK-INST: csrrs t2, tcontrol, zero +# CHECK-ENC: encoding: [0xf3,0x23,0x50,0x7a] +# CHECK-INST-ALIAS: csrr t2, tcontrol +# name +csrrs t1, tcontrol, zero +# uimm12 +csrrs t2, 0x7A5, zero + # mcontext # name # CHECK-INST: csrrs t1, mcontext, zero @@ -1433,6 +1461,20 @@ csrrs t1, mcontext, zero # uimm12 csrrs t2, 0x7A8, zero +# mscontext +# name +# CHECK-INST: csrrs t1, mscontext, zero +# CHECK-ENC: encoding: [0x73,0x23,0xa0,0x7a] +# CHECK-INST-ALIAS: csrr t1, mscontext +# uimm12 +# CHECK-INST: csrrs t2, mscontext, zero +# CHECK-ENC: encoding: [0xf3,0x23,0xa0,0x7a] +# CHECK-INST-ALIAS: csrr t2, mscontext +# name +csrrs t1, mscontext, zero +# uimm12 +csrrs t2, 0x7AA, zero + ####################### # Debug Mode Registers ######################## diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp index f631f26cf482e..3ea5afce56fa3 100644 --- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp +++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp @@ -1110,6 +1110,8 @@ Experimental extensions zalasr 0.1 zvbc32e 0.7 zvkgs 0.7 + sdext 1.0 + sdtrig 1.0 smctr 1.0 ssctr 1.0 svukte 0.3 From e4e47cef55886036651ff7f0dfd8475d3a158a4c Mon Sep 17 00:00:00 2001 From: Haojian Wu Date: Fri, 3 Jan 2025 10:35:01 +0100 Subject: [PATCH 338/567] [bazel] Fix the broken llvm-tblgen build for 27f30029741ecf023baece7b3dde1ff9011ffefc --- .../llvm-project-overlay/llvm/BUILD.bazel | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel index 36e266d26fc3d..18ac78174856b 100644 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel @@ -645,18 +645,20 @@ cc_binary( cc_binary( name = "llvm-min-tblgen", srcs = [ - "utils/TableGen/ARMTargetDefEmitter.cpp", - "utils/TableGen/Attributes.cpp", + "utils/TableGen/Basic/ARMTargetDefEmitter.cpp", + "utils/TableGen/Basic/Attributes.cpp", "utils/TableGen/Basic/CodeGenIntrinsics.cpp", "utils/TableGen/Basic/CodeGenIntrinsics.h", "utils/TableGen/Basic/SDNodeProperties.cpp", "utils/TableGen/Basic/SDNodeProperties.h", + "utils/TableGen/Basic/TableGen.h", + "utils/TableGen/Basic/TableGen.cpp", "utils/TableGen/Basic/SequenceToOffsetTable.h", - "utils/TableGen/DirectiveEmitter.cpp", - "utils/TableGen/IntrinsicEmitter.cpp", - "utils/TableGen/RISCVTargetDefEmitter.cpp", - "utils/TableGen/TableGen.cpp", - "utils/TableGen/VTEmitter.cpp", + "utils/TableGen/Basic/DirectiveEmitter.cpp", + "utils/TableGen/Basic/IntrinsicEmitter.cpp", + "utils/TableGen/Basic/RISCVTargetDefEmitter.cpp", + "utils/TableGen/Basic/VTEmitter.cpp", + "utils/TableGen/llvm-min-tblgen.cpp", ], copts = llvm_copts, stamp = 0, @@ -715,7 +717,10 @@ cc_binary( # regular dependency. "include/llvm/MC/*.h", ], - exclude = ["utils/TableGen/Common/GlobalISel/CodeExpander.cpp"], + exclude = [ + "utils/TableGen/Common/GlobalISel/CodeExpander.cpp", + "utils/TableGen/llvm-min-tblgen.cpp", + ], ) + [ "include/llvm/TargetParser/SubtargetFeature.h", ], From 366e836051adf5eb352b00828541197729e061e6 Mon Sep 17 00:00:00 2001 From: ShihPo Hung Date: Thu, 2 Jan 2025 22:43:22 -0800 Subject: [PATCH 339/567] [RISCV][NFC] precommit test for fcmp with f16 --- llvm/test/Analysis/CostModel/RISCV/rvv-cmp.ll | 254 +------ .../Analysis/CostModel/RISCV/rvv-fcmp-f16.ll | 677 ++++++++++++++++++ 2 files changed, 678 insertions(+), 253 deletions(-) create mode 100644 llvm/test/Analysis/CostModel/RISCV/rvv-fcmp-f16.ll diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-cmp.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-cmp.ll index 56f9e18c6c5a0..d1b230c35ff2d 100644 --- a/llvm/test/Analysis/CostModel/RISCV/rvv-cmp.ll +++ b/llvm/test/Analysis/CostModel/RISCV/rvv-cmp.ll @@ -875,15 +875,6 @@ define void @icmp_sle() { define void @fcmp_oeq() { ; CHECK-LABEL: 'fcmp_oeq' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp oeq <2 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp oeq <4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp oeq <8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp oeq <16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp oeq undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp oeq undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp oeq undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp oeq undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp oeq undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp oeq <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp oeq <4 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = fcmp oeq <8 x float> undef, undef @@ -902,16 +893,7 @@ define void @fcmp_oeq() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64 = fcmp oeq undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2f16 = fcmp oeq <2 x half> undef, undef - %v4f16 = fcmp oeq <4 x half> undef, undef - %v8f16 = fcmp oeq <8 x half> undef, undef - %v16f16 = fcmp oeq <16 x half> undef, undef - %nxv1f16 = fcmp oeq undef, undef - %nxv2f16 = fcmp oeq undef, undef - %nxv4f16 = fcmp oeq undef, undef - %nxv8f16 = fcmp oeq undef, undef - %nxv16f16 = fcmp oeq undef, undef %v2f32 = fcmp oeq <2 x float> undef, undef %v4f32 = fcmp oeq <4 x float> undef, undef @@ -938,15 +920,6 @@ define void @fcmp_oeq() { define void @fcmp_one() { ; CHECK-LABEL: 'fcmp_one' -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = fcmp one <2 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = fcmp one <4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = fcmp one <8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v16f16 = fcmp one <16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16 = fcmp one undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16 = fcmp one undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16 = fcmp one undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv8f16 = fcmp one undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv16f16 = fcmp one undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f32 = fcmp one <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f32 = fcmp one <4 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v8f32 = fcmp one <8 x float> undef, undef @@ -965,16 +938,7 @@ define void @fcmp_one() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv8f64 = fcmp one undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2f16 = fcmp one <2 x half> undef, undef - %v4f16 = fcmp one <4 x half> undef, undef - %v8f16 = fcmp one <8 x half> undef, undef - %v16f16 = fcmp one <16 x half> undef, undef - %nxv1f16 = fcmp one undef, undef - %nxv2f16 = fcmp one undef, undef - %nxv4f16 = fcmp one undef, undef - %nxv8f16 = fcmp one undef, undef - %nxv16f16 = fcmp one undef, undef %v2f32 = fcmp one <2 x float> undef, undef %v4f32 = fcmp one <4 x float> undef, undef @@ -1001,15 +965,6 @@ define void @fcmp_one() { define void @fcmp_olt() { ; CHECK-LABEL: 'fcmp_olt' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp olt <2 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp olt <4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp olt <8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp olt <16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp olt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp olt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp olt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp olt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp olt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp olt <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp olt <4 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = fcmp olt <8 x float> undef, undef @@ -1028,16 +983,7 @@ define void @fcmp_olt() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64 = fcmp olt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2f16 = fcmp olt <2 x half> undef, undef - %v4f16 = fcmp olt <4 x half> undef, undef - %v8f16 = fcmp olt <8 x half> undef, undef - %v16f16 = fcmp olt <16 x half> undef, undef - %nxv1f16 = fcmp olt undef, undef - %nxv2f16 = fcmp olt undef, undef - %nxv4f16 = fcmp olt undef, undef - %nxv8f16 = fcmp olt undef, undef - %nxv16f16 = fcmp olt undef, undef %v2f32 = fcmp olt <2 x float> undef, undef %v4f32 = fcmp olt <4 x float> undef, undef @@ -1064,15 +1010,6 @@ define void @fcmp_olt() { define void @fcmp_ole() { ; CHECK-LABEL: 'fcmp_ole' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ole <2 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ole <4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ole <8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp ole <16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ole undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ole undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ole undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp ole undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp ole undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp ole <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp ole <4 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = fcmp ole <8 x float> undef, undef @@ -1091,16 +1028,7 @@ define void @fcmp_ole() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64 = fcmp ole undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2f16 = fcmp ole <2 x half> undef, undef - %v4f16 = fcmp ole <4 x half> undef, undef - %v8f16 = fcmp ole <8 x half> undef, undef - %v16f16 = fcmp ole <16 x half> undef, undef - %nxv1f16 = fcmp ole undef, undef - %nxv2f16 = fcmp ole undef, undef - %nxv4f16 = fcmp ole undef, undef - %nxv8f16 = fcmp ole undef, undef - %nxv16f16 = fcmp ole undef, undef %v2f32 = fcmp ole <2 x float> undef, undef %v4f32 = fcmp ole <4 x float> undef, undef @@ -1127,15 +1055,6 @@ define void @fcmp_ole() { define void @fcmp_ogt() { ; CHECK-LABEL: 'fcmp_ogt' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ogt <2 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ogt <4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ogt <8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp ogt <16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ogt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ogt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ogt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp ogt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp ogt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp ogt <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp ogt <4 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = fcmp ogt <8 x float> undef, undef @@ -1154,16 +1073,7 @@ define void @fcmp_ogt() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64 = fcmp ogt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2f16 = fcmp ogt <2 x half> undef, undef - %v4f16 = fcmp ogt <4 x half> undef, undef - %v8f16 = fcmp ogt <8 x half> undef, undef - %v16f16 = fcmp ogt <16 x half> undef, undef - %nxv1f16 = fcmp ogt undef, undef - %nxv2f16 = fcmp ogt undef, undef - %nxv4f16 = fcmp ogt undef, undef - %nxv8f16 = fcmp ogt undef, undef - %nxv16f16 = fcmp ogt undef, undef %v2f32 = fcmp ogt <2 x float> undef, undef %v4f32 = fcmp ogt <4 x float> undef, undef @@ -1190,15 +1100,6 @@ define void @fcmp_ogt() { define void @fcmp_oge() { ; CHECK-LABEL: 'fcmp_oge' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp oge <2 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp oge <4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp oge <8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp oge <16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp oge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp oge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp oge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp oge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp oge undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp oge <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp oge <4 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = fcmp oge <8 x float> undef, undef @@ -1217,16 +1118,7 @@ define void @fcmp_oge() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64 = fcmp oge undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2f16 = fcmp oge <2 x half> undef, undef - %v4f16 = fcmp oge <4 x half> undef, undef - %v8f16 = fcmp oge <8 x half> undef, undef - %v16f16 = fcmp oge <16 x half> undef, undef - %nxv1f16 = fcmp oge undef, undef - %nxv2f16 = fcmp oge undef, undef - %nxv4f16 = fcmp oge undef, undef - %nxv8f16 = fcmp oge undef, undef - %nxv16f16 = fcmp oge undef, undef %v2f32 = fcmp oge <2 x float> undef, undef %v4f32 = fcmp oge <4 x float> undef, undef @@ -1253,15 +1145,6 @@ define void @fcmp_oge() { define void @fcmp_ueq() { ; CHECK-LABEL: 'fcmp_ueq' -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = fcmp ueq <2 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = fcmp ueq <4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = fcmp ueq <8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v16f16 = fcmp ueq <16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16 = fcmp ueq undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16 = fcmp ueq undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16 = fcmp ueq undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv8f16 = fcmp ueq undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv16f16 = fcmp ueq undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f32 = fcmp ueq <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f32 = fcmp ueq <4 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v8f32 = fcmp ueq <8 x float> undef, undef @@ -1280,16 +1163,7 @@ define void @fcmp_ueq() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv8f64 = fcmp ueq undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2f16 = fcmp ueq <2 x half> undef, undef - %v4f16 = fcmp ueq <4 x half> undef, undef - %v8f16 = fcmp ueq <8 x half> undef, undef - %v16f16 = fcmp ueq <16 x half> undef, undef - %nxv1f16 = fcmp ueq undef, undef - %nxv2f16 = fcmp ueq undef, undef - %nxv4f16 = fcmp ueq undef, undef - %nxv8f16 = fcmp ueq undef, undef - %nxv16f16 = fcmp ueq undef, undef %v2f32 = fcmp ueq <2 x float> undef, undef %v4f32 = fcmp ueq <4 x float> undef, undef @@ -1316,15 +1190,6 @@ define void @fcmp_ueq() { define void @fcmp_une() { ; CHECK-LABEL: 'fcmp_une' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp une <2 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp une <4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp une <8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp une <16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp une undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp une undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp une undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp une undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp une undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp une <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp une <4 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = fcmp une <8 x float> undef, undef @@ -1343,16 +1208,7 @@ define void @fcmp_une() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64 = fcmp une undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2f16 = fcmp une <2 x half> undef, undef - %v4f16 = fcmp une <4 x half> undef, undef - %v8f16 = fcmp une <8 x half> undef, undef - %v16f16 = fcmp une <16 x half> undef, undef - %nxv1f16 = fcmp une undef, undef - %nxv2f16 = fcmp une undef, undef - %nxv4f16 = fcmp une undef, undef - %nxv8f16 = fcmp une undef, undef - %nxv16f16 = fcmp une undef, undef %v2f32 = fcmp une <2 x float> undef, undef %v4f32 = fcmp une <4 x float> undef, undef @@ -1379,15 +1235,6 @@ define void @fcmp_une() { define void @fcmp_ult() { ; CHECK-LABEL: 'fcmp_ult' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ult <2 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp ult <4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp ult <8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp ult <16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp ult undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp ult undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp ult undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp ult undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp ult undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fcmp ult <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fcmp ult <4 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f32 = fcmp ult <8 x float> undef, undef @@ -1406,16 +1253,7 @@ define void @fcmp_ult() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv8f64 = fcmp ult undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2f16 = fcmp ult <2 x half> undef, undef - %v4f16 = fcmp ult <4 x half> undef, undef - %v8f16 = fcmp ult <8 x half> undef, undef - %v16f16 = fcmp ult <16 x half> undef, undef - %nxv1f16 = fcmp ult undef, undef - %nxv2f16 = fcmp ult undef, undef - %nxv4f16 = fcmp ult undef, undef - %nxv8f16 = fcmp ult undef, undef - %nxv16f16 = fcmp ult undef, undef %v2f32 = fcmp ult <2 x float> undef, undef %v4f32 = fcmp ult <4 x float> undef, undef @@ -1442,15 +1280,6 @@ define void @fcmp_ult() { define void @fcmp_ule() { ; CHECK-LABEL: 'fcmp_ule' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ule <2 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp ule <4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp ule <8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp ule <16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp ule undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp ule undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp ule undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp ule undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp ule undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fcmp ule <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fcmp ule <4 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f32 = fcmp ule <8 x float> undef, undef @@ -1469,16 +1298,7 @@ define void @fcmp_ule() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv8f64 = fcmp ule undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2f16 = fcmp ule <2 x half> undef, undef - %v4f16 = fcmp ule <4 x half> undef, undef - %v8f16 = fcmp ule <8 x half> undef, undef - %v16f16 = fcmp ule <16 x half> undef, undef - %nxv1f16 = fcmp ule undef, undef - %nxv2f16 = fcmp ule undef, undef - %nxv4f16 = fcmp ule undef, undef - %nxv8f16 = fcmp ule undef, undef - %nxv16f16 = fcmp ule undef, undef %v2f32 = fcmp ule <2 x float> undef, undef %v4f32 = fcmp ule <4 x float> undef, undef @@ -1505,15 +1325,6 @@ define void @fcmp_ule() { define void @fcmp_ugt() { ; CHECK-LABEL: 'fcmp_ugt' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ugt <2 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp ugt <4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp ugt <8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp ugt <16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp ugt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp ugt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp ugt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp ugt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp ugt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fcmp ugt <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fcmp ugt <4 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f32 = fcmp ugt <8 x float> undef, undef @@ -1532,16 +1343,7 @@ define void @fcmp_ugt() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv8f64 = fcmp ugt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2f16 = fcmp ugt <2 x half> undef, undef - %v4f16 = fcmp ugt <4 x half> undef, undef - %v8f16 = fcmp ugt <8 x half> undef, undef - %v16f16 = fcmp ugt <16 x half> undef, undef - %nxv1f16 = fcmp ugt undef, undef - %nxv2f16 = fcmp ugt undef, undef - %nxv4f16 = fcmp ugt undef, undef - %nxv8f16 = fcmp ugt undef, undef - %nxv16f16 = fcmp ugt undef, undef %v2f32 = fcmp ugt <2 x float> undef, undef %v4f32 = fcmp ugt <4 x float> undef, undef @@ -1568,15 +1370,6 @@ define void @fcmp_ugt() { define void @fcmp_uge() { ; CHECK-LABEL: 'fcmp_uge' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp uge <2 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp uge <4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp uge <8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp uge <16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp uge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp uge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp uge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp uge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp uge undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fcmp uge <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fcmp uge <4 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f32 = fcmp uge <8 x float> undef, undef @@ -1595,16 +1388,7 @@ define void @fcmp_uge() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv8f64 = fcmp uge undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2f16 = fcmp uge <2 x half> undef, undef - %v4f16 = fcmp uge <4 x half> undef, undef - %v8f16 = fcmp uge <8 x half> undef, undef - %v16f16 = fcmp uge <16 x half> undef, undef - %nxv1f16 = fcmp uge undef, undef - %nxv2f16 = fcmp uge undef, undef - %nxv4f16 = fcmp uge undef, undef - %nxv8f16 = fcmp uge undef, undef - %nxv16f16 = fcmp uge undef, undef %v2f32 = fcmp uge <2 x float> undef, undef %v4f32 = fcmp uge <4 x float> undef, undef @@ -1631,15 +1415,6 @@ define void @fcmp_uge() { define void @fcmp_true() { ; CHECK-LABEL: 'fcmp_true' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp true <2 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp true <4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp true <8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp true <16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp true undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp true undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp true undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp true undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp true undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp true <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp true <4 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32 = fcmp true <8 x float> undef, undef @@ -1658,16 +1433,7 @@ define void @fcmp_true() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f64 = fcmp true undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2f16 = fcmp true <2 x half> undef, undef - %v4f16 = fcmp true <4 x half> undef, undef - %v8f16 = fcmp true <8 x half> undef, undef - %v16f16 = fcmp true <16 x half> undef, undef - %nxv1f16 = fcmp true undef, undef - %nxv2f16 = fcmp true undef, undef - %nxv4f16 = fcmp true undef, undef - %nxv8f16 = fcmp true undef, undef - %nxv16f16 = fcmp true undef, undef %v2f32 = fcmp true <2 x float> undef, undef %v4f32 = fcmp true <4 x float> undef, undef @@ -1694,15 +1460,6 @@ define void @fcmp_true() { define void @fcmp_false() { ; CHECK-LABEL: 'fcmp_false' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp false <2 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp false <4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp false <8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp false <16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp false undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp false undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp false undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp false undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp false undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp false <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp false <4 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32 = fcmp false <8 x float> undef, undef @@ -1721,16 +1478,7 @@ define void @fcmp_false() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f64 = fcmp false undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2f16 = fcmp false <2 x half> undef, undef - %v4f16 = fcmp false <4 x half> undef, undef - %v8f16 = fcmp false <8 x half> undef, undef - %v16f16 = fcmp false <16 x half> undef, undef - - %nxv1f16 = fcmp false undef, undef - %nxv2f16 = fcmp false undef, undef - %nxv4f16 = fcmp false undef, undef - %nxv8f16 = fcmp false undef, undef - %nxv16f16 = fcmp false undef, undef + %v2f32 = fcmp false <2 x float> undef, undef %v4f32 = fcmp false <4 x float> undef, undef diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-fcmp-f16.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-fcmp-f16.ll new file mode 100644 index 0000000000000..8396e80ca3e80 --- /dev/null +++ b/llvm/test/Analysis/CostModel/RISCV/rvv-fcmp-f16.ll @@ -0,0 +1,677 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh -riscv-v-vector-bits-min=-1 < %s | FileCheck %s --check-prefix=NOF16 +; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh -riscv-v-vector-bits-min=-1 < %s | FileCheck %s --check-prefix=VFH +; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfhmin -riscv-v-vector-bits-min=-1 < %s | FileCheck %s --check-prefix=VFHMIN + +define void @fcmp_oeq() { +; NOF16-LABEL: 'fcmp_oeq' +; NOF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp oeq <2 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp oeq <4 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp oeq <8 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp oeq <16 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp oeq undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp oeq undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp oeq undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp oeq undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp oeq undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFH-LABEL: 'fcmp_oeq' +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp oeq <2 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp oeq <4 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp oeq <8 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp oeq <16 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp oeq undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp oeq undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp oeq undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp oeq undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp oeq undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFHMIN-LABEL: 'fcmp_oeq' +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp oeq <2 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp oeq <4 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp oeq <8 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp oeq <16 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp oeq undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp oeq undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp oeq undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp oeq undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp oeq undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2f16 = fcmp oeq <2 x half> undef, undef + %v4f16 = fcmp oeq <4 x half> undef, undef + %v8f16 = fcmp oeq <8 x half> undef, undef + %v16f16 = fcmp oeq <16 x half> undef, undef + %nxv1f16 = fcmp oeq undef, undef + %nxv2f16 = fcmp oeq undef, undef + %nxv4f16 = fcmp oeq undef, undef + %nxv8f16 = fcmp oeq undef, undef + %nxv16f16 = fcmp oeq undef, undef + ret void +} +define void @fcmp_one() { +; NOF16-LABEL: 'fcmp_one' +; NOF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp one <2 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp one <4 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp one <8 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp one <16 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp one undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp one undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp one undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp one undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp one undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFH-LABEL: 'fcmp_one' +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = fcmp one <2 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = fcmp one <4 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = fcmp one <8 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v16f16 = fcmp one <16 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16 = fcmp one undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16 = fcmp one undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16 = fcmp one undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv8f16 = fcmp one undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv16f16 = fcmp one undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFHMIN-LABEL: 'fcmp_one' +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp one <2 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp one <4 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp one <8 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp one <16 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp one undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp one undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp one undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp one undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp one undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2f16 = fcmp one <2 x half> undef, undef + %v4f16 = fcmp one <4 x half> undef, undef + %v8f16 = fcmp one <8 x half> undef, undef + %v16f16 = fcmp one <16 x half> undef, undef + %nxv1f16 = fcmp one undef, undef + %nxv2f16 = fcmp one undef, undef + %nxv4f16 = fcmp one undef, undef + %nxv8f16 = fcmp one undef, undef + %nxv16f16 = fcmp one undef, undef + ret void +} +define void @fcmp_olt() { +; NOF16-LABEL: 'fcmp_olt' +; NOF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp olt <2 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp olt <4 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp olt <8 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp olt <16 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp olt undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp olt undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp olt undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp olt undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp olt undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFH-LABEL: 'fcmp_olt' +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp olt <2 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp olt <4 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp olt <8 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp olt <16 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp olt undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp olt undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp olt undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp olt undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp olt undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFHMIN-LABEL: 'fcmp_olt' +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp olt <2 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp olt <4 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp olt <8 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp olt <16 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp olt undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp olt undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp olt undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp olt undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp olt undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2f16 = fcmp olt <2 x half> undef, undef + %v4f16 = fcmp olt <4 x half> undef, undef + %v8f16 = fcmp olt <8 x half> undef, undef + %v16f16 = fcmp olt <16 x half> undef, undef + %nxv1f16 = fcmp olt undef, undef + %nxv2f16 = fcmp olt undef, undef + %nxv4f16 = fcmp olt undef, undef + %nxv8f16 = fcmp olt undef, undef + %nxv16f16 = fcmp olt undef, undef + ret void +} +define void @fcmp_ole() { +; NOF16-LABEL: 'fcmp_ole' +; NOF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ole <2 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp ole <4 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp ole <8 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp ole <16 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp ole undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp ole undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp ole undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp ole undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp ole undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFH-LABEL: 'fcmp_ole' +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ole <2 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ole <4 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ole <8 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp ole <16 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ole undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ole undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ole undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp ole undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp ole undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFHMIN-LABEL: 'fcmp_ole' +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ole <2 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ole <4 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ole <8 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp ole <16 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ole undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ole undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ole undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp ole undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp ole undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2f16 = fcmp ole <2 x half> undef, undef + %v4f16 = fcmp ole <4 x half> undef, undef + %v8f16 = fcmp ole <8 x half> undef, undef + %v16f16 = fcmp ole <16 x half> undef, undef + %nxv1f16 = fcmp ole undef, undef + %nxv2f16 = fcmp ole undef, undef + %nxv4f16 = fcmp ole undef, undef + %nxv8f16 = fcmp ole undef, undef + %nxv16f16 = fcmp ole undef, undef + ret void +} +define void @fcmp_ogt() { +; NOF16-LABEL: 'fcmp_ogt' +; NOF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ogt <2 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp ogt <4 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp ogt <8 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp ogt <16 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp ogt undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp ogt undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp ogt undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp ogt undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp ogt undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFH-LABEL: 'fcmp_ogt' +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ogt <2 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ogt <4 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ogt <8 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp ogt <16 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ogt undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ogt undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ogt undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp ogt undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp ogt undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFHMIN-LABEL: 'fcmp_ogt' +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ogt <2 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ogt <4 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ogt <8 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp ogt <16 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ogt undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ogt undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ogt undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp ogt undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp ogt undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2f16 = fcmp ogt <2 x half> undef, undef + %v4f16 = fcmp ogt <4 x half> undef, undef + %v8f16 = fcmp ogt <8 x half> undef, undef + %v16f16 = fcmp ogt <16 x half> undef, undef + %nxv1f16 = fcmp ogt undef, undef + %nxv2f16 = fcmp ogt undef, undef + %nxv4f16 = fcmp ogt undef, undef + %nxv8f16 = fcmp ogt undef, undef + %nxv16f16 = fcmp ogt undef, undef + ret void +} +define void @fcmp_oge() { +; NOF16-LABEL: 'fcmp_oge' +; NOF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp oge <2 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp oge <4 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp oge <8 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp oge <16 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp oge undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp oge undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp oge undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp oge undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp oge undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFH-LABEL: 'fcmp_oge' +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp oge <2 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp oge <4 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp oge <8 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp oge <16 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp oge undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp oge undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp oge undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp oge undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp oge undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFHMIN-LABEL: 'fcmp_oge' +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp oge <2 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp oge <4 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp oge <8 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp oge <16 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp oge undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp oge undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp oge undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp oge undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp oge undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2f16 = fcmp oge <2 x half> undef, undef + %v4f16 = fcmp oge <4 x half> undef, undef + %v8f16 = fcmp oge <8 x half> undef, undef + %v16f16 = fcmp oge <16 x half> undef, undef + %nxv1f16 = fcmp oge undef, undef + %nxv2f16 = fcmp oge undef, undef + %nxv4f16 = fcmp oge undef, undef + %nxv8f16 = fcmp oge undef, undef + %nxv16f16 = fcmp oge undef, undef + ret void +} +define void @fcmp_ueq() { +; NOF16-LABEL: 'fcmp_ueq' +; NOF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ueq <2 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp ueq <4 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp ueq <8 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp ueq <16 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp ueq undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp ueq undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp ueq undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp ueq undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp ueq undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFH-LABEL: 'fcmp_ueq' +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = fcmp ueq <2 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = fcmp ueq <4 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = fcmp ueq <8 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v16f16 = fcmp ueq <16 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16 = fcmp ueq undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16 = fcmp ueq undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16 = fcmp ueq undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv8f16 = fcmp ueq undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv16f16 = fcmp ueq undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFHMIN-LABEL: 'fcmp_ueq' +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ueq <2 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ueq <4 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ueq <8 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp ueq <16 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ueq undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ueq undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ueq undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp ueq undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp ueq undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2f16 = fcmp ueq <2 x half> undef, undef + %v4f16 = fcmp ueq <4 x half> undef, undef + %v8f16 = fcmp ueq <8 x half> undef, undef + %v16f16 = fcmp ueq <16 x half> undef, undef + %nxv1f16 = fcmp ueq undef, undef + %nxv2f16 = fcmp ueq undef, undef + %nxv4f16 = fcmp ueq undef, undef + %nxv8f16 = fcmp ueq undef, undef + %nxv16f16 = fcmp ueq undef, undef + ret void +} +define void @fcmp_une() { +; NOF16-LABEL: 'fcmp_une' +; NOF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp une <2 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp une <4 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp une <8 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp une <16 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp une undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp une undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp une undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp une undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp une undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFH-LABEL: 'fcmp_une' +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp une <2 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp une <4 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp une <8 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp une <16 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp une undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp une undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp une undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp une undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp une undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFHMIN-LABEL: 'fcmp_une' +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp une <2 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp une <4 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp une <8 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp une <16 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp une undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp une undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp une undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp une undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp une undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2f16 = fcmp une <2 x half> undef, undef + %v4f16 = fcmp une <4 x half> undef, undef + %v8f16 = fcmp une <8 x half> undef, undef + %v16f16 = fcmp une <16 x half> undef, undef + %nxv1f16 = fcmp une undef, undef + %nxv2f16 = fcmp une undef, undef + %nxv4f16 = fcmp une undef, undef + %nxv8f16 = fcmp une undef, undef + %nxv16f16 = fcmp une undef, undef + ret void +} +define void @fcmp_ult() { +; NOF16-LABEL: 'fcmp_ult' +; NOF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ult <2 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp ult <4 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp ult <8 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp ult <16 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp ult undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp ult undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp ult undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp ult undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp ult undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFH-LABEL: 'fcmp_ult' +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ult <2 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp ult <4 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp ult <8 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp ult <16 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp ult undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp ult undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp ult undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp ult undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp ult undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFHMIN-LABEL: 'fcmp_ult' +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ult <2 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ult <4 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ult <8 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp ult <16 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ult undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ult undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ult undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp ult undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp ult undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2f16 = fcmp ult <2 x half> undef, undef + %v4f16 = fcmp ult <4 x half> undef, undef + %v8f16 = fcmp ult <8 x half> undef, undef + %v16f16 = fcmp ult <16 x half> undef, undef + %nxv1f16 = fcmp ult undef, undef + %nxv2f16 = fcmp ult undef, undef + %nxv4f16 = fcmp ult undef, undef + %nxv8f16 = fcmp ult undef, undef + %nxv16f16 = fcmp ult undef, undef + ret void +} +define void @fcmp_ule() { +; NOF16-LABEL: 'fcmp_ule' +; NOF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ule <2 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp ule <4 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp ule <8 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp ule <16 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp ule undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp ule undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp ule undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp ule undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp ule undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFH-LABEL: 'fcmp_ule' +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ule <2 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp ule <4 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp ule <8 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp ule <16 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp ule undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp ule undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp ule undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp ule undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp ule undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFHMIN-LABEL: 'fcmp_ule' +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ule <2 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ule <4 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ule <8 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp ule <16 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ule undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ule undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ule undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp ule undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp ule undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2f16 = fcmp ule <2 x half> undef, undef + %v4f16 = fcmp ule <4 x half> undef, undef + %v8f16 = fcmp ule <8 x half> undef, undef + %v16f16 = fcmp ule <16 x half> undef, undef + %nxv1f16 = fcmp ule undef, undef + %nxv2f16 = fcmp ule undef, undef + %nxv4f16 = fcmp ule undef, undef + %nxv8f16 = fcmp ule undef, undef + %nxv16f16 = fcmp ule undef, undef + ret void +} +define void @fcmp_ugt() { +; NOF16-LABEL: 'fcmp_ugt' +; NOF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ugt <2 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp ugt <4 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp ugt <8 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp ugt <16 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp ugt undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp ugt undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp ugt undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp ugt undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp ugt undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFH-LABEL: 'fcmp_ugt' +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ugt <2 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp ugt <4 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp ugt <8 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp ugt <16 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp ugt undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp ugt undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp ugt undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp ugt undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp ugt undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFHMIN-LABEL: 'fcmp_ugt' +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ugt <2 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ugt <4 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ugt <8 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp ugt <16 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ugt undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ugt undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ugt undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp ugt undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp ugt undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2f16 = fcmp ugt <2 x half> undef, undef + %v4f16 = fcmp ugt <4 x half> undef, undef + %v8f16 = fcmp ugt <8 x half> undef, undef + %v16f16 = fcmp ugt <16 x half> undef, undef + %nxv1f16 = fcmp ugt undef, undef + %nxv2f16 = fcmp ugt undef, undef + %nxv4f16 = fcmp ugt undef, undef + %nxv8f16 = fcmp ugt undef, undef + %nxv16f16 = fcmp ugt undef, undef + ret void +} +define void @fcmp_uge() { +; NOF16-LABEL: 'fcmp_uge' +; NOF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp uge <2 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp uge <4 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp uge <8 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp uge <16 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp uge undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp uge undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp uge undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp uge undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp uge undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFH-LABEL: 'fcmp_uge' +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp uge <2 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp uge <4 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp uge <8 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp uge <16 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp uge undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp uge undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp uge undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp uge undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp uge undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFHMIN-LABEL: 'fcmp_uge' +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp uge <2 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp uge <4 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp uge <8 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp uge <16 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp uge undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp uge undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp uge undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp uge undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp uge undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2f16 = fcmp uge <2 x half> undef, undef + %v4f16 = fcmp uge <4 x half> undef, undef + %v8f16 = fcmp uge <8 x half> undef, undef + %v16f16 = fcmp uge <16 x half> undef, undef + %nxv1f16 = fcmp uge undef, undef + %nxv2f16 = fcmp uge undef, undef + %nxv4f16 = fcmp uge undef, undef + %nxv8f16 = fcmp uge undef, undef + %nxv16f16 = fcmp uge undef, undef + ret void +} +define void @fcmp_true() { +; NOF16-LABEL: 'fcmp_true' +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %v2f16 = fcmp true <2 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %v4f16 = fcmp true <4 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %v8f16 = fcmp true <8 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %v16f16 = fcmp true <16 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp true undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp true undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp true undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp true undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp true undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFH-LABEL: 'fcmp_true' +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp true <2 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp true <4 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp true <8 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp true <16 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp true undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp true undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp true undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp true undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp true undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFHMIN-LABEL: 'fcmp_true' +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp true <2 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp true <4 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp true <8 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp true <16 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp true undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp true undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp true undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp true undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp true undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2f16 = fcmp true <2 x half> undef, undef + %v4f16 = fcmp true <4 x half> undef, undef + %v8f16 = fcmp true <8 x half> undef, undef + %v16f16 = fcmp true <16 x half> undef, undef + %nxv1f16 = fcmp true undef, undef + %nxv2f16 = fcmp true undef, undef + %nxv4f16 = fcmp true undef, undef + %nxv8f16 = fcmp true undef, undef + %nxv16f16 = fcmp true undef, undef + ret void +} +define void @fcmp_false() { +; NOF16-LABEL: 'fcmp_false' +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %v2f16 = fcmp false <2 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %v4f16 = fcmp false <4 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %v8f16 = fcmp false <8 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %v16f16 = fcmp false <16 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp false undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp false undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp false undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp false undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp false undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFH-LABEL: 'fcmp_false' +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp false <2 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp false <4 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp false <8 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp false <16 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp false undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp false undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp false undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp false undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp false undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFHMIN-LABEL: 'fcmp_false' +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp false <2 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp false <4 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp false <8 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp false <16 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp false undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp false undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp false undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp false undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp false undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2f16 = fcmp false <2 x half> undef, undef + %v4f16 = fcmp false <4 x half> undef, undef + %v8f16 = fcmp false <8 x half> undef, undef + %v16f16 = fcmp false <16 x half> undef, undef + %nxv1f16 = fcmp false undef, undef + %nxv2f16 = fcmp false undef, undef + %nxv4f16 = fcmp false undef, undef + %nxv8f16 = fcmp false undef, undef + %nxv16f16 = fcmp false undef, undef + ret void +} From ad192f9f20ad48188b80855c085a0ad7266e0056 Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Fri, 3 Jan 2025 20:44:43 +1100 Subject: [PATCH 340/567] [ORC] Restrict check-dwarf-filename test to Darwin for now. This test is failing on Windows (see e.g. https://lab.llvm.org/buildbot/#/builders/146/builds/1983), probably due to incomplete debugger support there (the test registers debug info in-process, so non-Darwin builds shouldn't be expected to have the right symbols). --- .../ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s index a2eee21a0761d..df44ce996ecad 100644 --- a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s +++ b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s @@ -2,7 +2,7 @@ # RUN: llvm-jitlink -debug-only=orc -noexec -debugger-support %t.o 2>&1 | \ # RUN: FileCheck %s # -# REQUIRES: asserts +# REQUIRES: asserts && system-darwin # # Test that source file names can be indentified from DWARF line tables. From cdad18319425a7bf93cc25b276a7961fe5b1168b Mon Sep 17 00:00:00 2001 From: Mariya Podchishchaeva Date: Fri, 3 Jan 2025 11:17:16 +0100 Subject: [PATCH 341/567] [clang] Fix #embed "fast path" (#121479) When a single #embed directive is used to initialize a char array, the case is optimized via swap of EmbedExpr to underlying StringLiteral, resulting in better performance in AST consumers. While browsing through the code, I realized that 7122b70cfc8e23a069410215c363da76d842bda4 which changed type of EmbedExpr made the "fast path" unreachable. This patch fixes this unfortunate situation. --- clang/lib/Sema/SemaInit.cpp | 9 ++------- clang/test/Analysis/embed.c | 2 +- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index 5909457b04e66..0dd5f468cf60b 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -2030,13 +2030,8 @@ canInitializeArrayWithEmbedDataString(ArrayRef ExprList, if (InitType->isArrayType()) { const ArrayType *InitArrayType = InitType->getAsArrayTypeUnsafe(); - QualType InitElementTy = InitArrayType->getElementType(); - QualType EmbedExprElementTy = EE->getDataStringLiteral()->getType(); - const bool TypesMatch = - Context.typesAreCompatible(InitElementTy, EmbedExprElementTy) || - (InitElementTy->isCharType() && EmbedExprElementTy->isCharType()); - if (TypesMatch) - return true; + StringLiteral *SL = EE->getDataStringLiteral(); + return IsStringInit(SL, InitArrayType, Context) == SIF_None; } return false; } diff --git a/clang/test/Analysis/embed.c b/clang/test/Analysis/embed.c index 32f6c13032574..db8c270fb35de 100644 --- a/clang/test/Analysis/embed.c +++ b/clang/test/Analysis/embed.c @@ -8,5 +8,5 @@ int main() { #embed "embed.c" }; clang_analyzer_dump_ptr(SelfBytes); // expected-warning {{&Element{SelfBytes,0 S64b,unsigned char}}} - clang_analyzer_dump(SelfBytes[0]); // expected-warning {{Unknown}} FIXME: This should be the `/` character. + clang_analyzer_dump(SelfBytes[0]); // expected-warning {{47 U8b}} } From e3ec5a728674fd775bb85a7d159acdb4fa1d69c2 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 3 Jan 2025 10:29:07 +0000 Subject: [PATCH 342/567] [VectorCombine] foldShuffleOfBinops - fold shuffle(binop(shuffle(x),shuffle(z)),binop(shuffle(y),shuffle(w)) -> binop(shuffle(x,z),shuffle(y,w)) (#120984) Some patterns (in particular horizontal style patterns) can end up with shuffles straddling both sides of a binop/cmp. Where individually the folds aren't worth it, by merging the (oneuse) shuffles we can notably reduce the net instruction count and cost. One of the final steps towards finally addressing #34072 --- .../Transforms/Vectorize/VectorCombine.cpp | 34 +++- .../test/Transforms/PhaseOrdering/X86/hadd.ll | 187 ++++++------------ .../Transforms/PhaseOrdering/X86/pr50392.ll | 9 +- .../X86/extract-binop-inseltpoison.ll | 11 +- 4 files changed, 95 insertions(+), 146 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 493ed95b1d22e..9bca613593591 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -1743,6 +1743,36 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) { TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinResTy, OldMask, CostKind, 0, nullptr, {LHS, RHS}, &I); + // Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns + // where one use shuffles have gotten split across the binop/cmp. These + // often allow a major reduction in total cost that wouldn't happen as + // individual folds. + auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef Mask, + TTI::TargetCostKind CostKind) -> bool { + Value *InnerOp; + ArrayRef InnerMask; + if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(), + m_Mask(InnerMask)))) && + InnerOp->getType() == Op->getType() && + all_of(InnerMask, + [NumSrcElts](int M) { return M < (int)NumSrcElts; })) { + for (int &M : Mask) + if (Offset <= M && M < (int)(Offset + NumSrcElts)) { + M = InnerMask[M - Offset]; + M = 0 <= M ? M + Offset : M; + } + OldCost += TTI.getInstructionCost(cast(Op), CostKind); + Op = InnerOp; + return true; + } + return false; + }; + bool ReducedInstCount = false; + ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind); + ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind); + ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind); + ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind); + InstructionCost NewCost = TTI.getShuffleCost(SK0, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z}) + TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind, 0, nullptr, {Y, W}); @@ -1763,8 +1793,8 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) { // If either shuffle will constant fold away, then fold for the same cost as // we will reduce the instruction count. - bool ReducedInstCount = (isa(X) && isa(Z)) || - (isa(Y) && isa(W)); + ReducedInstCount |= (isa(X) && isa(Z)) || + (isa(Y) && isa(W)); if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost)) return false; diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll index 798824bce4dac..67da29b6cee7d 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll @@ -78,30 +78,16 @@ define <8 x i16> @add_v8i16_u1234567(<8 x i16> %a, <8 x i16> %b) { ; SSE2-NEXT: ret <8 x i16> [[RESULT]] ; ; SSE4-LABEL: @add_v8i16_u1234567( -; SSE4-NEXT: [[SHIFT:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> -; SSE4-NEXT: [[TMP1:%.*]] = add <8 x i16> [[A]], [[SHIFT]] -; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> -; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> -; SSE4-NEXT: [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]] -; SSE4-NEXT: [[HADD32:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP4]], <8 x i32> -; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> -; SSE4-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> +; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> +; SSE4-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> ; SSE4-NEXT: [[TMP7:%.*]] = add <8 x i16> [[TMP5]], [[TMP6]] -; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <8 x i16> [[HADD32]], <8 x i16> [[TMP7]], <8 x i32> -; SSE4-NEXT: ret <8 x i16> [[RESULT]] +; SSE4-NEXT: ret <8 x i16> [[TMP7]] ; ; AVX-LABEL: @add_v8i16_u1234567( -; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> -; AVX-NEXT: [[TMP1:%.*]] = add <8 x i16> [[A]], [[SHIFT]] -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> -; AVX-NEXT: [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]] -; AVX-NEXT: [[HADD32:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP4]], <8 x i32> -; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> -; AVX-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> +; AVX-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> ; AVX-NEXT: [[TMP7:%.*]] = add <8 x i16> [[TMP5]], [[TMP6]] -; AVX-NEXT: [[RESULT:%.*]] = shufflevector <8 x i16> [[HADD32]], <8 x i16> [[TMP7]], <8 x i32> -; AVX-NEXT: ret <8 x i16> [[RESULT]] +; AVX-NEXT: ret <8 x i16> [[TMP7]] ; %a0 = extractelement <8 x i16> %a, i32 0 %a1 = extractelement <8 x i16> %a, i32 1 @@ -172,13 +158,10 @@ define <4 x i32> @add_v4i32_0123(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @add_v4i32_u123(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: @add_v4i32_u123( -; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[RESULT1]] +; CHECK-NEXT: ret <4 x i32> [[TMP4]] ; %a0 = extractelement <4 x i32> %a, i32 0 %a1 = extractelement <4 x i32> %a, i32 1 @@ -202,13 +185,10 @@ define <4 x i32> @add_v4i32_u123(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @add_v4i32_0u23(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: @add_v4i32_0u23( -; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[RESULT1]] +; CHECK-NEXT: ret <4 x i32> [[TMP4]] ; %a0 = extractelement <4 x i32> %a, i32 0 %a1 = extractelement <4 x i32> %a, i32 1 @@ -232,40 +212,28 @@ define <4 x i32> @add_v4i32_0u23(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @add_v4i32_01u3(<4 x i32> %a, <4 x i32> %b) { ; SSE2-LABEL: @add_v4i32_01u3( -; SSE2-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> -; SSE2-NEXT: [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]] -; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B:%.*]], <4 x i32> -; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> +; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> ; SSE2-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]] -; SSE2-NEXT: [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> -; SSE2-NEXT: ret <4 x i32> [[RESULT1]] +; SSE2-NEXT: ret <4 x i32> [[TMP4]] ; ; SSE4-LABEL: @add_v4i32_01u3( -; SSE4-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> -; SSE4-NEXT: [[TMP1:%.*]] = add <4 x i32> [[SHIFT]], [[B]] -; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> -; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> +; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> +; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> ; SSE4-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]] -; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> -; SSE4-NEXT: ret <4 x i32> [[RESULT]] +; SSE4-NEXT: ret <4 x i32> [[TMP4]] ; ; AVX2-LABEL: @add_v4i32_01u3( -; AVX2-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> -; AVX2-NEXT: [[TMP1:%.*]] = add <4 x i32> [[SHIFT]], [[B]] -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> -; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> ; AVX2-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]] -; AVX2-NEXT: [[RESULT:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> -; AVX2-NEXT: ret <4 x i32> [[RESULT]] +; AVX2-NEXT: ret <4 x i32> [[TMP4]] ; ; AVX512-LABEL: @add_v4i32_01u3( -; AVX512-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> -; AVX512-NEXT: [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]] -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B:%.*]], <4 x i32> -; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> ; AVX512-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]] -; AVX512-NEXT: [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> -; AVX512-NEXT: ret <4 x i32> [[RESULT1]] +; AVX512-NEXT: ret <4 x i32> [[TMP4]] ; %a0 = extractelement <4 x i32> %a, i32 0 %a1 = extractelement <4 x i32> %a, i32 1 @@ -289,13 +257,10 @@ define <4 x i32> @add_v4i32_01u3(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @add_v4i32_012u(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: @add_v4i32_012u( -; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B:%.*]], <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[RESULT1]] +; CHECK-NEXT: ret <4 x i32> [[TMP4]] ; %a0 = extractelement <4 x i32> %a, i32 0 %a1 = extractelement <4 x i32> %a, i32 1 @@ -420,17 +385,14 @@ define <8 x i32> @add_v8i32_01234567(<8 x i32> %a, <8 x i32> %b) { define <8 x i32> @add_v8i32_01234u67(<8 x i32> %a, <8 x i32> %b) { ; SSE2-LABEL: @add_v8i32_01234u67( -; SSE2-NEXT: [[SHIFT:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <8 x i32> -; SSE2-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A]], [[SHIFT]] ; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <2 x i32> ; SSE2-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> ; SSE2-NEXT: [[TMP8:%.*]] = add <2 x i32> [[TMP5]], [[TMP6]] -; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> -; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B]], <8 x i32> +; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> ; SSE2-NEXT: [[TMP4:%.*]] = add <8 x i32> [[TMP2]], [[TMP3]] -; SSE2-NEXT: [[HADD4:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP1]], <8 x i32> ; SSE2-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <8 x i32> -; SSE2-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[HADD4]], <8 x i32> [[TMP7]], <8 x i32> +; SSE2-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP7]], <8 x i32> ; SSE2-NEXT: ret <8 x i32> [[RESULT]] ; ; SSE4-LABEL: @add_v8i32_01234u67( @@ -449,17 +411,10 @@ define <8 x i32> @add_v8i32_01234u67(<8 x i32> %a, <8 x i32> %b) { ; SSE4-NEXT: ret <8 x i32> [[RESULT]] ; ; AVX-LABEL: @add_v8i32_01234u67( -; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <8 x i32> -; AVX-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A]], [[SHIFT]] -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> -; AVX-NEXT: [[TMP4:%.*]] = add <8 x i32> [[TMP2]], [[TMP3]] -; AVX-NEXT: [[HADD4:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP1]], <8 x i32> -; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> -; AVX-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> +; AVX-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> ; AVX-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP5]], [[TMP6]] -; AVX-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[HADD4]], <8 x i32> [[TMP7]], <8 x i32> -; AVX-NEXT: ret <8 x i32> [[RESULT]] +; AVX-NEXT: ret <8 x i32> [[TMP7]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -530,13 +485,10 @@ define <4 x float> @add_v4f32_0123(<4 x float> %a, <4 x float> %b) { define <4 x float> @add_v4f32_u123(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @add_v4f32_u123( -; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> -; CHECK-NEXT: ret <4 x float> [[RESULT1]] +; CHECK-NEXT: ret <4 x float> [[TMP4]] ; %a0 = extractelement <4 x float> %a, i32 0 %a1 = extractelement <4 x float> %a, i32 1 @@ -599,22 +551,16 @@ define <4 x float> @add_v4f32_01u3(<4 x float> %a, <4 x float> %b) { ; SSE2-NEXT: ret <4 x float> [[RESULT1]] ; ; SSE4-LABEL: @add_v4f32_01u3( -; SSE4-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> -; SSE4-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[B]] -; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> -; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> +; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> +; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> ; SSE4-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]] -; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> -; SSE4-NEXT: ret <4 x float> [[RESULT]] +; SSE4-NEXT: ret <4 x float> [[TMP4]] ; ; AVX2-LABEL: @add_v4f32_01u3( -; AVX2-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> -; AVX2-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[B]] -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> -; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> ; AVX2-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]] -; AVX2-NEXT: [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> -; AVX2-NEXT: ret <4 x float> [[RESULT]] +; AVX2-NEXT: ret <4 x float> [[TMP4]] ; ; AVX512-LABEL: @add_v4f32_01u3( ; AVX512-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> @@ -820,17 +766,10 @@ define <8 x float> @add_v8f32_012u4567(<8 x float> %a, <8 x float> %b) { ; SSE-NEXT: ret <8 x float> [[RESULT]] ; ; AVX-LABEL: @add_v8f32_012u4567( -; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <8 x i32> -; AVX-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A]], [[SHIFT]] -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> -; AVX-NEXT: [[TMP4:%.*]] = fadd <8 x float> [[TMP2]], [[TMP3]] -; AVX-NEXT: [[HADD5:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> [[TMP1]], <8 x i32> -; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> -; AVX-NEXT: [[TMP6:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> +; AVX-NEXT: [[TMP6:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> ; AVX-NEXT: [[TMP7:%.*]] = fadd <8 x float> [[TMP5]], [[TMP6]] -; AVX-NEXT: [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP7]], <8 x i32> -; AVX-NEXT: ret <8 x float> [[RESULT]] +; AVX-NEXT: ret <8 x float> [[TMP7]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -983,13 +922,10 @@ define <4 x double> @add_v4f64_u123(<4 x double> %a, <4 x double> %b) { ; SSE4-NEXT: ret <4 x double> [[RESULT]] ; ; AVX-LABEL: @add_v4f64_u123( -; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <4 x i32> -; AVX-NEXT: [[TMP1:%.*]] = fadd <4 x double> [[B]], [[SHIFT]] -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <4 x i32> -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> ; AVX-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]] -; AVX-NEXT: [[RESULT:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP1]], <4 x i32> -; AVX-NEXT: ret <4 x double> [[RESULT]] +; AVX-NEXT: ret <4 x double> [[TMP4]] ; %a0 = extractelement <4 x double> %a, i32 0 %a1 = extractelement <4 x double> %a, i32 1 @@ -1034,13 +970,10 @@ define <4 x double> @add_v4f64_0u23(<4 x double> %a, <4 x double> %b) { ; SSE4-NEXT: ret <4 x double> [[RESULT]] ; ; AVX-LABEL: @add_v4f64_0u23( -; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <4 x i32> -; AVX-NEXT: [[TMP1:%.*]] = fadd <4 x double> [[B]], [[SHIFT]] -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <4 x i32> -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> ; AVX-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]] -; AVX-NEXT: [[RESULT:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP1]], <4 x i32> -; AVX-NEXT: ret <4 x double> [[RESULT]] +; AVX-NEXT: ret <4 x double> [[TMP4]] ; %a0 = extractelement <4 x double> %a, i32 0 %a1 = extractelement <4 x double> %a, i32 1 @@ -1085,13 +1018,10 @@ define <4 x double> @add_v4f64_01u3(<4 x double> %a, <4 x double> %b) { ; SSE4-NEXT: ret <4 x double> [[RESULT]] ; ; AVX-LABEL: @add_v4f64_01u3( -; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <4 x i32> -; AVX-NEXT: [[TMP1:%.*]] = fadd <4 x double> [[B]], [[SHIFT]] -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <4 x i32> -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> ; AVX-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]] -; AVX-NEXT: [[RESULT:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP1]], <4 x i32> -; AVX-NEXT: ret <4 x double> [[RESULT]] +; AVX-NEXT: ret <4 x double> [[TMP4]] ; %a0 = extractelement <4 x double> %a, i32 0 %a1 = extractelement <4 x double> %a, i32 1 @@ -1136,13 +1066,10 @@ define <4 x double> @add_v4f64_012u(<4 x double> %a, <4 x double> %b) { ; SSE4-NEXT: ret <4 x double> [[RESULT]] ; ; AVX-LABEL: @add_v4f64_012u( -; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <4 x i32> -; AVX-NEXT: [[TMP1:%.*]] = fadd <4 x double> [[A]], [[SHIFT]] -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <4 x i32> -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> ; AVX-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]] -; AVX-NEXT: [[RESULT:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP1]], <4 x i32> -; AVX-NEXT: ret <4 x double> [[RESULT]] +; AVX-NEXT: ret <4 x double> [[TMP4]] ; %a0 = extractelement <4 x double> %a, i32 0 %a1 = extractelement <4 x double> %a, i32 1 diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll index 4e1051d1991aa..d92df9741644b 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll @@ -31,13 +31,10 @@ define <4 x double> @PR50392(<4 x double> %a, <4 x double> %b) { ; SSE4-NEXT: ret <4 x double> [[SHUFFLE]] ; ; AVX-LABEL: @PR50392( -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> -; AVX-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]] -; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> +; AVX-NEXT: [[B:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B1:%.*]], <4 x i32> +; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B1]], <4 x i32> ; AVX-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[B]], [[SHIFT]] -; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP4]], <4 x i32> -; AVX-NEXT: ret <4 x double> [[SHUFFLE]] +; AVX-NEXT: ret <4 x double> [[TMP4]] ; %vecext = extractelement <4 x double> %a, i32 0 %vecext1 = extractelement <4 x double> %a, i32 1 diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll index 800f57646a3e1..6ef18e66d4211 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll @@ -468,15 +468,10 @@ define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) { ; SSE-LABEL: @PR34724( ; SSE-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 ; SSE-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 -; SSE-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> -; SSE-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]] -; SSE-NEXT: [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> -; SSE-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]] -; SSE-NEXT: [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> +; SSE-NEXT: [[SHIFT2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B1:%.*]], <4 x i32> +; SSE-NEXT: [[B:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B1]], <4 x i32> ; SSE-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]] -; SSE-NEXT: [[V2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> -; SSE-NEXT: [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> -; SSE-NEXT: ret <4 x float> [[V3]] +; SSE-NEXT: ret <4 x float> [[TMP3]] ; ; AVX-LABEL: @PR34724( ; AVX-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 From d3eb65f15dfda454424125b2fa675378bd350889 Mon Sep 17 00:00:00 2001 From: Kaviya Rajendiran <67495422+kaviya2510@users.noreply.github.com> Date: Fri, 3 Jan 2025 16:22:38 +0530 Subject: [PATCH 343/567] [MLIR][OpenMP] Lowering aligned clause to LLVM IR for SIMD directive (#119536) This patch, - Added a translation support for aligned clause in SIMD directive by passing the alignment details to "llvm.assume" intrinsic. - Updated the insertion point for llvm.assume intrinsic call in "OMPIRBuilder.cpp". - Added a check in aligned clause MLIR lowering, to ensure that the alignment value must be a power of 2. --- clang/test/OpenMP/irbuilder_simd_aligned.cpp | 6 +- flang/lib/Lower/OpenMP/ClauseProcessor.cpp | 2 + llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 3 +- .../Frontend/OpenMPIRBuilderTest.cpp | 8 +-- .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 23 +++++-- .../Target/LLVMIR/openmp-simd-aligned.mlir | 60 +++++++++++++++++++ mlir/test/Target/LLVMIR/openmp-todo.mlir | 12 ---- 7 files changed, 89 insertions(+), 25 deletions(-) create mode 100644 mlir/test/Target/LLVMIR/openmp-simd-aligned.mlir diff --git a/clang/test/OpenMP/irbuilder_simd_aligned.cpp b/clang/test/OpenMP/irbuilder_simd_aligned.cpp index 1c3dc49b717ed..721fde6d95495 100644 --- a/clang/test/OpenMP/irbuilder_simd_aligned.cpp +++ b/clang/test/OpenMP/irbuilder_simd_aligned.cpp @@ -70,8 +70,11 @@ void simple(float *a, float *b, int *c) { // CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] // CHECK: for.end: // CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 128) ] // CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[P]], align 8 +// CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP5]], i64 64) ] // CHECK-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [32 x i32], ptr [[D]], i64 0, i64 0 +// CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[ARRAYDECAY]], i64 16) ] // CHECK-NEXT: store i32 3, ptr [[I1]], align 4 // CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[AGG_CAPTURED]], i32 0, i32 0 // CHECK-NEXT: store ptr [[I1]], ptr [[TMP6]], align 8 @@ -82,9 +85,6 @@ void simple(float *a, float *b, int *c) { // CHECK-NEXT: [[DOTCOUNT:%.*]] = load i32, ptr [[DOTCOUNT_ADDR]], align 4 // CHECK-NEXT: br label [[OMP_LOOP_PREHEADER:%.*]] // CHECK: omp_loop.preheader: -// CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 128) ] -// CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP5]], i64 64) ] -// CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[ARRAYDECAY]], i64 16) ] // CHECK-NEXT: br label [[OMP_LOOP_HEADER:%.*]] // CHECK: omp_loop.header: // CHECK-NEXT: [[OMP_LOOP_IV:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER]] ], [ [[OMP_LOOP_NEXT:%.*]], [[OMP_LOOP_INC:%.*]] ] diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp index c4ab5e0033d04..fb8e007c7af57 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp @@ -613,6 +613,8 @@ addAlignedClause(lower::AbstractConverter &converter, // Do not generate alignment assumption if alignment is less than or equal to // 0. if (alignment > 0) { + // alignment value must be power of 2 + assert((alignment & (alignment - 1)) == 0 && "alignment is not power of 2"); auto &objects = std::get(clause.t); if (!objects.empty()) genObjectList(objects, converter, alignedVars); diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 0d8dbbe3a8a71..8dbf2aa7e0a24 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -5302,10 +5302,11 @@ void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop, Loop *L = LI.getLoopFor(CanonicalLoop->getHeader()); if (AlignedVars.size()) { InsertPointTy IP = Builder.saveIP(); - Builder.SetInsertPoint(CanonicalLoop->getPreheader()->getTerminator()); for (auto &AlignedItem : AlignedVars) { Value *AlignedPtr = AlignedItem.first; Value *Alignment = AlignedItem.second; + Instruction *loadInst = dyn_cast(AlignedPtr); + Builder.SetInsertPoint(loadInst->getNextNode()); Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr, Alignment); } diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp index d7ac108249118..9faae88b8dbc7 100644 --- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp +++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp @@ -1993,6 +1993,7 @@ TEST_F(OpenMPIRBuilderTest, ApplySimdCustomAligned) { OpenMPIRBuilder OMPBuilder(*M); IRBuilder<> Builder(BB); const int AlignmentValue = 32; + llvm::BasicBlock *sourceBlock = Builder.GetInsertBlock(); AllocaInst *Alloc1 = Builder.CreateAlloca(Builder.getPtrTy(), Builder.getInt64(1)); LoadInst *Load1 = Builder.CreateLoad(Alloc1->getAllocatedType(), Alloc1); @@ -2031,13 +2032,12 @@ TEST_F(OpenMPIRBuilderTest, ApplySimdCustomAligned) { // Check if number of assumption instructions is equal to number of aligned // variables - BasicBlock *LoopPreheader = CLI->getPreheader(); - size_t NumAssummptionCallsInPreheader = count_if( - *LoopPreheader, [](Instruction &I) { return isa(I); }); + size_t NumAssummptionCallsInPreheader = + count_if(*sourceBlock, [](Instruction &I) { return isa(I); }); EXPECT_EQ(NumAssummptionCallsInPreheader, AlignedVars.size()); // Check if variables are correctly aligned - for (Instruction &Instr : *LoopPreheader) { + for (Instruction &Instr : *sourceBlock) { if (!isa(Instr)) continue; AssumeInst *AssumeInstruction = cast(&Instr); diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 9a30266103b15..ce129417fc5b2 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -150,10 +150,6 @@ static LogicalResult checkImplementationStatus(Operation &op) { << " operation"; }; - auto checkAligned = [&todo](auto op, LogicalResult &result) { - if (!op.getAlignedVars().empty() || op.getAlignments()) - result = todo("aligned"); - }; auto checkAllocate = [&todo](auto op, LogicalResult &result) { if (!op.getAllocateVars().empty() || !op.getAllocatorVars().empty()) result = todo("allocate"); @@ -275,7 +271,6 @@ static LogicalResult checkImplementationStatus(Operation &op) { }) .Case([&](omp::ParallelOp op) { checkAllocate(op, result); }) .Case([&](omp::SimdOp op) { - checkAligned(op, result); checkLinear(op, result); checkNontemporal(op, result); checkPrivate(op, result); @@ -2302,6 +2297,24 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder, llvm::MapVector alignedVars; llvm::omp::OrderKind order = convertOrderKind(simdOp.getOrder()); + llvm::BasicBlock *sourceBlock = builder.GetInsertBlock(); + std::optional alignmentValues = simdOp.getAlignments(); + mlir::OperandRange operands = simdOp.getAlignedVars(); + for (size_t i = 0; i < operands.size(); ++i) { + llvm::Value *alignment = nullptr; + llvm::Value *llvmVal = moduleTranslation.lookupValue(operands[i]); + llvm::Type *ty = llvmVal->getType(); + if (auto intAttr = llvm::dyn_cast((*alignmentValues)[i])) { + alignment = builder.getInt64(intAttr.getInt()); + assert(ty->isPointerTy() && "Invalid type for aligned variable"); + assert(alignment && "Invalid alignment value"); + auto curInsert = builder.saveIP(); + builder.SetInsertPoint(sourceBlock->getTerminator()); + llvmVal = builder.CreateLoad(ty, llvmVal); + builder.restoreIP(curInsert); + alignedVars[llvmVal] = alignment; + } + } ompBuilder->applySimd(loopInfo, alignedVars, simdOp.getIfExpr() ? moduleTranslation.lookupValue(simdOp.getIfExpr()) diff --git a/mlir/test/Target/LLVMIR/openmp-simd-aligned.mlir b/mlir/test/Target/LLVMIR/openmp-simd-aligned.mlir new file mode 100644 index 0000000000000..234604e4b664a --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-simd-aligned.mlir @@ -0,0 +1,60 @@ +// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s + +//CHECK-LABEL: define void @_QPsimd_aligned_pointer() { +//CHECK: %[[A_PTR:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, i64 1, align 8 +//CHECK: %[[A_VAL:.*]] = load ptr, ptr %[[A_PTR]], align 8 +//CHECK: call void @llvm.assume(i1 true) [ "align"(ptr %[[A_VAL]], i64 256) ] +llvm.func @_QPsimd_aligned_pointer() { + %1 = llvm.mlir.constant(1 : i64) : i64 + %2 = llvm.alloca %1 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {bindc_name = "x"} : (i64) -> !llvm.ptr + %3 = llvm.alloca %1 x i32 {bindc_name = "i", pinned} : (i64) -> !llvm.ptr + %4 = llvm.mlir.constant(1 : i32) : i32 + %5 = llvm.mlir.constant(10 : i32) : i32 + %6 = llvm.mlir.constant(1 : i32) : i32 + omp.simd aligned(%2 : !llvm.ptr -> 256 : i64) { + omp.loop_nest (%arg0) : i32 = (%4) to (%5) inclusive step (%6) { + llvm.store %arg0, %3 : i32, !llvm.ptr + omp.yield + } + } + llvm.return +} + +//CHECK-LABEL: define void @_QPsimd_aligned_cptr() { +//CHECK: %[[A_CPTR:.*]] = alloca %_QM__fortran_builtinsT__builtin_c_ptr, i64 1, align 8 +//CHECK: %[[A_VAL:.*]] = load ptr, ptr %[[A_CPTR]], align 8 +//CHECK: call void @llvm.assume(i1 true) [ "align"(ptr %[[A_VAL]], i64 256) ] +llvm.func @_QPsimd_aligned_cptr() { + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x !llvm.struct<"_QM__fortran_builtinsT__builtin_c_ptr", (i64)> {bindc_name = "a"} : (i64) -> !llvm.ptr + %2 = llvm.mlir.constant(1 : i64) : i64 + %3 = llvm.alloca %2 x i32 {bindc_name = "i", pinned} : (i64) -> !llvm.ptr + %4 = llvm.mlir.constant(1 : i32) : i32 + %5 = llvm.mlir.constant(10 : i32) : i32 + %6 = llvm.mlir.constant(1 : i32) : i32 + omp.simd aligned(%1 : !llvm.ptr -> 256 : i64) { + omp.loop_nest (%arg0) : i32 = (%4) to (%5) inclusive step (%6) { + llvm.store %arg0, %3 : i32, !llvm.ptr + omp.yield + } + } + llvm.return +} + +//CHECK-LABEL: define void @_QPsimd_aligned_allocatable() { +//CHECK: %[[A_ADDR:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8 +//CHECK: %[[A_VAL:.*]] = load ptr, ptr %[[A_ADDR]], align 8 +//CHECK: call void @llvm.assume(i1 true) [ "align"(ptr %[[A_VAL]], i64 256) ] +llvm.func @_QPsimd_aligned_allocatable() { + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {bindc_name = "a"} : (i64) -> !llvm.ptr + %2 = llvm.mlir.constant(1 : i32) : i32 + %3 = llvm.mlir.constant(10 : i32) : i32 + %4 = llvm.mlir.constant(1 : i32) : i32 + omp.simd aligned(%1 : !llvm.ptr -> 256 : i64) { + omp.loop_nest (%arg0) : i32 = (%2) to (%3) inclusive step (%4) { + omp.yield + } + } + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir index 8f3e466cfbbeb..83a0990d63162 100644 --- a/mlir/test/Target/LLVMIR/openmp-todo.mlir +++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir @@ -127,18 +127,6 @@ llvm.func @sections_private(%x : !llvm.ptr) { llvm.return } -// ----- - -llvm.func @simd_aligned(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) { - // expected-error@below {{not yet implemented: Unhandled clause aligned in omp.simd operation}} - // expected-error@below {{LLVM Translation failed for operation: omp.simd}} - omp.simd aligned(%x : !llvm.ptr -> 32) { - omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) { - omp.yield - } - } - llvm.return -} // ----- From 2e41489d7b1498ec8a18b99e6d7db9e946f2d786 Mon Sep 17 00:00:00 2001 From: Younan Zhang Date: Fri, 3 Jan 2025 19:10:43 +0800 Subject: [PATCH 344/567] [Clang] Fix unexpanded packs in NTTP type constraints (#121296) In the case where a type-constraint on an NTTP contains a pack, we form a PackExpansionType to model it. However, there are a few places expecting it to be a non-pack expansion, and luckily only small changes could make them work. Fixes https://github.com/llvm/llvm-project/issues/88866 --- clang/docs/ReleaseNotes.rst | 1 + clang/lib/AST/ASTContext.cpp | 2 +- clang/lib/Sema/SemaTemplate.cpp | 16 ++++- clang/lib/Sema/SemaTemplateDeduction.cpp | 5 +- clang/test/SemaCXX/cxx2c-fold-exprs.cpp | 79 ++++++++++++++++++++++++ 5 files changed, 98 insertions(+), 5 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 2789a24ebf273..61d6aa2216cd0 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -886,6 +886,7 @@ Bug Fixes to C++ Support out of a module (which is the case e.g. in MSVC's implementation of ``std`` module). (#GH118218) - Fixed a pack expansion issue in checking unexpanded parameter sizes. (#GH17042) - Fixed a bug where captured structured bindings were modifiable inside non-mutable lambda (#GH95081) +- Clang now identifies unexpanded parameter packs within the type constraint on a non-type template parameter. (#GH88866) - Fixed an issue while resolving type of expression indexing into a pack of values of non-dependent type (#GH121242) Bug Fixes to AST Handling diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index 8b4ae58e8427a..a9ecb4ee9c76b 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -6376,7 +6376,7 @@ ASTContext::getAutoType(QualType DeducedType, AutoTypeKeyword Keyword, } QualType ASTContext::getUnconstrainedType(QualType T) const { - QualType CanonT = T.getCanonicalType(); + QualType CanonT = T.getNonPackExpansionType().getCanonicalType(); // Remove a type-constraint from a top-level auto or decltype(auto). if (auto *AT = CanonT->getAs()) { diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index 5e7a3c8484c88..20ec2fbeaa6a8 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -1228,7 +1228,7 @@ bool Sema::AttachTypeConstraint(AutoTypeLoc TL, NonTypeTemplateParmDecl *NewConstrainedParm, NonTypeTemplateParmDecl *OrigConstrainedParm, SourceLocation EllipsisLoc) { - if (NewConstrainedParm->getType() != TL.getType() || + if (NewConstrainedParm->getType().getNonPackExpansionType() != TL.getType() || TL.getAutoKeyword() != AutoTypeKeyword::Auto) { Diag(NewConstrainedParm->getTypeSourceInfo()->getTypeLoc().getBeginLoc(), diag::err_unsupported_placeholder_constraint) @@ -1530,9 +1530,19 @@ NamedDecl *Sema::ActOnNonTypeTemplateParameter(Scope *S, Declarator &D, Param->setAccess(AS_public); if (AutoTypeLoc TL = TInfo->getTypeLoc().getContainedAutoTypeLoc()) - if (TL.isConstrained()) - if (AttachTypeConstraint(TL, Param, Param, D.getEllipsisLoc())) + if (TL.isConstrained()) { + if (D.getEllipsisLoc().isInvalid() && + T->containsUnexpandedParameterPack()) { + assert(TL.getConceptReference()->getTemplateArgsAsWritten()); + for (auto &Loc : + TL.getConceptReference()->getTemplateArgsAsWritten()->arguments()) + Invalid |= DiagnoseUnexpandedParameterPack( + Loc, UnexpandedParameterPackContext::UPPC_TypeConstraint); + } + if (!Invalid && + AttachTypeConstraint(TL, Param, Param, D.getEllipsisLoc())) Invalid = true; + } if (Invalid) Param->setInvalidDecl(); diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp index fad20b37a7d9a..1c1f6e30ab7b8 100644 --- a/clang/lib/Sema/SemaTemplateDeduction.cpp +++ b/clang/lib/Sema/SemaTemplateDeduction.cpp @@ -857,7 +857,10 @@ class PackDeductionScope { if (auto *NTTP = dyn_cast( TemplateParams->getParam(Index))) { if (!NTTP->isExpandedParameterPack()) - if (auto *Expansion = dyn_cast(NTTP->getType())) + // FIXME: CWG2982 suggests a type-constraint forms a non-deduced + // context, however it is not yet resolved. + if (auto *Expansion = dyn_cast( + S.Context.getUnconstrainedType(NTTP->getType()))) ExtraDeductions.push_back(Expansion->getPattern()); } // FIXME: Also collect the unexpanded packs in any type and template diff --git a/clang/test/SemaCXX/cxx2c-fold-exprs.cpp b/clang/test/SemaCXX/cxx2c-fold-exprs.cpp index 0674135aac483..48061439941f2 100644 --- a/clang/test/SemaCXX/cxx2c-fold-exprs.cpp +++ b/clang/test/SemaCXX/cxx2c-fold-exprs.cpp @@ -305,3 +305,82 @@ static_assert(__is_same_as(_Three_way_comparison_result_with_tuple_like, 0>::type, long)); } + +namespace GH88866 { + +template struct index_by; + +template +concept InitFunc = true; + +namespace ExpandsBoth { + +template auto... init> +struct LazyLitMatrix; // expected-note {{here}} + +template < + typename...Indices, + InitFunc> auto... init +> +struct LazyLitMatrix, init...> { +}; + +// FIXME: Explain why we didn't pick up the partial specialization - pack sizes don't match. +template struct LazyLitMatrix, 42>; +// expected-error@-1 {{instantiation of undefined template}} +template struct LazyLitMatrix, 42, 43>; + +} + +namespace ExpandsRespectively { + +template auto... init> +struct LazyLitMatrix; + +template < + typename...Indices, + InitFunc> auto... init +> +struct LazyLitMatrix, init...> { +}; + +template struct LazyLitMatrix, 42>; +template struct LazyLitMatrix, 42, 43>; + +} + +namespace TypeParameter { + +template ... init> +struct LazyLitMatrix; // expected-note {{here}} + +template < + typename...Indices, + InitFunc>... init +> +struct LazyLitMatrix, init...> { +}; + +// FIXME: Explain why we didn't pick up the partial specialization - pack sizes don't match. +template struct LazyLitMatrix, float>; +// expected-error@-1 {{instantiation of undefined template}} +template struct LazyLitMatrix, unsigned, float>; + +} + +namespace Invalid { + +template ... init> +struct LazyLitMatrix; + +template < + typename...Indices, + InitFunc> init + // expected-error@-1 {{unexpanded parameter pack 'Indices'}} +> +struct LazyLitMatrix, init> { +}; + +} + +} From 85849917f7ba19f6906f64726dc5e7101f8984ce Mon Sep 17 00:00:00 2001 From: David CARLIER Date: Fri, 3 Jan 2025 11:16:34 +0000 Subject: [PATCH 345/567] [compiler-rt][rtsan] Reland "fopencookie support." (#120864) (#121547) --- .../lib/rtsan/rtsan_interceptors_posix.cpp | 12 +++++++++ .../tests/rtsan_test_interceptors_posix.cpp | 25 +++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp index 4e51f464b5730..9f89ab6bf1fc7 100644 --- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp @@ -297,6 +297,17 @@ INTERCEPTOR(FILE *, fdopen, int fd, const char *mode) { return REAL(fdopen)(fd, mode); } +#if SANITIZER_INTERCEPT_FOPENCOOKIE +INTERCEPTOR(FILE *, fopencookie, void *cookie, const char *mode, + cookie_io_functions_t funcs) { + __rtsan_notify_intercepted_call("fopencookie"); + return REAL(fopencookie)(cookie, mode, funcs); +} +#define RTSAN_MAYBE_INTERCEPT_FOPENCOOKIE INTERCEPT_FUNCTION(fopencookie) +#else +#define RTSAN_MAYBE_INTERCEPT_FOPENCOOKIE +#endif + #if SANITIZER_INTERCEPT_OPEN_MEMSTREAM INTERCEPTOR(FILE *, open_memstream, char **buf, size_t *size) { __rtsan_notify_intercepted_call("open_memstream"); @@ -972,6 +983,7 @@ void __rtsan::InitializeInterceptors() { INTERCEPT_FUNCTION(fputs); INTERCEPT_FUNCTION(fdopen); INTERCEPT_FUNCTION(freopen); + RTSAN_MAYBE_INTERCEPT_FOPENCOOKIE; RTSAN_MAYBE_INTERCEPT_OPEN_MEMSTREAM; RTSAN_MAYBE_INTERCEPT_FMEMOPEN; INTERCEPT_FUNCTION(lseek); diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp index b052dd859dcdf..5adbf0fb63de8 100644 --- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp @@ -353,6 +353,31 @@ TEST_F(RtsanFileTest, FopenDiesWhenRealtime) { ExpectNonRealtimeSurvival(Func); } +#if SANITIZER_INTERCEPT_FOPENCOOKIE +TEST_F(RtsanFileTest, FopenCookieDieWhenRealtime) { + FILE *f = fopen(GetTemporaryFilePath(), "w"); + EXPECT_THAT(f, Ne(nullptr)); + struct fholder { + FILE *fp; + size_t read; + } fh = {f, 0}; + auto CookieRead = [this](void *cookie, char *buf, size_t size) { + fholder *p = reinterpret_cast(cookie); + p->read = fread(static_cast(buf), 1, size, p->fp); + EXPECT_NE(0, p->read); + }; + cookie_io_functions_t funcs = {(cookie_read_function_t *)&CookieRead, nullptr, + nullptr, nullptr}; + auto Func = [&fh, &funcs]() { + FILE *f = fopencookie(&fh, "w", funcs); + EXPECT_THAT(f, Ne(nullptr)); + }; + + ExpectRealtimeDeath(Func, "fopencookie"); + ExpectNonRealtimeSurvival(Func); +} +#endif + #if SANITIZER_INTERCEPT_OPEN_MEMSTREAM TEST_F(RtsanFileTest, OpenMemstreamDiesWhenRealtime) { char *buffer; From 579ced4f8266b273d15b2801067a828151a222ef Mon Sep 17 00:00:00 2001 From: Hugo Trachino Date: Fri, 3 Jan 2025 11:21:59 +0000 Subject: [PATCH 346/567] [MLIR][Python] Add structured.fuseop to python interpreter (#120601) Implements a python interface for structured.fuseOp allowing more freedom with inputs. --- .../mlir/dialects/transform/structured.py | 71 +++++++++++++++++++ .../dialects/transform_structured_ext.py | 36 ++++++++++ 2 files changed, 107 insertions(+) diff --git a/mlir/python/mlir/dialects/transform/structured.py b/mlir/python/mlir/dialects/transform/structured.py index 9121aa8e40237..bf40cc532065d 100644 --- a/mlir/python/mlir/dialects/transform/structured.py +++ b/mlir/python/mlir/dialects/transform/structured.py @@ -140,6 +140,77 @@ def __init__( ) +@_ods_cext.register_operation(_Dialect, replace=True) +class FuseOp(FuseOp): + """Specialization for FuseOp class.""" + + @overload + def __init__( + self, + loop_types: Union[Type, Sequence[Type]], + target: Union[Operation, Value, OpView], + *, + tile_sizes: Optional[Union[DynamicIndexList, ArrayAttr]] = None, + tile_interchange: OptionalIntList = None, + apply_cleanup: Optional[bool] = False, + loc=None, + ip=None, + ): + ... + + @overload + def __init__( + self, + target: Union[Operation, Value, OpView], + *, + tile_sizes: Optional[Union[DynamicIndexList, ArrayAttr]] = None, + tile_interchange: OptionalIntList = None, + apply_cleanup: Optional[bool] = False, + loc=None, + ip=None, + ): + ... + + def __init__( + self, + loop_types_or_target: Union[Type, Sequence[Type], Operation, OpView, Value], + target_or_none: Optional[Union[Operation, Value, OpView]] = None, + *, + tile_sizes: Optional[Union[DynamicIndexList, ArrayAttr]] = None, + tile_interchange: OptionalIntList = None, + apply_cleanup: Optional[bool] = False, + loc=None, + ip=None, + ): + tile_sizes = tile_sizes if tile_sizes else [] + tile_interchange = tile_interchange if tile_interchange else [] + _, tile_sizes, _ = _dispatch_dynamic_index_list(tile_sizes) + _, tile_interchange, _ = _dispatch_dynamic_index_list(tile_interchange) + num_loops = sum(0 if v == 0 else 1 for v in tile_sizes) + + if isinstance(loop_types_or_target, (Operation, Value, OpView)): + loop_types = [transform.AnyOpType.get()] * num_loops + target = loop_types_or_target + assert target_or_none is None, "Cannot construct FuseOp with two targets." + else: + loop_types = ( + ([loop_types_or_target] * num_loops) + if isinstance(loop_types_or_target, Type) + else loop_types_or_target + ) + target = target_or_none + super().__init__( + target.type, + loop_types, + target, + tile_sizes=tile_sizes, + tile_interchange=tile_interchange, + apply_cleanup=apply_cleanup, + loc=loc, + ip=ip, + ) + + @_ods_cext.register_operation(_Dialect, replace=True) class GeneralizeOp(GeneralizeOp): """Specialization for GeneralizeOp class.""" diff --git a/mlir/test/python/dialects/transform_structured_ext.py b/mlir/test/python/dialects/transform_structured_ext.py index fb4c75b533792..8785d6d360074 100644 --- a/mlir/test/python/dialects/transform_structured_ext.py +++ b/mlir/test/python/dialects/transform_structured_ext.py @@ -101,6 +101,42 @@ def testFuseIntoContainingOpCompact(target): # CHECK-SAME: (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) +@run +@create_sequence +def testFuseOpCompact(target): + structured.FuseOp( + target, tile_sizes=[4, 8], tile_interchange=[0, 1], apply_cleanup=True + ) + # CHECK-LABEL: TEST: testFuseOpCompact + # CHECK: transform.sequence + # CHECK: %{{.+}}, %{{.+}}:2 = transform.structured.fuse %{{.*}}[4, 8] + # CHECK-SAME: interchange [0, 1] apply_cleanup = true + # CHECK-SAME: (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + + +@run +@create_sequence +def testFuseOpNoArg(target): + structured.FuseOp(target) + # CHECK-LABEL: TEST: testFuseOpNoArg + # CHECK: transform.sequence + # CHECK: %{{.+}} = transform.structured.fuse %{{.*}} : + # CHECK-SAME: (!transform.any_op) -> !transform.any_op + + +@run +@create_sequence +def testFuseOpAttributes(target): + attr = DenseI64ArrayAttr.get([4, 8]) + ichange = DenseI64ArrayAttr.get([0, 1]) + structured.FuseOp(target, tile_sizes=attr, tile_interchange=ichange) + # CHECK-LABEL: TEST: testFuseOpAttributes + # CHECK: transform.sequence + # CHECK: %{{.+}}, %{{.+}}:2 = transform.structured.fuse %{{.*}}[4, 8] + # CHECK-SAME: interchange [0, 1] + # CHECK-SAME: (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + + @run @create_sequence def testGeneralize(target): From f87a9db8322643ccbc324e317a75b55903129b55 Mon Sep 17 00:00:00 2001 From: David Green Date: Fri, 3 Jan 2025 11:28:31 +0000 Subject: [PATCH 347/567] [ARM] Expand fp64 bf16 converts similarly to f32 This helps with +fp64 targets where the f64s are legal and not previously lowered. It can treat fpextends as a shift + cvt and fptrunc can use a libcall. --- llvm/lib/Target/ARM/ARMISelLowering.cpp | 2 + llvm/test/CodeGen/Thumb2/bf16-instructions.ll | 65 +++++++++++-------- 2 files changed, 41 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 5ec2d8389c18e..2e517c21fc4a8 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -806,7 +806,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::BITCAST, MVT::bf16, Custom); } else { setOperationAction(ISD::BF16_TO_FP, MVT::f32, Expand); + setOperationAction(ISD::BF16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP_TO_BF16, MVT::f32, Custom); + setOperationAction(ISD::FP_TO_BF16, MVT::f64, Custom); } for (MVT VT : MVT::fixedlen_vector_valuetypes()) { diff --git a/llvm/test/CodeGen/Thumb2/bf16-instructions.ll b/llvm/test/CodeGen/Thumb2/bf16-instructions.ll index 5de7afca25b84..786e35517fd7c 100644 --- a/llvm/test/CodeGen/Thumb2/bf16-instructions.ll +++ b/llvm/test/CodeGen/Thumb2/bf16-instructions.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple thumbv8.1m.main-none-eabi | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP -; RUN: llc < %s -mtriple thumbv8.1m.main-none-eabihf -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-FP +; RUN: llc < %s -mtriple thumbv8.1m.main-none-eabihf -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-FP,CHECK-FPNO64 +; RUN: llc < %s -mtriple thumbv8.1m.main-none-eabihf -mattr=+fullfp16,+fp64 | FileCheck %s --check-prefixes=CHECK,CHECK-FP,CHECK-FP64 define bfloat @test_fadd(bfloat %a, bfloat %b) { ; CHECK-NOFP-LABEL: test_fadd: @@ -259,9 +260,8 @@ define void @test_truncstore64(double %a, ptr %b) { ; CHECK-FP-NEXT: .save {r4, lr} ; CHECK-FP-NEXT: push {r4, lr} ; CHECK-FP-NEXT: mov r4, r0 -; CHECK-FP-NEXT: vmov r0, r1, d0 -; CHECK-FP-NEXT: bl __aeabi_d2f -; CHECK-FP-NEXT: lsrs r0, r0, #16 +; CHECK-FP-NEXT: bl __truncdfbf2 +; CHECK-FP-NEXT: vmov r0, s0 ; CHECK-FP-NEXT: strh r0, [r4] ; CHECK-FP-NEXT: pop {r4, pc} %r = fptrunc double %a to bfloat @@ -312,15 +312,23 @@ define double @test_loadext64(ptr %a) { ; CHECK-NOFP-NEXT: bl __aeabi_f2d ; CHECK-NOFP-NEXT: pop {r7, pc} ; -; CHECK-FP-LABEL: test_loadext64: -; CHECK-FP: @ %bb.0: -; CHECK-FP-NEXT: .save {r7, lr} -; CHECK-FP-NEXT: push {r7, lr} -; CHECK-FP-NEXT: ldrh r0, [r0] -; CHECK-FP-NEXT: lsls r0, r0, #16 -; CHECK-FP-NEXT: bl __aeabi_f2d -; CHECK-FP-NEXT: vmov d0, r0, r1 -; CHECK-FP-NEXT: pop {r7, pc} +; CHECK-FPNO64-LABEL: test_loadext64: +; CHECK-FPNO64: @ %bb.0: +; CHECK-FPNO64-NEXT: .save {r7, lr} +; CHECK-FPNO64-NEXT: push {r7, lr} +; CHECK-FPNO64-NEXT: ldrh r0, [r0] +; CHECK-FPNO64-NEXT: lsls r0, r0, #16 +; CHECK-FPNO64-NEXT: bl __aeabi_f2d +; CHECK-FPNO64-NEXT: vmov d0, r0, r1 +; CHECK-FPNO64-NEXT: pop {r7, pc} +; +; CHECK-FP64-LABEL: test_loadext64: +; CHECK-FP64: @ %bb.0: +; CHECK-FP64-NEXT: ldrh r0, [r0] +; CHECK-FP64-NEXT: lsls r0, r0, #16 +; CHECK-FP64-NEXT: vmov s0, r0 +; CHECK-FP64-NEXT: vcvt.f64.f32 d0, s0 +; CHECK-FP64-NEXT: bx lr %r = load bfloat, ptr %a %d = fpext bfloat %r to double ret double %d @@ -1374,10 +1382,7 @@ define bfloat @test_fptrunc_double(double %a) { ; CHECK-FP: @ %bb.0: ; CHECK-FP-NEXT: .save {r7, lr} ; CHECK-FP-NEXT: push {r7, lr} -; CHECK-FP-NEXT: vmov r0, r1, d0 -; CHECK-FP-NEXT: bl __aeabi_d2f -; CHECK-FP-NEXT: lsrs r0, r0, #16 -; CHECK-FP-NEXT: vmov.f16 s0, r0 +; CHECK-FP-NEXT: bl __truncdfbf2 ; CHECK-FP-NEXT: vmov.f16 r0, s0 ; CHECK-FP-NEXT: vmov s0, r0 ; CHECK-FP-NEXT: pop {r7, pc} @@ -1410,15 +1415,23 @@ define double @test_fpext_double(bfloat %a) { ; CHECK-NOFP-NEXT: bl __aeabi_f2d ; CHECK-NOFP-NEXT: pop {r7, pc} ; -; CHECK-FP-LABEL: test_fpext_double: -; CHECK-FP: @ %bb.0: -; CHECK-FP-NEXT: .save {r7, lr} -; CHECK-FP-NEXT: push {r7, lr} -; CHECK-FP-NEXT: vmov r0, s0 -; CHECK-FP-NEXT: lsls r0, r0, #16 -; CHECK-FP-NEXT: bl __aeabi_f2d -; CHECK-FP-NEXT: vmov d0, r0, r1 -; CHECK-FP-NEXT: pop {r7, pc} +; CHECK-FPNO64-LABEL: test_fpext_double: +; CHECK-FPNO64: @ %bb.0: +; CHECK-FPNO64-NEXT: .save {r7, lr} +; CHECK-FPNO64-NEXT: push {r7, lr} +; CHECK-FPNO64-NEXT: vmov r0, s0 +; CHECK-FPNO64-NEXT: lsls r0, r0, #16 +; CHECK-FPNO64-NEXT: bl __aeabi_f2d +; CHECK-FPNO64-NEXT: vmov d0, r0, r1 +; CHECK-FPNO64-NEXT: pop {r7, pc} +; +; CHECK-FP64-LABEL: test_fpext_double: +; CHECK-FP64: @ %bb.0: +; CHECK-FP64-NEXT: vmov r0, s0 +; CHECK-FP64-NEXT: lsls r0, r0, #16 +; CHECK-FP64-NEXT: vmov s0, r0 +; CHECK-FP64-NEXT: vcvt.f64.f32 d0, s0 +; CHECK-FP64-NEXT: bx lr %r = fpext bfloat %a to double ret double %r } From 5cf138cfbaa8040100fed1d0d5e0a189759b24ab Mon Sep 17 00:00:00 2001 From: David Spickett Date: Fri, 3 Jan 2025 12:43:31 +0000 Subject: [PATCH 348/567] [llvm][JITLink][LoongArch] Fix bit extraction on 32 bit platforms This shifted `1UL` to make the mask. On 32 bit Linux UL is 32 bit, so if Hi+1 was >= 32 then you'd get the wrong result here. The other version of this uses 1ULL, but using the uint64_t typename here saves someone going to check what ULL means on different platforms. This fixes test failures seen on Linaro's 32 bit bots: https://lab.llvm.org/buildbot/#/builders/39/builds/3700 https://lab.llvm.org/buildbot/#/builders/122/builds/781 Though I cannot say exactly why this fixes them. Does not seem like the new code was triggering this problem, but somehow it must be. --- llvm/include/llvm/ExecutionEngine/JITLink/loongarch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/loongarch.h b/llvm/include/llvm/ExecutionEngine/JITLink/loongarch.h index d31c749bad1b1..1db4b82218109 100644 --- a/llvm/include/llvm/ExecutionEngine/JITLink/loongarch.h +++ b/llvm/include/llvm/ExecutionEngine/JITLink/loongarch.h @@ -233,7 +233,7 @@ const char *getEdgeKindName(Edge::Kind K); // Returns extract bits Val[Hi:Lo]. inline uint32_t extractBits(uint64_t Val, unsigned Hi, unsigned Lo) { - return Hi == 63 ? Val >> Lo : (Val & (((1UL << (Hi + 1)) - 1))) >> Lo; + return Hi == 63 ? Val >> Lo : (Val & ((((uint64_t)1 << (Hi + 1)) - 1))) >> Lo; } /// Apply fixup expression for edge to block content. From cba9c6ac15b462e68cf76d496ba0f832a531db8b Mon Sep 17 00:00:00 2001 From: Adrian Kuegel Date: Fri, 3 Jan 2025 12:48:49 +0000 Subject: [PATCH 349/567] [mlir] Fix typo in parameter name annotation comment. Found by ClangTidyBugProne check. --- mlir/lib/Transforms/Utils/DialectConversion.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index 255b0ba2559ee..2b006430d3817 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -1214,7 +1214,7 @@ LogicalResult ConversionPatternRewriterImpl::remapValues( ValueRange targetMat = buildUnresolvedMaterialization( MaterializationKind::Target, computeInsertPoint(repl), operandLoc, /*valueToMap=*/Value(), /*inputs=*/unpacked, - /*outputType=*/legalTypes, /*originalType=*/origType, + /*outputTypes=*/legalTypes, /*originalType=*/origType, currentTypeConverter); remapped.push_back(targetMat); continue; From b1c195cbd16adbc4dac8f4bc01b8a34e315d3e61 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Fri, 3 Jan 2025 07:59:54 -0500 Subject: [PATCH 350/567] [gn] port 27f30029741e (yet another tblgen reorg) --- .../gn/secondary/llvm/utils/TableGen/BUILD.gn | 21 +++---------------- .../llvm/utils/TableGen/Basic/BUILD.gn | 9 +++++++- .../llvm/utils/TableGen/Common/BUILD.gn | 2 +- 3 files changed, 12 insertions(+), 20 deletions(-) diff --git a/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn index ba52a97f39d85..e2daa1e9b73c2 100644 --- a/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn @@ -1,29 +1,13 @@ -source_set("llvm-min-tblgen-sources") { - sources = [ - "ARMTargetDefEmitter.cpp", - "Attributes.cpp", - "DirectiveEmitter.cpp", - "IntrinsicEmitter.cpp", - "RISCVTargetDefEmitter.cpp", - "TableGen.cpp", - "VTEmitter.cpp", - ] - deps = [ - "Basic", - "//llvm/lib/Support", - ] -} - executable("llvm-min-tblgen") { + sources = [ "llvm-min-tblgen.cpp" ] deps = [ - ":llvm-min-tblgen-sources", "Basic", + "//llvm/lib/Support", ] } executable("llvm-tblgen") { deps = [ - ":llvm-min-tblgen-sources", "Basic", "Common", "//llvm/include/llvm/Config:llvm-config", @@ -55,6 +39,7 @@ executable("llvm-tblgen") { "GlobalISelEmitter.cpp", "InstrDocsEmitter.cpp", "InstrInfoEmitter.cpp", + "llvm-tblgen.cpp", "MacroFusionPredicatorEmitter.cpp", "OptionParserEmitter.cpp", "OptionRSTEmitter.cpp", diff --git a/llvm/utils/gn/secondary/llvm/utils/TableGen/Basic/BUILD.gn b/llvm/utils/gn/secondary/llvm/utils/TableGen/Basic/BUILD.gn index 2ebe393fa0fd9..ef6d6e44b6f8d 100644 --- a/llvm/utils/gn/secondary/llvm/utils/TableGen/Basic/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/utils/TableGen/Basic/BUILD.gn @@ -1,10 +1,17 @@ -static_library("Basic") { +source_set("Basic") { deps = [ "//llvm/lib/Support", "//llvm/lib/TableGen", ] sources = [ + "ARMTargetDefEmitter.cpp", + "Attributes.cpp", "CodeGenIntrinsics.cpp", + "DirectiveEmitter.cpp", + "IntrinsicEmitter.cpp", + "RISCVTargetDefEmitter.cpp", "SDNodeProperties.cpp", + "TableGen.cpp", + "VTEmitter.cpp", ] } diff --git a/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn b/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn index c46e7cb1dc8b7..31d0e1dade039 100644 --- a/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn @@ -1,4 +1,4 @@ -static_library("Common") { +source_set("Common") { deps = [ "//llvm/include/llvm/CodeGen:GenVT", "//llvm/lib/CodeGenTypes", From d598829375634da42910e2624f181f6b843bdc8b Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Fri, 3 Jan 2025 08:05:10 -0500 Subject: [PATCH 351/567] [gn] make LLVMTableGenCommon a static_library again Else TableGenTests doesn't link. --- llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn b/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn index 31d0e1dade039..db11e56e550f9 100644 --- a/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn @@ -1,4 +1,5 @@ -source_set("Common") { +static_library("Common") { + output_name = "LLVMTableGenCommon" deps = [ "//llvm/include/llvm/CodeGen:GenVT", "//llvm/lib/CodeGenTypes", From e53494c750246118c313b3cbf7479edb682f2208 Mon Sep 17 00:00:00 2001 From: Adrian Kuegel Date: Fri, 3 Jan 2025 13:07:49 +0000 Subject: [PATCH 352/567] [mlir] Fix 0 values passed to the wrong parameters. This was found by modernize-use-nullptr ClangTidy check, which suggested to pass nullptr instead of 0 to DIFileAttr. However it looks like the intention was to pass the two 0 values for line and scopeLine, and we should pass {} to DIFileAttr. Do that change. --- mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp index 28e8b81a05576..7490e8735f5fd 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp @@ -233,7 +233,7 @@ DIRecursiveTypeAttrInterface DISubprogramAttr::withRecId(DistinctAttr recId) { DIRecursiveTypeAttrInterface DISubprogramAttr::getRecSelf(DistinctAttr recId) { return DISubprogramAttr::get(recId.getContext(), recId, /*isRecSelf=*/true, - {}, {}, {}, {}, {}, 0, 0, {}, {}, {}, {}, {}); + {}, {}, {}, {}, {}, {}, 0, 0, {}, {}, {}, {}); } //===----------------------------------------------------------------------===// From 119fc720a19e047fee59d7f7446c911b158563e0 Mon Sep 17 00:00:00 2001 From: Haojian Wu Date: Fri, 3 Jan 2025 14:14:00 +0100 Subject: [PATCH 353/567] NFC, explicitly specify the -fopenmp lib in spirv-openmp-toolchain.c test Don't rely on the default `CLANG_DEFAULT_OPENMP_RUNTIME` env variable which is `libomp` by default. --- clang/test/Driver/spirv-openmp-toolchain.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/clang/test/Driver/spirv-openmp-toolchain.c b/clang/test/Driver/spirv-openmp-toolchain.c index 3eb1f22a03ed0..377b2d9be0b09 100644 --- a/clang/test/Driver/spirv-openmp-toolchain.c +++ b/clang/test/Driver/spirv-openmp-toolchain.c @@ -9,7 +9,7 @@ // CHECK: "-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-emit-obj" // CHECK: clang-linker-wrapper{{.*}} "-o" "a.out" -// RUN: %clang -ccc-print-phases --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=spirv64-intel %s 2>&1 \ +// RUN: %clang -ccc-print-phases --target=x86_64-unknown-linux-gnu -fopenmp=libomp -fopenmp-targets=spirv64-intel %s 2>&1 \ // RUN: | FileCheck --check-prefix=CHECK-PHASES %s // CHECK-PHASES: 0: input, "[[INPUT:.+]]", c, (host-openmp) @@ -28,8 +28,8 @@ // CHECK-PHASES: 13: assembler, {12}, object, (host-openmp) // CHECK-PHASES: 14: clang-linker-wrapper, {13}, image, (host-openmp) -// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -fopenmp -fopenmp-targets=spirv64-intel -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS -// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -fopenmp -fopenmp-targets=spirv64-intel -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -fopenmp=libomp -fopenmp-targets=spirv64-intel -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -fopenmp=libomp -fopenmp-targets=spirv64-intel -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS // CHECK-BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[HOST_BC:.+]]" // CHECK-BINDINGS: "spirv64-intel" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[DEVICE_TEMP_BC:.+]]" @@ -38,8 +38,8 @@ // CHECK-BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]", "[[DEVICE_IMAGE]]"], output: "[[HOST_OBJ:.+]]" // CHECK-BINDINGS: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[HOST_OBJ]]"], output: "a.out" -// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -save-temps -fopenmp -fopenmp-targets=spirv64-intel -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS-TEMPS -// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -save-temps -fopenmp -fopenmp-targets=spirv64-intel %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS-TEMPS +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -save-temps -fopenmp=libomp -fopenmp-targets=spirv64-intel -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS-TEMPS +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -save-temps -fopenmp=libomp -fopenmp-targets=spirv64-intel %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS-TEMPS // CHECK-BINDINGS-TEMPS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[HOST_PP:.+]]" // CHECK-BINDINGS-TEMPS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_PP]]"], output: "[[HOST_BC:.+]]" // CHECK-BINDINGS-TEMPS: "spirv64-intel" - "clang", inputs: ["[[INPUT]]"], output: "[[DEVICE_PP:.+]]" @@ -51,14 +51,14 @@ // CHECK-BINDINGS-TEMPS: "x86_64-unknown-linux-gnu" - "clang::as", inputs: ["[[HOST_ASM]]"], output: "[[HOST_OBJ:.+]]" // CHECK-BINDINGS-TEMPS: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[HOST_OBJ]]"], output: "a.out" -// RUN: %clang -### --target=x86_64-unknown-linux-gnu -emit-llvm -S -fopenmp -fopenmp-targets=spirv64-intel -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-EMIT-LLVM-IR +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -emit-llvm -S -fopenmp=libomp -fopenmp-targets=spirv64-intel -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-EMIT-LLVM-IR // CHECK-EMIT-LLVM-IR: "-cc1" "-triple" "spirv64-intel"{{.*}}"-emit-llvm-bc" -// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=spirv64-intel \ +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp -fopenmp-targets=spirv64-intel \ // RUN: --sysroot=%S/Inputs/spirv-openmp/ %s 2>&1 | FileCheck --check-prefix=CHECK-GPULIB %s // CHECK-GPULIB: "-cc1" "-triple" "spirv64-intel"{{.*}}"-mlink-builtin-bitcode" "{{.*}}libomptarget-spirv64.bc" -// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -fopenmp --offload-arch=spirv64-intel \ +// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=spirv64-intel \ // RUN: --libomptarget-spirv-bc-path=%t/ -nogpulib %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=CHECK-OFFLOAD-ARCH-ERROR // CHECK-OFFLOAD-ARCH-ERROR: error: failed to deduce triple for target architecture 'spirv64-intel'; specify the triple using '-fopenmp-targets' and '-Xopenmp-target' instead From 6cd171dc3330a055a8d8a1ddff63631d42150b8a Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Fri, 3 Jan 2025 08:20:06 -0500 Subject: [PATCH 354/567] [lld/COFF] Support thin archives in /reproduce: files (#121512) This already worked without /wholearchive; now it works with it too. (Only for thin archives containing relative file names, matching the ELF and Mach-O ports.) --- lld/COFF/InputFiles.cpp | 8 ++++++++ lld/test/COFF/linkrepro-thin-archives.s | 23 +++++++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 lld/test/COFF/linkrepro-thin-archives.s diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp index e698f66b84f62..a94c984cfd487 100644 --- a/lld/COFF/InputFiles.cpp +++ b/lld/COFF/InputFiles.cpp @@ -149,11 +149,19 @@ std::vector lld::coff::getArchiveMembers(COFFLinkerContext &ctx, Archive *file) { std::vector v; Error err = Error::success(); + + // Thin archives refer to .o files, so --reproduces needs the .o files too. + bool addToTar = file->isThin() && ctx.driver.tar; + for (const Archive::Child &c : file->children(err)) { MemoryBufferRef mbref = CHECK(c.getMemoryBufferRef(), file->getFileName() + ": could not get the buffer for a child of the archive"); + if (addToTar) { + ctx.driver.tar->append(relativeToRoot(check(c.getFullName())), + mbref.getBuffer()); + } v.push_back(mbref); } if (err) diff --git a/lld/test/COFF/linkrepro-thin-archives.s b/lld/test/COFF/linkrepro-thin-archives.s new file mode 100644 index 0000000000000..6fde36b84e0af --- /dev/null +++ b/lld/test/COFF/linkrepro-thin-archives.s @@ -0,0 +1,23 @@ +# REQUIRES: x86 + +# RUN: rm -rf %t.dir; split-file %s %t.dir + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-windows %t.dir/foo.s -o %t.dir/foo.obj +# RUN: cd %t.dir +# RUN: llvm-ar rcsT foo.lib foo.obj + +# RUN: lld-link foo.lib /out:/dev/null /reproduce:repro.tar \ +# RUN: /subsystem:console /machine:x64 +# RUN: tar tf repro.tar | FileCheck -DPATH='repro/%:t.dir' %s + +# RUN: lld-link /wholearchive foo.lib /out:/dev/null /reproduce:repro2.tar \ +# RUN: /subsystem:console /machine:x64 +# RUN: tar tf repro2.tar | FileCheck -DPATH='repro2/%:t.dir' %s + +# CHECK-DAG: [[PATH]]/foo.lib +# CHECK-DAG: [[PATH]]/foo.obj + +#--- foo.s +.globl mainCRTStartup +mainCRTStartup: + nop From e576c5bed79f8a9528391756c8475cc3a6276adf Mon Sep 17 00:00:00 2001 From: Haojian Wu Date: Fri, 3 Jan 2025 15:13:48 +0100 Subject: [PATCH 355/567] Fix an incorrect -show-graph command-line flag in COFF_comdat_weak_plus_strong.s test The flag -show-graph has been renamed to -show-graphs in 01bdd8cffcaf97636b5fb6ee4933e62c872528d3 --- .../JITLink/x86-64/COFF_comdat_weak_plus_strong.s | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_weak_plus_strong.s b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_weak_plus_strong.s index 2754855e428e0..01aac02f5286e 100644 --- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_weak_plus_strong.s +++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_weak_plus_strong.s @@ -8,7 +8,7 @@ # # RUN: not llvm-jitlink -noexec %t/COFF_main.o %t/COFF_weak_1.o %t/COFF_strong.o \ # RUN: -slab-allocate 64Kb -slab-address 0xfff00000 \ -# RUN: -slab-page-size 4096 -show-graph 2>&1 | FileCheck %s +# RUN: -slab-page-size 4096 -show-graphs=".*" 2>&1 | FileCheck %s # # Check that a combination of comdat any definition and strong definition # generate duplicate definition error. From df859f90aab261918eee26382021e8455b532f7d Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Fri, 3 Jan 2025 08:36:34 -0600 Subject: [PATCH 356/567] [flang][OpenMP] Frontend support for NOTHING directive (#120606) Create OpenMPUtilityConstruct and put the two utility directives in it (error and nothing). Rename OpenMPErrorConstruct to OmpErrorDirective. --- .../FlangOmpReport/FlangOmpReportVisitor.cpp | 8 +++-- flang/include/flang/Parser/dump-parse-tree.h | 4 ++- flang/include/flang/Parser/parse-tree.h | 34 ++++++++++++++----- flang/lib/Lower/OpenMP/OpenMP.cpp | 4 +-- flang/lib/Parser/openmp-parsers.cpp | 18 +++++++--- flang/lib/Parser/unparse.cpp | 6 +++- flang/lib/Semantics/check-omp-structure.cpp | 4 +-- flang/lib/Semantics/check-omp-structure.h | 4 +-- flang/test/Lower/OpenMP/Todo/error.f90 | 2 +- flang/test/Parser/OpenMP/error-unparse.f90 | 6 ++-- flang/test/Parser/OpenMP/nothing.f90 | 13 +++++++ 11 files changed, 75 insertions(+), 28 deletions(-) create mode 100644 flang/test/Parser/OpenMP/nothing.f90 diff --git a/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp b/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp index 665b92be00898..231df63bbae92 100644 --- a/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp +++ b/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp @@ -90,6 +90,10 @@ SourcePosition OpenMPCounterVisitor::getLocation(const OpenMPConstruct &c) { const CharBlock &source{c.source}; return (parsing->allCooked().GetSourcePositionRange(source))->first; }, + [&](const OpenMPUtilityConstruct &c) -> SourcePosition { + const CharBlock &source{c.source}; + return (parsing->allCooked().GetSourcePositionRange(source))->first; + }, }, c.u); } @@ -143,8 +147,8 @@ std::string OpenMPCounterVisitor::getName(const OpenMPConstruct &c) { }, c.u); }, - [&](const OpenMPErrorConstruct &c) -> std::string { - const CharBlock &source{std::get<0>(c.t).source}; + [&](const OpenMPUtilityConstruct &c) -> std::string { + const CharBlock &source{c.source}; return normalize_construct_name(source.ToString()); }, [&](const OpenMPSectionConstruct &c) -> std::string { diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h index 7821d40a644a2..fa813727442f0 100644 --- a/flang/include/flang/Parser/dump-parse-tree.h +++ b/flang/include/flang/Parser/dump-parse-tree.h @@ -516,6 +516,8 @@ class ParseTreeDumper { #include "llvm/Frontend/OpenMP/OMP.inc" NODE(parser, OmpClauseList) NODE(parser, OmpCriticalDirective) + NODE(parser, OmpErrorDirective) + NODE(parser, OmpNothingDirective) NODE(parser, OmpDeclareTargetSpecifier) NODE(parser, OmpDeclareTargetWithClause) NODE(parser, OmpDeclareTargetWithList) @@ -662,7 +664,7 @@ class ParseTreeDumper { NODE(parser, OmpAtomicDefaultMemOrderClause) NODE_ENUM(common, OmpAtomicDefaultMemOrderType) NODE(parser, OpenMPDepobjConstruct) - NODE(parser, OpenMPErrorConstruct) + NODE(parser, OpenMPUtilityConstruct) NODE(parser, OpenMPFlushConstruct) NODE(parser, OpenMPLoopConstruct) NODE(parser, OpenMPExecutableAllocate) diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h index 2ef593b3e50da..9df7c6d5e39c3 100644 --- a/flang/include/flang/Parser/parse-tree.h +++ b/flang/include/flang/Parser/parse-tree.h @@ -4182,6 +4182,30 @@ struct OmpClauseList { // --- Directives and constructs +// Ref: [5.1:89-90], [5.2:216] +// +// nothing-directive -> +// NOTHING // since 5.1 +struct OmpNothingDirective { + using EmptyTrait = std::true_type; + COPY_AND_ASSIGN_BOILERPLATE(OmpNothingDirective); + CharBlock source; +}; + +// Ref: OpenMP [5.2:216-218] +// ERROR AT(compilation|execution) SEVERITY(fatal|warning) MESSAGE("msg-str) +struct OmpErrorDirective { + TUPLE_CLASS_BOILERPLATE(OmpErrorDirective); + CharBlock source; + std::tuple t; +}; + +struct OpenMPUtilityConstruct { + UNION_CLASS_BOILERPLATE(OpenMPUtilityConstruct); + CharBlock source; + std::variant u; +}; + // 2.7.2 SECTIONS // 2.11.2 PARALLEL SECTIONS struct OmpSectionsDirective { @@ -4506,14 +4530,6 @@ struct OpenMPDepobjConstruct { std::tuple t; }; -// Ref: OpenMP [5.2:216-218] -// ERROR AT(compilation|execution) SEVERITY(fatal|warning) MESSAGE("msg-str) -struct OpenMPErrorConstruct { - TUPLE_CLASS_BOILERPLATE(OpenMPErrorConstruct); - CharBlock source; - std::tuple t; -}; - // 2.17.8 flush -> FLUSH [memory-order-clause] [(variable-name-list)] struct OpenMPFlushConstruct { TUPLE_CLASS_BOILERPLATE(OpenMPFlushConstruct); @@ -4586,7 +4602,7 @@ struct OpenMPConstruct { UNION_CLASS_BOILERPLATE(OpenMPConstruct); std::variant u; diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index b07e89d201d19..fe6d82125a9e0 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -2907,8 +2907,8 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, - const parser::OpenMPErrorConstruct &) { - TODO(converter.getCurrentLocation(), "OpenMPErrorConstruct"); + const parser::OpenMPUtilityConstruct &) { + TODO(converter.getCurrentLocation(), "OpenMPUtilityConstruct"); } static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp index 67385c03f66c8..0a0a29002de27 100644 --- a/flang/lib/Parser/openmp-parsers.cpp +++ b/flang/lib/Parser/openmp-parsers.cpp @@ -737,9 +737,20 @@ TYPE_PARSER( TYPE_PARSER(sourced(construct( many(maybe(","_tok) >> sourced(Parser{}))))) -// 2.1 (variable | /common-block | array-sections) +// 2.1 (variable | /common-block/ | array-sections) TYPE_PARSER(construct(nonemptyList(Parser{}))) +TYPE_PARSER(sourced(construct( + verbatim("ERROR"_tok), Parser{}))) + +TYPE_PARSER(sourced(construct("NOTHING" >> ok))) + +TYPE_PARSER(sourced(construct( + sourced(construct( + sourced(Parser{}))) || + sourced(construct( + sourced(Parser{})))))) + // Omp directives enclosing do loop TYPE_PARSER(sourced(construct(first( "DISTRIBUTE PARALLEL DO SIMD" >> @@ -1027,9 +1038,6 @@ TYPE_PARSER(sourced(construct(verbatim("CRITICAL"_tok), TYPE_PARSER(construct( Parser{}, block, Parser{})) -TYPE_PARSER(sourced(construct( - verbatim("ERROR"_tok), Parser{}))) - // 2.11.3 Executable Allocate directive TYPE_PARSER( sourced(construct(verbatim("ALLOCATE"_tok), @@ -1127,7 +1135,7 @@ TYPE_CONTEXT_PARSER("OpenMP construct"_en_US, // OpenMPStandaloneConstruct to resolve !$OMP ORDERED construct(Parser{}), construct(Parser{}), - construct(Parser{}), + construct(Parser{}), construct(Parser{}), construct(Parser{}), construct(Parser{}), diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp index 0a6af7435b4a2..4fe57f3e348d3 100644 --- a/flang/lib/Parser/unparse.cpp +++ b/flang/lib/Parser/unparse.cpp @@ -2710,11 +2710,15 @@ class UnparseVisitor { Walk(x.v); return false; } - void Unparse(const OpenMPErrorConstruct &x) { + void Unparse(const OmpErrorDirective &x) { Word("!$OMP ERROR "); Walk(x.t); Put("\n"); } + void Unparse(const OmpNothingDirective &x) { + Word("!$OMP NOTHING"); + Put("\n"); + } void Unparse(const OmpSectionsDirective &x) { switch (x.v) { case llvm::omp::Directive::OMPD_sections: diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index 95b962f5daf57..3a928c8a0289b 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -1688,12 +1688,12 @@ void OmpStructureChecker::Leave(const parser::OpenMPDeclareTargetConstruct &x) { dirContext_.pop_back(); } -void OmpStructureChecker::Enter(const parser::OpenMPErrorConstruct &x) { +void OmpStructureChecker::Enter(const parser::OmpErrorDirective &x) { const auto &dir{std::get(x.t)}; PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_error); } -void OmpStructureChecker::Leave(const parser::OpenMPErrorConstruct &x) { +void OmpStructureChecker::Leave(const parser::OmpErrorDirective &x) { dirContext_.pop_back(); } diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h index 346a7bed9138f..2a4f6fbd618c3 100644 --- a/flang/lib/Semantics/check-omp-structure.h +++ b/flang/lib/Semantics/check-omp-structure.h @@ -102,8 +102,8 @@ class OmpStructureChecker void Enter(const parser::OmpDeclareTargetWithList &); void Enter(const parser::OmpDeclareTargetWithClause &); void Leave(const parser::OmpDeclareTargetWithClause &); - void Enter(const parser::OpenMPErrorConstruct &); - void Leave(const parser::OpenMPErrorConstruct &); + void Enter(const parser::OmpErrorDirective &); + void Leave(const parser::OmpErrorDirective &); void Enter(const parser::OpenMPExecutableAllocate &); void Leave(const parser::OpenMPExecutableAllocate &); void Enter(const parser::OpenMPAllocatorsConstruct &); diff --git a/flang/test/Lower/OpenMP/Todo/error.f90 b/flang/test/Lower/OpenMP/Todo/error.f90 index b97e2c20a0cdf..6d3bd892da47d 100644 --- a/flang/test/Lower/OpenMP/Todo/error.f90 +++ b/flang/test/Lower/OpenMP/Todo/error.f90 @@ -1,6 +1,6 @@ ! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=51 -o - %s 2>&1 | FileCheck %s -! CHECK: not yet implemented: OpenMPErrorConstruct +! CHECK: not yet implemented: OpenMPUtilityConstruct program p integer, allocatable :: x !$omp error at(compilation) severity(warning) message("an error") diff --git a/flang/test/Parser/OpenMP/error-unparse.f90 b/flang/test/Parser/OpenMP/error-unparse.f90 index fce5d8cf22863..4dd06b736da80 100644 --- a/flang/test/Parser/OpenMP/error-unparse.f90 +++ b/flang/test/Parser/OpenMP/error-unparse.f90 @@ -3,19 +3,19 @@ program main character(*), parameter :: message = "This is an error" !CHECK: !$OMP ERROR AT(COMPILATION) SEVERITY(WARNING) MESSAGE("some message here") - !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPErrorConstruct + !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpErrorDirective !PARSE-TREE: OmpClauseList -> OmpClause -> At -> OmpAtClause -> ActionTime = Compilation !PARSE-TREE: OmpClause -> Severity -> OmpSeverityClause -> Severity = Warning !PARSE-TREE: OmpClause -> Message -> OmpMessageClause -> Expr -> LiteralConstant -> CharLiteralConstant !$omp error at(compilation) severity(warning) message("some message here") !CHECK: !$OMP ERROR AT(COMPILATION) SEVERITY(FATAL) MESSAGE(message) - !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPErrorConstruct + !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpErrorDirective !PARSE-TREE: OmpClauseList -> OmpClause -> At -> OmpAtClause -> ActionTime = Compilation !PARSE-TREE: OmpClause -> Severity -> OmpSeverityClause -> Severity = Fatal !PARSE-TREE: OmpClause -> Message -> OmpMessageClause -> Expr -> Designator -> DataRef -> Name = 'message' !$omp error at(compilation) severity(fatal) message(message) !CHECK: !$OMP ERROR AT(EXECUTION) SEVERITY(FATAL) MESSAGE(message) - !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPErrorConstruct + !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpErrorDirective !PARSE-TREE: OmpClauseList -> OmpClause -> At -> OmpAtClause -> ActionTime = Execution !PARSE-TREE: OmpClause -> Severity -> OmpSeverityClause -> Severity = Fatal !PARSE-TREE: OmpClause -> Message -> OmpMessageClause -> Expr -> Designator -> DataRef -> Name = 'message' diff --git a/flang/test/Parser/OpenMP/nothing.f90 b/flang/test/Parser/OpenMP/nothing.f90 new file mode 100644 index 0000000000000..80c0932087610 --- /dev/null +++ b/flang/test/Parser/OpenMP/nothing.f90 @@ -0,0 +1,13 @@ +!RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=51 %s | FileCheck --ignore-case --check-prefix="UNPARSE" %s +!RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=51 %s | FileCheck --check-prefix="PARSE-TREE" %s + +subroutine f00 + !$omp nothing +end + +!UNPARSE: SUBROUTINE f00 +!UNPARSE: !$OMP NOTHING +!UNPARSE: END SUBROUTINE + +!PARSE-TREE: ExecutionPart -> Block +!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpNothingDirective From 62b5cf041059a90215788a0bfefb8fc180fd0b5a Mon Sep 17 00:00:00 2001 From: Florian Mayer Date: Fri, 3 Jan 2025 06:37:45 -0800 Subject: [PATCH 357/567] [Vectorizer] precommit test for miscompilation (#120731) we generate GEPs that are out of bounds but mark them as "inbound" --- ...bounds-flags-for-reverse-vector-pointer.ll | 105 ++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll new file mode 100644 index 0000000000000..66bb9357750c8 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll @@ -0,0 +1,105 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; RUN: opt -passes=loop-vectorize -force-vector-width=4 -S %s | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; FIXME: GEP flags on GEPs for reverse vector pointer need to be dropped when folding the tail. + +define i1 @fn(ptr %nno) #0 { +; CHECK-LABEL: define i1 @fn( +; CHECK-SAME: ptr [[NNO:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 10, [[INDEX]] +; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], splat (i64 10) +; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i64> [[VEC_IND]], splat (i64 1) +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw i32, ptr [[NNO]], i64 [[TMP22]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 -3 +; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <4 x i32> +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP6]], i32 4, <4 x i1> [[REVERSE]], <4 x i32> poison) +; CHECK-NEXT: [[REVERSE1:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shl <4 x i32> [[REVERSE1]], splat (i32 1) +; CHECK-NEXT: [[TMP8:%.*]] = urem <4 x i32> [[TMP7]], splat (i32 10) +; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true) +; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP9]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[REVERSE1]], <4 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP11]] = or <4 x i32> [[PREDPHI]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP11]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4) +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12 +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP12]]) +; CHECK-NEXT: br i1 true, label [[FOR_END36:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -2, [[MIDDLE_BLOCK]] ], [ 10, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY20:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC35:%.*]] ] +; CHECK-NEXT: [[SUM_01:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUM_1:%.*]], [[FOR_INC35]] ] +; CHECK-NEXT: [[REM4:%.*]] = and i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[CMP21:%.*]] = icmp eq i64 [[REM4]], 0 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw i32, ptr [[NNO]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[GEP]], align 4 +; CHECK-NEXT: br i1 [[CMP21]], label [[IF_THEN22:%.*]], label [[FOR_INC35]] +; CHECK: if.then: +; CHECK-NEXT: [[MUL:%.*]] = shl i32 [[TMP15]], 1 +; CHECK-NEXT: [[REM27:%.*]] = urem i32 [[MUL]], 10 +; CHECK-NEXT: br label [[FOR_INC35]] +; CHECK: loop.latch: +; CHECK-NEXT: [[REM27_PN:%.*]] = phi i32 [ [[REM27]], [[IF_THEN22]] ], [ [[TMP15]], [[FOR_BODY20]] ] +; CHECK-NEXT: [[SUM_1]] = or i32 [[REM27_PN]], [[SUM_01]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 +; CHECK-NEXT: [[CMP19_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], 0 +; CHECK-NEXT: br i1 [[CMP19_NOT]], label [[FOR_END36]], label [[FOR_BODY20]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[SUM_1_LCSSA:%.*]] = phi i32 [ [[SUM_1]], [[FOR_INC35]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[CMP41:%.*]] = icmp eq i32 [[SUM_1_LCSSA]], 0 +; CHECK-NEXT: ret i1 [[CMP41]] +; +entry: + br label %loop.header + +loop.header: ; preds = %entry, %loop.latch + %iv = phi i64 [ 10, %entry ], [ %iv.next, %loop.latch ] + %sum.01 = phi i32 [ 0, %entry ], [ %sum.1, %loop.latch ] + %rem4 = and i64 %iv, 1 + %cmp21 = icmp eq i64 %rem4, 0 + %gep = getelementptr inbounds nuw i32, ptr %nno, i64 %iv + %0 = load i32, ptr %gep, align 4 + br i1 %cmp21, label %if.then, label %loop.latch + +if.then: ; preds = %loop.header + %mul = shl i32 %0, 1 + %rem27 = urem i32 %mul, 10 + br label %loop.latch + +loop.latch: ; preds = %loop.header, %if.then + %rem27.pn = phi i32 [ %rem27, %if.then ], [ %0, %loop.header ] + %sum.1 = or i32 %rem27.pn, %sum.01 + %iv.next = add nsw i64 %iv, -1 + %cmp19.not = icmp eq i64 %iv, 0 + br i1 %cmp19.not, label %exit, label %loop.header + +exit: ; preds = %loop.latch + %sum.1.lcssa = phi i32 [ %sum.1, %loop.latch ] + %cmp41 = icmp eq i32 %sum.1.lcssa, 0 + ret i1 %cmp41 +} + +attributes #0 = { "target-features"="+avx" } From 9d6527bc12547e28b86d180b76fe934a96aa518e Mon Sep 17 00:00:00 2001 From: Acim Maravic Date: Fri, 3 Jan 2025 15:45:52 +0100 Subject: [PATCH 358/567] [CodeGen] Add MOTargetFlag4 to MachineMemOperand Flags (#120136) --- llvm/include/llvm/CodeGen/MachineMemOperand.h | 3 ++- llvm/lib/CodeGen/MachineOperand.cpp | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/llvm/include/llvm/CodeGen/MachineMemOperand.h b/llvm/include/llvm/CodeGen/MachineMemOperand.h index e2343abcc4ac1..2caa3bd30487a 100644 --- a/llvm/include/llvm/CodeGen/MachineMemOperand.h +++ b/llvm/include/llvm/CodeGen/MachineMemOperand.h @@ -152,8 +152,9 @@ class MachineMemOperand { MOTargetFlag1 = 1u << 6, MOTargetFlag2 = 1u << 7, MOTargetFlag3 = 1u << 8, + MOTargetFlag4 = 1u << 9, - LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ MOTargetFlag3) + LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ MOTargetFlag4) }; private: diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp index 3a9bdde28a2e7..5c9ca91e784e9 100644 --- a/llvm/lib/CodeGen/MachineOperand.cpp +++ b/llvm/lib/CodeGen/MachineOperand.cpp @@ -1170,6 +1170,9 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST, if (getFlags() & MachineMemOperand::MOTargetFlag3) OS << '"' << getTargetMMOFlagName(*TII, MachineMemOperand::MOTargetFlag3) << "\" "; + if (getFlags() & MachineMemOperand::MOTargetFlag4) + OS << '"' << getTargetMMOFlagName(*TII, MachineMemOperand::MOTargetFlag4) + << "\" "; } else { if (getFlags() & MachineMemOperand::MOTargetFlag1) OS << "\"MOTargetFlag1\" "; @@ -1177,6 +1180,8 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST, OS << "\"MOTargetFlag2\" "; if (getFlags() & MachineMemOperand::MOTargetFlag3) OS << "\"MOTargetFlag3\" "; + if (getFlags() & MachineMemOperand::MOTargetFlag4) + OS << "\"MOTargetFlag4\" "; } assert((isLoad() || isStore()) && From 3ace685105d3b50bca68328bf0c945af22d70f23 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Fri, 3 Jan 2025 16:11:56 +0100 Subject: [PATCH 359/567] [mlir][Transforms] Support 1:N mappings in `ConversionValueMapping` (#116524) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit updates the internal `ConversionValueMapping` data structure in the dialect conversion driver to support 1:N replacements. This is the last major commit for adding 1:N support to the dialect conversion driver. Since #116470, the infrastructure already supports 1:N replacements. But the `ConversionValueMapping` still stored 1:1 value mappings. To that end, the driver inserted temporary argument materializations (converting N SSA values into 1 value). This is no longer the case. Argument materializations are now entirely gone. (They will be deleted from the type converter after some time, when we delete the old 1:N dialect conversion driver.) Note for LLVM integration: Replace all occurrences of `addArgumentMaterialization` (except for 1:N dialect conversion passes) with `addSourceMaterialization`. --------- Co-authored-by: Markus Böck --- .../lib/Optimizer/CodeGen/BoxedProcedure.cpp | 1 - mlir/docs/DialectConversion.md | 35 +- .../mlir/Transforms/DialectConversion.h | 18 +- .../Conversion/LLVMCommon/TypeConverter.cpp | 16 +- .../EmitC/Transforms/TypeConversions.cpp | 1 - .../Dialect/Linalg/Transforms/Detensorize.cpp | 1 - .../Quant/Transforms/StripFuncQuantTypes.cpp | 1 - .../Utils/SparseTensorDescriptor.cpp | 3 - .../Vector/Transforms/VectorLinearize.cpp | 1 - .../Transforms/Utils/DialectConversion.cpp | 477 +++++++++--------- mlir/test/Transforms/test-legalizer.mlir | 16 +- .../Func/TestDecomposeCallGraphTypes.cpp | 2 +- mlir/test/lib/Dialect/Test/TestPatterns.cpp | 9 - .../lib/Transforms/TestDialectConversion.cpp | 1 - 14 files changed, 268 insertions(+), 314 deletions(-) diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp index 1bb91d252529f..104ae7408b80c 100644 --- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp +++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp @@ -172,7 +172,6 @@ class BoxprocTypeRewriter : public mlir::TypeConverter { addConversion([&](TypeDescType ty) { return TypeDescType::get(convertType(ty.getOfTy())); }); - addArgumentMaterialization(materializeProcedure); addSourceMaterialization(materializeProcedure); addTargetMaterialization(materializeProcedure); } diff --git a/mlir/docs/DialectConversion.md b/mlir/docs/DialectConversion.md index 3168f5e13c751..abacd5a82c61e 100644 --- a/mlir/docs/DialectConversion.md +++ b/mlir/docs/DialectConversion.md @@ -242,19 +242,6 @@ cannot. These materializations are used by the conversion framework to ensure type safety during the conversion process. There are several types of materializations depending on the situation. -* Argument Materialization - - - An argument materialization is used when converting the type of a block - argument during a [signature conversion](#region-signature-conversion). - The new block argument types are specified in a `SignatureConversion` - object. An original block argument can be converted into multiple - block arguments, which is not supported everywhere in the dialect - conversion. (E.g., adaptors support only a single replacement value for - each original value.) Therefore, an argument materialization is used to - convert potentially multiple new block arguments back into a single SSA - value. An argument materialization is also used when replacing an op - result with multiple values. - * Source Materialization - A source materialization is used when a value was replaced with a value @@ -343,17 +330,6 @@ class TypeConverter { /// Materialization functions must be provided when a type conversion may /// persist after the conversion has finished. - /// This method registers a materialization that will be called when - /// converting (potentially multiple) block arguments that were the result of - /// a signature conversion of a single block argument, to a single SSA value - /// with the old argument type. - template ::template arg_t<1>> - void addArgumentMaterialization(FnT &&callback) { - argumentMaterializations.emplace_back( - wrapMaterialization(std::forward(callback))); - } - /// This method registers a materialization that will be called when /// converting a replacement value back to its original source type. /// This is used when some uses of the original value persist beyond the main @@ -406,12 +382,11 @@ done explicitly via a conversion pattern. To convert the types of block arguments within a Region, a custom hook on the `ConversionPatternRewriter` must be invoked; `convertRegionTypes`. This hook uses a provided type converter to apply type conversions to all blocks of a -given region. As noted above, the conversions performed by this method use the -argument materialization hook on the `TypeConverter`. This hook also takes an -optional `TypeConverter::SignatureConversion` parameter that applies a custom -conversion to the entry block of the region. The types of the entry block -arguments are often tied semantically to the operation, e.g., -`func::FuncOp`, `AffineForOp`, etc. +given region. This hook also takes an optional +`TypeConverter::SignatureConversion` parameter that applies a custom conversion +to the entry block of the region. The types of the entry block arguments are +often tied semantically to the operation, e.g., `func::FuncOp`, `AffineForOp`, +etc. To convert the signature of just one given block, the `applySignatureConversion` hook can be used. diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h index 28150e886913e..9a6975dcf8dfa 100644 --- a/mlir/include/mlir/Transforms/DialectConversion.h +++ b/mlir/include/mlir/Transforms/DialectConversion.h @@ -181,6 +181,10 @@ class TypeConverter { /// converting (potentially multiple) block arguments that were the result of /// a signature conversion of a single block argument, to a single SSA value /// with the old block argument type. + /// + /// Note: Argument materializations are used only with the 1:N dialect + /// conversion driver. The 1:N dialect conversion driver will be removed soon + /// and so will be argument materializations. template >::template arg_t<1>> void addArgumentMaterialization(FnT &&callback) { @@ -880,15 +884,7 @@ class ConversionPatternRewriter final : public PatternRewriter { void replaceOp(Operation *op, Operation *newOp) override; /// Replace the given operation with the new value ranges. The number of op - /// results and value ranges must match. If an original SSA value is replaced - /// by multiple SSA values (i.e., a value range has more than 1 element), the - /// conversion driver will insert an argument materialization to convert the - /// N SSA values back into 1 SSA value of the original type. The given - /// operation is erased. - /// - /// Note: The argument materialization is a workaround until we have full 1:N - /// support in the dialect conversion. (It is going to disappear from both - /// `replaceOpWithMultiple` and `applySignatureConversion`.) + /// results and value ranges must match. The given operation is erased. void replaceOpWithMultiple(Operation *op, ArrayRef newValues); /// PatternRewriter hook for erasing a dead operation. The uses of this @@ -1285,8 +1281,8 @@ struct ConversionConfig { // represented at the moment. RewriterBase::Listener *listener = nullptr; - /// If set to "true", the dialect conversion attempts to build source/target/ - /// argument materializations through the type converter API in lieu of + /// If set to "true", the dialect conversion attempts to build source/target + /// materializations through the type converter API in lieu of /// "builtin.unrealized_conversion_cast ops". The conversion process fails if /// at least one materialization could not be built. /// diff --git a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp index 49e2d94328664..72799e42cf3fd 100644 --- a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp +++ b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp @@ -85,7 +85,7 @@ static Value unrankedMemRefMaterialization(OpBuilder &builder, UnrankedMemRefType resultType, ValueRange inputs, Location loc, const LLVMTypeConverter &converter) { - // An argument materialization must return a value of type + // A source materialization must return a value of type // `resultType`, so insert a cast from the memref descriptor type // (!llvm.struct) to the original memref type. Value packed = @@ -101,7 +101,7 @@ static Value rankedMemRefMaterialization(OpBuilder &builder, MemRefType resultType, ValueRange inputs, Location loc, const LLVMTypeConverter &converter) { - // An argument materialization must return a value of type `resultType`, + // A source materialization must return a value of type `resultType`, // so insert a cast from the memref descriptor type (!llvm.struct) to the // original memref type. Value packed = @@ -234,19 +234,9 @@ LLVMTypeConverter::LLVMTypeConverter(MLIRContext *ctx, .getResult(0); }); - // Argument materializations convert from the new block argument types + // Source materializations convert from the new block argument types // (multiple SSA values that make up a memref descriptor) back to the // original block argument type. - addArgumentMaterialization([&](OpBuilder &builder, - UnrankedMemRefType resultType, - ValueRange inputs, Location loc) { - return unrankedMemRefMaterialization(builder, resultType, inputs, loc, - *this); - }); - addArgumentMaterialization([&](OpBuilder &builder, MemRefType resultType, - ValueRange inputs, Location loc) { - return rankedMemRefMaterialization(builder, resultType, inputs, loc, *this); - }); addSourceMaterialization([&](OpBuilder &builder, UnrankedMemRefType resultType, ValueRange inputs, Location loc) { diff --git a/mlir/lib/Dialect/EmitC/Transforms/TypeConversions.cpp b/mlir/lib/Dialect/EmitC/Transforms/TypeConversions.cpp index 0b3a494794f3f..72c8fd0f32485 100644 --- a/mlir/lib/Dialect/EmitC/Transforms/TypeConversions.cpp +++ b/mlir/lib/Dialect/EmitC/Transforms/TypeConversions.cpp @@ -33,7 +33,6 @@ void mlir::populateEmitCSizeTTypeConversions(TypeConverter &converter) { converter.addSourceMaterialization(materializeAsUnrealizedCast); converter.addTargetMaterialization(materializeAsUnrealizedCast); - converter.addArgumentMaterialization(materializeAsUnrealizedCast); } /// Get an unsigned integer or size data type corresponding to \p ty. diff --git a/mlir/lib/Dialect/Linalg/Transforms/Detensorize.cpp b/mlir/lib/Dialect/Linalg/Transforms/Detensorize.cpp index 0e651f4cee4c3..fc6671ef81175 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Detensorize.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Detensorize.cpp @@ -154,7 +154,6 @@ class DetensorizeTypeConverter : public TypeConverter { }); addSourceMaterialization(sourceMaterializationCallback); - addArgumentMaterialization(sourceMaterializationCallback); } }; diff --git a/mlir/lib/Dialect/Quant/Transforms/StripFuncQuantTypes.cpp b/mlir/lib/Dialect/Quant/Transforms/StripFuncQuantTypes.cpp index 6191272266283..71b88d1be1b05 100644 --- a/mlir/lib/Dialect/Quant/Transforms/StripFuncQuantTypes.cpp +++ b/mlir/lib/Dialect/Quant/Transforms/StripFuncQuantTypes.cpp @@ -56,7 +56,6 @@ class QuantizedTypeConverter : public TypeConverter { addConversion(convertQuantizedType); addConversion(convertTensorType); - addArgumentMaterialization(materializeConversion); addSourceMaterialization(materializeConversion); addTargetMaterialization(materializeConversion); } diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.cpp index 834e3634cc130..8bbb2cac5efdf 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.cpp @@ -69,9 +69,6 @@ SparseTensorTypeToBufferConverter::SparseTensorTypeToBufferConverter() { // Required by scf.for 1:N type conversion. addSourceMaterialization(materializeTuple); - - // Required as a workaround until we have full 1:N support. - addArgumentMaterialization(materializeTuple); } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp index 757631944f224..68535ae5a7a5c 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp @@ -481,7 +481,6 @@ void mlir::vector::populateVectorLinearizeTypeConversionsAndLegality( return builder.create(loc, type, inputs.front()); }; - typeConverter.addArgumentMaterialization(materializeCast); typeConverter.addSourceMaterialization(materializeCast); typeConverter.addTargetMaterialization(materializeCast); target.markUnknownOpDynamicallyLegal( diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index 2b006430d3817..0c5520988eff3 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -11,6 +11,7 @@ #include "mlir/IR/Block.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/Dominance.h" #include "mlir/IR/IRMapping.h" #include "mlir/IR/Iterators.h" #include "mlir/Interfaces/FunctionInterfaces.h" @@ -53,6 +54,55 @@ static void logFailure(llvm::ScopedPrinter &os, StringRef fmt, Args &&...args) { }); } +/// Given two insertion points in the same block, choose the later one. +static OpBuilder::InsertPoint +chooseLaterInsertPointInBlock(OpBuilder::InsertPoint a, + OpBuilder::InsertPoint b) { + assert(a.getBlock() == b.getBlock() && "expected same block"); + Block *block = a.getBlock(); + if (a.getPoint() == block->begin()) + return b; + if (b.getPoint() == block->begin()) + return a; + if (a.getPoint()->isBeforeInBlock(&*b.getPoint())) + return b; + return a; +} + +/// Helper function that chooses the insertion point among the two given ones +/// that is later. +// TODO: Extend DominanceInfo API to work with block iterators. +static OpBuilder::InsertPoint chooseLaterInsertPoint(OpBuilder::InsertPoint a, + OpBuilder::InsertPoint b) { + // Case 1: Fast path: Same block. This is the most common case. + if (LLVM_LIKELY(a.getBlock() == b.getBlock())) + return chooseLaterInsertPointInBlock(a, b); + + // Case 2: Different block, but same region. + if (a.getBlock()->getParent() == b.getBlock()->getParent()) { + DominanceInfo domInfo; + if (domInfo.properlyDominates(a.getBlock(), b.getBlock())) + return b; + if (domInfo.properlyDominates(b.getBlock(), a.getBlock())) + return a; + // Neither of the two blocks dominante each other. + llvm_unreachable("unable to find valid insertion point"); + } + + // Case 3: b's region contains a: choose a. + if (b.getBlock()->getParent()->findAncestorOpInRegion( + *a.getPoint()->getParentOp())) + return a; + + // Case 4: a's region contains b: choose b. + if (a.getBlock()->getParent()->findAncestorOpInRegion( + *b.getPoint()->getParentOp())) + return b; + + // Neither of the two operations contain each other. + llvm_unreachable("unable to find valid insertion point"); +} + /// Helper function that computes an insertion point where the given value is /// defined and can be used without a dominance violation. static OpBuilder::InsertPoint computeInsertPoint(Value value) { @@ -63,11 +113,38 @@ static OpBuilder::InsertPoint computeInsertPoint(Value value) { return OpBuilder::InsertPoint(insertBlock, insertPt); } +/// Helper function that computes an insertion point where the given values are +/// defined and can be used without a dominance violation. +static OpBuilder::InsertPoint computeInsertPoint(ArrayRef vals) { + assert(!vals.empty() && "expected at least one value"); + OpBuilder::InsertPoint pt = computeInsertPoint(vals.front()); + for (Value v : vals.drop_front()) + pt = chooseLaterInsertPoint(pt, computeInsertPoint(v)); + return pt; +} + //===----------------------------------------------------------------------===// // ConversionValueMapping //===----------------------------------------------------------------------===// +/// A vector of SSA values, optimized for the most common case of a single +/// value. +using ValueVector = SmallVector; + namespace { + +/// Helper class to make it possible to use `ValueVector` as a key in DenseMap. +struct ValueVectorMapInfo { + static ValueVector getEmptyKey() { return ValueVector{}; } + static ValueVector getTombstoneKey() { return ValueVector{}; } + static ::llvm::hash_code getHashValue(const ValueVector &val) { + return ::llvm::hash_combine_range(val.begin(), val.end()); + } + static bool isEqual(const ValueVector &LHS, const ValueVector &RHS) { + return LHS == RHS; + } +}; + /// This class wraps a IRMapping to provide recursive lookup /// functionality, i.e. we will traverse if the mapped value also has a mapping. struct ConversionValueMapping { @@ -75,68 +152,129 @@ struct ConversionValueMapping { /// false positives. bool isMappedTo(Value value) const { return mappedTo.contains(value); } - /// Lookup the most recently mapped value with the desired type in the + /// Lookup the most recently mapped values with the desired types in the /// mapping. /// /// Special cases: - /// - If the desired type is "null", simply return the most recently mapped + /// - If the desired type range is empty, simply return the most recently + /// mapped values. + /// - If there is no mapping to the desired types, also return the most + /// recently mapped values. + /// - If there is no mapping for the given values at all, return the given /// value. - /// - If there is no mapping to the desired type, also return the most - /// recently mapped value. - /// - If there is no mapping for the given value at all, return the given - /// value. - Value lookupOrDefault(Value from, Type desiredType = nullptr) const; + ValueVector lookupOrDefault(Value from, TypeRange desiredTypes = {}) const; + + /// Lookup the given value within the map, or return an empty vector if the + /// value is not mapped. If it is mapped, this follows the same behavior + /// as `lookupOrDefault`. + ValueVector lookupOrNull(Value from, TypeRange desiredTypes = {}) const; - /// Lookup a mapped value within the map, or return null if a mapping does not - /// exist. If a mapping exists, this follows the same behavior of - /// `lookupOrDefault`. - Value lookupOrNull(Value from, Type desiredType = nullptr) const; + template + struct IsValueVector : std::is_same, ValueVector> {}; - /// Map a value to the one provided. - void map(Value oldVal, Value newVal) { + /// Map a value vector to the one provided. + template + std::enable_if_t::value && IsValueVector::value> + map(OldVal &&oldVal, NewVal &&newVal) { LLVM_DEBUG({ - for (Value it = newVal; it; it = mapping.lookupOrNull(it)) - assert(it != oldVal && "inserting cyclic mapping"); + ValueVector next(newVal); + while (true) { + assert(next != oldVal && "inserting cyclic mapping"); + auto it = mapping.find(next); + if (it == mapping.end()) + break; + next = it->second; + } }); - mapping.map(oldVal, newVal); - mappedTo.insert(newVal); + for (Value v : newVal) + mappedTo.insert(v); + + mapping[std::forward(oldVal)] = std::forward(newVal); + } + + /// Map a value vector or single value to the one provided. + template + std::enable_if_t::value || + !IsValueVector::value> + map(OldVal &&oldVal, NewVal &&newVal) { + if constexpr (IsValueVector{}) { + map(std::forward(oldVal), ValueVector{newVal}); + } else if constexpr (IsValueVector{}) { + map(ValueVector{oldVal}, std::forward(newVal)); + } else { + map(ValueVector{oldVal}, ValueVector{newVal}); + } } - /// Drop the last mapping for the given value. - void erase(Value value) { mapping.erase(value); } + /// Drop the last mapping for the given values. + void erase(const ValueVector &value) { mapping.erase(value); } private: /// Current value mappings. - IRMapping mapping; + DenseMap mapping; /// All SSA values that are mapped to. May contain false positives. DenseSet mappedTo; }; } // namespace -Value ConversionValueMapping::lookupOrDefault(Value from, - Type desiredType) const { - // Try to find the deepest value that has the desired type. If there is no - // such value, simply return the deepest value. - Value desiredValue; +ValueVector +ConversionValueMapping::lookupOrDefault(Value from, + TypeRange desiredTypes) const { + // Try to find the deepest values that have the desired types. If there is no + // such mapping, simply return the deepest values. + ValueVector desiredValue; + ValueVector current{from}; do { - if (!desiredType || from.getType() == desiredType) - desiredValue = from; + // Store the current value if the types match. + if (TypeRange(current) == desiredTypes) + desiredValue = current; + + // If possible, Replace each value with (one or multiple) mapped values. + ValueVector next; + for (Value v : current) { + auto it = mapping.find({v}); + if (it != mapping.end()) { + llvm::append_range(next, it->second); + } else { + next.push_back(v); + } + } + if (next != current) { + // If at least one value was replaced, continue the lookup from there. + current = std::move(next); + continue; + } - Value mappedValue = mapping.lookupOrNull(from); - if (!mappedValue) + // Otherwise: Check if there is a mapping for the entire vector. Such + // mappings are materializations. (N:M mapping are not supported for value + // replacements.) + // + // Note: From a correctness point of view, materializations do not have to + // be stored (and looked up) in the mapping. But for performance reasons, + // we choose to reuse existing IR (when possible) instead of creating it + // multiple times. + auto it = mapping.find(current); + if (it == mapping.end()) { + // No mapping found: The lookup stops here. break; - from = mappedValue; + } + current = it->second; } while (true); - // If the desired value was found use it, otherwise default to the leaf value. - return desiredValue ? desiredValue : from; + // If the desired values were found use them, otherwise default to the leaf + // values. + // Note: If `desiredTypes` is empty, this function always returns `current`. + return !desiredValue.empty() ? std::move(desiredValue) : std::move(current); } -Value ConversionValueMapping::lookupOrNull(Value from, Type desiredType) const { - Value result = lookupOrDefault(from, desiredType); - if (result == from || (desiredType && result.getType() != desiredType)) - return nullptr; +ValueVector ConversionValueMapping::lookupOrNull(Value from, + TypeRange desiredTypes) const { + ValueVector result = lookupOrDefault(from, desiredTypes); + TypeRange resultTypes(result); + if (result == ValueVector{from} || + (!desiredTypes.empty() && resultTypes != desiredTypes)) + return {}; return result; } @@ -651,10 +789,6 @@ class CreateOperationRewrite : public OperationRewrite { /// The type of materialization. enum MaterializationKind { - /// This materialization materializes a conversion for an illegal block - /// argument type, to the original one. - Argument, - /// This materialization materializes a conversion from an illegal type to a /// legal one. Target, @@ -673,7 +807,7 @@ class UnresolvedMaterializationRewrite : public OperationRewrite { UnrealizedConversionCastOp op, const TypeConverter *converter, MaterializationKind kind, Type originalType, - Value mappedValue); + ValueVector mappedValues); static bool classof(const IRRewrite *rewrite) { return rewrite->getKind() == Kind::UnresolvedMaterialization; @@ -708,9 +842,9 @@ class UnresolvedMaterializationRewrite : public OperationRewrite { /// materializations. Type originalType; - /// The value in the conversion value mapping that is being replaced by the + /// The values in the conversion value mapping that are being replaced by the /// results of this unresolved materialization. - Value mappedValue; + ValueVector mappedValues; }; } // namespace @@ -779,7 +913,7 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { LogicalResult remapValues(StringRef valueDiagTag, std::optional inputLoc, PatternRewriter &rewriter, ValueRange values, - SmallVector> &remapped); + SmallVector &remapped); /// Return "true" if the given operation is ignored, and does not need to be /// converted. @@ -820,39 +954,14 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { /// If a cast op was built, it can optionally be returned with the `castOp` /// output argument. /// - /// If `valueToMap` is set to a non-null Value, then that value is mapped to + /// If `valuesToMap` is set to a non-null Value, then that value is mapped to /// the results of the unresolved materialization in the conversion value /// mapping. ValueRange buildUnresolvedMaterialization( MaterializationKind kind, OpBuilder::InsertPoint ip, Location loc, - Value valueToMap, ValueRange inputs, TypeRange outputTypes, + ValueVector valuesToMap, ValueRange inputs, TypeRange outputTypes, Type originalType, const TypeConverter *converter, UnrealizedConversionCastOp *castOp = nullptr); - Value buildUnresolvedMaterialization( - MaterializationKind kind, OpBuilder::InsertPoint ip, Location loc, - Value valueToMap, ValueRange inputs, Type outputType, Type originalType, - const TypeConverter *converter, - UnrealizedConversionCastOp *castOp = nullptr) { - return buildUnresolvedMaterialization(kind, ip, loc, valueToMap, inputs, - TypeRange(outputType), originalType, - converter, castOp) - .front(); - } - - /// Build an N:1 materialization for the given original value that was - /// replaced with the given replacement values. - /// - /// This is a workaround around incomplete 1:N support in the dialect - /// conversion driver. The conversion mapping can store only 1:1 replacements - /// and the conversion patterns only support single Value replacements in the - /// adaptor, so N values must be converted back to a single value. This - /// function will be deleted when full 1:N support has been added. - /// - /// This function inserts an argument materialization back to the original - /// type. - void insertNTo1Materialization(OpBuilder::InsertPoint ip, Location loc, - ValueRange replacements, Value originalValue, - const TypeConverter *converter); /// Find a replacement value for the given SSA value in the conversion value /// mapping. The replacement value must have the same type as the given SSA @@ -862,16 +971,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { Value findOrBuildReplacementValue(Value value, const TypeConverter *converter); - /// Unpack an N:1 materialization and return the inputs of the - /// materialization. This function unpacks only those materializations that - /// were built with `insertNTo1Materialization`. - /// - /// This is a workaround around incomplete 1:N support in the dialect - /// conversion driver. It allows us to write 1:N conversion patterns while - /// 1:N support is still missing in the conversion value mapping. This - /// function will be deleted when full 1:N support has been added. - SmallVector unpackNTo1Materialization(Value value); - //===--------------------------------------------------------------------===// // Rewriter Notification Hooks //===--------------------------------------------------------------------===// @@ -1041,7 +1140,7 @@ void ReplaceBlockArgRewrite::commit(RewriterBase &rewriter) { }); } -void ReplaceBlockArgRewrite::rollback() { rewriterImpl.mapping.erase(arg); } +void ReplaceBlockArgRewrite::rollback() { rewriterImpl.mapping.erase({arg}); } void ReplaceOperationRewrite::commit(RewriterBase &rewriter) { auto *listener = @@ -1082,7 +1181,7 @@ void ReplaceOperationRewrite::commit(RewriterBase &rewriter) { void ReplaceOperationRewrite::rollback() { for (auto result : op->getResults()) - rewriterImpl.mapping.erase(result); + rewriterImpl.mapping.erase({result}); } void ReplaceOperationRewrite::cleanup(RewriterBase &rewriter) { @@ -1101,18 +1200,18 @@ void CreateOperationRewrite::rollback() { UnresolvedMaterializationRewrite::UnresolvedMaterializationRewrite( ConversionPatternRewriterImpl &rewriterImpl, UnrealizedConversionCastOp op, const TypeConverter *converter, MaterializationKind kind, Type originalType, - Value mappedValue) + ValueVector mappedValues) : OperationRewrite(Kind::UnresolvedMaterialization, rewriterImpl, op), converterAndKind(converter, kind), originalType(originalType), - mappedValue(mappedValue) { + mappedValues(std::move(mappedValues)) { assert((!originalType || kind == MaterializationKind::Target) && "original type is valid only for target materializations"); rewriterImpl.unresolvedMaterializations[op] = this; } void UnresolvedMaterializationRewrite::rollback() { - if (mappedValue) - rewriterImpl.mapping.erase(mappedValue); + if (!mappedValues.empty()) + rewriterImpl.mapping.erase(mappedValues); rewriterImpl.unresolvedMaterializations.erase(getOperation()); rewriterImpl.nTo1TempMaterializations.erase(getOperation()); op->erase(); @@ -1160,7 +1259,7 @@ void ConversionPatternRewriterImpl::undoRewrites(unsigned numRewritesToKeep) { LogicalResult ConversionPatternRewriterImpl::remapValues( StringRef valueDiagTag, std::optional inputLoc, PatternRewriter &rewriter, ValueRange values, - SmallVector> &remapped) { + SmallVector &remapped) { remapped.reserve(llvm::size(values)); for (const auto &it : llvm::enumerate(values)) { @@ -1168,18 +1267,11 @@ LogicalResult ConversionPatternRewriterImpl::remapValues( Type origType = operand.getType(); Location operandLoc = inputLoc ? *inputLoc : operand.getLoc(); - // Find the most recently mapped value. Unpack all temporary N:1 - // materializations. Such conversions are a workaround around missing - // 1:N support in the ConversionValueMapping. (The conversion patterns - // already support 1:N replacements.) - Value repl = mapping.lookupOrDefault(operand); - SmallVector unpacked = unpackNTo1Materialization(repl); - if (!currentTypeConverter) { // The current pattern does not have a type converter. I.e., it does not // distinguish between legal and illegal types. For each operand, simply - // pass through the most recently mapped value. - remapped.push_back(std::move(unpacked)); + // pass through the most recently mapped values. + remapped.push_back(mapping.lookupOrDefault(operand)); continue; } @@ -1192,51 +1284,28 @@ LogicalResult ConversionPatternRewriterImpl::remapValues( }); return failure(); } - // If a type is converted to 0 types, there is nothing to do. if (legalTypes.empty()) { remapped.push_back({}); continue; } - if (legalTypes.size() != 1) { - // TODO: This is a 1:N conversion. The conversion value mapping does not - // store such materializations yet. If the types of the most recently - // mapped values do not match, build a target materialization. - ValueRange unpackedRange(unpacked); - if (TypeRange(unpackedRange) == legalTypes) { - remapped.push_back(std::move(unpacked)); - continue; - } - - // Insert a target materialization if the current pattern expects - // different legalized types. - ValueRange targetMat = buildUnresolvedMaterialization( - MaterializationKind::Target, computeInsertPoint(repl), operandLoc, - /*valueToMap=*/Value(), /*inputs=*/unpacked, - /*outputTypes=*/legalTypes, /*originalType=*/origType, - currentTypeConverter); - remapped.push_back(targetMat); + ValueVector repl = mapping.lookupOrDefault(operand, legalTypes); + if (!repl.empty() && TypeRange(repl) == legalTypes) { + // Mapped values have the correct type or there is an existing + // materialization. Or the operand is not mapped at all and has the + // correct type. + remapped.push_back(std::move(repl)); continue; } - // Handle 1->1 type conversions. - Type desiredType = legalTypes.front(); - // Try to find a mapped value with the desired type. (Or the operand itself - // if the value is not mapped at all.) - Value newOperand = mapping.lookupOrDefault(operand, desiredType); - if (newOperand.getType() != desiredType) { - // If the looked up value's type does not have the desired type, it means - // that the value was replaced with a value of different type and no - // target materialization was created yet. - Value castValue = buildUnresolvedMaterialization( - MaterializationKind::Target, computeInsertPoint(newOperand), - operandLoc, /*valueToMap=*/newOperand, /*inputs=*/unpacked, - /*outputType=*/desiredType, /*originalType=*/origType, - currentTypeConverter); - newOperand = castValue; - } - remapped.push_back({newOperand}); + // Create a materialization for the most recently mapped values. + repl = mapping.lookupOrDefault(operand); + ValueRange castValues = buildUnresolvedMaterialization( + MaterializationKind::Target, computeInsertPoint(repl), operandLoc, + /*valuesToMap=*/repl, /*inputs=*/repl, /*outputTypes=*/legalTypes, + /*originalType=*/origType, currentTypeConverter); + remapped.push_back(castValues); } return success(); } @@ -1353,7 +1422,7 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion( buildUnresolvedMaterialization( MaterializationKind::Source, OpBuilder::InsertPoint(newBlock, newBlock->begin()), origArg.getLoc(), - /*valueToMap=*/origArg, /*inputs=*/ValueRange(), + /*valuesToMap=*/{origArg}, /*inputs=*/ValueRange(), /*outputType=*/origArgType, /*originalType=*/Type(), converter); appendRewrite(block, origArg, converter); continue; @@ -1369,19 +1438,11 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion( continue; } - // This is a 1->1+ mapping. 1->N mappings are not fully supported in the - // dialect conversion. Therefore, we need an argument materialization to - // turn the replacement block arguments into a single SSA value that can be - // used as a replacement. + // This is a 1->1+ mapping. auto replArgs = newBlock->getArguments().slice(inputMap->inputNo, inputMap->size); - if (replArgs.size() == 1) { - mapping.map(origArg, replArgs.front()); - } else { - insertNTo1Materialization( - OpBuilder::InsertPoint(newBlock, newBlock->begin()), origArg.getLoc(), - /*replacements=*/replArgs, /*outputValue=*/origArg, converter); - } + ValueVector replArgVals = llvm::to_vector_of(replArgs); + mapping.map(origArg, std::move(replArgVals)); appendRewrite(block, origArg, converter); } @@ -1402,7 +1463,7 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion( /// of input operands. ValueRange ConversionPatternRewriterImpl::buildUnresolvedMaterialization( MaterializationKind kind, OpBuilder::InsertPoint ip, Location loc, - Value valueToMap, ValueRange inputs, TypeRange outputTypes, + ValueVector valuesToMap, ValueRange inputs, TypeRange outputTypes, Type originalType, const TypeConverter *converter, UnrealizedConversionCastOp *castOp) { assert((!originalType || kind == MaterializationKind::Target) && @@ -1410,10 +1471,8 @@ ValueRange ConversionPatternRewriterImpl::buildUnresolvedMaterialization( // Avoid materializing an unnecessary cast. if (TypeRange(inputs) == outputTypes) { - if (valueToMap) { - assert(inputs.size() == 1 && "1:N mapping is not supported"); - mapping.map(valueToMap, inputs.front()); - } + if (!valuesToMap.empty()) + mapping.map(std::move(valuesToMap), inputs); return inputs; } @@ -1423,37 +1482,21 @@ ValueRange ConversionPatternRewriterImpl::buildUnresolvedMaterialization( builder.setInsertionPoint(ip.getBlock(), ip.getPoint()); auto convertOp = builder.create(loc, outputTypes, inputs); - if (valueToMap) { - assert(outputTypes.size() == 1 && "1:N mapping is not supported"); - mapping.map(valueToMap, convertOp.getResult(0)); - } + if (!valuesToMap.empty()) + mapping.map(valuesToMap, convertOp.getResults()); if (castOp) *castOp = convertOp; - appendRewrite(convertOp, converter, kind, - originalType, valueToMap); + appendRewrite( + convertOp, converter, kind, originalType, std::move(valuesToMap)); return convertOp.getResults(); } -void ConversionPatternRewriterImpl::insertNTo1Materialization( - OpBuilder::InsertPoint ip, Location loc, ValueRange replacements, - Value originalValue, const TypeConverter *converter) { - // Insert argument materialization back to the original type. - Type originalType = originalValue.getType(); - UnrealizedConversionCastOp argCastOp; - buildUnresolvedMaterialization( - MaterializationKind::Argument, ip, loc, /*valueToMap=*/originalValue, - /*inputs=*/replacements, originalType, - /*originalType=*/Type(), converter, &argCastOp); - if (argCastOp) - nTo1TempMaterializations.insert(argCastOp); -} - Value ConversionPatternRewriterImpl::findOrBuildReplacementValue( Value value, const TypeConverter *converter) { // Find a replacement value with the same type. - Value repl = mapping.lookupOrNull(value, value.getType()); - if (repl) - return repl; + ValueVector repl = mapping.lookupOrNull(value, value.getType()); + if (!repl.empty()) + return repl.front(); // Check if the value is dead. No replacement value is needed in that case. // This is an approximate check that may have false negatives but does not @@ -1468,7 +1511,7 @@ Value ConversionPatternRewriterImpl::findOrBuildReplacementValue( // (regardless of the type) and build a source materialization to the // original type. repl = mapping.lookupOrNull(value); - if (!repl) { + if (repl.empty()) { // No replacement value is registered in the mapping. This means that the // value is dropped and no longer needed. (If the value were still needed, // a source materialization producing a replacement value "out of thin air" @@ -1476,36 +1519,29 @@ Value ConversionPatternRewriterImpl::findOrBuildReplacementValue( // `applySignatureConversion`.) return Value(); } - Value castValue = buildUnresolvedMaterialization( - MaterializationKind::Source, computeInsertPoint(repl), value.getLoc(), - /*valueToMap=*/value, /*inputs=*/repl, /*outputType=*/value.getType(), - /*originalType=*/Type(), converter); + + // Note: `computeInsertPoint` computes the "earliest" insertion point at + // which all values in `repl` are defined. It is important to emit the + // materialization at that location because the same materialization may be + // reused in a different context. (That's because materializations are cached + // in the conversion value mapping.) The insertion point of the + // materialization must be valid for all future users that may be created + // later in the conversion process. + // + // Note: Instead of creating new IR, `buildUnresolvedMaterialization` may + // return an already existing, cached materialization from the conversion + // value mapping. + Value castValue = + buildUnresolvedMaterialization(MaterializationKind::Source, + computeInsertPoint(repl), value.getLoc(), + /*valuesToMap=*/{value}, /*inputs=*/repl, + /*outputType=*/value.getType(), + /*originalType=*/Type(), converter) + .front(); mapping.map(value, castValue); return castValue; } -SmallVector -ConversionPatternRewriterImpl::unpackNTo1Materialization(Value value) { - // Unpack unrealized_conversion_cast ops that were inserted as a N:1 - // workaround. - auto castOp = value.getDefiningOp(); - if (!castOp) - return {value}; - if (!nTo1TempMaterializations.contains(castOp)) - return {value}; - assert(castOp->getNumResults() == 1 && "expected single result"); - - SmallVector result; - for (Value v : castOp.getOperands()) { - // Keep unpacking if possible. This is needed because during block - // signature conversions and 1:N op replacements, the driver may have - // inserted two materializations back-to-back: first an argument - // materialization, then a target materialization. - llvm::append_range(result, unpackNTo1Materialization(v)); - } - return result; -} - //===----------------------------------------------------------------------===// // Rewriter Notification Hooks @@ -1554,7 +1590,7 @@ void ConversionPatternRewriterImpl::notifyOpReplaced( // Materialize a replacement value "out of thin air". buildUnresolvedMaterialization( MaterializationKind::Source, computeInsertPoint(result), - result.getLoc(), /*valueToMap=*/result, /*inputs=*/ValueRange(), + result.getLoc(), /*valuesToMap=*/{result}, /*inputs=*/ValueRange(), /*outputType=*/result.getType(), /*originalType=*/Type(), currentTypeConverter); continue; @@ -1572,16 +1608,7 @@ void ConversionPatternRewriterImpl::notifyOpReplaced( // Remap result to replacement value. if (repl.empty()) continue; - - if (repl.size() == 1) { - // Single replacement value: replace directly. - mapping.map(result, repl.front()); - } else { - // Multiple replacement values: insert N:1 materialization. - insertNTo1Materialization(computeInsertPoint(result), result.getLoc(), - /*replacements=*/repl, /*outputValue=*/result, - currentTypeConverter); - } + mapping.map(result, repl); } appendRewrite(op, currentTypeConverter); @@ -1660,8 +1687,13 @@ void ConversionPatternRewriter::replaceOp(Operation *op, ValueRange newValues) { << "** Replace : '" << op->getName() << "'(" << op << ")\n"; }); SmallVector newVals; - for (size_t i = 0; i < newValues.size(); ++i) - newVals.push_back(newValues.slice(i, 1)); + for (size_t i = 0; i < newValues.size(); ++i) { + if (newValues[i]) { + newVals.push_back(newValues.slice(i, 1)); + } else { + newVals.push_back(ValueRange()); + } + } impl->notifyOpReplaced(op, newVals); } @@ -1733,7 +1765,7 @@ void ConversionPatternRewriter::replaceUsesOfBlockArgument(BlockArgument from, } Value ConversionPatternRewriter::getRemappedValue(Value key) { - SmallVector> remappedValues; + SmallVector remappedValues; if (failed(impl->remapValues("value", /*inputLoc=*/std::nullopt, *this, key, remappedValues))) return nullptr; @@ -1746,7 +1778,7 @@ ConversionPatternRewriter::getRemappedValues(ValueRange keys, SmallVectorImpl &results) { if (keys.empty()) return success(); - SmallVector> remapped; + SmallVector remapped; if (failed(impl->remapValues("value", /*inputLoc=*/std::nullopt, *this, keys, remapped))) return failure(); @@ -1872,7 +1904,7 @@ ConversionPattern::matchAndRewrite(Operation *op, getTypeConverter()); // Remap the operands of the operation. - SmallVector> remapped; + SmallVector remapped; if (failed(rewriterImpl.remapValues("operand", op->getLoc(), rewriter, op->getOperands(), remapped))) { return failure(); @@ -2625,19 +2657,6 @@ legalizeUnresolvedMaterialization(RewriterBase &rewriter, rewriter.setInsertionPoint(op); SmallVector newMaterialization; switch (rewrite->getMaterializationKind()) { - case MaterializationKind::Argument: { - // Try to materialize an argument conversion. - assert(op->getNumResults() == 1 && "expected single result"); - Value argMat = converter->materializeArgumentConversion( - rewriter, op->getLoc(), op.getResultTypes().front(), inputOperands); - if (argMat) { - newMaterialization.push_back(argMat); - break; - } - } - // If an argument materialization failed, fallback to trying a target - // materialization. - [[fallthrough]]; case MaterializationKind::Target: newMaterialization = converter->materializeTargetConversion( rewriter, op->getLoc(), op.getResultTypes(), inputOperands, diff --git a/mlir/test/Transforms/test-legalizer.mlir b/mlir/test/Transforms/test-legalizer.mlir index 297eb5acef21b..ae7d344b7167f 100644 --- a/mlir/test/Transforms/test-legalizer.mlir +++ b/mlir/test/Transforms/test-legalizer.mlir @@ -64,9 +64,6 @@ func.func @remap_call_1_to_1(%arg0: i64) { // Contents of the old block are moved to the new block. // CHECK-NEXT: notifyOperationInserted: test.return, was linked, exact position unknown -// The new block arguments are used in "test.return". -// CHECK-NEXT: notifyOperationModified: test.return - // The old block is erased. // CHECK-NEXT: notifyBlockErased @@ -390,8 +387,8 @@ func.func @caller() { // CHECK: %[[call:.*]]:2 = call @callee() : () -> (f16, f16) %0:2 = func.call @callee() : () -> (f32, i24) - // CHECK: %[[cast1:.*]] = "test.cast"() : () -> i24 - // CHECK: %[[cast0:.*]] = "test.cast"(%[[call]]#0, %[[call]]#1) : (f16, f16) -> f32 + // CHECK-DAG: %[[cast1:.*]] = "test.cast"() : () -> i24 + // CHECK-DAG: %[[cast0:.*]] = "test.cast"(%[[call]]#0, %[[call]]#1) : (f16, f16) -> f32 // CHECK: "test.some_user"(%[[cast0]], %[[cast1]]) : (f32, i24) -> () // expected-remark @below{{'test.some_user' is not legalizable}} "test.some_user"(%0#0, %0#1) : (f32, i24) -> () @@ -494,13 +491,8 @@ func.func @test_1_to_n_block_signature_conversion() { // CHECK-LABEL: func @test_multiple_1_to_n_replacement() // CHECK: %[[legal_op:.*]]:4 = "test.legal_op"() : () -> (f16, f16, f16, f16) -// TODO: There should be a single cast (i.e., a single target materialization). -// This is currently not possible due to 1:N limitations of the conversion -// mapping. Instead, we have 3 argument materializations. -// CHECK: %[[cast1:.*]] = "test.cast"(%[[legal_op]]#2, %[[legal_op]]#3) : (f16, f16) -> f16 -// CHECK: %[[cast2:.*]] = "test.cast"(%[[legal_op]]#0, %[[legal_op]]#1) : (f16, f16) -> f16 -// CHECK: %[[cast3:.*]] = "test.cast"(%[[cast2]], %[[cast1]]) : (f16, f16) -> f16 -// CHECK: "test.valid"(%[[cast3]]) : (f16) -> () +// CHECK: %[[cast:.*]] = "test.cast"(%[[legal_op]]#0, %[[legal_op]]#1, %[[legal_op]]#2, %[[legal_op]]#3) : (f16, f16, f16, f16) -> f16 +// CHECK: "test.valid"(%[[cast]]) : (f16) -> () func.func @test_multiple_1_to_n_replacement() { %0 = "test.multiple_1_to_n_replacement"() : () -> (f16) "test.invalid"(%0) : (f16) -> () diff --git a/mlir/test/lib/Dialect/Func/TestDecomposeCallGraphTypes.cpp b/mlir/test/lib/Dialect/Func/TestDecomposeCallGraphTypes.cpp index 09c5b4b2a0ad5..d0b62e71ab0cf 100644 --- a/mlir/test/lib/Dialect/Func/TestDecomposeCallGraphTypes.cpp +++ b/mlir/test/lib/Dialect/Func/TestDecomposeCallGraphTypes.cpp @@ -139,7 +139,7 @@ struct TestDecomposeCallGraphTypes tupleType.getFlattenedTypes(types); return success(); }); - typeConverter.addArgumentMaterialization(buildMakeTupleOp); + typeConverter.addSourceMaterialization(buildMakeTupleOp); typeConverter.addTargetMaterialization(buildDecomposeTuple); populateFunctionOpInterfaceTypeConversionPattern( diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp index 826c222990be4..5b7c36c9b97bf 100644 --- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp +++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp @@ -1264,14 +1264,6 @@ class TestMultiple1ToNReplacement : public ConversionPattern { // Replace test.multiple_1_to_n_replacement with test.step_1. Operation *repl1 = replaceWithDoubleResults(op, "test.step_1"); // Now replace test.step_1 with test.legal_op. - // TODO: Ideally, it should not be necessary to reset the insertion point - // here. Based on the API calls, it looks like test.step_1 is entirely - // erased. But that's not the case: an argument materialization will - // survive. And that argument materialization will be used by the users of - // `op`. If we don't reset the insertion point here, we get dominance - // errors. This will be fixed when we have 1:N support in the conversion - // value mapping. - rewriter.setInsertionPoint(repl1); replaceWithDoubleResults(repl1, "test.legal_op"); return success(); } @@ -1284,7 +1276,6 @@ struct TestTypeConverter : public TypeConverter { using TypeConverter::TypeConverter; TestTypeConverter() { addConversion(convertType); - addArgumentMaterialization(materializeCast); addSourceMaterialization(materializeCast); } diff --git a/mlir/test/lib/Transforms/TestDialectConversion.cpp b/mlir/test/lib/Transforms/TestDialectConversion.cpp index 2cc1fb5d39d78..a03bf0a1023d5 100644 --- a/mlir/test/lib/Transforms/TestDialectConversion.cpp +++ b/mlir/test/lib/Transforms/TestDialectConversion.cpp @@ -28,7 +28,6 @@ namespace { struct PDLLTypeConverter : public TypeConverter { PDLLTypeConverter() { addConversion(convertType); - addArgumentMaterialization(materializeCast); addSourceMaterialization(materializeCast); } From 68d265666e708bad1c63b419b6275aaba1a7dcd2 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Fri, 3 Jan 2025 16:15:27 +0100 Subject: [PATCH 360/567] [clang][NFC][docs] Fix typo in LanguageExtensions (#121576) --- clang/docs/LanguageExtensions.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index cc5f1d4ddf447..e020710c7aa4f 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -3641,7 +3641,7 @@ program location should be executed. It is expected to be used to implement `_ intrinsic. -The ``__builtin_allow_runtime_check()`` can be used within constrol structures +The ``__builtin_allow_runtime_check()`` can be used within control structures like ``if`` to guard expensive runtime checks. The return value is determined by the following compiler options and may differ per call site: From a4d92400a6db9566d84cb4b900149e36e117f452 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Fri, 3 Jan 2025 23:19:57 +0800 Subject: [PATCH 361/567] [InstCombine] Fix GEPNoWrapFlags propagation in `foldGEPOfPhi` (#121572) Closes https://github.com/llvm/llvm-project/issues/121459. --- .../InstCombine/InstructionCombining.cpp | 5 ++ .../test/Transforms/InstCombine/opaque-ptr.ll | 58 +++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 934156f04f7fd..f63de1f0d410e 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2782,6 +2782,7 @@ static Instruction *foldGEPOfPhi(GetElementPtrInst &GEP, PHINode *PN, // loop iteration). if (Op1 == &GEP) return nullptr; + GEPNoWrapFlags NW = Op1->getNoWrapFlags(); int DI = -1; @@ -2838,6 +2839,8 @@ static Instruction *foldGEPOfPhi(GetElementPtrInst &GEP, PHINode *PN, } } } + + NW &= Op2->getNoWrapFlags(); } // If not all GEPs are identical we'll have to create a new PHI node. @@ -2847,6 +2850,8 @@ static Instruction *foldGEPOfPhi(GetElementPtrInst &GEP, PHINode *PN, return nullptr; auto *NewGEP = cast(Op1->clone()); + NewGEP->setNoWrapFlags(NW); + if (DI == -1) { // All the GEPs feeding the PHI are identical. Clone one down into our // BB so that it can be merged with the current GEP. diff --git a/llvm/test/Transforms/InstCombine/opaque-ptr.ll b/llvm/test/Transforms/InstCombine/opaque-ptr.ll index bac51c82f36dd..b05274658e812 100644 --- a/llvm/test/Transforms/InstCombine/opaque-ptr.ll +++ b/llvm/test/Transforms/InstCombine/opaque-ptr.ll @@ -654,6 +654,64 @@ join: ret ptr %gep } +define ptr @gep_of_phi_of_gep_flags1(i1 %c, ptr %p) { +; CHECK-LABEL: @gep_of_phi_of_gep_flags1( +; CHECK-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]] +; CHECK: if: +; CHECK-NEXT: br label [[JOIN:%.*]] +; CHECK: else: +; CHECK-NEXT: br label [[JOIN]] +; CHECK: join: +; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 4, [[IF]] ], [ 8, [[ELSE]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[TMP2]], i64 4 +; CHECK-NEXT: ret ptr [[GEP]] +; + br i1 %c, label %if, label %else + +if: + %gep1 = getelementptr inbounds i32, ptr %p, i64 1 + br label %join + +else: + %gep2 = getelementptr i32, ptr %p, i64 2 + br label %join + +join: + %phi = phi ptr [ %gep1, %if ], [ %gep2, %else ] + %gep = getelementptr i32, ptr %phi, i64 1 + ret ptr %gep +} + +define ptr @gep_of_phi_of_gep_flags2(i1 %c, ptr %p) { +; CHECK-LABEL: @gep_of_phi_of_gep_flags2( +; CHECK-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]] +; CHECK: if: +; CHECK-NEXT: br label [[JOIN:%.*]] +; CHECK: else: +; CHECK-NEXT: br label [[JOIN]] +; CHECK: join: +; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 4, [[IF]] ], [ 8, [[ELSE]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr nuw i8, ptr [[P:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[TMP2]], i64 4 +; CHECK-NEXT: ret ptr [[GEP]] +; + br i1 %c, label %if, label %else + +if: + %gep1 = getelementptr nuw i32, ptr %p, i64 1 + br label %join + +else: + %gep2 = getelementptr nuw i32, ptr %p, i64 2 + br label %join + +join: + %phi = phi ptr [ %gep1, %if ], [ %gep2, %else ] + %gep = getelementptr i32, ptr %phi, i64 1 + ret ptr %gep +} + define ptr @gep_of_phi_of_gep_different_type(i1 %c, ptr %p) { ; CHECK-LABEL: @gep_of_phi_of_gep_different_type( ; CHECK-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]] From adeff9f63a24f60b0bf240bf13e40bbf7c1dd0e8 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Fri, 3 Jan 2025 09:21:36 -0600 Subject: [PATCH 362/567] [flang][OpenMP] Allow utility constructs in specification part (#121509) Allow utility constructs (error and nothing) to appear in the specification part as well as the execution part. The exception is "ERROR AT(EXECUTION)" which should only be in the execution part. In case of ambiguity (the boundary between the specification and the execution part), utility constructs will be parsed as belonging to the specification part. In such cases move them to the execution part in the OpenMP canonicalization code. --- .../FlangOmpReport/FlangOmpReportVisitor.cpp | 14 +- flang/include/flang/Parser/parse-tree.h | 2 +- flang/lib/Lower/OpenMP/OpenMP.cpp | 4 + flang/lib/Parser/openmp-parsers.cpp | 4 +- flang/lib/Parser/unparse.cpp | 101 +++++------ flang/lib/Semantics/canonicalize-omp.cpp | 162 ++++++++++++++++++ flang/lib/Semantics/check-omp-structure.cpp | 19 +- flang/lib/Semantics/check-omp-structure.h | 8 +- flang/test/Parser/OpenMP/error-unparse.f90 | 18 +- flang/test/Parser/OpenMP/nothing.f90 | 100 +++++++++++ flang/test/Semantics/OpenMP/error.f90 | 8 + 11 files changed, 365 insertions(+), 75 deletions(-) create mode 100644 flang/test/Semantics/OpenMP/error.f90 diff --git a/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp b/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp index 231df63bbae92..c78dd7f14e503 100644 --- a/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp +++ b/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp @@ -106,10 +106,16 @@ std::string OpenMPCounterVisitor::getName(const OmpWrapperType &w) { return getName(*std::get(w)); } std::string OpenMPCounterVisitor::getName(const OpenMPDeclarativeConstruct &c) { - return std::visit( - [&](const auto &o) -> std::string { - const CharBlock &source{std::get(o.t).source}; - return normalize_construct_name(source.ToString()); + return std::visit( // + Fortran::common::visitors{ + [&](const OpenMPUtilityConstruct &o) -> std::string { + const CharBlock &source{o.source}; + return normalize_construct_name(source.ToString()); + }, + [&](const auto &o) -> std::string { + const CharBlock &source{std::get(o.t).source}; + return normalize_construct_name(source.ToString()); + }, }, c.u); } diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h index 9df7c6d5e39c3..b693e001e5e4b 100644 --- a/flang/include/flang/Parser/parse-tree.h +++ b/flang/include/flang/Parser/parse-tree.h @@ -4342,7 +4342,7 @@ struct OpenMPDeclarativeConstruct { std::variant + OpenMPRequiresConstruct, OpenMPUtilityConstruct> u; }; diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index fe6d82125a9e0..0a84162291573 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -2586,6 +2586,10 @@ static void genOMPDispatch(lower::AbstractConverter &converter, //===----------------------------------------------------------------------===// // OpenMPDeclarativeConstruct visitors //===----------------------------------------------------------------------===// +static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, + semantics::SemanticsContext &semaCtx, + lower::pft::Evaluation &eval, + const parser::OpenMPUtilityConstruct &); static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp index 0a0a29002de27..75bb64d06ed0f 100644 --- a/flang/lib/Parser/openmp-parsers.cpp +++ b/flang/lib/Parser/openmp-parsers.cpp @@ -1090,7 +1090,9 @@ TYPE_PARSER(startOmpLine >> construct( Parser{}) || construct( - Parser{})) / + Parser{}) || + construct( + Parser{})) / endOmpLine)) // Block Construct diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp index 4fe57f3e348d3..58820476c51bc 100644 --- a/flang/lib/Parser/unparse.cpp +++ b/flang/lib/Parser/unparse.cpp @@ -2631,81 +2631,64 @@ class UnparseVisitor { } } void Unparse(const OpenMPDeclareReductionConstruct &x) { + BeginOpenMP(); + Word("!$OMP DECLARE REDUCTION "); Put("("); Walk(std::get(x.t)), Put(" : "); Walk(std::get>(x.t), ","), Put(" : "); Walk(std::get(x.t)); Put(")"); Walk(std::get>(x.t)); + EndOpenMP(); } - bool Pre(const OpenMPDeclarativeConstruct &x) { + + void Unparse(const OpenMPDeclareMapperConstruct &z) { BeginOpenMP(); - Word("!$OMP "); - return common::visit( - common::visitors{ - [&](const OpenMPDeclarativeAllocate &z) { - Word("ALLOCATE ("); - Walk(std::get(z.t)); - Put(")"); - Walk(std::get(z.t)); - Put("\n"); - EndOpenMP(); - return false; - }, - [&](const OpenMPDeclareMapperConstruct &z) { - Word("DECLARE MAPPER ("); - const auto &spec{std::get(z.t)}; - if (auto mapname{std::get>(spec.t)}) { - Walk(mapname); - Put(":"); - } - Walk(std::get(spec.t)); - Put("::"); - Walk(std::get(spec.t)); - Put(")"); + Word("!$OMP DECLARE MAPPER ("); + const auto &spec{std::get(z.t)}; + if (auto mapname{std::get>(spec.t)}) { + Walk(mapname); + Put(":"); + } + Walk(std::get(spec.t)); + Put("::"); + Walk(std::get(spec.t)); + Put(")"); - Walk(std::get(z.t)); - Put("\n"); - return false; - }, - [&](const OpenMPDeclareReductionConstruct &) { - Word("DECLARE REDUCTION "); - return true; - }, - [&](const OpenMPDeclareSimdConstruct &y) { - Word("DECLARE SIMD "); - Walk("(", std::get>(y.t), ")"); - Walk(std::get(y.t)); - Put("\n"); - EndOpenMP(); - return false; - }, - [&](const OpenMPDeclareTargetConstruct &) { - Word("DECLARE TARGET "); - return true; - }, - [&](const OpenMPRequiresConstruct &y) { - Word("REQUIRES "); - Walk(std::get(y.t)); - Put("\n"); - EndOpenMP(); - return false; - }, - [&](const OpenMPThreadprivate &) { - Word("THREADPRIVATE ("); - return true; - }, - }, - x.u); + Walk(std::get(z.t)); + Put("\n"); + EndOpenMP(); + } + void Unparse(const OpenMPDeclareSimdConstruct &y) { + BeginOpenMP(); + Word("!$OMP DECLARE SIMD "); + Walk("(", std::get>(y.t), ")"); + Walk(std::get(y.t)); + Put("\n"); + EndOpenMP(); } - void Post(const OpenMPDeclarativeConstruct &) { + void Unparse(const OpenMPDeclareTargetConstruct &x) { + BeginOpenMP(); + Word("!$OMP DECLARE TARGET "); + Walk(std::get(x.t)); Put("\n"); EndOpenMP(); } - void Post(const OpenMPThreadprivate &) { + void Unparse(const OpenMPRequiresConstruct &y) { + BeginOpenMP(); + Word("!$OMP REQUIRES "); + Walk(std::get(y.t)); + Put("\n"); + EndOpenMP(); + } + void Unparse(const OpenMPThreadprivate &x) { + BeginOpenMP(); + Word("!$OMP THREADPRIVATE ("); + Walk(std::get(x.t)); Put(")\n"); EndOpenMP(); } + bool Pre(const OmpMessageClause &x) { Walk(x.v); return false; diff --git a/flang/lib/Semantics/canonicalize-omp.cpp b/flang/lib/Semantics/canonicalize-omp.cpp index 0481b3d41f501..5164f1dc6faab 100644 --- a/flang/lib/Semantics/canonicalize-omp.cpp +++ b/flang/lib/Semantics/canonicalize-omp.cpp @@ -50,6 +50,43 @@ class CanonicalizationOfOmp { void Post(parser::ExecutionPart &body) { RewriteOmpAllocations(body); } + // Pre-visit all constructs that have both a specification part and + // an execution part, and store the connection between the two. + bool Pre(parser::BlockConstruct &x) { + auto *spec = &std::get(x.t).v; + auto *block = &std::get(x.t); + blockForSpec_.insert(std::make_pair(spec, block)); + return true; + } + bool Pre(parser::MainProgram &x) { + auto *spec = &std::get(x.t); + auto *block = &std::get(x.t).v; + blockForSpec_.insert(std::make_pair(spec, block)); + return true; + } + bool Pre(parser::FunctionSubprogram &x) { + auto *spec = &std::get(x.t); + auto *block = &std::get(x.t).v; + blockForSpec_.insert(std::make_pair(spec, block)); + return true; + } + bool Pre(parser::SubroutineSubprogram &x) { + auto *spec = &std::get(x.t); + auto *block = &std::get(x.t).v; + blockForSpec_.insert(std::make_pair(spec, block)); + return true; + } + bool Pre(parser::SeparateModuleSubprogram &x) { + auto *spec = &std::get(x.t); + auto *block = &std::get(x.t).v; + blockForSpec_.insert(std::make_pair(spec, block)); + return true; + } + + void Post(parser::SpecificationPart &spec) { + CanonicalizeUtilityConstructs(spec); + } + private: template T *GetConstructIf(parser::ExecutionPartConstruct &x) { if (auto *y{std::get_if(&x.u)}) { @@ -155,6 +192,131 @@ class CanonicalizationOfOmp { } } + // Canonicalization of utility constructs. + // + // This addresses the issue of utility constructs that appear at the + // boundary between the specification and the execution parts, e.g. + // subroutine foo + // integer :: x ! Specification + // !$omp nothing + // x = 1 ! Execution + // ... + // end + // + // Utility constructs (error and nothing) can appear in both the + // specification part and the execution part, except "error at(execution)", + // which cannot be present in the specification part (whereas any utility + // construct can be in the execution part). + // When a utility construct is at the boundary, it should preferably be + // parsed as an element of the execution part, but since the specification + // part is parsed first, the utility construct ends up belonging to the + // specification part. + // + // To allow the likes of the following code to compile, move all utility + // construct that are at the end of the specification part to the beginning + // of the execution part. + // + // subroutine foo + // !$omp error at(execution) ! Initially parsed as declarative construct. + // ! Move it to the execution part. + // end + + void CanonicalizeUtilityConstructs(parser::SpecificationPart &spec) { + auto found = blockForSpec_.find(&spec); + if (found == blockForSpec_.end()) { + // There is no corresponding execution part, so there is nothing to do. + return; + } + parser::Block &block = *found->second; + + // There are two places where an OpenMP declarative construct can + // show up in the tuple in specification part: + // (1) in std::list, or + // (2) in std::list. + // The case (1) is only possible is the list (2) is empty. + + auto &omps = + std::get>(spec.t); + auto &decls = std::get>(spec.t); + + if (!decls.empty()) { + MoveUtilityConstructsFromDecls(decls, block); + } else { + MoveUtilityConstructsFromOmps(omps, block); + } + } + + void MoveUtilityConstructsFromDecls( + std::list &decls, parser::Block &block) { + // Find the trailing range of DeclarationConstructs that are OpenMP + // utility construct, that are to be moved to the execution part. + std::list::reverse_iterator rlast = [&]() { + for (auto rit = decls.rbegin(), rend = decls.rend(); rit != rend; ++rit) { + parser::DeclarationConstruct &dc = *rit; + if (!std::holds_alternative(dc.u)) { + return rit; + } + auto &sc = std::get(dc.u); + using OpenMPDeclarativeConstruct = + common::Indirection; + if (!std::holds_alternative(sc.u)) { + return rit; + } + // Got OpenMPDeclarativeConstruct. If it's not a utility construct + // then stop. + auto &odc = std::get(sc.u).value(); + if (!std::holds_alternative(odc.u)) { + return rit; + } + } + return decls.rend(); + }(); + + std::transform(decls.rbegin(), rlast, std::front_inserter(block), + [](parser::DeclarationConstruct &dc) { + auto &sc = std::get(dc.u); + using OpenMPDeclarativeConstruct = + common::Indirection; + auto &oc = std::get(sc.u).value(); + auto &ut = std::get(oc.u); + + return parser::ExecutionPartConstruct(parser::ExecutableConstruct( + common::Indirection(parser::OpenMPConstruct(std::move(ut))))); + }); + + decls.erase(rlast.base(), decls.end()); + } + + void MoveUtilityConstructsFromOmps( + std::list &omps, + parser::Block &block) { + using OpenMPDeclarativeConstruct = parser::OpenMPDeclarativeConstruct; + // Find the trailing range of OpenMPDeclarativeConstruct that are OpenMP + // utility construct, that are to be moved to the execution part. + std::list::reverse_iterator rlast = [&]() { + for (auto rit = omps.rbegin(), rend = omps.rend(); rit != rend; ++rit) { + OpenMPDeclarativeConstruct &dc = *rit; + if (!std::holds_alternative(dc.u)) { + return rit; + } + } + return omps.rend(); + }(); + + std::transform(omps.rbegin(), rlast, std::front_inserter(block), + [](parser::OpenMPDeclarativeConstruct &dc) { + auto &ut = std::get(dc.u); + return parser::ExecutionPartConstruct(parser::ExecutableConstruct( + common::Indirection(parser::OpenMPConstruct(std::move(ut))))); + }); + + omps.erase(rlast.base(), omps.end()); + } + + // Mapping from the specification parts to the blocks that follow in the + // same construct. This is for converting utility constructs to executable + // constructs. + std::map blockForSpec_; parser::Messages &messages_; }; diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index 3a928c8a0289b..4c6a408a9ef30 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -614,6 +614,14 @@ void OmpStructureChecker::Leave(const parser::OpenMPConstruct &) { deferredNonVariables_.clear(); } +void OmpStructureChecker::Enter(const parser::OpenMPDeclarativeConstruct &x) { + EnterDirectiveNest(DeclarativeNest); +} + +void OmpStructureChecker::Leave(const parser::OpenMPDeclarativeConstruct &x) { + ExitDirectiveNest(DeclarativeNest); +} + void OmpStructureChecker::Enter(const parser::OpenMPLoopConstruct &x) { loopStack_.push_back(&x); const auto &beginLoopDir{std::get(x.t)}; @@ -1697,6 +1705,16 @@ void OmpStructureChecker::Leave(const parser::OmpErrorDirective &x) { dirContext_.pop_back(); } +void OmpStructureChecker::Enter(const parser::OmpClause::At &x) { + CheckAllowedClause(llvm::omp::Clause::OMPC_at); + if (GetDirectiveNest(DeclarativeNest) > 0) { + if (x.v.v == parser::OmpAtClause::ActionTime::Execution) { + context_.Say(GetContext().clauseSource, + "The ERROR directive with AT(EXECUTION) cannot appear in the specification part"_err_en_US); + } + } +} + void OmpStructureChecker::Enter(const parser::OpenMPExecutableAllocate &x) { isPredefinedAllocator = true; const auto &dir{std::get(x.t)}; @@ -2856,7 +2874,6 @@ CHECK_SIMPLE_CLAUSE(Init, OMPC_init) CHECK_SIMPLE_CLAUSE(Use, OMPC_use) CHECK_SIMPLE_CLAUSE(Novariants, OMPC_novariants) CHECK_SIMPLE_CLAUSE(Nocontext, OMPC_nocontext) -CHECK_SIMPLE_CLAUSE(At, OMPC_at) CHECK_SIMPLE_CLAUSE(Severity, OMPC_severity) CHECK_SIMPLE_CLAUSE(Message, OMPC_message) CHECK_SIMPLE_CLAUSE(Filter, OMPC_filter) diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h index 2a4f6fbd618c3..f47c01c00499a 100644 --- a/flang/lib/Semantics/check-omp-structure.h +++ b/flang/lib/Semantics/check-omp-structure.h @@ -73,6 +73,9 @@ class OmpStructureChecker void Enter(const parser::OpenMPConstruct &); void Leave(const parser::OpenMPConstruct &); + void Enter(const parser::OpenMPDeclarativeConstruct &); + void Leave(const parser::OpenMPDeclarativeConstruct &); + void Enter(const parser::OpenMPLoopConstruct &); void Leave(const parser::OpenMPLoopConstruct &); void Enter(const parser::OmpEndLoopDirective &); @@ -270,11 +273,12 @@ class OmpStructureChecker const parser::Variable &, const parser::Expr &); inline void ErrIfNonScalarAssignmentStmt( const parser::Variable &, const parser::Expr &); - enum directiveNestType { + enum directiveNestType : int { SIMDNest, TargetBlockOnlyTeams, TargetNest, - LastType + DeclarativeNest, + LastType = DeclarativeNest, }; int directiveNest_[LastType + 1] = {0}; diff --git a/flang/test/Parser/OpenMP/error-unparse.f90 b/flang/test/Parser/OpenMP/error-unparse.f90 index 4dd06b736da80..2cb4e1a083a6c 100644 --- a/flang/test/Parser/OpenMP/error-unparse.f90 +++ b/flang/test/Parser/OpenMP/error-unparse.f90 @@ -1,23 +1,27 @@ -! RUN: %flang_fc1 -fopenmp-version=51 -fopenmp -fdebug-unparse-no-sema %s 2>&1 | FileCheck %s -! RUN: %flang_fc1 -fopenmp-version=51 -fopenmp -fdebug-dump-parse-tree-no-sema %s 2>&1 | FileCheck %s --check-prefix="PARSE-TREE" +! RUN: %flang_fc1 -fopenmp-version=51 -fopenmp -fdebug-unparse %s 2>&1 | FileCheck %s +! RUN: %flang_fc1 -fopenmp-version=51 -fopenmp -fdebug-dump-parse-tree %s 2>&1 | FileCheck %s --check-prefix="PARSE-TREE" program main character(*), parameter :: message = "This is an error" !CHECK: !$OMP ERROR AT(COMPILATION) SEVERITY(WARNING) MESSAGE("some message here") !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpErrorDirective !PARSE-TREE: OmpClauseList -> OmpClause -> At -> OmpAtClause -> ActionTime = Compilation !PARSE-TREE: OmpClause -> Severity -> OmpSeverityClause -> Severity = Warning - !PARSE-TREE: OmpClause -> Message -> OmpMessageClause -> Expr -> LiteralConstant -> CharLiteralConstant + !PARSE-TREE: OmpClause -> Message -> OmpMessageClause -> Expr = '"some message here"' + !PARSE-TREE: LiteralConstant -> CharLiteralConstant + !PARSE-TREE: string = 'some message here' !$omp error at(compilation) severity(warning) message("some message here") - !CHECK: !$OMP ERROR AT(COMPILATION) SEVERITY(FATAL) MESSAGE(message) + !CHECK: !$OMP ERROR AT(COMPILATION) SEVERITY(FATAL) MESSAGE("This is an error") !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpErrorDirective !PARSE-TREE: OmpClauseList -> OmpClause -> At -> OmpAtClause -> ActionTime = Compilation !PARSE-TREE: OmpClause -> Severity -> OmpSeverityClause -> Severity = Fatal - !PARSE-TREE: OmpClause -> Message -> OmpMessageClause -> Expr -> Designator -> DataRef -> Name = 'message' + !PARSE-TREE: OmpClause -> Message -> OmpMessageClause -> Expr = '"This is an error"' + !PARSE-TREE: Designator -> DataRef -> Name = 'message' !$omp error at(compilation) severity(fatal) message(message) - !CHECK: !$OMP ERROR AT(EXECUTION) SEVERITY(FATAL) MESSAGE(message) + !CHECK: !$OMP ERROR AT(EXECUTION) SEVERITY(FATAL) MESSAGE("This is an error") !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpErrorDirective !PARSE-TREE: OmpClauseList -> OmpClause -> At -> OmpAtClause -> ActionTime = Execution !PARSE-TREE: OmpClause -> Severity -> OmpSeverityClause -> Severity = Fatal - !PARSE-TREE: OmpClause -> Message -> OmpMessageClause -> Expr -> Designator -> DataRef -> Name = 'message' + !PARSE-TREE: OmpClause -> Message -> OmpMessageClause -> Expr = '"This is an error"' + !PARSE-TREE: Designator -> DataRef -> Name = 'message' !$omp error at(EXECUTION) severity(fatal) message(message) end program main diff --git a/flang/test/Parser/OpenMP/nothing.f90 b/flang/test/Parser/OpenMP/nothing.f90 index 80c0932087610..22558c493c444 100644 --- a/flang/test/Parser/OpenMP/nothing.f90 +++ b/flang/test/Parser/OpenMP/nothing.f90 @@ -11,3 +11,103 @@ subroutine f00 !PARSE-TREE: ExecutionPart -> Block !PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpNothingDirective + +subroutine f01 + block + import, none + integer :: x + !$omp nothing ! "nothing" in the execution part + x = x+1 + end block +end + +!UNPARSE: SUBROUTINE f01 +!UNPARSE: BLOCK +!UNPARSE: IMPORT, NONE +!UNPARSE: INTEGER x +!UNPARSE: !$OMP NOTHING +!UNPARSE: x=x+1_4 +!UNPARSE: END BLOCK +!UNPARSE: END SUBROUTINE + +!PARSE-TREE: BlockStmt -> +!PARSE-TREE: BlockSpecificationPart -> SpecificationPart +!PARSE-TREE: | ImportStmt +!PARSE-TREE: | ImplicitPart -> +!PARSE-TREE: | DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt +!PARSE-TREE: | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec -> +!PARSE-TREE: | | EntityDecl +!PARSE-TREE: | | | Name = 'x' +!PARSE-TREE: Block +!PARSE-TREE: | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpNothingDirective +!PARSE-TREE: | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=x+1_4' +!PARSE-TREE: | | Variable = 'x' +!PARSE-TREE: | | | Designator -> DataRef -> Name = 'x' +!PARSE-TREE: | | Expr = 'x+1_4' +!PARSE-TREE: | | | Add +!PARSE-TREE: | | | | Expr = 'x' +!PARSE-TREE: | | | | | Designator -> DataRef -> Name = 'x' +!PARSE-TREE: | | | | Expr = '1_4' +!PARSE-TREE: | | | | | LiteralConstant -> IntLiteralConstant = '1' +!PARSE-TREE: EndBlockStmt -> + +subroutine f02 + integer :: x + !$omp nothing +end + +!UNPARSE: SUBROUTINE f02 +!UNPARSE: INTEGER x +!UNPARSE: !$OMP NOTHING +!UNPARSE: END SUBROUTINE + +!PARSE-TREE: SpecificationPart +!PARSE-TREE: | ImplicitPart -> +!PARSE-TREE: | DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt +!PARSE-TREE: | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec -> +!PARSE-TREE: | | EntityDecl +!PARSE-TREE: | | | Name = 'x' +!PARSE-TREE: ExecutionPart -> Block +!PARSE-TREE: | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpNothingDirective + +subroutine f03 + block + !$omp nothing ! "nothing" in the specification part + import, none + integer :: x + x = x+1 + end block +end + +!UNPARSE: SUBROUTINE f03 +!UNPARSE: BLOCK +!UNPARSE: !$OMP NOTHING +!UNPARSE: IMPORT, NONE +!UNPARSE: INTEGER x +!UNPARSE: x=x+1_4 +!UNPARSE: END BLOCK +!UNPARSE: END SUBROUTINE + +!PARSE-TREE: ExecutionPart -> Block +!PARSE-TREE: | ExecutionPartConstruct -> ExecutableConstruct -> BlockConstruct +!PARSE-TREE: | | BlockStmt -> +!PARSE-TREE: | | BlockSpecificationPart -> SpecificationPart +!PARSE-TREE: | | | OpenMPDeclarativeConstruct -> OpenMPUtilityConstruct -> OmpNothingDirective +!PARSE-TREE: | | | ImportStmt +!PARSE-TREE: | | | ImplicitPart -> +!PARSE-TREE: | | | DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt +!PARSE-TREE: | | | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec -> +!PARSE-TREE: | | | | EntityDecl +!PARSE-TREE: | | | | | Name = 'x' +!PARSE-TREE: | | Block +!PARSE-TREE: | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=x+1_4' +!PARSE-TREE: | | | | Variable = 'x' +!PARSE-TREE: | | | | | Designator -> DataRef -> Name = 'x' +!PARSE-TREE: | | | | Expr = 'x+1_4' +!PARSE-TREE: | | | | | Add +!PARSE-TREE: | | | | | | Expr = 'x' +!PARSE-TREE: | | | | | | | Designator -> DataRef -> Name = 'x' +!PARSE-TREE: | | | | | | Expr = '1_4' +!PARSE-TREE: | | | | | | | LiteralConstant -> IntLiteralConstant = '1' +!PARSE-TREE: | | EndBlockStmt -> +!PARSE-TREE: EndSubroutineStmt -> \ No newline at end of file diff --git a/flang/test/Semantics/OpenMP/error.f90 b/flang/test/Semantics/OpenMP/error.f90 new file mode 100644 index 0000000000000..067417a8cda3b --- /dev/null +++ b/flang/test/Semantics/OpenMP/error.f90 @@ -0,0 +1,8 @@ +!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=51 + +subroutine f00(x) +!ERROR: The ERROR directive with AT(EXECUTION) cannot appear in the specification part + !$omp error at(execution) message("Haaa!") + integer :: x +end + From faa30be101e9ae2bdb58d2acb250341f1b13031c Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Fri, 3 Jan 2025 16:35:02 +0100 Subject: [PATCH 363/567] [mlir][Transforms] Fix build after #116524 (#121578) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix build errors after #116524. ``` error: call of overloaded ‘TypeRange(ValueVector&)’ is ambiguous ``` --- mlir/lib/Transforms/Utils/DialectConversion.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index 0c5520988eff3..6c3863e4c7f66 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -227,7 +227,7 @@ ConversionValueMapping::lookupOrDefault(Value from, ValueVector current{from}; do { // Store the current value if the types match. - if (TypeRange(current) == desiredTypes) + if (TypeRange(ValueRange(current)) == desiredTypes) desiredValue = current; // If possible, Replace each value with (one or multiple) mapped values. @@ -271,9 +271,8 @@ ConversionValueMapping::lookupOrDefault(Value from, ValueVector ConversionValueMapping::lookupOrNull(Value from, TypeRange desiredTypes) const { ValueVector result = lookupOrDefault(from, desiredTypes); - TypeRange resultTypes(result); if (result == ValueVector{from} || - (!desiredTypes.empty() && resultTypes != desiredTypes)) + (!desiredTypes.empty() && TypeRange(ValueRange(result)) != desiredTypes)) return {}; return result; } @@ -1291,7 +1290,7 @@ LogicalResult ConversionPatternRewriterImpl::remapValues( } ValueVector repl = mapping.lookupOrDefault(operand, legalTypes); - if (!repl.empty() && TypeRange(repl) == legalTypes) { + if (!repl.empty() && TypeRange(ValueRange(repl)) == legalTypes) { // Mapped values have the correct type or there is an existing // materialization. Or the operand is not mapped at all and has the // correct type. From 5137c209f0c19668d06e48cc4293e4c01a77c964 Mon Sep 17 00:00:00 2001 From: agozillon Date: Fri, 3 Jan 2025 16:46:15 +0100 Subject: [PATCH 364/567] [Flang][OpenMP] Fix allocating arrays with size intrinisic (#119226) Attempt to address the following example from causing an assert or ICE: ``` subroutine test(a) implicit none integer :: i real(kind=real64), dimension(:) :: a real(kind=real64), dimension(size(a, 1)) :: b !$omp target map(tofrom: b) do i = 1, 10 b(i) = i end do !$omp end target end subroutine ``` Where we utilise a Fortran intrinsic (size) to calculate the size of allocatable arrays and then map it to device. --- flang/lib/Lower/OpenMP/OpenMP.cpp | 54 ++++++++++++++++--- .../Optimizer/OpenMP/MapInfoFinalization.cpp | 14 +++-- .../Lower/OpenMP/allocatable-array-bounds.f90 | 7 +-- flang/test/Lower/OpenMP/array-bounds.f90 | 2 +- .../OpenMP/derived-type-allocatable-map.f90 | 8 +-- .../local-intrinsic-sized-array-map.f90 | 32 +++++++++++ .../Transforms/omp-map-info-finalization.fir | 14 ++--- ...target-map-local-intrinisc-sized-param.f90 | 39 ++++++++++++++ 8 files changed, 143 insertions(+), 27 deletions(-) create mode 100644 flang/test/Lower/OpenMP/local-intrinsic-sized-array-map.f90 create mode 100644 offload/test/offloading/fortran/target-map-local-intrinisc-sized-param.f90 diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 0a84162291573..cd4b25a17722c 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -923,13 +923,24 @@ static void genBodyOfTargetOp( while (!valuesDefinedAbove.empty()) { for (mlir::Value val : valuesDefinedAbove) { mlir::Operation *valOp = val.getDefiningOp(); - if (mlir::isMemoryEffectFree(valOp)) { + assert(valOp != nullptr); + + // NOTE: We skip BoxDimsOp's as the lesser of two evils is to map the + // indices separately, as the alternative is to eventually map the Box, + // which comes with a fairly large overhead comparatively. We could be + // more robust about this and check using a BackwardsSlice to see if we + // run the risk of mapping a box. + if (mlir::isMemoryEffectFree(valOp) && + !mlir::isa(valOp)) { mlir::Operation *clonedOp = valOp->clone(); entryBlock->push_front(clonedOp); - val.replaceUsesWithIf(clonedOp->getResult(0), - [entryBlock](mlir::OpOperand &use) { - return use.getOwner()->getBlock() == entryBlock; - }); + + auto replace = [entryBlock](mlir::OpOperand &use) { + return use.getOwner()->getBlock() == entryBlock; + }; + + valOp->getResults().replaceUsesWithIf(clonedOp->getResults(), replace); + valOp->replaceUsesWithIf(clonedOp, replace); } else { auto savedIP = firOpBuilder.getInsertionPoint(); firOpBuilder.setInsertionPointAfter(valOp); @@ -937,9 +948,36 @@ static void genBodyOfTargetOp( firOpBuilder.createTemporary(val.getLoc(), val.getType()); firOpBuilder.createStoreWithConvert(copyVal.getLoc(), val, copyVal); - llvm::SmallVector bounds; + lower::AddrAndBoundsInfo info = lower::getDataOperandBaseAddr( + firOpBuilder, val, /*isOptional=*/false, val.getLoc()); + llvm::SmallVector bounds = + Fortran::lower::genImplicitBoundsOps( + firOpBuilder, info, + hlfir::translateToExtendedValue(val.getLoc(), firOpBuilder, + hlfir::Entity{val}) + .first, + /*dataExvIsAssumedSize=*/false, val.getLoc()); + std::stringstream name; firOpBuilder.setInsertionPoint(targetOp); + + llvm::omp::OpenMPOffloadMappingFlags mapFlag = + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT; + mlir::omp::VariableCaptureKind captureKind = + mlir::omp::VariableCaptureKind::ByRef; + + mlir::Type eleType = copyVal.getType(); + if (auto refType = + mlir::dyn_cast(copyVal.getType())) + eleType = refType.getElementType(); + + if (fir::isa_trivial(eleType) || fir::isa_char(eleType)) { + captureKind = mlir::omp::VariableCaptureKind::ByCopy; + } else if (!fir::isa_builtin_cptr_type(eleType)) { + mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO; + } + mlir::Value mapOp = createMapInfoOp( firOpBuilder, copyVal.getLoc(), copyVal, /*varPtrPtr=*/mlir::Value{}, name.str(), bounds, @@ -947,8 +985,8 @@ static void genBodyOfTargetOp( /*membersIndex=*/mlir::ArrayAttr{}, static_cast< std::underlying_type_t>( - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT), - mlir::omp::VariableCaptureKind::ByCopy, copyVal.getType()); + mapFlag), + captureKind, copyVal.getType()); // Get the index of the first non-map argument before modifying mapVars, // then append an element to mapVars and an associated entry block diff --git a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp index ad7b806ae262a..e823443958714 100644 --- a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp +++ b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp @@ -162,13 +162,19 @@ class MapInfoFinalizationPass mlir::Value baseAddrAddr = builder.create( loc, descriptor, fir::BoxFieldAttr::base_addr); + mlir::Type underlyingVarType = + llvm::cast( + fir::unwrapRefType(baseAddrAddr.getType())) + .getElementType(); + if (auto seqType = llvm::dyn_cast(underlyingVarType)) + if (seqType.hasDynamicExtents()) + underlyingVarType = seqType.getEleTy(); + // Member of the descriptor pointing at the allocated data return builder.create( loc, baseAddrAddr.getType(), descriptor, - mlir::TypeAttr::get(llvm::cast( - fir::unwrapRefType(baseAddrAddr.getType())) - .getElementType()), - baseAddrAddr, /*members=*/mlir::SmallVector{}, + mlir::TypeAttr::get(underlyingVarType), baseAddrAddr, + /*members=*/mlir::SmallVector{}, /*membersIndex=*/mlir::ArrayAttr{}, bounds, builder.getIntegerAttr(builder.getIntegerType(64, false), mapType), builder.getAttr( diff --git a/flang/test/Lower/OpenMP/allocatable-array-bounds.f90 b/flang/test/Lower/OpenMP/allocatable-array-bounds.f90 index e162c5a2d6d69..e66b6f17d8858 100644 --- a/flang/test/Lower/OpenMP/allocatable-array-bounds.f90 +++ b/flang/test/Lower/OpenMP/allocatable-array-bounds.f90 @@ -23,7 +23,7 @@ !HOST: %[[BOX_3:.*]]:3 = fir.box_dims %[[LOAD_3]], %[[CONSTANT_3]] : (!fir.box>>, index) -> (index, index, index) !HOST: %[[BOUNDS_1:.*]] = omp.map.bounds lower_bound(%[[LB_1]] : index) upper_bound(%[[UB_1]] : index) extent(%[[BOX_3]]#1 : index) stride(%[[BOX_2]]#2 : index) start_idx(%[[BOX_1]]#0 : index) {stride_in_bytes = true} !HOST: %[[VAR_PTR_PTR:.*]] = fir.box_offset %[[DECLARE_1]]#1 base_addr : (!fir.ref>>>) -> !fir.llvm_ptr>> -!HOST: %[[MAP_INFO_MEMBER:.*]] = omp.map.info var_ptr(%[[DECLARE_1]]#1 : !fir.ref>>>, !fir.array) var_ptr_ptr(%[[VAR_PTR_PTR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS_1]]) -> !fir.llvm_ptr>> {name = ""} +!HOST: %[[MAP_INFO_MEMBER:.*]] = omp.map.info var_ptr(%[[DECLARE_1]]#1 : !fir.ref>>>, i32) var_ptr_ptr(%[[VAR_PTR_PTR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS_1]]) -> !fir.llvm_ptr>> {name = ""} !HOST: %[[MAP_INFO_1:.*]] = omp.map.info var_ptr(%[[DECLARE_1]]#1 : !fir.ref>>>, !fir.box>>) map_clauses(to) capture(ByRef) members(%[[MAP_INFO_MEMBER]] : [0] : !fir.llvm_ptr>>) -> !fir.ref>>> {name = "sp_read(2:5)"} !HOST: %[[LOAD_3:.*]] = fir.load %[[DECLARE_2]]#0 : !fir.ref>>> @@ -41,7 +41,7 @@ !HOST: %[[BOX_5:.*]]:3 = fir.box_dims %[[LOAD_5]], %[[CONSTANT_5]] : (!fir.box>>, index) -> (index, index, index) !HOST: %[[BOUNDS_2:.*]] = omp.map.bounds lower_bound(%[[LB_2]] : index) upper_bound(%[[UB_2]] : index) extent(%[[BOX_5]]#1 : index) stride(%[[BOX_4]]#2 : index) start_idx(%[[BOX_3]]#0 : index) {stride_in_bytes = true} !HOST: %[[VAR_PTR_PTR:.*]] = fir.box_offset %[[DECLARE_2]]#1 base_addr : (!fir.ref>>>) -> !fir.llvm_ptr>> -!HOST: %[[MAP_INFO_MEMBER:.*]] = omp.map.info var_ptr(%[[DECLARE_2]]#1 : !fir.ref>>>, !fir.array) var_ptr_ptr(%[[VAR_PTR_PTR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS_2]]) -> !fir.llvm_ptr>> {name = ""} +!HOST: %[[MAP_INFO_MEMBER:.*]] = omp.map.info var_ptr(%[[DECLARE_2]]#1 : !fir.ref>>>, i32) var_ptr_ptr(%[[VAR_PTR_PTR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS_2]]) -> !fir.llvm_ptr>> {name = ""} !HOST: %[[MAP_INFO_2:.*]] = omp.map.info var_ptr(%[[DECLARE_2]]#1 : !fir.ref>>>, !fir.box>>) map_clauses(to) capture(ByRef) members(%[[MAP_INFO_MEMBER]] : [0] : !fir.llvm_ptr>>) -> !fir.ref>>> {name = "sp_write(2:5)"} subroutine read_write_section() @@ -80,8 +80,9 @@ module assumed_allocatable_array_routines !HOST: %[[BOX_3:.*]]:3 = fir.box_dims %[[LOAD_3]], %[[CONSTANT_3]] : (!fir.box>>, index) -> (index, index, index) !HOST: %[[BOUNDS:.*]] = omp.map.bounds lower_bound(%[[LB]] : index) upper_bound(%[[UB]] : index) extent(%[[BOX_3]]#1 : index) stride(%[[BOX_2]]#2 : index) start_idx(%[[BOX_1]]#0 : index) {stride_in_bytes = true} !HOST: %[[VAR_PTR_PTR:.*]] = fir.box_offset %[[DECLARE]]#1 base_addr : (!fir.ref>>>) -> !fir.llvm_ptr>> -!HOST: %[[MAP_INFO_MEMBER:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref>>>, !fir.array) var_ptr_ptr(%[[VAR_PTR_PTR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {name = ""} +!HOST: %[[MAP_INFO_MEMBER:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref>>>, i32) var_ptr_ptr(%[[VAR_PTR_PTR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {name = ""} !HOST: %[[MAP_INFO:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref>>>, !fir.box>>) map_clauses(to) capture(ByRef) members(%[[MAP_INFO_MEMBER]] : [0] : !fir.llvm_ptr>>) -> !fir.ref>>> {name = "arr_read_write(2:5)"} + subroutine assumed_shape_array(arr_read_write) integer, allocatable, intent(inout) :: arr_read_write(:) diff --git a/flang/test/Lower/OpenMP/array-bounds.f90 b/flang/test/Lower/OpenMP/array-bounds.f90 index 78fa81567ca54..479b6887a83f4 100644 --- a/flang/test/Lower/OpenMP/array-bounds.f90 +++ b/flang/test/Lower/OpenMP/array-bounds.f90 @@ -51,7 +51,7 @@ module assumed_array_routines !HOST: %[[DIMS1:.*]]:3 = fir.box_dims %[[ARG0_DECL]]#1, %[[C0_1]] : (!fir.box>, index) -> (index, index, index) !HOST: %[[BOUNDS:.*]] = omp.map.bounds lower_bound(%[[C3]] : index) upper_bound(%[[C4]] : index) extent(%[[DIMS1]]#1 : index) stride(%[[DIMS0]]#2 : index) start_idx(%[[C0]] : index) {stride_in_bytes = true} !HOST: %[[VAR_PTR_PTR:.*]] = fir.box_offset %0 base_addr : (!fir.ref>>) -> !fir.llvm_ptr>> -!HOST: %[[MAP_INFO_MEMBER:.*]] = omp.map.info var_ptr(%[[INTERMEDIATE_ALLOCA]] : !fir.ref>>, !fir.array) var_ptr_ptr(%[[VAR_PTR_PTR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {name = ""} +!HOST: %[[MAP_INFO_MEMBER:.*]] = omp.map.info var_ptr(%[[INTERMEDIATE_ALLOCA]] : !fir.ref>>, i32) var_ptr_ptr(%[[VAR_PTR_PTR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {name = ""} !HOST: %[[MAP:.*]] = omp.map.info var_ptr(%[[INTERMEDIATE_ALLOCA]] : !fir.ref>>, !fir.box>) map_clauses(to) capture(ByRef) members(%[[MAP_INFO_MEMBER]] : [0] : !fir.llvm_ptr>>) -> !fir.ref> {name = "arr_read_write(2:5)"} !HOST: omp.target map_entries(%[[MAP]] -> %{{.*}}, {{.*}} -> {{.*}}, %[[MAP_INFO_MEMBER]] -> %{{.*}} : !fir.ref>, !fir.ref, !fir.llvm_ptr>>) { subroutine assumed_shape_array(arr_read_write) diff --git a/flang/test/Lower/OpenMP/derived-type-allocatable-map.f90 b/flang/test/Lower/OpenMP/derived-type-allocatable-map.f90 index 47bcf2a7229ea..28a2b9b5b967b 100644 --- a/flang/test/Lower/OpenMP/derived-type-allocatable-map.f90 +++ b/flang/test/Lower/OpenMP/derived-type-allocatable-map.f90 @@ -6,7 +6,7 @@ !CHECK: %[[MEMBER_INDEX:.*]] = arith.constant 4 : index !CHECK: %[[MEMBER_COORD:.*]] = fir.coordinate_of %[[DECLARE]]#0, %[[MEMBER_INDEX]] : (!fir.ref>, index) -> !fir.ref>>> !CHECK: %[[MEMBER_BASE_ADDR:.*]] = fir.box_offset %[[MEMBER_COORD]] base_addr : (!fir.ref>>>) -> !fir.llvm_ptr>> -!CHECK: %[[MAP_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[MEMBER_COORD]] : !fir.ref>>>, !fir.array) var_ptr_ptr(%[[MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} +!CHECK: %[[MAP_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[MEMBER_COORD]] : !fir.ref>>>, i32) var_ptr_ptr(%[[MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} !CHECK: %[[MAP_MEMBER_DESCRIPTOR:.*]] = omp.map.info var_ptr(%[[MEMBER_COORD]] : !fir.ref>>>, !fir.box>>) map_clauses(to) capture(ByRef) -> !fir.ref>>> {{.*}} !CHECK: %[[MAP_PARENT:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref>, !fir.type<[[ONE_LAYER_TY]]>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_MEMBER_DESCRIPTOR]], %[[MAP_MEMBER_BASE_ADDR]] : [4], [4, 0] : !fir.ref>>>, !fir.llvm_ptr>>) -> !fir.ref> {{{.*}} partial_map = true} !CHECK: omp.target map_entries(%[[MAP_PARENT]] -> %[[ARG0:.*]], %[[MAP_MEMBER_DESCRIPTOR]] -> %[[ARG1:.*]], %[[MAP_MEMBER_BASE_ADDR]] -> %[[ARG2:.*]] : !fir.ref>, !fir.ref>>>, !fir.llvm_ptr>>) { @@ -37,7 +37,7 @@ subroutine dtype_alloca_map_op_block() !CHECK: %[[MEMBER_INDEX:.*]] = arith.constant 4 : index !CHECK: %[[MEMBER_COORD:.*]] = fir.coordinate_of %[[LOAD_DTYPE]], %[[MEMBER_INDEX]] : (!fir.box>>, index) -> !fir.ref>>> !CHECK: %[[MEMBER_BASE_ADDR:.*]] = fir.box_offset %[[MEMBER_COORD]] base_addr : (!fir.ref>>>) -> !fir.llvm_ptr>> -!CHECK: %[[MAP_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[MEMBER_COORD]] : !fir.ref>>>, !fir.array) var_ptr_ptr(%[[MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} +!CHECK: %[[MAP_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[MEMBER_COORD]] : !fir.ref>>>, i32) var_ptr_ptr(%[[MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} !CHECK: %[[MAP_MEMBER_DESC:.*]] = omp.map.info var_ptr(%[[MEMBER_COORD]] : !fir.ref>>>, !fir.box>>) map_clauses(to) capture(ByRef) -> !fir.ref>>> {{.*}} !CHECK: %[[LOAD_DTYPE:.*]] = fir.load %[[DECLARE]]#0 : !fir.ref>>> !CHECK: %[[MEMBER_COORD:.*]] = arith.constant 5 : index @@ -78,7 +78,7 @@ subroutine alloca_dtype_op_block_add() !CHECK: %[[NESTED_MEMBER_INDEX:.*]] = arith.constant 2 : index !CHECK: %[[NESTED_MEMBER_COORD:.*]] = fir.coordinate_of %[[NESTED_DTYPE_COORD]], %[[NESTED_MEMBER_INDEX]] : (!fir.ref>, index) -> !fir.ref>>> !CHECK: %[[NESTED_MEMBER_BASE_ADDR:.*]] = fir.box_offset %[[NESTED_MEMBER_COORD]] base_addr : (!fir.ref>>>) -> !fir.llvm_ptr>> -!CHECK: %[[MAP_NESTED_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[NESTED_MEMBER_COORD]] : !fir.ref>>>, !fir.array) var_ptr_ptr(%[[NESTED_MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} +!CHECK: %[[MAP_NESTED_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[NESTED_MEMBER_COORD]] : !fir.ref>>>, i32) var_ptr_ptr(%[[NESTED_MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} !CHECK: %[[MAP_NESTED_MEMBER_COORD:.*]] = omp.map.info var_ptr(%[[NESTED_MEMBER_COORD]] : !fir.ref>>>, !fir.box>>) map_clauses(to) capture(ByRef) -> !fir.ref>>> {{.*}} !CHECK: %[[LOAD:.*]] = fir.load %[[DECLARE]]#0 : !fir.ref}>>>> !CHECK: %[[NESTED_DTYPE_INDEX:.*]] = arith.constant 6 : index @@ -128,7 +128,7 @@ subroutine alloca_nest_dype_map_op_block_add() !CHECK: %[[NESTED_MEMBER_INDEX:.*]] = arith.constant 2 : index !CHECK: %[[NESTED_MEMBER_COORD:.*]] = fir.coordinate_of %[[NESTED_DTYPE_COORD]], %[[NESTED_MEMBER_INDEX]] : (!fir.ref>, index) -> !fir.ref>>> !CHECK: %[[NESTED_MEMBER_BASE_ADDR:.*]] = fir.box_offset %[[NESTED_MEMBER_COORD]] base_addr : (!fir.ref>>>) -> !fir.llvm_ptr>> -!CHECK: %[[MAP_NESTED_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[NESTED_MEMBER_COORD]] : !fir.ref>>>, !fir.array) var_ptr_ptr(%[[NESTED_MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} +!CHECK: %[[MAP_NESTED_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[NESTED_MEMBER_COORD]] : !fir.ref>>>, i32) var_ptr_ptr(%[[NESTED_MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} !CHECK: %[[MAP_NESTED_MEMBER_DESC:.*]] = omp.map.info var_ptr(%[[NESTED_MEMBER_COORD]] : !fir.ref>>>, !fir.box>>) map_clauses(to) capture(ByRef) -> !fir.ref>>> {{.*}} !CHECK: %[[MAP_PARENT:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref>, !fir.type<[[REC_TY]]>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_NESTED_MEMBER_DESC]], %[[MAP_NESTED_MEMBER_BASE_ADDR]] : [6, 2], [6, 2, 0] : !fir.ref>>>, !fir.llvm_ptr>>) -> !fir.ref> {{.*}} !CHECK: omp.target map_entries(%[[MAP_PARENT]] -> %[[ARG0:.*]], %[[MAP_NESTED_MEMBER_DESC]] -> %[[ARG1:.*]], %[[MAP_NESTED_MEMBER_BASE_ADDR]] -> %[[ARG2:.*]] : !fir.ref>, !fir.ref>>>, !fir.llvm_ptr>>) { diff --git a/flang/test/Lower/OpenMP/local-intrinsic-sized-array-map.f90 b/flang/test/Lower/OpenMP/local-intrinsic-sized-array-map.f90 new file mode 100644 index 0000000000000..ab2cdf380b783 --- /dev/null +++ b/flang/test/Lower/OpenMP/local-intrinsic-sized-array-map.f90 @@ -0,0 +1,32 @@ +!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s --check-prefixes="HLFIRDIALECT" + +!HLFIRDIALECT: func.func @_QPlocal_variable_intrinsic_size(%[[ARG0:.*]]: !fir.box> {fir.bindc_name = "a"}) { +!HLFIRDIALECT: %[[SZ_DATA:.*]] = fir.alloca index +!HLFIRDIALECT: %[[DECLARE:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope {{.*}} {uniq_name = "_QFlocal_variable_intrinsic_sizeEa"} : (!fir.box>, !fir.dscope) -> (!fir.box>, !fir.box>) +!HLFIRDIALECT: %[[DIMENSIONS:.*]]:3 = fir.box_dims %[[DECLARE]]#0, %{{.*}} : (!fir.box>, index) -> (index, index, index) +!HLFIRDIALECT: fir.store %[[DIMENSIONS]]#1 to %[[SZ_DATA]] : !fir.ref +!HLFIRDIALECT: %[[SIZE_SEL:.*]] = arith.select {{.*}}, {{.*}}, {{.*}} : index +!HLFIRDIALECT: %[[B_ALLOCA:.*]] = fir.alloca !fir.array, %[[SIZE_SEL]] {bindc_name = "b", uniq_name = "_QFlocal_variable_intrinsic_sizeEb"} +!HLFIRDIALECT: %[[B_SHAPE:.*]] = fir.shape %[[SIZE_SEL]] : (index) -> !fir.shape<1> +!HLFIRDIALECT: %[[B_DECLARE:.*]]:2 = hlfir.declare %[[B_ALLOCA]](%[[B_SHAPE]]) {uniq_name = "_QFlocal_variable_intrinsic_sizeEb"} : (!fir.ref>, !fir.shape<1>) -> (!fir.box>, !fir.ref>) +!HLFIRDIALECT: %[[BOUNDS:.*]] = omp.map.bounds lower_bound({{.*}} : index) upper_bound({{.*}} : index) extent({{.*}} : index) stride({{.*}} : index) start_idx({{.*}} : index) {stride_in_bytes = true} +!HLFIRDIALECT: %[[MAP_DATA_B:.*]] = omp.map.info var_ptr(%[[B_DECLARE]]#1 : !fir.ref>, f32) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "b"} +!HLFIRDIALECT: %[[MAP_DATA_SZ:.*]] = omp.map.info var_ptr(%[[SZ_DATA]] : !fir.ref, index) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref {name = ""} +!HLFIRDIALECT: omp.target map_entries(%[[MAP_DATA_B]] -> %[[ARG1:.*]], %[[MAP_DATA_SZ]] -> %[[ARG2:.*]] : !fir.ref>, !fir.ref) { +!HLFIRDIALECT: %[[SZ_LD:.*]] = fir.load %[[ARG2]] : !fir.ref +!HLFIRDIALECT: %[[SZ_CONV:.*]] = fir.convert %[[SZ_LD]] : (index) -> i64 +!HLFIRDIALECT: %[[SZ_CONV2:.*]] = fir.convert %[[SZ_CONV]] : (i64) -> index +!HLFIRDIALECT: %[[SEL_SZ:.*]] = arith.cmpi sgt, %[[SZ_CONV2]], %{{.*}} : index +!HLFIRDIALECT: %[[SEL_SZ2:.*]] = arith.select %[[SEL_SZ]], %[[SZ_CONV2]], %{{.*}} : index +!HLFIRDIALECT: %[[SHAPE:.*]] = fir.shape %[[SEL_SZ2]] : (index) -> !fir.shape<1> +!HLFIRDIALECT: %{{.*}} = hlfir.declare %[[ARG1]](%[[SHAPE]]) {uniq_name = "_QFlocal_variable_intrinsic_sizeEb"} : (!fir.ref>, !fir.shape<1>) -> (!fir.box>, !fir.ref>) + +subroutine local_variable_intrinsic_size(a) + implicit none + real, dimension(:) :: a + real, dimension(size(a, 1)) :: b + +!$omp target map(tofrom: b) + b(5) = 5 +!$omp end target +end subroutine diff --git a/flang/test/Transforms/omp-map-info-finalization.fir b/flang/test/Transforms/omp-map-info-finalization.fir index 74b87152d5b05..19e6dcad068cd 100644 --- a/flang/test/Transforms/omp-map-info-finalization.fir +++ b/flang/test/Transforms/omp-map-info-finalization.fir @@ -35,7 +35,7 @@ func.func @test_descriptor_expansion_pass(%arg0: !fir.box>) { // CHECK: %[[DESC_PARENT_MAP:.*]] = omp.map.info var_ptr(%[[DECLARE2]]#1 : !fir.ref>>, !fir.box>) map_clauses(to) capture(ByRef) members(%[[DESC_MEMBER_MAP]] : [0] : !fir.llvm_ptr>) -> !fir.ref>> // CHECK: fir.store %[[DECLARE1]]#1 to %[[ALLOCA]] : !fir.ref>> // CHECK: %[[BASE_ADDR_OFF_2:.*]] = fir.box_offset %[[ALLOCA]] base_addr : (!fir.ref>>) -> !fir.llvm_ptr>> -// CHECK: %[[DESC_MEMBER_MAP_2:.*]] = omp.map.info var_ptr(%[[ALLOCA]] : !fir.ref>>, !fir.array) var_ptr_ptr(%[[BASE_ADDR_OFF_2]] : !fir.llvm_ptr>>) map_clauses(from) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {name = ""} +// CHECK: %[[DESC_MEMBER_MAP_2:.*]] = omp.map.info var_ptr(%[[ALLOCA]] : !fir.ref>>, i32) var_ptr_ptr(%[[BASE_ADDR_OFF_2]] : !fir.llvm_ptr>>) map_clauses(from) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {name = ""} // CHECK: %[[DESC_PARENT_MAP_2:.*]] = omp.map.info var_ptr(%[[ALLOCA]] : !fir.ref>>, !fir.box>) map_clauses(to) capture(ByRef) members(%[[DESC_MEMBER_MAP_2]] : [0] : !fir.llvm_ptr>>) -> !fir.ref> // CHECK: omp.target map_entries(%[[DESC_PARENT_MAP]] -> %[[ARG1:.*]], %[[DESC_PARENT_MAP_2]] -> %[[ARG2:.*]], %[[DESC_MEMBER_MAP]] -> %[[ARG3:.*]], %[[DESC_MEMBER_MAP_2]] -> %[[ARG4:.*]] : {{.*}}) { @@ -115,7 +115,7 @@ func.func @dtype_alloca_op_block_add(%arg0: !fir.ref>, index) -> !fir.ref>>> // CHECK: %[[MEMBER_BASE_ADDR:.*]] = fir.box_offset %[[MEMBER_COORD:.*]] base_addr : (!fir.ref>>>) -> !fir.llvm_ptr>> -// CHECK: %[[MAP_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[MEMBER_COORD]] : !fir.ref>>>, !fir.array) var_ptr_ptr(%[[MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} +// CHECK: %[[MAP_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[MEMBER_COORD]] : !fir.ref>>>, i32) var_ptr_ptr(%[[MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} // CHECK: %[[MAP_MEMBER_DESCRIPTOR:.*]] = omp.map.info var_ptr(%[[MEMBER_COORD]] : !fir.ref>>>, !fir.box>>) map_clauses(to) capture(ByRef) -> !fir.ref>>> {name = "one_l%array_j"} // CHECK: %[[MAP_MEMBER_PARENT:.*]] = omp.map.info var_ptr(%[[ALLOCA]]#0 : !fir.ref<[[REC_TY]]>>, [[REC_TY]]>) map_clauses(tofrom) capture(ByRef) members(%10, %9 : [4], [4, 0] : !fir.ref>>>, !fir.llvm_ptr>>) -> !fir.ref<[[REC_TY]]>> {{.*}} // CHECK: omp.target map_entries(%[[MAP_MEMBER_PARENT]] -> %[[ARG1:.*]], %[[MAP_MEMBER_DESCRIPTOR]] -> %[[ARG2:.*]], %[[MAP_MEMBER_BASE_ADDR]] -> %[[ARG3:.*]] : !fir.ref<[[REC_TY]]>>, !fir.ref>>>, !fir.llvm_ptr>>) { @@ -157,7 +157,7 @@ func.func @alloca_dtype_map_op_block_add(%arg0 : !fir.ref>>> // CHECK: %[[ALLOCATABLE_MEMBER_COORD:.*]] = fir.coordinate_of %[[LOAD_ALLOCA]], %{{.*}} : (!fir.box>>, index) -> !fir.ref>>> // CHECK: %[[ALLOCATABLE_MEMBER_BASE_ADDR:.*]] = fir.box_offset %[[ALLOCATABLE_MEMBER_COORD]] base_addr : (!fir.ref>>>) -> !fir.llvm_ptr>> -// CHECK: %[[MAP_ALLOCA_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[ALLOCATABLE_MEMBER_COORD]] : !fir.ref>>>, !fir.array) var_ptr_ptr(%[[ALLOCATABLE_MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} +// CHECK: %[[MAP_ALLOCA_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[ALLOCATABLE_MEMBER_COORD]] : !fir.ref>>>, i32) var_ptr_ptr(%[[ALLOCATABLE_MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} // CHECK: %[[MAP_ALLOCA_MEMBER_DESCRIPTOR:.*]] = omp.map.info var_ptr(%[[ALLOCATABLE_MEMBER_COORD]] : !fir.ref>>>, !fir.box>>) map_clauses(to) capture(ByRef) -> !fir.ref>>> {{.*}} // CHECK: %[[LOAD_ALLOCA2:.*]] = fir.load %[[ALLOCA]]#0 : !fir.ref>>> // CHECK: %[[REGULAR_MEMBER_COORD:.*]] = fir.coordinate_of %[[LOAD_ALLOCA2]], %{{.*}} : (!fir.box>>, index) -> !fir.ref @@ -208,7 +208,7 @@ func.func @alloca_dtype_map_op_block_add(%arg0 : !fir.ref>>, index) -> !fir.ref,array_k:!fir.box>>,k:i32}]]>> // CHECK: %[[NESTED_ALLOCA_MEMBER:.*]] = fir.coordinate_of %[[INTERMEDIATE_DTYPE_NESTED_MEMBER]], %{{.*}} : (!fir.ref>, index) -> !fir.ref>>> // CHECK: %[[NESTED_ALLOCA_MEMBER_BASE_ADDR:.*]] = fir.box_offset %[[NESTED_ALLOCA_MEMBER]] base_addr : (!fir.ref>>>) -> !fir.llvm_ptr>> -// CHECK: %[[MAP_NESTED_ALLOCA_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[NESTED_ALLOCA_MEMBER]] : !fir.ref>>>, !fir.array) var_ptr_ptr(%[[NESTED_ALLOCA_MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} +// CHECK: %[[MAP_NESTED_ALLOCA_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[NESTED_ALLOCA_MEMBER]] : !fir.ref>>>, i32) var_ptr_ptr(%[[NESTED_ALLOCA_MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} // CHECK: %[[MAP_NESTED_ALLOCA_MEMBER:.*]] = omp.map.info var_ptr(%[[NESTED_ALLOCA_MEMBER]] : !fir.ref>>>, !fir.box>>) map_clauses(to) capture(ByRef) -> !fir.ref>>> {{.*}} // CHECK: %[[ALLOCA_LOAD2:.*]] = fir.load %[[ALLOCA]]#0 : !fir.ref>>> // CHECK: %[[INTERMEDIATE_DTYPE_NESTED_MEMBER2:.*]] = fir.coordinate_of %[[ALLOCA_LOAD2]], %{{.*}} : (!fir.box>>, index) -> !fir.ref> @@ -252,7 +252,7 @@ func.func @alloca_dtype_map_op_block_add(%arg0 : !fir.ref>, index) -> !fir.ref,array_k:!fir.box>>,k:i32}]]>> // CHECK: %[[ALLOCATABLE_MEMBER:.*]] = fir.coordinate_of %[[NESTED_DTYPE_COORD]], %{{.*}} : (!fir.ref>, index) -> !fir.ref>>> // CHECK: %[[ALLOCATABLE_MEMBER_BASE_ADDR:.*]] = fir.box_offset %[[ALLOCATABLE_MEMBER]] base_addr : (!fir.ref>>>) -> !fir.llvm_ptr>> -// CHECK: %[[MAP_ALLOCATABLE_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[ALLOCATABLE_MEMBER]] : !fir.ref>>>, !fir.array) var_ptr_ptr(%[[ALLOCATABLE_MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} +// CHECK: %[[MAP_ALLOCATABLE_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[ALLOCATABLE_MEMBER]] : !fir.ref>>>, i32) var_ptr_ptr(%[[ALLOCATABLE_MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} // CHECK: %[[MAP_ALLOCATABLE_MEMBER_DESCRIPTOR:.*]] = omp.map.info var_ptr(%[[ALLOCATABLE_MEMBER]] : !fir.ref>>>, !fir.box>>) map_clauses(to) capture(ByRef) -> !fir.ref>>> {{.*}} // CHECK: %[[MAP_PARENT:.*]] = omp.map.info var_ptr(%[[ALLOCA]]#0 : !fir.ref>, !fir.type<[[REC_TY]]>) map_clauses(tofrom) capture(ByRef) members(%12, %11 : [6, 2], [6, 2, 0] : !fir.ref>>>, !fir.llvm_ptr>>) -> !fir.ref> {{.*}} // CHECK: omp.target map_entries(%[[MAP_PARENT]] -> %[[ARG1:.*]], %[[MAP_ALLOCATABLE_MEMBER_DESCRIPTOR]] -> %[[ARG2:.*]], %[[MAP_ALLOCATABLE_MEMBER_BASE_ADDR]] -> %[[ARG3:.*]] : !fir.ref>, !fir.ref>>>, !fir.llvm_ptr>>) { @@ -286,13 +286,13 @@ func.func @alloca_dtype_map_op_block_add(%arg0 : !fir.ref>) -> (!fir.ref>, !fir.ref>) // CHECK: %[[DESC_1:.*]] = fir.coordinate_of %[[DECLARE]]#0, %{{.*}} : (!fir.ref>, index) -> !fir.ref>>,vertexy:!fir.box>>}]]>>>>> // CHECK: %[[BASE_ADDR_1:.*]] = fir.box_offset %[[DESC_1]] base_addr : (!fir.ref>>>>) -> !fir.llvm_ptr>>> -// CHECK: %[[BASE_ADDR_MAP_1:.*]] = omp.map.info var_ptr(%[[DESC_1]] : !fir.ref>>>>, !fir.array>) var_ptr_ptr(%[[BASE_ADDR_1]] : !fir.llvm_ptr>>>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) bounds(%{{.*}}) -> !fir.llvm_ptr>>> {{.*}} +// CHECK: %[[BASE_ADDR_MAP_1:.*]] = omp.map.info var_ptr(%[[DESC_1]] : !fir.ref>>>>, !fir.type<[[REC_TY2]]>) var_ptr_ptr(%[[BASE_ADDR_1]] : !fir.llvm_ptr>>>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) bounds(%{{.*}}) -> !fir.llvm_ptr>>> {{.*}} // CHECK: %[[DESC_MAP_1:.*]] = omp.map.info var_ptr(%[[DESC_1]] : !fir.ref>>>>, !fir.box>>>) map_clauses(to) capture(ByRef) -> !fir.ref>>>> {{.*}} // CHECK: %[[DESC_LD_1:.*]] = fir.load %[[DESC_1]] : !fir.ref>>>> // CHECK: %[[MEMBER_ACCESS_1:.*]] = fir.coordinate_of %[[DESC_LD_1]], %{{.*}} : (!fir.box>>>, index) -> !fir.ref> // CHECK: %[[DESC_2:.*]] = fir.coordinate_of %[[MEMBER_ACCESS_1]], %{{.*}} : (!fir.ref>, index) -> !fir.ref>>> // CHECK: %[[BASE_ADDR_2:.*]] = fir.box_offset %[[DESC_2]] base_addr : (!fir.ref>>>) -> !fir.llvm_ptr>> -// CHECK: %[[BASE_ADDR_MAP_2:.*]] = omp.map.info var_ptr(%[[DESC_2]] : !fir.ref>>>, !fir.array) var_ptr_ptr(%[[BASE_ADDR_2]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%{{.*}}) -> !fir.llvm_ptr>> {{.*}} +// CHECK: %[[BASE_ADDR_MAP_2:.*]] = omp.map.info var_ptr(%[[DESC_2]] : !fir.ref>>>, i32) var_ptr_ptr(%[[BASE_ADDR_2]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%{{.*}}) -> !fir.llvm_ptr>> {{.*}} // CHECK: %[[DESC_MAP_2:.*]] = omp.map.info var_ptr(%[[DESC_2]] : !fir.ref>>>, !fir.box>>) map_clauses(to) capture(ByRef) -> !fir.ref>>> {{.*}} // CHECK: %[[TOP_PARENT_MAP:.*]] = omp.map.info var_ptr(%0#1 : !fir.ref>, !fir.type<[[REC_TY]]>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) members(%6, %5, %14, %13 : [1], [1, 0], [1, 0, 2], [1, 0, 2, 0] : !fir.ref>>>>, !fir.llvm_ptr>>>, !fir.ref>>>, !fir.llvm_ptr>>) -> !fir.ref> {{{.*}} partial_map = true} // CHECK: omp.target map_entries(%[[TOP_PARENT_MAP]] -> %{{.*}}, %[[DESC_MAP_1]] -> %{{.*}}, %[[BASE_ADDR_MAP_1]] -> %{{.*}}, %[[DESC_MAP_2]] -> %{{.*}}, %[[BASE_ADDR_MAP_2]] -> %{{.*}} : !fir.ref>, !fir.ref>>>>, !fir.llvm_ptr>>>, !fir.ref>>>, !fir.llvm_ptr>>) { diff --git a/offload/test/offloading/fortran/target-map-local-intrinisc-sized-param.f90 b/offload/test/offloading/fortran/target-map-local-intrinisc-sized-param.f90 new file mode 100644 index 0000000000000..b4fded7b3c70a --- /dev/null +++ b/offload/test/offloading/fortran/target-map-local-intrinisc-sized-param.f90 @@ -0,0 +1,39 @@ +! Offloading test checking interaction of an local array +! sized utilising an input parameter and the size intrinsic +! when being mapped to device. +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-run-and-check-generic +module mod + use iso_fortran_env, only: real64 + implicit none +contains + subroutine test(a) + implicit none + integer :: i + real(kind=real64), dimension(:) :: a + real(kind=real64), dimension(size(a, 1)) :: b + +!$omp target map(tofrom: b) + do i = 1, 10 + b(i) = i + end do +!$omp end target + + print *, b + end subroutine +end module mod + +program main + use mod + real(kind=real64), allocatable :: a(:) + allocate(a(10)) + + do i = 1, 10 + a(i) = i + end do + + call test(a) +end program main + +!CHECK: 1. 2. 3. 4. 5. 6. 7. 8. 9. 10. From b9482ceb97f7cf7cde707dd81a0149dc9958ae53 Mon Sep 17 00:00:00 2001 From: Slava Zakharin Date: Fri, 3 Jan 2025 08:17:52 -0800 Subject: [PATCH 365/567] [flang] Improve designate/elemental indices match in opt-bufferization. (#121371) This pattern appears in `tonto`: `rys1%w = rys1%w * ...`, where component `w` is a pointer. Due to the computations transforming the elemental's one-based indices to the array indices, the indices match check did not pass in opt-bufferization. This patch recognizes this indices adjusting pattern, and returns the one-based indices for the designator. --- .../Transforms/OptimizedBufferization.cpp | 76 ++++++++++++++++++- .../opt-bufferization-same-ptr-elemental.fir | 69 +++++++++++++++++ 2 files changed, 144 insertions(+), 1 deletion(-) create mode 100644 flang/test/HLFIR/opt-bufferization-same-ptr-elemental.fir diff --git a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp index bf3cf861e46f4..bfaabed013678 100644 --- a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp +++ b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp @@ -87,6 +87,13 @@ class ElementalAssignBufferization /// determines if the transformation can be applied to this elemental static std::optional findMatch(hlfir::ElementalOp elemental); + /// Returns the array indices for the given hlfir.designate. + /// It recognizes the computations used to transform the one-based indices + /// into the array's lb-based indices, and returns the one-based indices + /// in these cases. + static llvm::SmallVector + getDesignatorIndices(hlfir::DesignateOp designate); + public: using mlir::OpRewritePattern::OpRewritePattern; @@ -430,6 +437,73 @@ bool ArraySectionAnalyzer::isLess(mlir::Value v1, mlir::Value v2) { return false; } +llvm::SmallVector +ElementalAssignBufferization::getDesignatorIndices( + hlfir::DesignateOp designate) { + mlir::Value memref = designate.getMemref(); + + // If the object is a box, then the indices may be adjusted + // according to the box's lower bound(s). Scan through + // the computations to try to find the one-based indices. + if (mlir::isa(memref.getType())) { + // Look for the following pattern: + // %13 = fir.load %12 : !fir.ref + // %14:3 = fir.box_dims %13, %c0 : (!fir.box<...>, index) -> ... + // %17 = arith.subi %14#0, %c1 : index + // %18 = arith.addi %arg2, %17 : index + // %19 = hlfir.designate %13 (%18) : (!fir.box<...>, index) -> ... + // + // %arg2 is a one-based index. + + auto isNormalizedLb = [memref](mlir::Value v, unsigned dim) { + // Return true, if v and dim are such that: + // %14:3 = fir.box_dims %13, %dim : (!fir.box<...>, index) -> ... + // %17 = arith.subi %14#0, %c1 : index + // %19 = hlfir.designate %13 (...) : (!fir.box<...>, index) -> ... + if (auto subOp = + mlir::dyn_cast_or_null(v.getDefiningOp())) { + auto cst = fir::getIntIfConstant(subOp.getRhs()); + if (!cst || *cst != 1) + return false; + if (auto dimsOp = mlir::dyn_cast_or_null( + subOp.getLhs().getDefiningOp())) { + if (memref != dimsOp.getVal() || + dimsOp.getResult(0) != subOp.getLhs()) + return false; + auto dimsOpDim = fir::getIntIfConstant(dimsOp.getDim()); + return dimsOpDim && dimsOpDim == dim; + } + } + return false; + }; + + llvm::SmallVector newIndices; + for (auto index : llvm::enumerate(designate.getIndices())) { + if (auto addOp = mlir::dyn_cast_or_null( + index.value().getDefiningOp())) { + for (unsigned opNum = 0; opNum < 2; ++opNum) + if (isNormalizedLb(addOp->getOperand(opNum), index.index())) { + newIndices.push_back(addOp->getOperand((opNum + 1) % 2)); + break; + } + + // If new one-based index was not added, exit early. + if (newIndices.size() <= index.index()) + break; + } + } + + // If any of the indices is not adjusted to the array's lb, + // then return the original designator indices. + if (newIndices.size() != designate.getIndices().size()) + return designate.getIndices(); + + return newIndices; + } + + return designate.getIndices(); +} + std::optional ElementalAssignBufferization::findMatch(hlfir::ElementalOp elemental) { mlir::Operation::user_range users = elemental->getUsers(); @@ -557,7 +631,7 @@ ElementalAssignBufferization::findMatch(hlfir::ElementalOp elemental) { << " at " << elemental.getLoc() << "\n"); return std::nullopt; } - auto indices = designate.getIndices(); + auto indices = getDesignatorIndices(designate); auto elementalIndices = elemental.getIndices(); if (indices.size() == elementalIndices.size() && std::equal(indices.begin(), indices.end(), elementalIndices.begin(), diff --git a/flang/test/HLFIR/opt-bufferization-same-ptr-elemental.fir b/flang/test/HLFIR/opt-bufferization-same-ptr-elemental.fir new file mode 100644 index 0000000000000..ae91930d44eb1 --- /dev/null +++ b/flang/test/HLFIR/opt-bufferization-same-ptr-elemental.fir @@ -0,0 +1,69 @@ +// RUN: fir-opt --opt-bufferization %s | FileCheck %s + +// Verify that the hlfir.assign of hlfir.elemental is optimized +// into element-per-element assignment: +// subroutine test1(p) +// real, pointer :: p(:) +// p = p + 1.0 +// end subroutine test1 + +func.func @_QPtest1(%arg0: !fir.ref>>> {fir.bindc_name = "p"}) { + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 1.000000e+00 : f32 + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFtest1Ep"} : (!fir.ref>>>, !fir.dscope) -> (!fir.ref>>>, !fir.ref>>>) + %2 = fir.load %1#0 : !fir.ref>>> + %3:3 = fir.box_dims %2, %c0 : (!fir.box>>, index) -> (index, index, index) + %4 = fir.shape %3#1 : (index) -> !fir.shape<1> + %5 = hlfir.elemental %4 unordered : (!fir.shape<1>) -> !hlfir.expr { + ^bb0(%arg1: index): + %6 = arith.subi %3#0, %c1 : index + %7 = arith.addi %arg1, %6 : index + %8 = hlfir.designate %2 (%7) : (!fir.box>>, index) -> !fir.ref + %9 = fir.load %8 : !fir.ref + %10 = arith.addf %9, %cst fastmath : f32 + hlfir.yield_element %10 : f32 + } + hlfir.assign %5 to %2 : !hlfir.expr, !fir.box>> + hlfir.destroy %5 : !hlfir.expr + return +} +// CHECK-LABEL: func.func @_QPtest1( +// CHECK-NOT: hlfir.assign +// CHECK: hlfir.assign %{{.*}} to %{{.*}} : f32, !fir.ref +// CHECK-NOT: hlfir.assign + +// subroutine test2(p) +// real, pointer :: p(:,:) +// p = p + 1.0 +// end subroutine test2 +func.func @_QPtest2(%arg0: !fir.ref>>> {fir.bindc_name = "p"}) { + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 1.000000e+00 : f32 + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFtest2Ep"} : (!fir.ref>>>, !fir.dscope) -> (!fir.ref>>>, !fir.ref>>>) + %2 = fir.load %1#0 : !fir.ref>>> + %3:3 = fir.box_dims %2, %c0 : (!fir.box>>, index) -> (index, index, index) + %4:3 = fir.box_dims %2, %c1 : (!fir.box>>, index) -> (index, index, index) + %5 = fir.shape %3#1, %4#1 : (index, index) -> !fir.shape<2> + %6 = hlfir.elemental %5 unordered : (!fir.shape<2>) -> !hlfir.expr { + ^bb0(%arg1: index, %arg2: index): + %7 = arith.subi %3#0, %c1 : index + %8 = arith.addi %arg1, %7 : index + %9 = arith.subi %4#0, %c1 : index + %10 = arith.addi %arg2, %9 : index + %11 = hlfir.designate %2 (%8, %10) : (!fir.box>>, index, index) -> !fir.ref + %12 = fir.load %11 : !fir.ref + %13 = arith.addf %12, %cst fastmath : f32 + hlfir.yield_element %13 : f32 + } + hlfir.assign %6 to %2 : !hlfir.expr, !fir.box>> + hlfir.destroy %6 : !hlfir.expr + return +} +// CHECK-LABEL: func.func @_QPtest2( +// CHECK-NOT: hlfir.assign +// CHECK: hlfir.assign %{{.*}} to %{{.*}} : f32, !fir.ref +// CHECK-NOT: hlfir.assign From 3c700d131a35ce4b0063a4688dce4a0cb739ca83 Mon Sep 17 00:00:00 2001 From: Slava Zakharin Date: Fri, 3 Jan 2025 08:33:14 -0800 Subject: [PATCH 366/567] [flang] Extract hlfir.assign inlining from opt-bufferization. (#121544) Optimized bufferization can transform hlfir.assign into a loop nest doing element per element assignment, but it avoids doing so for RHS that is hlfir.expr. This is done to let ElementalAssignBufferization pattern to try to do a better job. This patch moves the hlfir.assign inlining after opt-bufferization, and enables it for hlfir.expr RHS. The hlfir.expr RHS cases are present in tonto, and this patch results in some nice improvements. Note that those cases are handled by other compilers also using array temporaries, so this patch seems to just get rid of the Assign runtime overhead/inefficiency. --- flang/include/flang/Optimizer/HLFIR/Passes.td | 4 + .../Optimizer/HLFIR/Transforms/CMakeLists.txt | 1 + .../HLFIR/Transforms/InlineHLFIRAssign.cpp | 152 ++++++++++++++++++ .../Transforms/OptimizedBufferization.cpp | 109 +------------ flang/lib/Optimizer/Passes/Pipelines.cpp | 2 + flang/test/Driver/mlir-pass-pipeline.f90 | 4 + flang/test/Fir/basic-program.fir | 4 + ...ble-assign.fir => inline-hlfir-assign.fir} | 57 ++++++- flang/test/HLFIR/maxloc-elemental.fir | 8 +- flang/test/HLFIR/minloc-elemental.fir | 16 +- .../HLFIR/opt-bufferization-eval_in_mem.fir | 7 +- flang/test/HLFIR/opt-bufferization.fir | 42 ----- 12 files changed, 228 insertions(+), 178 deletions(-) create mode 100644 flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp rename flang/test/HLFIR/{opt-variable-assign.fir => inline-hlfir-assign.fir} (84%) diff --git a/flang/include/flang/Optimizer/HLFIR/Passes.td b/flang/include/flang/Optimizer/HLFIR/Passes.td index ed49f5093c965..644f1e3c3af2b 100644 --- a/flang/include/flang/Optimizer/HLFIR/Passes.td +++ b/flang/include/flang/Optimizer/HLFIR/Passes.td @@ -49,4 +49,8 @@ def InlineElementals : Pass<"inline-elementals"> { let summary = "Inline chained hlfir.elemental operations"; } +def InlineHLFIRAssign : Pass<"inline-hlfir-assign"> { + let summary = "Inline hlfir.assign operations"; +} + #endif //FORTRAN_DIALECT_HLFIR_PASSES diff --git a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt index d18df2ef49f10..25a532204dd05 100644 --- a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt +++ b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt @@ -4,6 +4,7 @@ add_flang_library(HLFIRTransforms BufferizeHLFIR.cpp ConvertToFIR.cpp InlineElementals.cpp + InlineHLFIRAssign.cpp LowerHLFIRIntrinsics.cpp LowerHLFIROrderedAssignments.cpp ScheduleOrderedAssignments.cpp diff --git a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp new file mode 100644 index 0000000000000..249976d5509b0 --- /dev/null +++ b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp @@ -0,0 +1,152 @@ +//===- InlineHLFIRAssign.cpp - Inline hlfir.assign ops --------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Transform hlfir.assign array operations into loop nests performing element +// per element assignments. The inlining is done for trivial data types always, +// though, we may add performance/code-size heuristics in future. +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/Analysis/AliasAnalysis.h" +#include "flang/Optimizer/Builder/FIRBuilder.h" +#include "flang/Optimizer/Builder/HLFIRTools.h" +#include "flang/Optimizer/HLFIR/HLFIROps.h" +#include "flang/Optimizer/HLFIR/Passes.h" +#include "flang/Optimizer/OpenMP/Passes.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Support/LLVM.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +namespace hlfir { +#define GEN_PASS_DEF_INLINEHLFIRASSIGN +#include "flang/Optimizer/HLFIR/Passes.h.inc" +} // namespace hlfir + +#define DEBUG_TYPE "inline-hlfir-assign" + +namespace { +/// Expand hlfir.assign of array RHS to array LHS into a loop nest +/// of element-by-element assignments: +/// hlfir.assign %4 to %5 : !fir.ref>, +/// !fir.ref> +/// into: +/// fir.do_loop %arg1 = %c1 to %c3 step %c1 unordered { +/// fir.do_loop %arg2 = %c1 to %c3 step %c1 unordered { +/// %6 = hlfir.designate %4 (%arg2, %arg1) : +/// (!fir.ref>, index, index) -> !fir.ref +/// %7 = fir.load %6 : !fir.ref +/// %8 = hlfir.designate %5 (%arg2, %arg1) : +/// (!fir.ref>, index, index) -> !fir.ref +/// hlfir.assign %7 to %8 : f32, !fir.ref +/// } +/// } +/// +/// The transformation is correct only when LHS and RHS do not alias. +/// When RHS is an array expression, then there is no aliasing. +/// This transformation does not support runtime checking for +/// non-conforming LHS/RHS arrays' shapes currently. +class InlineHLFIRAssignConversion + : public mlir::OpRewritePattern { +public: + using mlir::OpRewritePattern::OpRewritePattern; + + llvm::LogicalResult + matchAndRewrite(hlfir::AssignOp assign, + mlir::PatternRewriter &rewriter) const override { + if (assign.isAllocatableAssignment()) + return rewriter.notifyMatchFailure(assign, + "AssignOp may imply allocation"); + + hlfir::Entity rhs{assign.getRhs()}; + + if (!rhs.isArray()) + return rewriter.notifyMatchFailure(assign, + "AssignOp's RHS is not an array"); + + mlir::Type rhsEleTy = rhs.getFortranElementType(); + if (!fir::isa_trivial(rhsEleTy)) + return rewriter.notifyMatchFailure( + assign, "AssignOp's RHS data type is not trivial"); + + hlfir::Entity lhs{assign.getLhs()}; + if (!lhs.isArray()) + return rewriter.notifyMatchFailure(assign, + "AssignOp's LHS is not an array"); + + mlir::Type lhsEleTy = lhs.getFortranElementType(); + if (!fir::isa_trivial(lhsEleTy)) + return rewriter.notifyMatchFailure( + assign, "AssignOp's LHS data type is not trivial"); + + if (lhsEleTy != rhsEleTy) + return rewriter.notifyMatchFailure(assign, + "RHS/LHS element types mismatch"); + + if (!mlir::isa(rhs.getType())) { + // If RHS is not an hlfir.expr, then we should prove that + // LHS and RHS do not alias. + // TODO: if they may alias, we can insert hlfir.as_expr for RHS, + // and proceed with the inlining. + fir::AliasAnalysis aliasAnalysis; + mlir::AliasResult aliasRes = aliasAnalysis.alias(lhs, rhs); + // TODO: use areIdenticalOrDisjointSlices() from + // OptimizedBufferization.cpp to check if we can still do the expansion. + if (!aliasRes.isNo()) { + LLVM_DEBUG(llvm::dbgs() << "InlineHLFIRAssign:\n" + << "\tLHS: " << lhs << "\n" + << "\tRHS: " << rhs << "\n" + << "\tALIAS: " << aliasRes << "\n"); + return rewriter.notifyMatchFailure(assign, "RHS/LHS may alias"); + } + } + + mlir::Location loc = assign->getLoc(); + fir::FirOpBuilder builder(rewriter, assign.getOperation()); + builder.setInsertionPoint(assign); + rhs = hlfir::derefPointersAndAllocatables(loc, builder, rhs); + lhs = hlfir::derefPointersAndAllocatables(loc, builder, lhs); + mlir::Value shape = hlfir::genShape(loc, builder, lhs); + llvm::SmallVector extents = + hlfir::getIndexExtents(loc, builder, shape); + hlfir::LoopNest loopNest = + hlfir::genLoopNest(loc, builder, extents, /*isUnordered=*/true, + flangomp::shouldUseWorkshareLowering(assign)); + builder.setInsertionPointToStart(loopNest.body); + auto rhsArrayElement = + hlfir::getElementAt(loc, builder, rhs, loopNest.oneBasedIndices); + rhsArrayElement = hlfir::loadTrivialScalar(loc, builder, rhsArrayElement); + auto lhsArrayElement = + hlfir::getElementAt(loc, builder, lhs, loopNest.oneBasedIndices); + builder.create(loc, rhsArrayElement, lhsArrayElement); + rewriter.eraseOp(assign); + return mlir::success(); + } +}; + +class InlineHLFIRAssignPass + : public hlfir::impl::InlineHLFIRAssignBase { +public: + void runOnOperation() override { + mlir::MLIRContext *context = &getContext(); + + mlir::GreedyRewriteConfig config; + // Prevent the pattern driver from merging blocks. + config.enableRegionSimplification = + mlir::GreedySimplifyRegionLevel::Disabled; + + mlir::RewritePatternSet patterns(context); + patterns.insert(context); + + if (mlir::failed(mlir::applyPatternsGreedily( + getOperation(), std::move(patterns), config))) { + mlir::emitError(getOperation()->getLoc(), + "failure in hlfir.assign inlining"); + signalPassFailure(); + } + } +}; +} // namespace diff --git a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp index bfaabed013678..0cfefc2d23ecb 100644 --- a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp +++ b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp @@ -772,108 +772,6 @@ llvm::LogicalResult BroadcastAssignBufferization::matchAndRewrite( return mlir::success(); } -/// Expand hlfir.assign of array RHS to array LHS into a loop nest -/// of element-by-element assignments: -/// hlfir.assign %4 to %5 : !fir.ref>, -/// !fir.ref> -/// into: -/// fir.do_loop %arg1 = %c1 to %c3 step %c1 unordered { -/// fir.do_loop %arg2 = %c1 to %c3 step %c1 unordered { -/// %6 = hlfir.designate %4 (%arg2, %arg1) : -/// (!fir.ref>, index, index) -> !fir.ref -/// %7 = fir.load %6 : !fir.ref -/// %8 = hlfir.designate %5 (%arg2, %arg1) : -/// (!fir.ref>, index, index) -> !fir.ref -/// hlfir.assign %7 to %8 : f32, !fir.ref -/// } -/// } -/// -/// The transformation is correct only when LHS and RHS do not alias. -/// This transformation does not support runtime checking for -/// non-conforming LHS/RHS arrays' shapes currently. -class VariableAssignBufferization - : public mlir::OpRewritePattern { -private: -public: - using mlir::OpRewritePattern::OpRewritePattern; - - llvm::LogicalResult - matchAndRewrite(hlfir::AssignOp assign, - mlir::PatternRewriter &rewriter) const override; -}; - -llvm::LogicalResult VariableAssignBufferization::matchAndRewrite( - hlfir::AssignOp assign, mlir::PatternRewriter &rewriter) const { - if (assign.isAllocatableAssignment()) - return rewriter.notifyMatchFailure(assign, "AssignOp may imply allocation"); - - hlfir::Entity rhs{assign.getRhs()}; - - // To avoid conflicts with ElementalAssignBufferization pattern, we avoid - // matching RHS when it is an `ExprType` defined by an `ElementalOp`; which is - // among the main criteria matched by ElementalAssignBufferization. - if (mlir::isa(rhs.getType()) && - mlir::isa(rhs.getDefiningOp())) - return rewriter.notifyMatchFailure( - assign, "RHS is an ExprType defined by ElementalOp"); - - if (!rhs.isArray()) - return rewriter.notifyMatchFailure(assign, - "AssignOp's RHS is not an array"); - - mlir::Type rhsEleTy = rhs.getFortranElementType(); - if (!fir::isa_trivial(rhsEleTy)) - return rewriter.notifyMatchFailure( - assign, "AssignOp's RHS data type is not trivial"); - - hlfir::Entity lhs{assign.getLhs()}; - if (!lhs.isArray()) - return rewriter.notifyMatchFailure(assign, - "AssignOp's LHS is not an array"); - - mlir::Type lhsEleTy = lhs.getFortranElementType(); - if (!fir::isa_trivial(lhsEleTy)) - return rewriter.notifyMatchFailure( - assign, "AssignOp's LHS data type is not trivial"); - - if (lhsEleTy != rhsEleTy) - return rewriter.notifyMatchFailure(assign, - "RHS/LHS element types mismatch"); - - fir::AliasAnalysis aliasAnalysis; - mlir::AliasResult aliasRes = aliasAnalysis.alias(lhs, rhs); - // TODO: use areIdenticalOrDisjointSlices() to check if - // we can still do the expansion. - if (!aliasRes.isNo()) { - LLVM_DEBUG(llvm::dbgs() << "VariableAssignBufferization:\n" - << "\tLHS: " << lhs << "\n" - << "\tRHS: " << rhs << "\n" - << "\tALIAS: " << aliasRes << "\n"); - return rewriter.notifyMatchFailure(assign, "RHS/LHS may alias"); - } - - mlir::Location loc = assign->getLoc(); - fir::FirOpBuilder builder(rewriter, assign.getOperation()); - builder.setInsertionPoint(assign); - rhs = hlfir::derefPointersAndAllocatables(loc, builder, rhs); - lhs = hlfir::derefPointersAndAllocatables(loc, builder, lhs); - mlir::Value shape = hlfir::genShape(loc, builder, lhs); - llvm::SmallVector extents = - hlfir::getIndexExtents(loc, builder, shape); - hlfir::LoopNest loopNest = - hlfir::genLoopNest(loc, builder, extents, /*isUnordered=*/true, - flangomp::shouldUseWorkshareLowering(assign)); - builder.setInsertionPointToStart(loopNest.body); - auto rhsArrayElement = - hlfir::getElementAt(loc, builder, rhs, loopNest.oneBasedIndices); - rhsArrayElement = hlfir::loadTrivialScalar(loc, builder, rhsArrayElement); - auto lhsArrayElement = - hlfir::getElementAt(loc, builder, lhs, loopNest.oneBasedIndices); - builder.create(loc, rhsArrayElement, lhsArrayElement); - rewriter.eraseOp(assign); - return mlir::success(); -} - using GenBodyFn = std::function &)>; @@ -1280,9 +1178,9 @@ class ReductionMaskConversion : public mlir::OpRewritePattern { loc, resultArr, builder.createBool(loc, false)); // Check all the users - the destroy is no longer required, and any assign - // can use resultArr directly so that VariableAssignBufferization in this - // pass can optimize the results. Other operations are replaces with an - // AsExpr for the temporary resultArr. + // can use resultArr directly so that InlineHLFIRAssign pass + // can optimize the results. Other operations are replaced with an AsExpr + // for the temporary resultArr. llvm::SmallVector destroys; llvm::SmallVector assigns; for (auto user : mloc->getUsers()) { @@ -1430,7 +1328,6 @@ class OptimizedBufferizationPass // This requires small code reordering in ElementalAssignBufferization. patterns.insert(context); patterns.insert(context); - patterns.insert(context); patterns.insert(context); patterns.insert>(context); patterns.insert>(context); diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp index 72803aa3793ce..20e4599587c4b 100644 --- a/flang/lib/Optimizer/Passes/Pipelines.cpp +++ b/flang/lib/Optimizer/Passes/Pipelines.cpp @@ -234,6 +234,8 @@ void createHLFIRToFIRPassPipeline(mlir::PassManager &pm, bool enableOpenMP, pm.addPass(mlir::createCSEPass()); addNestedPassToAllTopLevelOperations( pm, hlfir::createOptimizedBufferization); + addNestedPassToAllTopLevelOperations( + pm, hlfir::createInlineHLFIRAssign); } pm.addPass(hlfir::createLowerHLFIROrderedAssignments()); pm.addPass(hlfir::createLowerHLFIRIntrinsics()); diff --git a/flang/test/Driver/mlir-pass-pipeline.f90 b/flang/test/Driver/mlir-pass-pipeline.f90 index b30affe691b84..9655afce96d92 100644 --- a/flang/test/Driver/mlir-pass-pipeline.f90 +++ b/flang/test/Driver/mlir-pass-pipeline.f90 @@ -36,12 +36,16 @@ ! O2-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private'] ! O2-NEXT: 'fir.global' Pipeline ! O2-NEXT: OptimizedBufferization +! O2-NEXT: InlineHLFIRAssign ! O2-NEXT: 'func.func' Pipeline ! O2-NEXT: OptimizedBufferization +! O2-NEXT: InlineHLFIRAssign ! O2-NEXT: 'omp.declare_reduction' Pipeline ! O2-NEXT: OptimizedBufferization +! O2-NEXT: InlineHLFIRAssign ! O2-NEXT: 'omp.private' Pipeline ! O2-NEXT: OptimizedBufferization +! O2-NEXT: InlineHLFIRAssign ! ALL: LowerHLFIROrderedAssignments ! ALL-NEXT: LowerHLFIRIntrinsics ! ALL-NEXT: BufferizeHLFIR diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir index d2788008c3893..620882ebbed2a 100644 --- a/flang/test/Fir/basic-program.fir +++ b/flang/test/Fir/basic-program.fir @@ -37,12 +37,16 @@ func.func @_QQmain() { // PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private'] // PASSES-NEXT: 'fir.global' Pipeline // PASSES-NEXT: OptimizedBufferization +// PASSES-NEXT: InlineHLFIRAssign // PASSES-NEXT: 'func.func' Pipeline // PASSES-NEXT: OptimizedBufferization +// PASSES-NEXT: InlineHLFIRAssign // PASSES-NEXT: 'omp.declare_reduction' Pipeline // PASSES-NEXT: OptimizedBufferization +// PASSES-NEXT: InlineHLFIRAssign // PASSES-NEXT: 'omp.private' Pipeline // PASSES-NEXT: OptimizedBufferization +// PASSES-NEXT: InlineHLFIRAssign // PASSES-NEXT: LowerHLFIROrderedAssignments // PASSES-NEXT: LowerHLFIRIntrinsics // PASSES-NEXT: BufferizeHLFIR diff --git a/flang/test/HLFIR/opt-variable-assign.fir b/flang/test/HLFIR/inline-hlfir-assign.fir similarity index 84% rename from flang/test/HLFIR/opt-variable-assign.fir rename to flang/test/HLFIR/inline-hlfir-assign.fir index 17124fa86af65..f834e7971e3d5 100644 --- a/flang/test/HLFIR/opt-variable-assign.fir +++ b/flang/test/HLFIR/inline-hlfir-assign.fir @@ -1,6 +1,5 @@ -// Test optimized bufferization for hlfir.assign of arrays -// variables: -// RUN: fir-opt --opt-bufferization %s | FileCheck %s +// Test inlining of hlfir.assign of arrays: +// RUN: fir-opt --inline-hlfir-assign %s | FileCheck %s // The two assigns come from the following source forms: // y(:,:) = x(:,:) @@ -302,3 +301,55 @@ func.func @_QPtest7(%arg0: !fir.ref>>> {f // CHECK-NOT: hlfir.assign // CHECK: hlfir.assign %{{.*}} to %{{.*}} : f32, !fir.ref // CHECK-NOT: hlfir.assign + + +// Test that VAR = EXPR assignment is inlined: +// subroutine test_expr_rhs(p1, p2) +// logical, pointer :: p1(:), p2(:) +// p1 = (p2) +// end subroutine test_expr_rhs +func.func @_QPtest_expr_rhs(%arg0: !fir.ref>>>> {fir.bindc_name = "p1"}, %arg1: !fir.ref>>>> {fir.bindc_name = "p2"}) { + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFtest_expr_rhsEp1"} : (!fir.ref>>>>, !fir.dscope) -> (!fir.ref>>>>, !fir.ref>>>>) + %2:2 = hlfir.declare %arg1 dummy_scope %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFtest_expr_rhsEp2"} : (!fir.ref>>>>, !fir.dscope) -> (!fir.ref>>>>, !fir.ref>>>>) + %3 = fir.load %2#0 : !fir.ref>>>> + %4:3 = fir.box_dims %3, %c0 : (!fir.box>>>, index) -> (index, index, index) + %5 = fir.shape %4#1 : (index) -> !fir.shape<1> + %6 = hlfir.elemental %5 unordered : (!fir.shape<1>) -> !hlfir.expr> { + ^bb0(%arg2: index): + %8 = arith.subi %4#0, %c1 : index + %9 = arith.addi %arg2, %8 : index + %10 = hlfir.designate %3 (%9) : (!fir.box>>>, index) -> !fir.ref> + %11 = fir.load %10 : !fir.ref> + %12 = hlfir.no_reassoc %11 : !fir.logical<4> + hlfir.yield_element %12 : !fir.logical<4> + } + %7 = fir.load %1#0 : !fir.ref>>>> + hlfir.assign %6 to %7 : !hlfir.expr>, !fir.box>>> + hlfir.destroy %6 : !hlfir.expr> + return +} +// CHECK-LABEL: func.func @_QPtest_expr_rhs( +// CHECK-SAME: %[[VAL_0:.*]]: !fir.ref>>>> {fir.bindc_name = "p1"}, +// CHECK-SAME: %[[VAL_1:.*]]: !fir.ref>>>> {fir.bindc_name = "p2"}) { +// CHECK: %[[VAL_2:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_3:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_4]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFtest_expr_rhsEp1"} : (!fir.ref>>>>, !fir.dscope) -> (!fir.ref>>>>, !fir.ref>>>>) +// CHECK: %[[VAL_10:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<1>) -> !hlfir.expr> { +// CHECK: } +// CHECK: %[[VAL_17:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref>>>> +// CHECK: %[[VAL_18:.*]]:3 = fir.box_dims %[[VAL_17]], %[[VAL_3]] : (!fir.box>>>, index) -> (index, index, index) +// CHECK: fir.do_loop %[[VAL_19:.*]] = %[[VAL_2]] to %[[VAL_18]]#1 step %[[VAL_2]] unordered { +// CHECK: %[[VAL_20:.*]] = hlfir.apply %[[VAL_10]], %[[VAL_19]] : (!hlfir.expr>, index) -> !fir.logical<4> +// CHECK: %[[VAL_21:.*]]:3 = fir.box_dims %[[VAL_17]], %[[VAL_3]] : (!fir.box>>>, index) -> (index, index, index) +// CHECK: %[[VAL_22:.*]] = arith.subi %[[VAL_21]]#0, %[[VAL_2]] : index +// CHECK: %[[VAL_23:.*]] = arith.addi %[[VAL_19]], %[[VAL_22]] : index +// CHECK: %[[VAL_24:.*]] = hlfir.designate %[[VAL_17]] (%[[VAL_23]]) : (!fir.box>>>, index) -> !fir.ref> +// CHECK: hlfir.assign %[[VAL_20]] to %[[VAL_24]] : !fir.logical<4>, !fir.ref> +// CHECK: } +// CHECK: hlfir.destroy %[[VAL_10]] : !hlfir.expr> +// CHECK: return +// CHECK: } diff --git a/flang/test/HLFIR/maxloc-elemental.fir b/flang/test/HLFIR/maxloc-elemental.fir index 497a58c9bd1d4..c9210a59f0340 100644 --- a/flang/test/HLFIR/maxloc-elemental.fir +++ b/flang/test/HLFIR/maxloc-elemental.fir @@ -68,13 +68,7 @@ func.func @_QPtest(%arg0: !fir.box> {fir.bindc_name = "array"} // CHECK-NEXT: } // CHECK-NEXT: fir.result %[[V18]] : i32 // CHECK-NEXT: } -// CHECK-NEXT: %[[BD:.*]]:3 = fir.box_dims %[[V2]]#0, %c0 : (!fir.box>, index) -> (index, index, index) -// CHECK-NEXT: fir.do_loop %arg3 = %c1 to %[[BD]]#1 step %c1 unordered { -// CHECK-NEXT: %[[V13:.*]] = hlfir.designate %[[RES]] (%arg3) : (!fir.ref>, index) -> !fir.ref -// CHECK-NEXT: %[[V14:.*]] = fir.load %[[V13]] : !fir.ref -// CHECK-NEXT: %[[V15:.*]] = hlfir.designate %[[V2]]#0 (%arg3) : (!fir.box>, index) -> !fir.ref -// CHECK-NEXT: hlfir.assign %[[V14]] to %[[V15]] : i32, !fir.ref -// CHECK-NEXT: } +// CHECK-NEXT: hlfir.assign %[[RES]] to %[[V2]]#0 : !fir.ref>, !fir.box> // CHECK-NEXT: return // CHECK-NEXT: } diff --git a/flang/test/HLFIR/minloc-elemental.fir b/flang/test/HLFIR/minloc-elemental.fir index 5fa482a7b904e..9453a335b4fbf 100644 --- a/flang/test/HLFIR/minloc-elemental.fir +++ b/flang/test/HLFIR/minloc-elemental.fir @@ -68,13 +68,7 @@ func.func @_QPtest(%arg0: !fir.box> {fir.bindc_name = "array"} // CHECK-NEXT: } // CHECK-NEXT: fir.result %[[V18]] : i32 // CHECK-NEXT: } -// CHECK-NEXT: %[[BD:.*]]:3 = fir.box_dims %[[V2]]#0, %c0 : (!fir.box>, index) -> (index, index, index) -// CHECK-NEXT: fir.do_loop %arg3 = %c1 to %[[BD]]#1 step %c1 unordered { -// CHECK-NEXT: %[[V13:.*]] = hlfir.designate %[[RES]] (%arg3) : (!fir.ref>, index) -> !fir.ref -// CHECK-NEXT: %[[V14:.*]] = fir.load %[[V13]] : !fir.ref -// CHECK-NEXT: %[[V15:.*]] = hlfir.designate %[[V2]]#0 (%arg3) : (!fir.box>, index) -> !fir.ref -// CHECK-NEXT: hlfir.assign %[[V14]] to %[[V15]] : i32, !fir.ref -// CHECK-NEXT: } +// CHECK-NEXT: hlfir.assign %[[RES]] to %[[V2]]#0 : !fir.ref>, !fir.box> // CHECK-NEXT: return // CHECK-NEXT: } @@ -147,13 +141,7 @@ func.func @_QPtest_kind2(%arg0: !fir.box> {fir.bindc_name = "a // CHECK-NEXT: } // CHECK-NEXT: fir.result %[[V18]] : i32 // CHECK-NEXT: } -// CHECK-NEXT: %[[BD:.*]]:3 = fir.box_dims %[[V2]]#0, %c0 : (!fir.box>, index) -> (index, index, index) -// CHECK-NEXT: fir.do_loop %arg3 = %c1 to %[[BD]]#1 step %c1 unordered { -// CHECK-NEXT: %[[V13:.*]] = hlfir.designate %[[RES]] (%arg3) : (!fir.ref>, index) -> !fir.ref -// CHECK-NEXT: %[[V14:.*]] = fir.load %[[V13]] : !fir.ref -// CHECK-NEXT: %[[V15:.*]] = hlfir.designate %[[V2]]#0 (%arg3) : (!fir.box>, index) -> !fir.ref -// CHECK-NEXT: hlfir.assign %[[V14]] to %[[V15]] : i16, !fir.ref -// CHECK-NEXT: } +// CHECK-NEXT: hlfir.assign %[[RES]] to %[[V2]]#0 : !fir.ref>, !fir.box> // CHECK-NEXT: return diff --git a/flang/test/HLFIR/opt-bufferization-eval_in_mem.fir b/flang/test/HLFIR/opt-bufferization-eval_in_mem.fir index 984c0bcbaddcc..ce669073dbb1b 100644 --- a/flang/test/HLFIR/opt-bufferization-eval_in_mem.fir +++ b/flang/test/HLFIR/opt-bufferization-eval_in_mem.fir @@ -48,7 +48,6 @@ func.func @_QPnegative_test_is_target(%arg0: !fir.ref> {fir.b } // CHECK-LABEL: func.func @_QPnegative_test_is_target( // CHECK-SAME: %[[VAL_0:.*]]: !fir.ref> {fir.bindc_name = "x", fir.target}) { -// CHECK: %[[VAL_1:.*]] = arith.constant 1 : index // CHECK: %[[VAL_2:.*]] = arith.constant false // CHECK: %[[VAL_3:.*]] = arith.constant 10 : index // CHECK: %[[VAL_4:.*]] = fir.alloca !fir.array<10xf32> @@ -57,11 +56,7 @@ func.func @_QPnegative_test_is_target(%arg0: !fir.ref> {fir.b // CHECK: %[[VAL_9:.*]] = fir.call @_QPfoo() fastmath : () -> !fir.array<10xf32> // CHECK: fir.save_result %[[VAL_9]] to %[[VAL_8]]#1{{.*}} // CHECK: %[[VAL_10:.*]] = hlfir.as_expr %[[VAL_8]]#0 move %[[VAL_2]] : (!fir.ref>, i1) -> !hlfir.expr<10xf32> -// CHECK: fir.do_loop %[[VAL_11:.*]] = %[[VAL_1]] to %[[VAL_3]] step %[[VAL_1]] unordered { -// CHECK: %[[VAL_12:.*]] = hlfir.apply %[[VAL_10]], %[[VAL_11]] : (!hlfir.expr<10xf32>, index) -> f32 -// CHECK: %[[VAL_13:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_11]]) : (!fir.ref>, index) -> !fir.ref -// CHECK: hlfir.assign %[[VAL_12]] to %[[VAL_13]] : f32, !fir.ref -// CHECK: } +// CHECK: hlfir.assign %[[VAL_10]] to %[[VAL_7]]#0 : !hlfir.expr<10xf32>, !fir.ref> // CHECK: hlfir.destroy %[[VAL_10]] : !hlfir.expr<10xf32> // CHECK: return // CHECK: } diff --git a/flang/test/HLFIR/opt-bufferization.fir b/flang/test/HLFIR/opt-bufferization.fir index 87afb3cc92453..faa8f4bcdb778 100644 --- a/flang/test/HLFIR/opt-bufferization.fir +++ b/flang/test/HLFIR/opt-bufferization.fir @@ -796,45 +796,3 @@ func.func @_QPddx(%arg0: !fir.box> {fir.bindc_name = "array" // CHECK: %[[VAL_61:.*]] = fir.load %[[VAL_26]]#1 : !fir.ref> // CHECK: return %[[VAL_61]] : !fir.array // CHECK: } - -// `hlfir.expr` bufferization (when the expresion is not the result of -// `hlfir.elemental`) -func.func @_QPfoo() { - %c1 = arith.constant 1 : index - %0 = fir.alloca !fir.array<1xi32> {bindc_name = "iavs", uniq_name = "_QFfooEiavs"} - %1 = fir.shape %c1 : (index) -> !fir.shape<1> - %2:2 = hlfir.declare %0(%1) {uniq_name = "_QFfooEiavs"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) - %3 = fir.alloca i32 {bindc_name = "iv", uniq_name = "_QFfooEiv"} - %4:2 = hlfir.declare %3 {uniq_name = "_QFfooEiv"} : (!fir.ref) -> (!fir.ref, !fir.ref) - %c10_i32 = arith.constant 10 : i32 - %6 = fir.convert %c10_i32 : (i32) -> index - %7 = fir.convert %c1 : (index) -> i32 - %8:2 = fir.do_loop %arg0 = %c1 to %6 step %c1 iter_args(%arg1 = %7) -> (index, i32) { - fir.store %arg1 to %4#1 : !fir.ref - %9 = fir.allocmem !fir.array<1xi32> {bindc_name = ".tmp.arrayctor", uniq_name = ""} - %10 = fir.shape %c1 : (index) -> !fir.shape<1> - %11:2 = hlfir.declare %9(%10) {uniq_name = ".tmp.arrayctor"} : (!fir.heap>, !fir.shape<1>) -> (!fir.heap>, !fir.heap>) - %12 = fir.load %4#0 : !fir.ref - %13 = hlfir.designate %11#0 (%c1) : (!fir.heap>, index) -> !fir.ref - hlfir.assign %12 to %13 : i32, !fir.ref - %true = arith.constant true - %14 = hlfir.as_expr %11#0 move %true : (!fir.heap>, i1) -> !hlfir.expr<1xi32> - hlfir.assign %14 to %2#0 : !hlfir.expr<1xi32>, !fir.ref> - hlfir.destroy %14 : !hlfir.expr<1xi32> - %15 = arith.addi %arg0, %c1 : index - %16 = fir.convert %c1 : (index) -> i32 - %17 = fir.load %4#1 : !fir.ref - %18 = arith.addi %17, %16 : i32 - fir.result %15, %18 : index, i32 - } - fir.store %8#1 to %4#1 : !fir.ref - return -} - -// CHECK-LABEL: func.func @_QPfoo -// CHECK: %[[C1:.*]] = arith.constant 1 : index -// CHECK: fir.do_loop {{.*}} { -// CHECK-NOT: hlfir.assign %{{.*}} to %{{.*}}#0 : !hlfir.expr<1xi32>, !fir.ref> -// CHECK: fir.do_loop %{{.*}} = %[[C1]] to %[[C1]] step %[[C1]] unordered { -// CHECK: } -// CHECK: } From 322f16e6246ada7cd53e71e927ee68273e819f78 Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Fri, 3 Jan 2025 11:43:07 -0500 Subject: [PATCH 367/567] [AMDGPU][True16][MC] true16 for v_sat_pk_u8_i16 (#120634) Support true16 format for v_sat_pk_u8_i16 in MC --- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 2 +- llvm/test/MC/AMDGPU/gfx11_asm_vop1.s | 69 +++++++++++-------- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s | 65 +++++++++-------- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s | 21 ++++-- llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s | 18 +++++ .../MC/AMDGPU/gfx11_asm_vop1_t16_promote.s | 21 ++++-- .../AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s | 59 ++++++++-------- .../MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s | 15 ++-- .../test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s | 63 +++++++++-------- llvm/test/MC/AMDGPU/gfx12_asm_vop1.s | 51 ++++++++++---- llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s | 6 ++ llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s | 6 ++ llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s | 18 +++++ .../MC/AMDGPU/gfx12_asm_vop1_t16_promote.s | 9 +++ .../test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s | 3 + .../AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s | 3 + .../MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s | 3 + .../Disassembler/AMDGPU/gfx11_dasm_vop1.txt | 55 +++++++++++---- .../AMDGPU/gfx11_dasm_vop1_dpp16.txt | 54 +++++++++++---- .../AMDGPU/gfx11_dasm_vop1_dpp8.txt | 17 ++++- .../gfx11_dasm_vop3_dpp16_from_vop1.txt | 46 +++++++++---- .../AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt | 10 ++- .../AMDGPU/gfx11_dasm_vop3_from_vop1.txt | 49 +++++++++---- .../AMDGPU/gfx12_dasm_vop1_dpp16.txt | 50 ++++++++++---- .../AMDGPU/gfx12_dasm_vop1_dpp8.txt | 13 +++- .../AMDGPU/gfx12_dasm_vop3_from_vop1.txt | 49 +++++++++---- .../gfx12_dasm_vop3_from_vop1_dpp16.txt | 46 +++++++++---- .../AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt | 10 ++- 28 files changed, 572 insertions(+), 259 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 1dd39be9e8d9c..bbb456ab739ab 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -1047,7 +1047,7 @@ defm V_RNDNE_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05e, "v_rndne_f1 defm V_FRACT_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05f, "v_fract_f16">; defm V_SIN_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x060, "v_sin_f16">; defm V_COS_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x061, "v_cos_f16">; -defm V_SAT_PK_U8_I16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x062, "v_sat_pk_u8_i16">; +defm V_SAT_PK_U8_I16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x062, "v_sat_pk_u8_i16">; defm V_CVT_NORM_I16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x063, "v_cvt_norm_i16_f16">; defm V_CVT_NORM_U16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x064, "v_cvt_norm_u16_f16">; diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s index 4e4dc6647daeb..4448720e6f79f 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s @@ -3236,50 +3236,59 @@ v_rsq_f64 v[5:6], src_scc v_rsq_f64 v[254:255], 0xaf123456 // GFX11: v_rsq_f64_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x62,0xfc,0x7f,0x56,0x34,0x12,0xaf] -v_sat_pk_u8_i16 v5, v1 -// GFX11: v_sat_pk_u8_i16_e32 v5, v1 ; encoding: [0x01,0xc5,0x0a,0x7e] +v_sat_pk_u8_i16 v5.l, v1 +// GFX11: v_sat_pk_u8_i16_e32 v5.l, v1 ; encoding: [0x01,0xc5,0x0a,0x7e] -v_sat_pk_u8_i16 v5, v255 -// GFX11: v_sat_pk_u8_i16_e32 v5, v255 ; encoding: [0xff,0xc5,0x0a,0x7e] +v_sat_pk_u8_i16 v5.l, v255 +// GFX11: v_sat_pk_u8_i16_e32 v5.l, v255 ; encoding: [0xff,0xc5,0x0a,0x7e] -v_sat_pk_u8_i16 v5, s1 -// GFX11: v_sat_pk_u8_i16_e32 v5, s1 ; encoding: [0x01,0xc4,0x0a,0x7e] +v_sat_pk_u8_i16 v5.l, s1 +// GFX11: v_sat_pk_u8_i16_e32 v5.l, s1 ; encoding: [0x01,0xc4,0x0a,0x7e] -v_sat_pk_u8_i16 v5, s105 -// GFX11: v_sat_pk_u8_i16_e32 v5, s105 ; encoding: [0x69,0xc4,0x0a,0x7e] +v_sat_pk_u8_i16 v5.l, s105 +// GFX11: v_sat_pk_u8_i16_e32 v5.l, s105 ; encoding: [0x69,0xc4,0x0a,0x7e] -v_sat_pk_u8_i16 v5, vcc_lo -// GFX11: v_sat_pk_u8_i16_e32 v5, vcc_lo ; encoding: [0x6a,0xc4,0x0a,0x7e] +v_sat_pk_u8_i16 v5.l, vcc_lo +// GFX11: v_sat_pk_u8_i16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xc4,0x0a,0x7e] -v_sat_pk_u8_i16 v5, vcc_hi -// GFX11: v_sat_pk_u8_i16_e32 v5, vcc_hi ; encoding: [0x6b,0xc4,0x0a,0x7e] +v_sat_pk_u8_i16 v5.l, vcc_hi +// GFX11: v_sat_pk_u8_i16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xc4,0x0a,0x7e] -v_sat_pk_u8_i16 v5, ttmp15 -// GFX11: v_sat_pk_u8_i16_e32 v5, ttmp15 ; encoding: [0x7b,0xc4,0x0a,0x7e] +v_sat_pk_u8_i16 v5.l, ttmp15 +// GFX11: v_sat_pk_u8_i16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xc4,0x0a,0x7e] -v_sat_pk_u8_i16 v5, m0 -// GFX11: v_sat_pk_u8_i16_e32 v5, m0 ; encoding: [0x7d,0xc4,0x0a,0x7e] +v_sat_pk_u8_i16 v5.l, m0 +// GFX11: v_sat_pk_u8_i16_e32 v5.l, m0 ; encoding: [0x7d,0xc4,0x0a,0x7e] -v_sat_pk_u8_i16 v5, exec_lo -// GFX11: v_sat_pk_u8_i16_e32 v5, exec_lo ; encoding: [0x7e,0xc4,0x0a,0x7e] +v_sat_pk_u8_i16 v5.l, exec_lo +// GFX11: v_sat_pk_u8_i16_e32 v5.l, exec_lo ; encoding: [0x7e,0xc4,0x0a,0x7e] -v_sat_pk_u8_i16 v5, exec_hi -// GFX11: v_sat_pk_u8_i16_e32 v5, exec_hi ; encoding: [0x7f,0xc4,0x0a,0x7e] +v_sat_pk_u8_i16 v5.l, exec_hi +// GFX11: v_sat_pk_u8_i16_e32 v5.l, exec_hi ; encoding: [0x7f,0xc4,0x0a,0x7e] -v_sat_pk_u8_i16 v5, null -// GFX11: v_sat_pk_u8_i16_e32 v5, null ; encoding: [0x7c,0xc4,0x0a,0x7e] +v_sat_pk_u8_i16 v5.l, null +// GFX11: v_sat_pk_u8_i16_e32 v5.l, null ; encoding: [0x7c,0xc4,0x0a,0x7e] -v_sat_pk_u8_i16 v5, -1 -// GFX11: v_sat_pk_u8_i16_e32 v5, -1 ; encoding: [0xc1,0xc4,0x0a,0x7e] +v_sat_pk_u8_i16 v5.l, -1 +// GFX11: v_sat_pk_u8_i16_e32 v5.l, -1 ; encoding: [0xc1,0xc4,0x0a,0x7e] -v_sat_pk_u8_i16 v5, 0.5 -// GFX11: v_sat_pk_u8_i16_e32 v5, 0.5 ; encoding: [0xf0,0xc4,0x0a,0x7e] +v_sat_pk_u8_i16 v5.l, 0.5 +// GFX11: v_sat_pk_u8_i16_e32 v5.l, 0.5 ; encoding: [0xf0,0xc4,0x0a,0x7e] -v_sat_pk_u8_i16 v5, src_scc -// GFX11: v_sat_pk_u8_i16_e32 v5, src_scc ; encoding: [0xfd,0xc4,0x0a,0x7e] +v_sat_pk_u8_i16 v5.l, src_scc +// GFX11: v_sat_pk_u8_i16_e32 v5.l, src_scc ; encoding: [0xfd,0xc4,0x0a,0x7e] -v_sat_pk_u8_i16 v127, 0xfe0b -// GFX11: v_sat_pk_u8_i16_e32 v127, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_sat_pk_u8_i16 v127.l, 0xfe0b +// GFX11: v_sat_pk_u8_i16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +v_sat_pk_u8_i16 v127.l, 0.5 +// GFX11: v_sat_pk_u8_i16_e32 v127.l, 0.5 ; encoding: [0xf0,0xc4,0xfe,0x7e] + +v_sat_pk_u8_i16 v5.h, src_scc +// GFX11: v_sat_pk_u8_i16_e32 v5.h, src_scc ; encoding: [0xfd,0xc4,0x0a,0x7f] + +v_sat_pk_u8_i16 v127.h, 0xfe0b +// GFX11: v_sat_pk_u8_i16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7f,0x0b,0xfe,0x00,0x00] v_sin_f16 v5, v1 // GFX11: v_sin_f16_e32 v5, v1 ; encoding: [0x01,0xc1,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s index 98e4b29b25666..da2a3615360a4 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s @@ -2522,47 +2522,56 @@ v_rsq_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_rsq_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_rsq_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x5c,0xfe,0x7f,0xff,0x6f,0x35,0x30] -v_sat_pk_u8_i16 v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff] +v_sat_pk_u8_i16 v5.l, v1 quad_perm:[3,2,1,0] +// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff] -v_sat_pk_u8_i16 v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff] +v_sat_pk_u8_i16 v5.l, v1 quad_perm:[0,1,2,3] +// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff] -v_sat_pk_u8_i16 v5, v1 row_mirror -// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff] +v_sat_pk_u8_i16 v5.l, v1 row_mirror +// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff] -v_sat_pk_u8_i16 v5, v1 row_half_mirror -// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff] +v_sat_pk_u8_i16 v5.l, v1 row_half_mirror +// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff] -v_sat_pk_u8_i16 v5, v1 row_shl:1 -// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff] +v_sat_pk_u8_i16 v5.l, v1 row_shl:1 +// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff] -v_sat_pk_u8_i16 v5, v1 row_shl:15 -// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff] +v_sat_pk_u8_i16 v5.l, v1 row_shl:15 +// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff] -v_sat_pk_u8_i16 v5, v1 row_shr:1 -// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff] +v_sat_pk_u8_i16 v5.l, v1 row_shr:1 +// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff] -v_sat_pk_u8_i16 v5, v1 row_shr:15 -// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff] +v_sat_pk_u8_i16 v5.l, v1 row_shr:15 +// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff] -v_sat_pk_u8_i16 v5, v1 row_ror:1 -// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff] +v_sat_pk_u8_i16 v5.l, v1 row_ror:1 +// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff] -v_sat_pk_u8_i16 v5, v1 row_ror:15 -// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff] +v_sat_pk_u8_i16 v5.l, v1 row_ror:15 +// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff] -v_sat_pk_u8_i16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff] +v_sat_pk_u8_i16 v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff] -v_sat_pk_u8_i16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01] +v_sat_pk_u8_i16 v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01] -v_sat_pk_u8_i16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x09,0x13] +v_sat_pk_u8_i16 v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x09,0x13] -v_sat_pk_u8_i16 v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_sat_pk_u8_i16_dpp v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x05,0x30] +v_sat_pk_u8_i16 v127.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_sat_pk_u8_i16_dpp v127.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x05,0x30] + +v_sat_pk_u8_i16 v127.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_sat_pk_u8_i16_dpp v127.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0x01,0x5f,0x01,0x01] + +v_sat_pk_u8_i16 v5.h, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_sat_pk_u8_i16_dpp v5.h, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc4,0x0a,0x7f,0x01,0x60,0x09,0x13] + +v_sat_pk_u8_i16 v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_sat_pk_u8_i16_dpp v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7f,0xff,0x6f,0x05,0x30] v_sin_f16 v5, v1 quad_perm:[3,2,1,0] // GFX11: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s index ab4606af2bb35..34cb2d097b7a7 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s @@ -587,14 +587,23 @@ v_rsq_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_rsq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_rsq_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x5c,0xfe,0x7f,0xff,0x00,0x00,0x00] -v_sat_pk_u8_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_sat_pk_u8_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_sat_pk_u8_i16 v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_sat_pk_u8_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_sat_pk_u8_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_sat_pk_u8_i16 v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_sat_pk_u8_i16 v127, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_sat_pk_u8_i16_dpp v127, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00] +v_sat_pk_u8_i16 v127.l, v255 dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_sat_pk_u8_i16_dpp v127.l, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00] + +v_sat_pk_u8_i16 v127.l, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sat_pk_u8_i16_dpp v127.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0xfe,0x7e,0x01,0x77,0x39,0x05] + +v_sat_pk_u8_i16 v5.h, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_sat_pk_u8_i16_dpp v5.h, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc4,0x0a,0x7f,0x01,0x77,0x39,0x05] + +v_sat_pk_u8_i16 v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_sat_pk_u8_i16_dpp v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00] v_sin_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s index 4ae91340386b6..9c5693de3d8b1 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s @@ -716,6 +716,24 @@ v_sat_pk_u8_i16_e32 v199, v5 dpp8:[7,6,5,4,3,2,1,0] v_sat_pk_u8_i16_e32 v199, v5 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:30: error: invalid operand for instruction +v_sat_pk_u8_i16_e32 v199.h, v5.h +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sat_pk_u8_i16_e32 v199.h, v5.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sat_pk_u8_i16_e32 v199.h, v5.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sat_pk_u8_i16_e32 v199.l, v5.l +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sat_pk_u8_i16_e32 v199.l, v5.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sat_pk_u8_i16_e32 v199.l, v5.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + v_sin_f16_e32 v128, 0xfe0b // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s index 1d441720280ca..fa6ab407f87c7 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s @@ -1802,14 +1802,23 @@ v_rsq_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_rsq_f16 v5, v199 quad_perm:[3,2,1,0] // GFX11: v_rsq_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] -v_sat_pk_u8_i16 v199, v5 -// GFX11: v_sat_pk_u8_i16_e64 v199, v5 ; encoding: [0xc7,0x00,0xe2,0xd5,0x05,0x01,0x00,0x00] +v_sat_pk_u8_i16 v199.h, v5 +// GFX11: v_sat_pk_u8_i16_e64 v199.h, v5 op_sel:[0,1] ; encoding: [0xc7,0x40,0xe2,0xd5,0x05,0x01,0x00,0x00] -v_sat_pk_u8_i16 v199, v5 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_sat_pk_u8_i16_e64_dpp v199, v5 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xc7,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x05,0x77,0x39,0x05] +v_sat_pk_u8_i16 v199.h, v5 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sat_pk_u8_i16_e64_dpp v199.h, v5 op_sel:[0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xc7,0x40,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x05,0x77,0x39,0x05] -v_sat_pk_u8_i16 v199, v5 quad_perm:[3,2,1,0] -// GFX11: v_sat_pk_u8_i16_e64_dpp v199, v5 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xc7,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x05,0x1b,0x00,0xff] +v_sat_pk_u8_i16 v199.h, v5 quad_perm:[3,2,1,0] +// GFX11: v_sat_pk_u8_i16_e64_dpp v199.h, v5 op_sel:[0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xc7,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x05,0x1b,0x00,0xff] + +v_sat_pk_u8_i16 v199.l, v5 +// GFX11: v_sat_pk_u8_i16_e64 v199.l, v5 ; encoding: [0xc7,0x00,0xe2,0xd5,0x05,0x01,0x00,0x00] + +v_sat_pk_u8_i16 v199.l, v5 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sat_pk_u8_i16_e64_dpp v199.l, v5 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xc7,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x05,0x77,0x39,0x05] + +v_sat_pk_u8_i16 v199.l, v5 quad_perm:[3,2,1,0] +// GFX11: v_sat_pk_u8_i16_e64_dpp v199.l, v5 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xc7,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x05,0x1b,0x00,0xff] v_sin_f16 v128, 0xfe0b // GFX11: v_sin_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xe0,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s index f38ff6a2fdd7d..1bd1a5c5695bc 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s @@ -2644,47 +2644,50 @@ v_rsq_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xae,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[3,2,1,0] +// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[0,1,2,3] +// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_sat_pk_u8_i16_e64_dpp v5, v1 row_mirror -// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_mirror +// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_sat_pk_u8_i16_e64_dpp v5, v1 row_half_mirror -// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_half_mirror +// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:1 -// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:1 +// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:15 -// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:15 +// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:1 -// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:1 +// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:15 -// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:15 +// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:1 -// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:1 +// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:15 -// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:15 +// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] -v_sat_pk_u8_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13] +v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13] -v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] +v_sat_pk_u8_i16_e64_dpp v255.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_sat_pk_u8_i16_e64_dpp v255.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] + +v_sat_pk_u8_i16_e64_dpp v255.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: [0xff,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX11: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s index 95407886ccba1..65af1c1829902 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s @@ -760,14 +760,17 @@ v_rsq_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xae,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_sat_pk_u8_i16_e64_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_sat_pk_u8_i16_e64_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64_dpp v255.l, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_sat_pk_u8_i16_e64_dpp v255.l, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] + +v_sat_pk_u8_i16_e64_dpp v255.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: [0xff,0x40,0xe2,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s index 3850f0254a7f1..1108887c26ed4 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s @@ -3190,50 +3190,53 @@ v_rsq_f64_e64 v[5:6], -|src_scc| mul:4 v_rsq_f64_e64 v[254:255], 0xaf123456 clamp div:2 // GFX11: v_rsq_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb1,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] -v_sat_pk_u8_i16_e64 v5, v1 -// GFX11: v_sat_pk_u8_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00] +v_sat_pk_u8_i16_e64 v5.l, v1 +// GFX11: v_sat_pk_u8_i16_e64 v5.l, v1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00] -v_sat_pk_u8_i16_e64 v5, v255 -// GFX11: v_sat_pk_u8_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00] +v_sat_pk_u8_i16_e64 v5.l, v255 +// GFX11: v_sat_pk_u8_i16_e64 v5.l, v255 ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00] -v_sat_pk_u8_i16_e64 v5, s1 -// GFX11: v_sat_pk_u8_i16_e64 v5, s1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5.l, s1 +// GFX11: v_sat_pk_u8_i16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00] -v_sat_pk_u8_i16_e64 v5, s105 -// GFX11: v_sat_pk_u8_i16_e64 v5, s105 ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5.l, s105 +// GFX11: v_sat_pk_u8_i16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00] -v_sat_pk_u8_i16_e64 v5, vcc_lo -// GFX11: v_sat_pk_u8_i16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5.l, vcc_lo +// GFX11: v_sat_pk_u8_i16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00] -v_sat_pk_u8_i16_e64 v5, vcc_hi -// GFX11: v_sat_pk_u8_i16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5.l, vcc_hi +// GFX11: v_sat_pk_u8_i16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00] -v_sat_pk_u8_i16_e64 v5, ttmp15 -// GFX11: v_sat_pk_u8_i16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5.l, ttmp15 +// GFX11: v_sat_pk_u8_i16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00] -v_sat_pk_u8_i16_e64 v5, m0 -// GFX11: v_sat_pk_u8_i16_e64 v5, m0 ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5.l, m0 +// GFX11: v_sat_pk_u8_i16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00] -v_sat_pk_u8_i16_e64 v5, exec_lo -// GFX11: v_sat_pk_u8_i16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5.l, exec_lo +// GFX11: v_sat_pk_u8_i16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00] -v_sat_pk_u8_i16_e64 v5, exec_hi -// GFX11: v_sat_pk_u8_i16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5.l, exec_hi +// GFX11: v_sat_pk_u8_i16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00] -v_sat_pk_u8_i16_e64 v5, null -// GFX11: v_sat_pk_u8_i16_e64 v5, null ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5.l, null +// GFX11: v_sat_pk_u8_i16_e64 v5.l, null ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00] -v_sat_pk_u8_i16_e64 v5, -1 -// GFX11: v_sat_pk_u8_i16_e64 v5, -1 ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5.l, -1 +// GFX11: v_sat_pk_u8_i16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00] -v_sat_pk_u8_i16_e64 v5, 0.5 -// GFX11: v_sat_pk_u8_i16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5.l, 0.5 +// GFX11: v_sat_pk_u8_i16_e64 v5.l, 0.5 ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00] -v_sat_pk_u8_i16_e64 v5, src_scc -// GFX11: v_sat_pk_u8_i16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5.l, src_scc +// GFX11: v_sat_pk_u8_i16_e64 v5.l, src_scc ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00] -v_sat_pk_u8_i16_e64 v255, 0xfe0b -// GFX11: v_sat_pk_u8_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_sat_pk_u8_i16_e64 v255.l, 0xfe0b +// GFX11: v_sat_pk_u8_i16_e64 v255.l, 0xfe0b ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] + +v_sat_pk_u8_i16_e64 v255.h, 0xfe0b +// GFX11: [0xff,0x40,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] v_sin_f16_e64 v5, v1 // GFX11: v_sin_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s index edd3b916f4e5f..086356fbca25a 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s @@ -3301,49 +3301,70 @@ v_rsq_f64 v[254:255], 0xaf123456 // GFX12: v_rsq_f64_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x62,0xfc,0x7f,0x56,0x34,0x12,0xaf] v_sat_pk_u8_i16 v5, v1 -// GFX12: v_sat_pk_u8_i16_e32 v5, v1 ; encoding: [0x01,0xc5,0x0a,0x7e] +// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, v1 ; encoding: [0x01,0xc5,0x0a,0x7e] +// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, v1 ; encoding: [0x01,0xc5,0x0a,0x7e] v_sat_pk_u8_i16 v5, v255 -// GFX12: v_sat_pk_u8_i16_e32 v5, v255 ; encoding: [0xff,0xc5,0x0a,0x7e] +// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, v255 ; encoding: [0xff,0xc5,0x0a,0x7e] +// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, v255 ; encoding: [0xff,0xc5,0x0a,0x7e] v_sat_pk_u8_i16 v5, s1 -// GFX12: v_sat_pk_u8_i16_e32 v5, s1 ; encoding: [0x01,0xc4,0x0a,0x7e] +// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, s1 ; encoding: [0x01,0xc4,0x0a,0x7e] +// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, s1 ; encoding: [0x01,0xc4,0x0a,0x7e] v_sat_pk_u8_i16 v5, s105 -// GFX12: v_sat_pk_u8_i16_e32 v5, s105 ; encoding: [0x69,0xc4,0x0a,0x7e] +// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, s105 ; encoding: [0x69,0xc4,0x0a,0x7e] +// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, s105 ; encoding: [0x69,0xc4,0x0a,0x7e] v_sat_pk_u8_i16 v5, vcc_lo -// GFX12: v_sat_pk_u8_i16_e32 v5, vcc_lo ; encoding: [0x6a,0xc4,0x0a,0x7e] +// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, vcc_lo ; encoding: [0x6a,0xc4,0x0a,0x7e] +// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xc4,0x0a,0x7e] v_sat_pk_u8_i16 v5, vcc_hi -// GFX12: v_sat_pk_u8_i16_e32 v5, vcc_hi ; encoding: [0x6b,0xc4,0x0a,0x7e] +// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, vcc_hi ; encoding: [0x6b,0xc4,0x0a,0x7e] +// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xc4,0x0a,0x7e] v_sat_pk_u8_i16 v5, ttmp15 -// GFX12: v_sat_pk_u8_i16_e32 v5, ttmp15 ; encoding: [0x7b,0xc4,0x0a,0x7e] +// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, ttmp15 ; encoding: [0x7b,0xc4,0x0a,0x7e] +// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xc4,0x0a,0x7e] v_sat_pk_u8_i16 v5, m0 -// GFX12: v_sat_pk_u8_i16_e32 v5, m0 ; encoding: [0x7d,0xc4,0x0a,0x7e] +// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, m0 ; encoding: [0x7d,0xc4,0x0a,0x7e] +// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, m0 ; encoding: [0x7d,0xc4,0x0a,0x7e] v_sat_pk_u8_i16 v5, exec_lo -// GFX12: v_sat_pk_u8_i16_e32 v5, exec_lo ; encoding: [0x7e,0xc4,0x0a,0x7e] +// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, exec_lo ; encoding: [0x7e,0xc4,0x0a,0x7e] +// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, exec_lo ; encoding: [0x7e,0xc4,0x0a,0x7e] v_sat_pk_u8_i16 v5, exec_hi -// GFX12: v_sat_pk_u8_i16_e32 v5, exec_hi ; encoding: [0x7f,0xc4,0x0a,0x7e] +// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, exec_hi ; encoding: [0x7f,0xc4,0x0a,0x7e] +// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, exec_hi ; encoding: [0x7f,0xc4,0x0a,0x7e] v_sat_pk_u8_i16 v5, null -// GFX12: v_sat_pk_u8_i16_e32 v5, null ; encoding: [0x7c,0xc4,0x0a,0x7e] +// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, null ; encoding: [0x7c,0xc4,0x0a,0x7e] +// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, null ; encoding: [0x7c,0xc4,0x0a,0x7e] v_sat_pk_u8_i16 v5, -1 -// GFX12: v_sat_pk_u8_i16_e32 v5, -1 ; encoding: [0xc1,0xc4,0x0a,0x7e] +// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, -1 ; encoding: [0xc1,0xc4,0x0a,0x7e] +// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, -1 ; encoding: [0xc1,0xc4,0x0a,0x7e] v_sat_pk_u8_i16 v5, 0.5 -// GFX12: v_sat_pk_u8_i16_e32 v5, 0.5 ; encoding: [0xf0,0xc4,0x0a,0x7e] +// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, 0.5 ; encoding: [0xf0,0xc4,0x0a,0x7e] +// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, 0.5 ; encoding: [0xf0,0xc4,0x0a,0x7e] v_sat_pk_u8_i16 v5, src_scc -// GFX12: v_sat_pk_u8_i16_e32 v5, src_scc ; encoding: [0xfd,0xc4,0x0a,0x7e] +// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, src_scc ; encoding: [0xfd,0xc4,0x0a,0x7e] +// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, src_scc ; encoding: [0xfd,0xc4,0x0a,0x7e] v_sat_pk_u8_i16 v127, 0xfe0b -// GFX12: v_sat_pk_u8_i16_e32 v127, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +// GFX12-ASM: v_sat_pk_u8_i16_e32 v127, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +// GFX12-DIS: v_sat_pk_u8_i16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +v_sat_pk_u8_i16 v5.h, src_scc +// GFX12: v_sat_pk_u8_i16_e32 v5.h, src_scc ; encoding: [0xfd,0xc4,0x0a,0x7f] + +v_sat_pk_u8_i16 v127.h, 0xfe0b +// GFX12: v_sat_pk_u8_i16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7f,0x0b,0xfe,0x00,0x00] v_sin_f16 v5, v1 // GFX12: v_sin_f16_e32 v5, v1 ; encoding: [0x01,0xc1,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s index 56b42f19db38a..26e7162206aed 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s @@ -2632,6 +2632,12 @@ v_sat_pk_u8_i16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_sat_pk_u8_i16 v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_sat_pk_u8_i16_dpp v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x05,0x30] +v_sat_pk_u8_i16 v5.h, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_sat_pk_u8_i16_dpp v5.h, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc4,0x0a,0x7f,0x01,0x60,0x09,0x13] + +v_sat_pk_u8_i16 v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_sat_pk_u8_i16_dpp v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7f,0xff,0x6f,0x05,0x30] + v_sin_f16 v5, v1 quad_perm:[3,2,1,0] // GFX12: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s index 09f3069114d4a..a54ae771fab40 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s @@ -619,6 +619,12 @@ v_sat_pk_u8_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_sat_pk_u8_i16 v127, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_sat_pk_u8_i16_dpp v127, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00] +v_sat_pk_u8_i16 v5.h, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_sat_pk_u8_i16_dpp v5.h, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc4,0x0a,0x7f,0x01,0x77,0x39,0x05] + +v_sat_pk_u8_i16 v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_sat_pk_u8_i16_dpp v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00] + v_sin_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s index 0ccad9c673079..01aa7a44bbc23 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s @@ -625,6 +625,24 @@ v_sat_pk_u8_i16_e32 v199, v5 dpp8:[7,6,5,4,3,2,1,0] v_sat_pk_u8_i16_e32 v199, v5 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:30: error: invalid operand for instruction +v_sat_pk_u8_i16_e32 v199.h, v5 +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sat_pk_u8_i16_e32 v199.h, v5 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sat_pk_u8_i16_e32 v199.h, v5 quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sat_pk_u8_i16_e32 v199.l, v5 +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sat_pk_u8_i16_e32 v199.l, v5 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sat_pk_u8_i16_e32 v199.l, v5 quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + v_sin_f16_e32 v128, 0xfe0b // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s index f220ec2b7d1e5..4c983af094561 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s @@ -1771,6 +1771,15 @@ v_sat_pk_u8_i16 v199, v5 dpp8:[7,6,5,4,3,2,1,0] v_sat_pk_u8_i16 v199, v5 quad_perm:[3,2,1,0] // GFX12: v_sat_pk_u8_i16_e64_dpp v199, v5 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xc7,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x05,0x1b,0x00,0xff] +v_sat_pk_u8_i16 v199.h, v5 +// GFX12: v_sat_pk_u8_i16_e64 v199.h, v5 op_sel:[0,1] ; encoding: [0xc7,0x40,0xe2,0xd5,0x05,0x01,0x00,0x00] + +v_sat_pk_u8_i16 v199.h, v5 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_sat_pk_u8_i16_e64_dpp v199.h, v5 op_sel:[0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xc7,0x40,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x05,0x77,0x39,0x05] + +v_sat_pk_u8_i16 v199.h, v5 quad_perm:[3,2,1,0] +// GFX12: v_sat_pk_u8_i16_e64_dpp v199.h, v5 op_sel:[0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xc7,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x05,0x1b,0x00,0xff] + v_sin_f16 v128, 0xfe0b // GFX12: v_sin_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xe0,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s index 015619d31504b..ea4a58d9d0f7e 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s @@ -3385,6 +3385,9 @@ v_sat_pk_u8_i16_e64 v5, src_scc v_sat_pk_u8_i16_e64 v255, 0xfe0b // GFX12: v_sat_pk_u8_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_sat_pk_u8_i16_e64 v255.h, 0xfe0b +// GFX12: v_sat_pk_u8_i16_e64 v255.h, 0xfe0b op_sel:[0,1] ; encoding: [0xff,0x40,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] + v_sin_f16_e64 v5, v1 // GFX12: v_sin_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s index 160bc3fc6afc7..a9b933e639abb 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s @@ -2548,6 +2548,9 @@ v_sat_pk_u8_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] +v_sat_pk_u8_i16_e64_dpp v255.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_sat_pk_u8_i16_e64_dpp v255.h, v255 op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] + v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX12: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s index c9ea7cdf1512e..af335f2e0b586 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s @@ -730,6 +730,9 @@ v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64_dpp v255.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_sat_pk_u8_i16_e64_dpp v255.h, v255 op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x40,0xe2,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] + v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt index 61e529abf4455..f02f0206acd2f 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt @@ -3220,49 +3220,74 @@ # GFX11: v_rsq_f64_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x62,0xfc,0x7f,0x56,0x34,0x12,0xaf] 0x01,0xc5,0x0a,0x7e -# GFX11: v_sat_pk_u8_i16_e32 v5, v1 ; encoding: [0x01,0xc5,0x0a,0x7e] +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, v1 ; encoding: [0x01,0xc5,0x0a,0x7e] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, v1 ; encoding: [0x01,0xc5,0x0a,0x7e] 0xff,0xc5,0x0a,0x7e -# GFX11: v_sat_pk_u8_i16_e32 v5, v255 ; encoding: [0xff,0xc5,0x0a,0x7e] +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, v255 ; encoding: [0xff,0xc5,0x0a,0x7e] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, v255 ; encoding: [0xff,0xc5,0x0a,0x7e] 0x01,0xc4,0x0a,0x7e -# GFX11: v_sat_pk_u8_i16_e32 v5, s1 ; encoding: [0x01,0xc4,0x0a,0x7e] +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, s1 ; encoding: [0x01,0xc4,0x0a,0x7e] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, s1 ; encoding: [0x01,0xc4,0x0a,0x7e] 0x69,0xc4,0x0a,0x7e -# GFX11: v_sat_pk_u8_i16_e32 v5, s105 ; encoding: [0x69,0xc4,0x0a,0x7e] +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, s105 ; encoding: [0x69,0xc4,0x0a,0x7e] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, s105 ; encoding: [0x69,0xc4,0x0a,0x7e] 0x6a,0xc4,0x0a,0x7e -# GFX11: v_sat_pk_u8_i16_e32 v5, vcc_lo ; encoding: [0x6a,0xc4,0x0a,0x7e] +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xc4,0x0a,0x7e] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, vcc_lo ; encoding: [0x6a,0xc4,0x0a,0x7e] 0x6b,0xc4,0x0a,0x7e -# GFX11: v_sat_pk_u8_i16_e32 v5, vcc_hi ; encoding: [0x6b,0xc4,0x0a,0x7e] +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xc4,0x0a,0x7e] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, vcc_hi ; encoding: [0x6b,0xc4,0x0a,0x7e] 0x7b,0xc4,0x0a,0x7e -# GFX11: v_sat_pk_u8_i16_e32 v5, ttmp15 ; encoding: [0x7b,0xc4,0x0a,0x7e] +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xc4,0x0a,0x7e] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, ttmp15 ; encoding: [0x7b,0xc4,0x0a,0x7e] 0x7d,0xc4,0x0a,0x7e -# GFX11: v_sat_pk_u8_i16_e32 v5, m0 ; encoding: [0x7d,0xc4,0x0a,0x7e] +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, m0 ; encoding: [0x7d,0xc4,0x0a,0x7e] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, m0 ; encoding: [0x7d,0xc4,0x0a,0x7e] 0x7e,0xc4,0x0a,0x7e -# GFX11: v_sat_pk_u8_i16_e32 v5, exec_lo ; encoding: [0x7e,0xc4,0x0a,0x7e] +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, exec_lo ; encoding: [0x7e,0xc4,0x0a,0x7e] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, exec_lo ; encoding: [0x7e,0xc4,0x0a,0x7e] 0x7f,0xc4,0x0a,0x7e -# GFX11: v_sat_pk_u8_i16_e32 v5, exec_hi ; encoding: [0x7f,0xc4,0x0a,0x7e] +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, exec_hi ; encoding: [0x7f,0xc4,0x0a,0x7e] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, exec_hi ; encoding: [0x7f,0xc4,0x0a,0x7e] 0x7c,0xc4,0x0a,0x7e -# GFX11: v_sat_pk_u8_i16_e32 v5, null ; encoding: [0x7c,0xc4,0x0a,0x7e] +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, null ; encoding: [0x7c,0xc4,0x0a,0x7e] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, null ; encoding: [0x7c,0xc4,0x0a,0x7e] 0xc1,0xc4,0x0a,0x7e -# GFX11: v_sat_pk_u8_i16_e32 v5, -1 ; encoding: [0xc1,0xc4,0x0a,0x7e] +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, -1 ; encoding: [0xc1,0xc4,0x0a,0x7e] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, -1 ; encoding: [0xc1,0xc4,0x0a,0x7e] 0xf0,0xc4,0x0a,0x7e -# GFX11: v_sat_pk_u8_i16_e32 v5, 0.5 ; encoding: [0xf0,0xc4,0x0a,0x7e] +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, 0.5 ; encoding: [0xf0,0xc4,0x0a,0x7e] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, 0.5 ; encoding: [0xf0,0xc4,0x0a,0x7e] 0xfd,0xc4,0x0a,0x7e -# GFX11: v_sat_pk_u8_i16_e32 v5, src_scc ; encoding: [0xfd,0xc4,0x0a,0x7e] +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, src_scc ; encoding: [0xfd,0xc4,0x0a,0x7e] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, src_scc ; encoding: [0xfd,0xc4,0x0a,0x7e] 0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e32 v127, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v127, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +0xf0,0xc4,0xfe,0x7e +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v127.l, 0.5 ; encoding: [0xf0,0xc4,0xfe,0x7e] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v127, 0.5 ; encoding: [0xf0,0xc4,0xfe,0x7e] + +0xfd,0xc4,0x0a,0x7f +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.h, src_scc ; encoding: [0xfd,0xc4,0x0a,0x7f] + +0xff,0xc4,0xfe,0x7f,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7f,0x0b,0xfe,0x00,0x00] 0x01,0xc1,0x0a,0x7e # GFX11: v_sin_f16_e32 v5, v1 ; encoding: [0x01,0xc1,0x0a,0x7e] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt index 1075a3eecd540..a4491e02abf05 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt @@ -2525,46 +2525,72 @@ # GFX11: v_rsq_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x5c,0xfe,0x7f,0xff,0x6f,0x3d,0x30] 0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX11: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX11: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30 -# GFX11: v_sat_pk_u8_i16_dpp v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v127.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30] + +0xfa,0xc4,0xfe,0x7e,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v127.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v127, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0x01,0x5f,0x01,0x01] + +0xfa,0xc4,0x0a,0x7f,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.h, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7f,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_mul_i32_i24_e32 v128, s1, v176 ; encoding: [0x01,0x60,0x01,0x13] + +0xfa,0xc4,0xfe,0x7f,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7f,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_lshlrev_b32_e32 v6, v255, v183 ; encoding: [0xff,0x6f,0x0d,0x30] 0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX11: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt index 051dd348e9a38..4e15731203168 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt @@ -470,10 +470,23 @@ # GFX11: v_rsq_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x5c,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX11: v_sat_pk_u8_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_dpp v127, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v127.l, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v127, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00] + +0xe9,0xc4,0xfe,0x7e,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v127.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0xfe,0x7e,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v127, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0xfe,0x7e,0x01,0x77,0x39,0x05] + +0xe9,0xc4,0x0a,0x7f,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.h, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7f,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v1, v187 ; encoding: [0x01,0x77,0x39,0x05] + +0xea,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX11: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt index 2666b758344c6..f97c678e6a90a 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt @@ -2687,46 +2687,64 @@ # GFX11: v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xae,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01 -# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13 -# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] 0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 -# GFX11: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v255.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] + +0xff,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v255.h, v255 op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX11: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt index c19947c4bd6ff..3cad28d888202 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt @@ -727,10 +727,16 @@ # GFX11: v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xae,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v255.l, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] + +0xff,0x40,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v255.h, v255 op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX11: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt index 3df206ccf522e..8b2bc97c5de1f 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt @@ -3204,49 +3204,68 @@ # GFX11: v_rsq_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb1,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] 0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, v1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, v255 ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64 v5, s1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, s1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64 v5, s105 ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, s105 ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64 v5, m0 ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, m0 ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64 v5, null ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, null ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, null ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64 v5, -1 ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, -1 ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, 0.5 ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, src_scc ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00] 0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v255.l, 0xfe0b ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] + +0xff,0x40,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v255.h, 0xfe0b op_sel:[0,1] ; encoding: [0xff,0x40,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00 # GFX11: v_sin_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt index a1291b2e34f34..aa60378da9ab0 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt @@ -2661,46 +2661,68 @@ # GFX12: v_rsq_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x5c,0xfe,0x7f,0xff,0x6f,0x3d,0x30] 0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX12: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX12: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30 -# GFX12: v_sat_pk_u8_i16_dpp v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v127.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30] + +0xfa,0xc4,0x0a,0x7f,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.h, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7f,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_mul_i32_i24_e32 v128, s1, v176 ; encoding: [0x01,0x60,0x01,0x13] + +0xfa,0xc4,0xfe,0x7f,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7f,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_lshlrev_b32_e32 v6, v255, v183 ; encoding: [0xff,0x6f,0x0d,0x30] 0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX12: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt index 05008bfabc45a..99985e09d7432 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt @@ -483,10 +483,19 @@ # GFX12: v_rsq_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x5c,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX12: v_sat_pk_u8_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_dpp v127, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v127.l, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v127, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00] + +0xe9,0xc4,0x0a,0x7f,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.h, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7f,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[1:2], v[187:188] ; encoding: [0x01,0x77,0x39,0x05] + +0xea,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX12: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt index bb9f607b6ece6..8ba4f58b787f5 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt @@ -3250,49 +3250,68 @@ # GFX12: v_rsq_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb1,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] 0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, v1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, v255 ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64 v5, s1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, s1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64 v5, s105 ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, s105 ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64 v5, m0 ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, m0 ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64 v5, null ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, null ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, null ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64 v5, -1 ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, -1 ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, 0.5 ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, src_scc ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00] 0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v255.l, 0xfe0b ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] + +0xff,0x40,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v255.h, 0xfe0b op_sel:[0,1] ; encoding: [0xff,0x40,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00 # GFX12: v_sin_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt index be9f069322da8..98da7c8c54508 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt @@ -2531,46 +2531,64 @@ # GFX12: v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xae,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01 -# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13 -# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] 0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 -# GFX12: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v255.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] + +0xff,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v255.h, v255 op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX12: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt index 87115b962a808..8213237ada1e2 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt @@ -691,10 +691,16 @@ # GFX12: v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xae,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v255.l, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] + +0xff,0x40,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v255.h, v255 op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX12: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] From e5acb167b72a6d2a6e29bcd29d6be57e15224c24 Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Fri, 3 Jan 2025 11:43:45 -0500 Subject: [PATCH 368/567] [AMDGPU][True16][MC] true16 for v_trunc_f16 (#120693) Support true16 format for v_trunc_f16 in MC --- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 2 +- llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll | 41 +++++ llvm/test/MC/AMDGPU/gfx11_asm_vop1.s | 75 +++++---- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s | 65 ++++---- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s | 21 ++- llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s | 44 ++++- .../MC/AMDGPU/gfx11_asm_vop1_t16_promote.s | 155 ++++++++++++----- .../AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s | 65 ++++---- .../MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s | 25 ++- .../test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s | 69 ++++---- llvm/test/MC/AMDGPU/gfx12_asm_vop1.s | 72 ++++---- llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s | 62 +++---- llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s | 18 +- llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s | 43 +++++ .../MC/AMDGPU/gfx12_asm_vop1_t16_promote.s | 156 +++++++++++++----- .../test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s | 69 ++++---- .../AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s | 65 ++++---- .../MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s | 25 ++- .../Disassembler/AMDGPU/gfx11_dasm_vop1.txt | 63 +++++-- .../AMDGPU/gfx11_dasm_vop1_dpp16.txt | 54 ++++-- .../AMDGPU/gfx11_dasm_vop1_dpp8.txt | 17 +- .../gfx11_dasm_vop3_dpp16_from_vop1.txt | 54 ++++-- .../AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt | 24 ++- .../AMDGPU/gfx11_dasm_vop3_from_vop1.txt | 57 +++++-- .../AMDGPU/gfx12_dasm_vop1_dpp16.txt | 50 ++++-- .../AMDGPU/gfx12_dasm_vop1_dpp8.txt | 13 +- .../AMDGPU/gfx12_dasm_vop3_from_vop1.txt | 57 +++++-- .../gfx12_dasm_vop3_from_vop1_dpp16.txt | 54 ++++-- .../AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt | 24 ++- 29 files changed, 1074 insertions(+), 465 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index bbb456ab739ab..92ebd0e10c8fd 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -1042,7 +1042,7 @@ defm V_FLOOR_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f1 defm V_FLOOR_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">; defm V_CEIL_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">; defm V_CEIL_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">; -defm V_TRUNC_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05d, "v_trunc_f16">; +defm V_TRUNC_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05d, "v_trunc_f16">; defm V_RNDNE_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05e, "v_rndne_f16">; defm V_FRACT_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05f, "v_fract_f16">; defm V_SIN_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x060, "v_sin_f16">; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll index 47777e3853e89..0d58afd1812de 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll @@ -2,6 +2,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s declare half @llvm.trunc.f16(half %a) declare <2 x half> @llvm.trunc.v2f16(<2 x half> %a) @@ -62,6 +63,24 @@ define amdgpu_kernel void @trunc_f16( ; GFX11-NEXT: v_trunc_f16_e32 v0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: trunc_f16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s6, -1 +; GFX12-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-NEXT: s_mov_b32 s10, s6 +; GFX12-NEXT: s_mov_b32 s11, s7 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s2 +; GFX12-NEXT: s_mov_b32 s9, s3 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: buffer_load_u16 v0, off, s[8:11], null +; GFX12-NEXT: s_mov_b32 s5, s1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_trunc_f16_e32 v0, v0 +; GFX12-NEXT: buffer_store_b16 v0, off, s[4:7], null +; GFX12-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -147,6 +166,28 @@ define amdgpu_kernel void @trunc_v2f16( ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: trunc_v2f16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s6, -1 +; GFX12-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-NEXT: s_mov_b32 s10, s6 +; GFX12-NEXT: s_mov_b32 s11, s7 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s2 +; GFX12-NEXT: s_mov_b32 s9, s3 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null +; GFX12-NEXT: s_mov_b32 s5, s1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-NEXT: v_trunc_f16_e32 v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_trunc_f16_e32 v1, v1 +; GFX12-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null +; GFX12-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s index 4448720e6f79f..fe08042ae5c84 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s @@ -3533,50 +3533,65 @@ v_swaprel_b32 v5, v1 v_swaprel_b32 v255, v255 // GFX11: v_swaprel_b32 v255, v255 ; encoding: [0xff,0xd1,0xfe,0x7f] -v_trunc_f16 v5, v1 -// GFX11: v_trunc_f16_e32 v5, v1 ; encoding: [0x01,0xbb,0x0a,0x7e] +v_trunc_f16 v5.l, v1.l +// GFX11: v_trunc_f16_e32 v5.l, v1.l ; encoding: [0x01,0xbb,0x0a,0x7e] -v_trunc_f16 v5, v127 -// GFX11: v_trunc_f16_e32 v5, v127 ; encoding: [0x7f,0xbb,0x0a,0x7e] +v_trunc_f16 v5.l, v127.l +// GFX11: v_trunc_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xbb,0x0a,0x7e] -v_trunc_f16 v5, s1 -// GFX11: v_trunc_f16_e32 v5, s1 ; encoding: [0x01,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, s1 +// GFX11: v_trunc_f16_e32 v5.l, s1 ; encoding: [0x01,0xba,0x0a,0x7e] -v_trunc_f16 v5, s105 -// GFX11: v_trunc_f16_e32 v5, s105 ; encoding: [0x69,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, s105 +// GFX11: v_trunc_f16_e32 v5.l, s105 ; encoding: [0x69,0xba,0x0a,0x7e] -v_trunc_f16 v5, vcc_lo -// GFX11: v_trunc_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, vcc_lo +// GFX11: v_trunc_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xba,0x0a,0x7e] -v_trunc_f16 v5, vcc_hi -// GFX11: v_trunc_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, vcc_hi +// GFX11: v_trunc_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xba,0x0a,0x7e] -v_trunc_f16 v5, ttmp15 -// GFX11: v_trunc_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, ttmp15 +// GFX11: v_trunc_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xba,0x0a,0x7e] -v_trunc_f16 v5, m0 -// GFX11: v_trunc_f16_e32 v5, m0 ; encoding: [0x7d,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, m0 +// GFX11: v_trunc_f16_e32 v5.l, m0 ; encoding: [0x7d,0xba,0x0a,0x7e] -v_trunc_f16 v5, exec_lo -// GFX11: v_trunc_f16_e32 v5, exec_lo ; encoding: [0x7e,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, exec_lo +// GFX11: v_trunc_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xba,0x0a,0x7e] -v_trunc_f16 v5, exec_hi -// GFX11: v_trunc_f16_e32 v5, exec_hi ; encoding: [0x7f,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, exec_hi +// GFX11: v_trunc_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xba,0x0a,0x7e] -v_trunc_f16 v5, null -// GFX11: v_trunc_f16_e32 v5, null ; encoding: [0x7c,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, null +// GFX11: v_trunc_f16_e32 v5.l, null ; encoding: [0x7c,0xba,0x0a,0x7e] -v_trunc_f16 v5, -1 -// GFX11: v_trunc_f16_e32 v5, -1 ; encoding: [0xc1,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, -1 +// GFX11: v_trunc_f16_e32 v5.l, -1 ; encoding: [0xc1,0xba,0x0a,0x7e] -v_trunc_f16 v5, 0.5 -// GFX11: v_trunc_f16_e32 v5, 0.5 ; encoding: [0xf0,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, 0.5 +// GFX11: v_trunc_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xba,0x0a,0x7e] -v_trunc_f16 v5, src_scc -// GFX11: v_trunc_f16_e32 v5, src_scc ; encoding: [0xfd,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, src_scc +// GFX11: v_trunc_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xba,0x0a,0x7e] -v_trunc_f16 v127, 0xfe0b -// GFX11: v_trunc_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_trunc_f16 v127.l, 0xfe0b +// GFX11: v_trunc_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +v_trunc_f16 v5.l, v1.h +// GFX11: v_trunc_f16_e32 v5.l, v1.h ; encoding: [0x81,0xbb,0x0a,0x7e] + +v_trunc_f16 v5.l, v127.h +// GFX11: v_trunc_f16_e32 v5.l, v127.h ; encoding: [0xff,0xbb,0x0a,0x7e] + +v_trunc_f16 v127.l, 0.5 +// GFX11: v_trunc_f16_e32 v127.l, 0.5 ; encoding: [0xf0,0xba,0xfe,0x7e] + +v_trunc_f16 v5.h, src_scc +// GFX11: v_trunc_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xba,0x0a,0x7f] + +v_trunc_f16 v127.h, 0xfe0b +// GFX11: v_trunc_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xba,0xfe,0x7f,0x0b,0xfe,0x00,0x00] v_trunc_f32 v5, v1 // GFX11: v_trunc_f32_e32 v5, v1 ; encoding: [0x01,0x43,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s index da2a3615360a4..f5cf3fd390c7d 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s @@ -2741,47 +2741,56 @@ v_sqrt_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_sqrt_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_sqrt_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x66,0xfe,0x7f,0xff,0x6f,0x35,0x30] -v_trunc_f16 v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_trunc_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff] +v_trunc_f16 v5.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_trunc_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff] -v_trunc_f16 v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff] +v_trunc_f16 v5.l, v1.l quad_perm:[0,1,2,3] +// GFX11: v_trunc_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff] -v_trunc_f16 v5, v1 row_mirror -// GFX11: v_trunc_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_mirror +// GFX11: v_trunc_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff] -v_trunc_f16 v5, v1 row_half_mirror -// GFX11: v_trunc_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_half_mirror +// GFX11: v_trunc_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff] -v_trunc_f16 v5, v1 row_shl:1 -// GFX11: v_trunc_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_shl:1 +// GFX11: v_trunc_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff] -v_trunc_f16 v5, v1 row_shl:15 -// GFX11: v_trunc_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_shl:15 +// GFX11: v_trunc_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff] -v_trunc_f16 v5, v1 row_shr:1 -// GFX11: v_trunc_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_shr:1 +// GFX11: v_trunc_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff] -v_trunc_f16 v5, v1 row_shr:15 -// GFX11: v_trunc_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_shr:15 +// GFX11: v_trunc_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff] -v_trunc_f16 v5, v1 row_ror:1 -// GFX11: v_trunc_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_ror:1 +// GFX11: v_trunc_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff] -v_trunc_f16 v5, v1 row_ror:15 -// GFX11: v_trunc_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_ror:15 +// GFX11: v_trunc_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff] -v_trunc_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_trunc_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_trunc_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff] -v_trunc_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_trunc_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01] +v_trunc_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_trunc_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01] -v_trunc_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_trunc_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x09,0x13] +v_trunc_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_trunc_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x09,0x13] -v_trunc_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_trunc_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +v_trunc_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_trunc_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x35,0x30] + +v_trunc_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_trunc_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x5f,0x01,0x01] + +v_trunc_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_trunc_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xba,0x0a,0x7f,0x81,0x60,0x09,0x13] + +v_trunc_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_trunc_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7f,0xff,0x6f,0x35,0x30] v_trunc_f32 v5, v1 quad_perm:[3,2,1,0] // GFX11: v_trunc_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s index 34cb2d097b7a7..5a0ffd04bc5c1 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s @@ -644,14 +644,23 @@ v_sqrt_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_sqrt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_sqrt_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x66,0xfe,0x7f,0xff,0x00,0x00,0x00] -v_trunc_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_trunc_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_trunc_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_trunc_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_trunc_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_trunc_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_trunc_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_trunc_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00] +v_trunc_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_trunc_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +v_trunc_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_trunc_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0xfe,0x7e,0x7f,0x77,0x39,0x05] + +v_trunc_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_trunc_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xba,0x0a,0x7f,0x81,0x77,0x39,0x05] + +v_trunc_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_trunc_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xba,0xfe,0x7f,0xff,0x00,0x00,0x00] v_trunc_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_trunc_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x42,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s index 9c5693de3d8b1..92882cb89e201 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s @@ -812,6 +812,12 @@ v_swap_b16_e32 v128.l, v0.l v_trunc_f16_e32 v128, 0xfe0b // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_trunc_f16_e32 v128.h, 0xfe0b +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_trunc_f16_e32 v128.l, 0xfe0b +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + v_trunc_f16_e32 v255, v1 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -821,6 +827,24 @@ v_trunc_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0] v_trunc_f16_e32 v255, v1 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction +v_trunc_f16_e32 v255.h, v1.h +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_trunc_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_trunc_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_trunc_f16_e32 v255.l, v1.l +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_trunc_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_trunc_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + v_trunc_f16_e32 v5, v199 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -829,3 +853,21 @@ v_trunc_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_trunc_f16_e32 v5, v199 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_trunc_f16_e32 v5.h, v199.h +// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_trunc_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_trunc_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_trunc_f16_e32 v5.l, v199.l +// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_trunc_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_trunc_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s index fa6ab407f87c7..d97c8ed844dbb 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefix=GFX11 --implicit-check-not=_e32 %s // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefix=GFX11 --implicit-check-not=_e32 %s @@ -1952,69 +1952,134 @@ v_sqrt_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_sqrt_f16 v5, v199 quad_perm:[3,2,1,0] // GFX11: v_sqrt_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] -v_trunc_f16 v128, 0xfe0b -// GFX11: v_trunc_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xdd,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_trunc_f16 v128.h, 0xfe0b +// GFX11: v_trunc_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xdd,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_trunc_f16 v255, -1 -// GFX11: v_trunc_f16_e64 v255, -1 ; encoding: [0xff,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] +v_trunc_f16 v128.l, 0xfe0b +// GFX11: v_trunc_f16_e64 v128.l, 0xfe0b ; encoding: [0x80,0x00,0xdd,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_trunc_f16 v255, 0.5 -// GFX11: v_trunc_f16_e64 v255, 0.5 ; encoding: [0xff,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x00] +v_trunc_f16 v255.h, -1 +// GFX11: v_trunc_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0xc1,0x00,0x00,0x00] -v_trunc_f16 v255, exec_hi -// GFX11: v_trunc_f16_e64 v255, exec_hi ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] +v_trunc_f16 v255.h, 0.5 +// GFX11: v_trunc_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0xf0,0x00,0x00,0x00] -v_trunc_f16 v255, exec_lo -// GFX11: v_trunc_f16_e64 v255, exec_lo ; encoding: [0xff,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] +v_trunc_f16 v255.h, exec_hi +// GFX11: v_trunc_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7f,0x00,0x00,0x00] -v_trunc_f16 v255, m0 -// GFX11: v_trunc_f16_e64 v255, m0 ; encoding: [0xff,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] +v_trunc_f16 v255.h, exec_lo +// GFX11: v_trunc_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7e,0x00,0x00,0x00] -v_trunc_f16 v255, null -// GFX11: v_trunc_f16_e64 v255, null ; encoding: [0xff,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] +v_trunc_f16 v255.h, m0 +// GFX11: v_trunc_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7d,0x00,0x00,0x00] -v_trunc_f16 v255, s1 -// GFX11: v_trunc_f16_e64 v255, s1 ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] +v_trunc_f16 v255.h, null +// GFX11: v_trunc_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7c,0x00,0x00,0x00] -v_trunc_f16 v255, s105 -// GFX11: v_trunc_f16_e64 v255, s105 ; encoding: [0xff,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] +v_trunc_f16 v255.h, s1 +// GFX11: v_trunc_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x01,0x00,0x00,0x00] -v_trunc_f16 v255, src_scc -// GFX11: v_trunc_f16_e64 v255, src_scc ; encoding: [0xff,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x00] +v_trunc_f16 v255.h, s105 +// GFX11: v_trunc_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x69,0x00,0x00,0x00] -v_trunc_f16 v255, ttmp15 -// GFX11: v_trunc_f16_e64 v255, ttmp15 ; encoding: [0xff,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] +v_trunc_f16 v255.h, src_scc +// GFX11: v_trunc_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0xfd,0x00,0x00,0x00] -v_trunc_f16 v255, v1 -// GFX11: v_trunc_f16_e64 v255, v1 ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] +v_trunc_f16 v255.h, ttmp15 +// GFX11: v_trunc_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7b,0x00,0x00,0x00] -v_trunc_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_trunc_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_trunc_f16 v255.h, v1.h +// GFX11: v_trunc_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00] -v_trunc_f16 v255, v1 quad_perm:[3,2,1,0] -// GFX11: v_trunc_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_trunc_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_trunc_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_trunc_f16 v255, v127 -// GFX11: v_trunc_f16_e64 v255, v127 ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x01,0x00,0x00] +v_trunc_f16 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX11: v_trunc_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_trunc_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_trunc_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] +v_trunc_f16 v255.h, v127.h +// GFX11: v_trunc_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdd,0xd5,0x7f,0x01,0x00,0x00] -v_trunc_f16 v255, v127 quad_perm:[3,2,1,0] -// GFX11: v_trunc_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] +v_trunc_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_trunc_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] -v_trunc_f16 v255, vcc_hi -// GFX11: v_trunc_f16_e64 v255, vcc_hi ; encoding: [0xff,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] +v_trunc_f16 v255.h, v127.h quad_perm:[3,2,1,0] +// GFX11: v_trunc_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] -v_trunc_f16 v255, vcc_lo -// GFX11: v_trunc_f16_e64 v255, vcc_lo ; encoding: [0xff,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] +v_trunc_f16 v255.h, vcc_hi +// GFX11: v_trunc_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x6b,0x00,0x00,0x00] -v_trunc_f16 v5, v199 -// GFX11: v_trunc_f16_e64 v5, v199 ; encoding: [0x05,0x00,0xdd,0xd5,0xc7,0x01,0x00,0x00] +v_trunc_f16 v255.h, vcc_lo +// GFX11: v_trunc_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x6a,0x00,0x00,0x00] -v_trunc_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_trunc_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] +v_trunc_f16 v255.l, -1 +// GFX11: v_trunc_f16_e64 v255.l, -1 ; encoding: [0xff,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] +v_trunc_f16 v255.l, 0.5 +// GFX11: v_trunc_f16_e64 v255.l, 0.5 ; encoding: [0xff,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x00] -v_trunc_f16 v5, v199 quad_perm:[3,2,1,0] -// GFX11: v_trunc_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] +v_trunc_f16 v255.l, exec_hi +// GFX11: v_trunc_f16_e64 v255.l, exec_hi ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] + +v_trunc_f16 v255.l, exec_lo +// GFX11: v_trunc_f16_e64 v255.l, exec_lo ; encoding: [0xff,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] + +v_trunc_f16 v255.l, m0 +// GFX11: v_trunc_f16_e64 v255.l, m0 ; encoding: [0xff,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] + +v_trunc_f16 v255.l, null +// GFX11: v_trunc_f16_e64 v255.l, null ; encoding: [0xff,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] + +v_trunc_f16 v255.l, s1 +// GFX11: v_trunc_f16_e64 v255.l, s1 ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] + +v_trunc_f16 v255.l, s105 +// GFX11: v_trunc_f16_e64 v255.l, s105 ; encoding: [0xff,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] + +v_trunc_f16 v255.l, src_scc +// GFX11: v_trunc_f16_e64 v255.l, src_scc ; encoding: [0xff,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x00] + +v_trunc_f16 v255.l, ttmp15 +// GFX11: v_trunc_f16_e64 v255.l, ttmp15 ; encoding: [0xff,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] + +v_trunc_f16 v255.l, v1.l +// GFX11: v_trunc_f16_e64 v255.l, v1.l ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] + +v_trunc_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_trunc_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_trunc_f16 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_trunc_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_trunc_f16 v255.l, v127.l +// GFX11: v_trunc_f16_e64 v255.l, v127.l ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x01,0x00,0x00] + +v_trunc_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_trunc_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] + +v_trunc_f16 v255.l, v127.l quad_perm:[3,2,1,0] +// GFX11: v_trunc_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] + +v_trunc_f16 v255.l, vcc_hi +// GFX11: v_trunc_f16_e64 v255.l, vcc_hi ; encoding: [0xff,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] + +v_trunc_f16 v255.l, vcc_lo +// GFX11: v_trunc_f16_e64 v255.l, vcc_lo ; encoding: [0xff,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] + +v_trunc_f16 v5.h, v199.h +// GFX11: v_trunc_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdd,0xd5,0xc7,0x01,0x00,0x00] + +v_trunc_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_trunc_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_trunc_f16 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX11: v_trunc_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] + +v_trunc_f16 v5.l, v199.l +// GFX11: v_trunc_f16_e64 v5.l, v199.l ; encoding: [0x05,0x00,0xdd,0xd5,0xc7,0x01,0x00,0x00] + +v_trunc_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_trunc_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_trunc_f16 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX11: v_trunc_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s index 1bd1a5c5695bc..6176baf11c552 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s @@ -2866,47 +2866,56 @@ v_sqrt_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xb3,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_trunc_f16_e64_dpp v5, v1 row_mirror -// GFX11: v_trunc_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_mirror +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_half_mirror -// GFX11: v_trunc_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_half_mirror +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_shl:1 -// GFX11: v_trunc_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_shl:1 +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_shl:15 -// GFX11: v_trunc_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_shl:15 +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_shr:1 -// GFX11: v_trunc_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_shr:1 +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_shr:15 -// GFX11: v_trunc_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_shr:15 +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_ror:1 -// GFX11: v_trunc_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_ror:1 +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_ror:15 -// GFX11: v_trunc_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_ror:15 +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +v_trunc_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +v_trunc_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +v_trunc_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: [0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +v_trunc_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: [0x05,0x08,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +v_trunc_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: [0xff,0xc1,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] v_trunc_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX11: v_trunc_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s index 65af1c1829902..f3c8c8a69fbe5 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s @@ -829,17 +829,26 @@ v_sqrt_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xb3,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_trunc_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +v_trunc_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +v_trunc_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +v_trunc_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: [0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +v_trunc_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: [0x05,0x08,0xdd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +v_trunc_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: [0xff,0xc1,0xdd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] v_trunc_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_trunc_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s index 1108887c26ed4..9020017c86106 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s @@ -3454,50 +3454,59 @@ v_sqrt_f64_e64 v[5:6], -|src_scc| mul:4 v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2 // GFX11: v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb4,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] -v_trunc_f16_e64 v5, v1 -// GFX11: v_trunc_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] +v_trunc_f16_e64 v5.l, v1.l +// GFX11: v_trunc_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] -v_trunc_f16_e64 v5, v255 -// GFX11: v_trunc_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00] +v_trunc_f16_e64 v5.l, v255.l +// GFX11: v_trunc_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00] -v_trunc_f16_e64 v5, s1 -// GFX11: v_trunc_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, s1 +// GFX11: v_trunc_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] -v_trunc_f16_e64 v5, s105 -// GFX11: v_trunc_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, s105 +// GFX11: v_trunc_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] -v_trunc_f16_e64 v5, vcc_lo -// GFX11: v_trunc_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, vcc_lo +// GFX11: v_trunc_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] -v_trunc_f16_e64 v5, vcc_hi -// GFX11: v_trunc_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, vcc_hi +// GFX11: v_trunc_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] -v_trunc_f16_e64 v5, ttmp15 -// GFX11: v_trunc_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, ttmp15 +// GFX11: v_trunc_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] -v_trunc_f16_e64 v5, m0 -// GFX11: v_trunc_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, m0 +// GFX11: v_trunc_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] -v_trunc_f16_e64 v5, exec_lo -// GFX11: v_trunc_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, exec_lo +// GFX11: v_trunc_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] -v_trunc_f16_e64 v5, exec_hi -// GFX11: v_trunc_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, exec_hi +// GFX11: v_trunc_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] -v_trunc_f16_e64 v5, null -// GFX11: v_trunc_f16_e64 v5, null ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, null +// GFX11: v_trunc_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] -v_trunc_f16_e64 v5, -1 -// GFX11: v_trunc_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, -1 +// GFX11: v_trunc_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] -v_trunc_f16_e64 v5, 0.5 mul:2 -// GFX11: v_trunc_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08] +v_trunc_f16_e64 v5.l, 0.5 mul:2 +// GFX11: v_trunc_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08] -v_trunc_f16_e64 v5, src_scc mul:4 -// GFX11: v_trunc_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10] +v_trunc_f16_e64 v5.l, src_scc mul:4 +// GFX11: v_trunc_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10] -v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 -// GFX11: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +v_trunc_f16_e64 v255.l, -|0xfe0b| clamp div:2 +// GFX11: v_trunc_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_trunc_f16_e64 v5.h, v1.h +// GFX11: [0x05,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00] + +v_trunc_f16_e64 v5.l, v255.h +// GFX11: [0x05,0x08,0xdd,0xd5,0xff,0x01,0x00,0x00] + +v_trunc_f16_e64 v255.h, -|0xfe0b| clamp div:2 +// GFX11: [0xff,0xc1,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] v_trunc_f32_e64 v5, v1 // GFX11: v_trunc_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s index 086356fbca25a..b125821d1306e 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s @@ -3603,50 +3603,62 @@ v_swaprel_b32 v5, v1 v_swaprel_b32 v255, v255 // GFX12: v_swaprel_b32 v255, v255 ; encoding: [0xff,0xd1,0xfe,0x7f] -v_trunc_f16 v5, v1 -// GFX12: v_trunc_f16_e32 v5, v1 ; encoding: [0x01,0xbb,0x0a,0x7e] +v_trunc_f16 v5.l, v1.l +// GFX12: v_trunc_f16_e32 v5.l, v1.l ; encoding: [0x01,0xbb,0x0a,0x7e] -v_trunc_f16 v5, v127 -// GFX12: v_trunc_f16_e32 v5, v127 ; encoding: [0x7f,0xbb,0x0a,0x7e] +v_trunc_f16 v5.l, v127.l +// GFX12: v_trunc_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xbb,0x0a,0x7e] -v_trunc_f16 v5, s1 -// GFX12: v_trunc_f16_e32 v5, s1 ; encoding: [0x01,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, s1 +// GFX12: v_trunc_f16_e32 v5.l, s1 ; encoding: [0x01,0xba,0x0a,0x7e] -v_trunc_f16 v5, s105 -// GFX12: v_trunc_f16_e32 v5, s105 ; encoding: [0x69,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, s105 +// GFX12: v_trunc_f16_e32 v5.l, s105 ; encoding: [0x69,0xba,0x0a,0x7e] -v_trunc_f16 v5, vcc_lo -// GFX12: v_trunc_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, vcc_lo +// GFX12: v_trunc_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xba,0x0a,0x7e] -v_trunc_f16 v5, vcc_hi -// GFX12: v_trunc_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, vcc_hi +// GFX12: v_trunc_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xba,0x0a,0x7e] -v_trunc_f16 v5, ttmp15 -// GFX12: v_trunc_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, ttmp15 +// GFX12: v_trunc_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xba,0x0a,0x7e] -v_trunc_f16 v5, m0 -// GFX12: v_trunc_f16_e32 v5, m0 ; encoding: [0x7d,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, m0 +// GFX12: v_trunc_f16_e32 v5.l, m0 ; encoding: [0x7d,0xba,0x0a,0x7e] -v_trunc_f16 v5, exec_lo -// GFX12: v_trunc_f16_e32 v5, exec_lo ; encoding: [0x7e,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, exec_lo +// GFX12: v_trunc_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xba,0x0a,0x7e] -v_trunc_f16 v5, exec_hi -// GFX12: v_trunc_f16_e32 v5, exec_hi ; encoding: [0x7f,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, exec_hi +// GFX12: v_trunc_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xba,0x0a,0x7e] -v_trunc_f16 v5, null -// GFX12: v_trunc_f16_e32 v5, null ; encoding: [0x7c,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, null +// GFX12: v_trunc_f16_e32 v5.l, null ; encoding: [0x7c,0xba,0x0a,0x7e] -v_trunc_f16 v5, -1 -// GFX12: v_trunc_f16_e32 v5, -1 ; encoding: [0xc1,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, -1 +// GFX12: v_trunc_f16_e32 v5.l, -1 ; encoding: [0xc1,0xba,0x0a,0x7e] -v_trunc_f16 v5, 0.5 -// GFX12: v_trunc_f16_e32 v5, 0.5 ; encoding: [0xf0,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, 0.5 +// GFX12: v_trunc_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xba,0x0a,0x7e] -v_trunc_f16 v5, src_scc -// GFX12: v_trunc_f16_e32 v5, src_scc ; encoding: [0xfd,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, src_scc +// GFX12: v_trunc_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xba,0x0a,0x7e] -v_trunc_f16 v127, 0xfe0b -// GFX12: v_trunc_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_trunc_f16 v127.l, 0xfe0b +// GFX12: v_trunc_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +v_trunc_f16 v5.l, v1.h +// GFX12: v_trunc_f16_e32 v5.l, v1.h ; encoding: [0x81,0xbb,0x0a,0x7e] + +v_trunc_f16 v5.l, v127.h +// GFX12: v_trunc_f16_e32 v5.l, v127.h ; encoding: [0xff,0xbb,0x0a,0x7e] + +v_trunc_f16 v5.h, src_scc +// GFX12: v_trunc_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xba,0x0a,0x7f] + +v_trunc_f16 v127.h, 0xfe0b +// GFX12: v_trunc_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xba,0xfe,0x7f,0x0b,0xfe,0x00,0x00] v_trunc_f32 v5, v1 // GFX12: v_trunc_f32_e32 v5, v1 ; encoding: [0x01,0x43,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s index 26e7162206aed..a625326c1dae4 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s @@ -2806,47 +2806,53 @@ v_sqrt_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_sqrt_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_sqrt_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x66,0xfe,0x7f,0xff,0x6f,0x35,0x30] -v_trunc_f16 v5, v1 quad_perm:[3,2,1,0] -// GFX12: v_trunc_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff] +v_trunc_f16 v5.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_trunc_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff] -v_trunc_f16 v5, v1 quad_perm:[0,1,2,3] -// GFX12: v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff] +v_trunc_f16 v5.l, v1.l quad_perm:[0,1,2,3] +// GFX12: v_trunc_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff] -v_trunc_f16 v5, v1 row_mirror -// GFX12: v_trunc_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_mirror +// GFX12: v_trunc_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff] -v_trunc_f16 v5, v1 row_half_mirror -// GFX12: v_trunc_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_half_mirror +// GFX12: v_trunc_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff] -v_trunc_f16 v5, v1 row_shl:1 -// GFX12: v_trunc_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_shl:1 +// GFX12: v_trunc_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff] -v_trunc_f16 v5, v1 row_shl:15 -// GFX12: v_trunc_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_shl:15 +// GFX12: v_trunc_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff] -v_trunc_f16 v5, v1 row_shr:1 -// GFX12: v_trunc_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_shr:1 +// GFX12: v_trunc_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff] -v_trunc_f16 v5, v1 row_shr:15 -// GFX12: v_trunc_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_shr:15 +// GFX12: v_trunc_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff] -v_trunc_f16 v5, v1 row_ror:1 -// GFX12: v_trunc_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_ror:1 +// GFX12: v_trunc_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff] -v_trunc_f16 v5, v1 row_ror:15 -// GFX12: v_trunc_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_ror:15 +// GFX12: v_trunc_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff] -v_trunc_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_trunc_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_trunc_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff] -v_trunc_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_trunc_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01] +v_trunc_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_trunc_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01] -v_trunc_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_trunc_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x09,0x13] +v_trunc_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_trunc_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x09,0x13] -v_trunc_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_trunc_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +v_trunc_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_trunc_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x35,0x30] + +v_trunc_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_trunc_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xba,0x0a,0x7f,0x81,0x60,0x09,0x13] + +v_trunc_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_trunc_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7f,0xff,0x6f,0x35,0x30] v_trunc_f32 v5, v1 quad_perm:[3,2,1,0] // GFX12: v_trunc_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s index a54ae771fab40..9281d6fb16ce8 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s @@ -661,14 +661,20 @@ v_sqrt_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_sqrt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_sqrt_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x66,0xfe,0x7f,0xff,0x00,0x00,0x00] -v_trunc_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_trunc_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_trunc_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_trunc_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_trunc_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_trunc_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_trunc_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_trunc_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00] +v_trunc_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_trunc_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +v_trunc_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_trunc_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xba,0x0a,0x7f,0x81,0x77,0x39,0x05] + +v_trunc_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_trunc_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xba,0xfe,0x7f,0xff,0x00,0x00,0x00] v_trunc_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_trunc_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x42,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s index 01aa7a44bbc23..33a5dded095c7 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s @@ -1,3 +1,4 @@ +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 ; NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12 --implicit-check-not=error %s @@ -703,6 +704,12 @@ v_swap_b16_e32 v128.l, v0.l v_trunc_f16_e32 v128, 0xfe0b // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_trunc_f16_e32 v128.h, 0xfe0b +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_trunc_f16_e32 v128.l, 0xfe0b +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + v_trunc_f16_e32 v255, v1 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -712,6 +719,24 @@ v_trunc_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0] v_trunc_f16_e32 v255, v1 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction +v_trunc_f16_e32 v255.h, v1.h +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_trunc_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_trunc_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_trunc_f16_e32 v255.l, v1.l +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_trunc_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_trunc_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + v_trunc_f16_e32 v5, v199 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -720,3 +745,21 @@ v_trunc_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_trunc_f16_e32 v5, v199 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_trunc_f16_e32 v5.h, v199.h +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_trunc_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_trunc_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_trunc_f16_e32 v5.l, v199.l +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_trunc_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_trunc_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s index 4c983af094561..03519d43c49a9 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-encoding %s | FileCheck --check-prefix=GFX12 --implicit-check-not=_e32 %s v_ceil_f16 v128, 0xfe0b @@ -1912,68 +1912,134 @@ v_sqrt_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_sqrt_f16 v5, v199 quad_perm:[3,2,1,0] // GFX12: v_sqrt_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] -v_trunc_f16 v128, 0xfe0b -// GFX12: v_trunc_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xdd,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_trunc_f16 v128.h, 0xfe0b +// GFX12: v_trunc_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xdd,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_trunc_f16 v255, -1 -// GFX12: v_trunc_f16_e64 v255, -1 ; encoding: [0xff,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] +v_trunc_f16 v128.l, 0xfe0b +// GFX12: v_trunc_f16_e64 v128.l, 0xfe0b ; encoding: [0x80,0x00,0xdd,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_trunc_f16 v255, 0.5 -// GFX12: v_trunc_f16_e64 v255, 0.5 ; encoding: [0xff,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x00] +v_trunc_f16 v255.h, -1 +// GFX12: v_trunc_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0xc1,0x00,0x00,0x00] -v_trunc_f16 v255, exec_hi -// GFX12: v_trunc_f16_e64 v255, exec_hi ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] +v_trunc_f16 v255.h, 0.5 +// GFX12: v_trunc_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0xf0,0x00,0x00,0x00] -v_trunc_f16 v255, exec_lo -// GFX12: v_trunc_f16_e64 v255, exec_lo ; encoding: [0xff,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] +v_trunc_f16 v255.h, exec_hi +// GFX12: v_trunc_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7f,0x00,0x00,0x00] -v_trunc_f16 v255, m0 -// GFX12: v_trunc_f16_e64 v255, m0 ; encoding: [0xff,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] +v_trunc_f16 v255.h, exec_lo +// GFX12: v_trunc_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7e,0x00,0x00,0x00] -v_trunc_f16 v255, null -// GFX12: v_trunc_f16_e64 v255, null ; encoding: [0xff,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] +v_trunc_f16 v255.h, m0 +// GFX12: v_trunc_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7d,0x00,0x00,0x00] -v_trunc_f16 v255, s1 -// GFX12: v_trunc_f16_e64 v255, s1 ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] +v_trunc_f16 v255.h, null +// GFX12: v_trunc_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7c,0x00,0x00,0x00] -v_trunc_f16 v255, s105 -// GFX12: v_trunc_f16_e64 v255, s105 ; encoding: [0xff,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] +v_trunc_f16 v255.h, s1 +// GFX12: v_trunc_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x01,0x00,0x00,0x00] -v_trunc_f16 v255, src_scc -// GFX12: v_trunc_f16_e64 v255, src_scc ; encoding: [0xff,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x00] +v_trunc_f16 v255.h, s105 +// GFX12: v_trunc_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x69,0x00,0x00,0x00] -v_trunc_f16 v255, ttmp15 -// GFX12: v_trunc_f16_e64 v255, ttmp15 ; encoding: [0xff,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] +v_trunc_f16 v255.h, src_scc +// GFX12: v_trunc_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0xfd,0x00,0x00,0x00] -v_trunc_f16 v255, v1 -// GFX12: v_trunc_f16_e64 v255, v1 ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] +v_trunc_f16 v255.h, ttmp15 +// GFX12: v_trunc_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7b,0x00,0x00,0x00] -v_trunc_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_trunc_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_trunc_f16 v255.h, v1.h +// GFX12: v_trunc_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00] -v_trunc_f16 v255, v1 quad_perm:[3,2,1,0] -// GFX12: v_trunc_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_trunc_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_trunc_f16 v255, v127 -// GFX12: v_trunc_f16_e64 v255, v127 ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x01,0x00,0x00] +v_trunc_f16 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_trunc_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_trunc_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] +v_trunc_f16 v255.h, v127.h +// GFX12: v_trunc_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdd,0xd5,0x7f,0x01,0x00,0x00] -v_trunc_f16 v255, v127 quad_perm:[3,2,1,0] -// GFX12: v_trunc_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] +v_trunc_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] -v_trunc_f16 v255, vcc_hi -// GFX12: v_trunc_f16_e64 v255, vcc_hi ; encoding: [0xff,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] +v_trunc_f16 v255.h, v127.h quad_perm:[3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] -v_trunc_f16 v255, vcc_lo -// GFX12: v_trunc_f16_e64 v255, vcc_lo ; encoding: [0xff,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] +v_trunc_f16 v255.h, vcc_hi +// GFX12: v_trunc_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x6b,0x00,0x00,0x00] -v_trunc_f16 v5, v199 -// GFX12: v_trunc_f16_e64 v5, v199 ; encoding: [0x05,0x00,0xdd,0xd5,0xc7,0x01,0x00,0x00] +v_trunc_f16 v255.h, vcc_lo +// GFX12: v_trunc_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x6a,0x00,0x00,0x00] -v_trunc_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_trunc_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] +v_trunc_f16 v255.l, -1 +// GFX12: v_trunc_f16_e64 v255.l, -1 ; encoding: [0xff,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] -v_trunc_f16 v5, v199 quad_perm:[3,2,1,0] -// GFX12: v_trunc_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] +v_trunc_f16 v255.l, 0.5 +// GFX12: v_trunc_f16_e64 v255.l, 0.5 ; encoding: [0xff,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x00] + +v_trunc_f16 v255.l, exec_hi +// GFX12: v_trunc_f16_e64 v255.l, exec_hi ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] + +v_trunc_f16 v255.l, exec_lo +// GFX12: v_trunc_f16_e64 v255.l, exec_lo ; encoding: [0xff,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] + +v_trunc_f16 v255.l, m0 +// GFX12: v_trunc_f16_e64 v255.l, m0 ; encoding: [0xff,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] + +v_trunc_f16 v255.l, null +// GFX12: v_trunc_f16_e64 v255.l, null ; encoding: [0xff,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] + +v_trunc_f16 v255.l, s1 +// GFX12: v_trunc_f16_e64 v255.l, s1 ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] + +v_trunc_f16 v255.l, s105 +// GFX12: v_trunc_f16_e64 v255.l, s105 ; encoding: [0xff,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] + +v_trunc_f16 v255.l, src_scc +// GFX12: v_trunc_f16_e64 v255.l, src_scc ; encoding: [0xff,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x00] + +v_trunc_f16 v255.l, ttmp15 +// GFX12: v_trunc_f16_e64 v255.l, ttmp15 ; encoding: [0xff,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] + +v_trunc_f16 v255.l, v1.l +// GFX12: v_trunc_f16_e64 v255.l, v1.l ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] + +v_trunc_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_trunc_f16 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_trunc_f16 v255.l, v127.l +// GFX12: v_trunc_f16_e64 v255.l, v127.l ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x01,0x00,0x00] + +v_trunc_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] + +v_trunc_f16 v255.l, v127.l quad_perm:[3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] + +v_trunc_f16 v255.l, vcc_hi +// GFX12: v_trunc_f16_e64 v255.l, vcc_hi ; encoding: [0xff,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] + +v_trunc_f16 v255.l, vcc_lo +// GFX12: v_trunc_f16_e64 v255.l, vcc_lo ; encoding: [0xff,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] + +v_trunc_f16 v5.h, v199.h +// GFX12: v_trunc_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdd,0xd5,0xc7,0x01,0x00,0x00] + +v_trunc_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_trunc_f16 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] + +v_trunc_f16 v5.l, v199.l +// GFX12: v_trunc_f16_e64 v5.l, v199.l ; encoding: [0x05,0x00,0xdd,0xd5,0xc7,0x01,0x00,0x00] + +v_trunc_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_trunc_f16 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s index ea4a58d9d0f7e..e2fe08ddc8b06 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s @@ -3604,50 +3604,59 @@ v_sqrt_f64_e64 v[5:6], -|src_scc| mul:4 v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2 // GFX12: v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb4,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] -v_trunc_f16_e64 v5, v1 -// GFX12: v_trunc_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] +v_trunc_f16_e64 v5.l, v1.l +// GFX12: v_trunc_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] -v_trunc_f16_e64 v5, v255 -// GFX12: v_trunc_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00] +v_trunc_f16_e64 v5.l, v255.l +// GFX12: v_trunc_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00] -v_trunc_f16_e64 v5, s1 -// GFX12: v_trunc_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, s1 +// GFX12: v_trunc_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] -v_trunc_f16_e64 v5, s105 -// GFX12: v_trunc_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, s105 +// GFX12: v_trunc_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] -v_trunc_f16_e64 v5, vcc_lo -// GFX12: v_trunc_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, vcc_lo +// GFX12: v_trunc_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] -v_trunc_f16_e64 v5, vcc_hi -// GFX12: v_trunc_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, vcc_hi +// GFX12: v_trunc_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] -v_trunc_f16_e64 v5, ttmp15 -// GFX12: v_trunc_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, ttmp15 +// GFX12: v_trunc_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] -v_trunc_f16_e64 v5, m0 -// GFX12: v_trunc_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, m0 +// GFX12: v_trunc_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] -v_trunc_f16_e64 v5, exec_lo -// GFX12: v_trunc_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, exec_lo +// GFX12: v_trunc_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] -v_trunc_f16_e64 v5, exec_hi -// GFX12: v_trunc_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, exec_hi +// GFX12: v_trunc_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] -v_trunc_f16_e64 v5, null -// GFX12: v_trunc_f16_e64 v5, null ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, null +// GFX12: v_trunc_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] -v_trunc_f16_e64 v5, -1 -// GFX12: v_trunc_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, -1 +// GFX12: v_trunc_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] -v_trunc_f16_e64 v5, 0.5 mul:2 -// GFX12: v_trunc_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08] +v_trunc_f16_e64 v5.l, 0.5 mul:2 +// GFX12: v_trunc_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08] -v_trunc_f16_e64 v5, src_scc mul:4 -// GFX12: v_trunc_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10] +v_trunc_f16_e64 v5.l, src_scc mul:4 +// GFX12: v_trunc_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10] -v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 -// GFX12: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +v_trunc_f16_e64 v255.l, -|0xfe0b| clamp div:2 +// GFX12: v_trunc_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_trunc_f16_e64 v5.h, v1.h +// GFX12: v_trunc_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00] + +v_trunc_f16_e64 v5.l, v255.h +// GFX12: v_trunc_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xdd,0xd5,0xff,0x01,0x00,0x00] + +v_trunc_f16_e64 v255.h, -|0xfe0b| clamp div:2 +// GFX12: v_trunc_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] v_trunc_f32_e64 v5, v1 // GFX12: v_trunc_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s index a9b933e639abb..3fff2749e6e99 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s @@ -2719,47 +2719,56 @@ v_sqrt_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xb3,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] -// GFX12: v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] -// GFX12: v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_trunc_f16_e64_dpp v5, v1 row_mirror -// GFX12: v_trunc_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_mirror +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_half_mirror -// GFX12: v_trunc_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_half_mirror +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_shl:1 -// GFX12: v_trunc_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_shl:1 +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_shl:15 -// GFX12: v_trunc_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_shl:15 +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_shr:1 -// GFX12: v_trunc_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_shr:1 +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_shr:15 -// GFX12: v_trunc_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_shr:15 +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_ror:1 -// GFX12: v_trunc_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_ror:1 +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_ror:15 -// GFX12: v_trunc_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_ror:15 +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +v_trunc_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +v_trunc_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +v_trunc_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_trunc_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +v_trunc_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x08,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +v_trunc_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_trunc_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc1,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] v_trunc_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX12: v_trunc_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s index af335f2e0b586..e4ae0ad655518 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s @@ -781,17 +781,26 @@ v_sqrt_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xb3,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_trunc_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +v_trunc_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +v_trunc_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +v_trunc_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +v_trunc_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x08,0xdd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +v_trunc_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_trunc_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc1,0xdd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] v_trunc_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_trunc_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt index f02f0206acd2f..8cf2c2b4f2d1e 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt @@ -3545,49 +3545,82 @@ # GFX11: v_swaprel_b32 v255, v255 ; encoding: [0xff,0xd1,0xfe,0x7f] 0x01,0xbb,0x0a,0x7e -# GFX11: v_trunc_f16_e32 v5, v1 ; encoding: [0x01,0xbb,0x0a,0x7e] +# GFX11-REAL16: v_trunc_f16_e32 v5.l, v1.l ; encoding: [0x01,0xbb,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, v1 ; encoding: [0x01,0xbb,0x0a,0x7e] 0x7f,0xbb,0x0a,0x7e -# GFX11: v_trunc_f16_e32 v5, v127 ; encoding: [0x7f,0xbb,0x0a,0x7e] +# GFX11-REAL16: v_trunc_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xbb,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, v127 ; encoding: [0x7f,0xbb,0x0a,0x7e] 0x01,0xba,0x0a,0x7e -# GFX11: v_trunc_f16_e32 v5, s1 ; encoding: [0x01,0xba,0x0a,0x7e] +# GFX11-REAL16: v_trunc_f16_e32 v5.l, s1 ; encoding: [0x01,0xba,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, s1 ; encoding: [0x01,0xba,0x0a,0x7e] 0x69,0xba,0x0a,0x7e -# GFX11: v_trunc_f16_e32 v5, s105 ; encoding: [0x69,0xba,0x0a,0x7e] +# GFX11-REAL16: v_trunc_f16_e32 v5.l, s105 ; encoding: [0x69,0xba,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, s105 ; encoding: [0x69,0xba,0x0a,0x7e] 0x6a,0xba,0x0a,0x7e -# GFX11: v_trunc_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xba,0x0a,0x7e] +# GFX11-REAL16: v_trunc_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xba,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xba,0x0a,0x7e] 0x6b,0xba,0x0a,0x7e -# GFX11: v_trunc_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xba,0x0a,0x7e] +# GFX11-REAL16: v_trunc_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xba,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xba,0x0a,0x7e] 0x7b,0xba,0x0a,0x7e -# GFX11: v_trunc_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xba,0x0a,0x7e] +# GFX11-REAL16: v_trunc_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xba,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xba,0x0a,0x7e] 0x7d,0xba,0x0a,0x7e -# GFX11: v_trunc_f16_e32 v5, m0 ; encoding: [0x7d,0xba,0x0a,0x7e] +# GFX11-REAL16: v_trunc_f16_e32 v5.l, m0 ; encoding: [0x7d,0xba,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, m0 ; encoding: [0x7d,0xba,0x0a,0x7e] 0x7e,0xba,0x0a,0x7e -# GFX11: v_trunc_f16_e32 v5, exec_lo ; encoding: [0x7e,0xba,0x0a,0x7e] +# GFX11-REAL16: v_trunc_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xba,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, exec_lo ; encoding: [0x7e,0xba,0x0a,0x7e] 0x7f,0xba,0x0a,0x7e -# GFX11: v_trunc_f16_e32 v5, exec_hi ; encoding: [0x7f,0xba,0x0a,0x7e] +# GFX11-REAL16: v_trunc_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xba,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, exec_hi ; encoding: [0x7f,0xba,0x0a,0x7e] 0x7c,0xba,0x0a,0x7e -# GFX11: v_trunc_f16_e32 v5, null ; encoding: [0x7c,0xba,0x0a,0x7e] +# GFX11-REAL16: v_trunc_f16_e32 v5.l, null ; encoding: [0x7c,0xba,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, null ; encoding: [0x7c,0xba,0x0a,0x7e] 0xc1,0xba,0x0a,0x7e -# GFX11: v_trunc_f16_e32 v5, -1 ; encoding: [0xc1,0xba,0x0a,0x7e] +# GFX11-REAL16: v_trunc_f16_e32 v5.l, -1 ; encoding: [0xc1,0xba,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, -1 ; encoding: [0xc1,0xba,0x0a,0x7e] 0xf0,0xba,0x0a,0x7e -# GFX11: v_trunc_f16_e32 v5, 0.5 ; encoding: [0xf0,0xba,0x0a,0x7e] +# GFX11-REAL16: v_trunc_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xba,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, 0.5 ; encoding: [0xf0,0xba,0x0a,0x7e] 0xfd,0xba,0x0a,0x7e -# GFX11: v_trunc_f16_e32 v5, src_scc ; encoding: [0xfd,0xba,0x0a,0x7e] +# GFX11-REAL16: v_trunc_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xba,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, src_scc ; encoding: [0xfd,0xba,0x0a,0x7e] 0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00 -# GFX11: v_trunc_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +0x81,0xbb,0x0a,0x7e +# GFX11-REAL16: v_trunc_f16_e32 v5.l, v1.h ; encoding: [0x81,0xbb,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xbb,0x0a,0x7e] + +0xff,0xbb,0x0a,0x7e +# GFX11-REAL16: v_trunc_f16_e32 v5.l, v127.h ; encoding: [0xff,0xbb,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xbb,0x0a,0x7e] + +0xf0,0xba,0xfe,0x7e +# GFX11-REAL16: v_trunc_f16_e32 v127.l, 0.5 ; encoding: [0xf0,0xba,0xfe,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v127, 0.5 ; encoding: [0xf0,0xba,0xfe,0x7e] + +0xfd,0xba,0x0a,0x7f +# GFX11-REAL16: v_trunc_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xba,0x0a,0x7f] + +0xff,0xba,0xfe,0x7f,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_trunc_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xba,0xfe,0x7f,0x0b,0xfe,0x00,0x00] 0x01,0x43,0x0a,0x7e # GFX11: v_trunc_f32_e32 v5, v1 ; encoding: [0x01,0x43,0x0a,0x7e] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt index a4491e02abf05..b9a499549d12c 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt @@ -2775,46 +2775,72 @@ # GFX11: v_sqrt_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x66,0xfe,0x7f,0xff,0x6f,0x3d,0x30] 0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX11: v_trunc_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX11: v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX11: v_trunc_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX11: v_trunc_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX11: v_trunc_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX11: v_trunc_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX11: v_trunc_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX11: v_trunc_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX11: v_trunc_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX11: v_trunc_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX11: v_trunc_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX11: v_trunc_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX11: v_trunc_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30 -# GFX11: v_trunc_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX11-REAL16: v_trunc_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX11-FAKE16: v_trunc_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] + +0xfa,0xba,0xfe,0x7e,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_trunc_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_trunc_f16_dpp v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x5f,0x01,0x01] + +0xfa,0xba,0x0a,0x7f,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_trunc_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7f,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_mul_i32_i24_e32 v128, 1, v176 ; encoding: [0x81,0x60,0x01,0x13] + +0xfa,0xba,0xfe,0x7f,0xff,0x6f,0x3d,0x30 +# GFX11-REAL16: v_trunc_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7f,0xff,0x6f,0x3d,0x30] +# GFX11-FAKE16: v_lshlrev_b32_e32 v30, v255, v183 ; encoding: [0xff,0x6f,0x3d,0x30] 0xfa,0x42,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX11: v_trunc_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt index 4e15731203168..80c739a98f65f 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt @@ -522,10 +522,23 @@ # GFX11: v_sqrt_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x66,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX11: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00 -# GFX11: v_trunc_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +0xe9,0xba,0xfe,0x7e,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_trunc_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0xfe,0x7e,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_trunc_f16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0xfe,0x7e,0x7f,0x77,0x39,0x05] + +0xe9,0xba,0x0a,0x7f,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_trunc_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7f,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v129, v187 ; encoding: [0x81,0x77,0x39,0x05] + +0xea,0xba,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_trunc_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0x42,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX11: v_trunc_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x42,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt index f97c678e6a90a..fd84ed734fb31 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt @@ -2941,46 +2941,72 @@ # GFX11: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xb3,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX11: v_trunc_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX11: v_trunc_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX11: v_trunc_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_trunc_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX11: v_trunc_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_trunc_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX11: v_trunc_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_trunc_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX11: v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX11: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 -# GFX11: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] 0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 -# GFX11: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] + +0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +0x05,0x08,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] + +0xff,0xc1,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_trunc_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xa1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX11: v_trunc_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt index 3cad28d888202..0edbff63d60ed 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt @@ -803,16 +803,32 @@ # GFX11: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xb3,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX11: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX11: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] 0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX11: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +0x05,0x08,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +0xff,0xc1,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_trunc_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0x05,0x00,0xa1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX11: v_trunc_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt index 8b2bc97c5de1f..0406d78078305 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt @@ -3499,49 +3499,76 @@ # GFX11: v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb4,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] 0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00 -# GFX11: v_trunc_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00 -# GFX11: v_trunc_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00 -# GFX11: v_trunc_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00 -# GFX11: v_trunc_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00 -# GFX11: v_trunc_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00 -# GFX11: v_trunc_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00 -# GFX11: v_trunc_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00 -# GFX11: v_trunc_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00 -# GFX11: v_trunc_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00 -# GFX11: v_trunc_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00 -# GFX11: v_trunc_f16_e64 v5, null ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v5, null ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00 -# GFX11: v_trunc_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08 -# GFX11: v_trunc_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08] +# GFX11-REAL16: v_trunc_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08] +# GFX11-FAKE16: v_trunc_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08] 0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10 -# GFX11: v_trunc_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10] +# GFX11-REAL16: v_trunc_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10] +# GFX11-FAKE16: v_trunc_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10] 0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 -# GFX11: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00 +# GFX11-REAL16: v_trunc_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x08,0xdd,0xd5,0xff,0x01,0x00,0x00 +# GFX11-REAL16: v_trunc_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xdd,0xd5,0xff,0x01,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00] + +0xff,0xc1,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_trunc_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] 0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00 # GFX11: v_trunc_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt index aa60378da9ab0..22ae18815a522 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt @@ -2915,46 +2915,68 @@ # GFX12: v_sqrt_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x66,0xfe,0x7f,0xff,0x6f,0x3d,0x30] 0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX12: v_trunc_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX12: v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX12: v_trunc_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX12: v_trunc_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX12: v_trunc_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX12: v_trunc_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX12: v_trunc_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX12: v_trunc_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX12: v_trunc_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX12: v_trunc_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX12: v_trunc_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX12: v_trunc_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX12: v_trunc_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30 -# GFX12: v_trunc_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-REAL16: v_trunc_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-FAKE16: v_trunc_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] + +0xfa,0xba,0x0a,0x7f,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_trunc_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7f,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_mul_i32_i24_e32 v128, 1, v176 ; encoding: [0x81,0x60,0x01,0x13] + +0xfa,0xba,0xfe,0x7f,0xff,0x6f,0x3d,0x30 +# GFX12-REAL16: v_trunc_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7f,0xff,0x6f,0x3d,0x30] +# GFX12-FAKE16: v_lshlrev_b32_e32 v30, v255, v183 ; encoding: [0xff,0x6f,0x3d,0x30] 0xfa,0x42,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX12: v_trunc_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt index 99985e09d7432..bfb84c6cdff39 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt @@ -531,10 +531,19 @@ # GFX12: v_sqrt_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x66,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX12: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00 -# GFX12: v_trunc_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_trunc_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +0xe9,0xba,0x0a,0x7f,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_trunc_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7f,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05] + +0xea,0xba,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_trunc_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0x42,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX12: v_trunc_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x42,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt index 8ba4f58b787f5..e27469230a15f 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt @@ -3545,49 +3545,76 @@ # GFX12: v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb4,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] 0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00 -# GFX12: v_trunc_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] +# GFX12-REAL16: v_trunc_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00 -# GFX12: v_trunc_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00] +# GFX12-REAL16: v_trunc_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00 -# GFX12: v_trunc_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] +# GFX12-REAL16: v_trunc_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00 -# GFX12: v_trunc_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] +# GFX12-REAL16: v_trunc_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00 -# GFX12: v_trunc_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-REAL16: v_trunc_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00 -# GFX12: v_trunc_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-REAL16: v_trunc_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00 -# GFX12: v_trunc_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-REAL16: v_trunc_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00 -# GFX12: v_trunc_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-REAL16: v_trunc_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00 -# GFX12: v_trunc_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-REAL16: v_trunc_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00 -# GFX12: v_trunc_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_trunc_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00 -# GFX12: v_trunc_f16_e64 v5, null ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-REAL16: v_trunc_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v5, null ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00 -# GFX12: v_trunc_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-REAL16: v_trunc_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08 -# GFX12: v_trunc_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-REAL16: v_trunc_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-FAKE16: v_trunc_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08] 0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10 -# GFX12: v_trunc_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-REAL16: v_trunc_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-FAKE16: v_trunc_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10] 0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 -# GFX12: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_trunc_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00 +# GFX12-REAL16: v_trunc_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x08,0xdd,0xd5,0xff,0x01,0x00,0x00 +# GFX12-REAL16: v_trunc_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xdd,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00] + +0xff,0xc1,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_trunc_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] 0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00 # GFX12: v_trunc_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt index 98da7c8c54508..bc957576b19b6 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt @@ -2773,46 +2773,72 @@ # GFX12: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xb3,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX12: v_trunc_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX12: v_trunc_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX12: v_trunc_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_trunc_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX12: v_trunc_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_trunc_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX12: v_trunc_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_trunc_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX12: v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX12: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 -# GFX12: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] 0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 -# GFX12: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] + +0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +0x05,0x08,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] + +0xff,0xc1,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_trunc_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xa1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX12: v_trunc_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt index 8213237ada1e2..989824315b2d2 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt @@ -755,16 +755,32 @@ # GFX12: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xb3,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX12: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX12: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] 0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX12: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +0x05,0x08,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +0xff,0xc1,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_trunc_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0x05,0x00,0xa1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX12: v_trunc_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] From c50370c67afddf557ba30d58143b30ffb7203935 Mon Sep 17 00:00:00 2001 From: Han-Kuan Chen Date: Sat, 4 Jan 2025 00:44:57 +0800 Subject: [PATCH 369/567] [SLP] NFC. Use InstructionsState::valid if users just want to know whether VL has same opcode. (#120217) Add assert for InstructionsState::getOpcode. Use InstructionsState::getOpcode only when necessary. --- .../Transforms/Vectorize/SLPVectorizer.cpp | 129 +++++++++--------- 1 file changed, 67 insertions(+), 62 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index f52ddfda5e64c..c4582df89213d 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -816,27 +816,34 @@ class InstructionsState { Instruction *AltOp = nullptr; public: - Instruction *getMainOp() const { return MainOp; } + Instruction *getMainOp() const { + assert(valid() && "InstructionsState is invalid."); + return MainOp; + } - Instruction *getAltOp() const { return AltOp; } + Instruction *getAltOp() const { + assert(valid() && "InstructionsState is invalid."); + return AltOp; + } /// The main/alternate opcodes for the list of instructions. - unsigned getOpcode() const { - return MainOp ? MainOp->getOpcode() : 0; - } + unsigned getOpcode() const { return getMainOp()->getOpcode(); } - unsigned getAltOpcode() const { - return AltOp ? AltOp->getOpcode() : 0; - } + unsigned getAltOpcode() const { return getAltOp()->getOpcode(); } /// Some of the instructions in the list have alternate opcodes. - bool isAltShuffle() const { return AltOp != MainOp; } + bool isAltShuffle() const { return getMainOp() != getAltOp(); } bool isOpcodeOrAlt(Instruction *I) const { unsigned CheckedOpcode = I->getOpcode(); return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode; } + /// Checks if the current state is valid, i.e. has non-null MainOp + bool valid() const { return MainOp && AltOp; } + + explicit operator bool() const { return valid(); } + InstructionsState() = delete; InstructionsState(Instruction *MainOp, Instruction *AltOp) : MainOp(MainOp), AltOp(AltOp) {} @@ -869,8 +876,8 @@ static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, (!isa(BaseOp0) && !isa(Op0) && !isa(BaseOp1) && !isa(Op1)) || BaseOp0 == Op0 || BaseOp1 == Op1 || - getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() || - getSameOpcode({BaseOp1, Op1}, TLI).getOpcode(); + getSameOpcode({BaseOp0, Op0}, TLI) || + getSameOpcode({BaseOp1, Op1}, TLI); } /// \returns true if a compare instruction \p CI has similar "look" and @@ -1847,7 +1854,7 @@ class BoUpSLP { InstructionsState S = getSameOpcode(Ops, TLI); // Note: Only consider instructions with <= 2 operands to avoid // complexity explosion. - if (S.getOpcode() && + if (S && (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() || !S.isAltShuffle()) && all_of(Ops, [&S](Value *V) { @@ -2382,7 +2389,7 @@ class BoUpSLP { // Use Boyer-Moore majority voting for finding the majority opcode and // the number of times it occurs. if (auto *I = dyn_cast(OpData.V)) { - if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() || + if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) || I->getParent() != Parent) { if (NumOpsWithSameOpcodeParent == 0) { NumOpsWithSameOpcodeParent = 1; @@ -2501,8 +2508,7 @@ class BoUpSLP { // 2.1. If we have only 2 lanes, need to check that value in the // next lane does not build same opcode sequence. (Lns == 2 && - !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) - .getOpcode() && + !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) && isa(Data.V)))) || // 3. The operand in the current lane is loop invariant (can be // hoisted out) and another operand is also a loop invariant @@ -2511,7 +2517,7 @@ class BoUpSLP { // FIXME: need to teach the cost model about this case for better // estimation. (IsInvariant && !isa(Data.V) && - !getSameOpcode({Op, Data.V}, TLI).getOpcode() && + !getSameOpcode({Op, Data.V}, TLI) && L->isLoopInvariant(Data.V))) { FoundCandidate = true; Data.IsUsed = Data.V == Op; @@ -2541,7 +2547,7 @@ class BoUpSLP { return true; Value *OpILn = getValue(OpI, Ln); return (L && L->isLoopInvariant(OpILn)) || - (getSameOpcode({Op, OpILn}, TLI).getOpcode() && + (getSameOpcode({Op, OpILn}, TLI) && allSameBlock({Op, OpILn})); })) return true; @@ -2698,7 +2704,7 @@ class BoUpSLP { OperandData &AltOp = getData(OpIdx, Lane); InstructionsState OpS = getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI); - if (OpS.getOpcode() && OpS.isAltShuffle()) + if (OpS && OpS.isAltShuffle()) MainAltOps[OpIdx].push_back(AltOp.V); } } @@ -3400,6 +3406,7 @@ class BoUpSLP { } void setOperations(const InstructionsState &S) { + assert(S && "InstructionsState is invalid."); MainOp = S.getMainOp(); AltOp = S.getAltOp(); } @@ -3600,7 +3607,7 @@ class BoUpSLP { "Need to vectorize gather entry?"); // Gathered loads still gathered? Do not create entry, use the original one. if (GatheredLoadsEntriesFirst.has_value() && - EntryState == TreeEntry::NeedToGather && + EntryState == TreeEntry::NeedToGather && S && S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX && !UserTreeIdx.UserTE) return nullptr; @@ -3618,7 +3625,8 @@ class BoUpSLP { ReuseShuffleIndices.end()); if (ReorderIndices.empty()) { Last->Scalars.assign(VL.begin(), VL.end()); - Last->setOperations(S); + if (S) + Last->setOperations(S); } else { // Reorder scalars and build final mask. Last->Scalars.assign(VL.size(), nullptr); @@ -3629,7 +3637,8 @@ class BoUpSLP { return VL[Idx]; }); InstructionsState S = getSameOpcode(Last->Scalars, *TLI); - Last->setOperations(S); + if (S) + Last->setOperations(S); Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); } if (!Last->isGather()) { @@ -4774,8 +4783,7 @@ static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, (!GEP2 || isConstant(GEP2->getOperand(1)))) || !CompareOpcodes || (GEP1 && GEP2 && - getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI) - .getOpcode())); + getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI))); } /// Calculates minimal alignment as a common alignment. @@ -7500,7 +7508,7 @@ bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S, [&](ArrayRef Op) { if (allConstant(Op) || (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) && - getSameOpcode(Op, *TLI).getMainOp())) + getSameOpcode(Op, *TLI))) return false; DenseMap Uniques; for (Value *V : Op) { @@ -8071,15 +8079,14 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // Don't go into catchswitch blocks, which can happen with PHIs. // Such blocks can only have PHIs and the catchswitch. There is no // place to insert a shuffle if we need to, so just avoid that issue. - if (S.getMainOp() && - isa(S.getMainOp()->getParent()->getTerminator())) { + if (S && isa(S.getMainOp()->getParent()->getTerminator())) { LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n"); newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); return; } // Check if this is a duplicate of another entry. - if (S.getOpcode()) { + if (S) { if (TreeEntry *E = getTreeEntry(S.getMainOp())) { LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n"); @@ -8140,13 +8147,12 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // a load), in which case peek through to include it in the tree, without // ballooning over-budget. if (Depth >= RecursionMaxDepth && - !(S.getMainOp() && !S.isAltShuffle() && VL.size() >= 4 && + !(S && !S.isAltShuffle() && VL.size() >= 4 && (match(S.getMainOp(), m_Load(m_Value())) || all_of(VL, [&S](const Value *I) { return match(I, m_OneUse(m_ZExtOrSExt(m_OneUse(m_Load(m_Value()))))) && - cast(I)->getOpcode() == - S.getMainOp()->getOpcode(); + cast(I)->getOpcode() == S.getOpcode(); })))) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); if (TryToFindDuplicates(S)) @@ -8156,7 +8162,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, } // Don't handle scalable vectors - if (S.getOpcode() == Instruction::ExtractElement && + if (S && S.getOpcode() == Instruction::ExtractElement && isa( cast(S.getMainOp())->getVectorOperandType())) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n"); @@ -8180,7 +8186,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // vectorize. auto &&NotProfitableForVectorization = [&S, this, Depth](ArrayRef VL) { - if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2) + if (!S || !S.isAltShuffle() || VL.size() > 2) return false; if (VectorizableTree.size() < MinTreeSize) return false; @@ -8235,7 +8241,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, bool IsScatterVectorizeUserTE = UserTreeIdx.UserTE && UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize; - bool AreAllSameBlock = S.getOpcode() && allSameBlock(VL); + bool AreAllSameBlock = S && allSameBlock(VL); bool AreScatterAllGEPSameBlock = (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() && VL.size() > 2 && @@ -8252,8 +8258,9 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE, SortedIndices)); bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock; - if (!AreAllSameInsts || (!S.getOpcode() && allConstant(VL)) || isSplat(VL) || - (isa_and_present( + if (!AreAllSameInsts || (!S && allConstant(VL)) || isSplat(VL) || + (S && + isa( S.getMainOp()) && !all_of(VL, isVectorLikeInstWithConstOps)) || NotProfitableForVectorization(VL)) { @@ -8265,7 +8272,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, } // Don't vectorize ephemeral values. - if (S.getOpcode() && !EphValues.empty()) { + if (S && !EphValues.empty()) { for (Value *V : VL) { if (EphValues.count(V)) { LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V @@ -8324,7 +8331,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, Instruction *VL0 = S.getMainOp(); BB = VL0->getParent(); - if (S.getMainOp() && + if (S && (BB->isEHPad() || isa_and_nonnull(BB->getTerminator()) || !DT->isReachableFromEntry(BB))) { // Don't go into unreachable blocks. They may contain instructions with @@ -8378,8 +8385,8 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, } LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); - unsigned ShuffleOrOp = S.isAltShuffle() ? - (unsigned) Instruction::ShuffleVector : S.getOpcode(); + unsigned ShuffleOrOp = + S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode(); auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) { // Postpone PHI nodes creation SmallVector PHIOps; @@ -8388,7 +8395,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, if (Op.empty()) continue; InstructionsState S = getSameOpcode(Op, *TLI); - if (S.getOpcode() != Instruction::PHI || S.isAltShuffle()) + if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle()) buildTree_rec(Op, Depth + 1, {TE, I}); else PHIOps.push_back(I); @@ -9771,7 +9778,7 @@ void BoUpSLP::transformNodes() { if (IsSplat) continue; InstructionsState S = getSameOpcode(Slice, *TLI); - if (!S.getOpcode() || S.isAltShuffle() || !allSameBlock(Slice) || + if (!S || S.isAltShuffle() || !allSameBlock(Slice) || (S.getOpcode() == Instruction::Load && areKnownNonVectorizableLoads(Slice)) || (S.getOpcode() != Instruction::Load && !has_single_bit(VF))) @@ -11086,7 +11093,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, if (const TreeEntry *OpTE = getTreeEntry(V)) return getCastContextHint(*OpTE); InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI); - if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle()) + if (SrcState && SrcState.getOpcode() == Instruction::Load && + !SrcState.isAltShuffle()) return TTI::CastContextHint::GatherScatter; return TTI::CastContextHint::None; }; @@ -13265,7 +13273,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( Value *In1 = PHI1->getIncomingValue(I); if (isConstant(In) && isConstant(In1)) continue; - if (!getSameOpcode({In, In1}, *TLI).getOpcode()) + if (!getSameOpcode({In, In1}, *TLI)) return false; if (cast(In)->getParent() != cast(In1)->getParent()) @@ -13293,7 +13301,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( if (It != UsedValuesEntry.end()) UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second; return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE && - getSameOpcode({V, V1}, *TLI).getOpcode() && + getSameOpcode({V, V1}, *TLI) && cast(V)->getParent() == cast(V1)->getParent() && (!isa(V1) || AreCompatiblePHIs(V, V1)); @@ -14560,12 +14568,12 @@ BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E, ArrayRef VL = E->getOperand(NodeIdx); InstructionsState S = getSameOpcode(VL, *TLI); // Special processing for GEPs bundle, which may include non-gep values. - if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) { + if (!S && VL.front()->getType()->isPointerTy()) { const auto *It = find_if(VL, IsaPred); if (It != VL.end()) S = getSameOpcode(*It, *TLI); } - if (!S.getOpcode()) + if (!S) return nullptr; auto CheckSameVE = [&](const TreeEntry *VE) { return VE->isSame(VL) && @@ -18546,8 +18554,7 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(), ValOps.size()) || (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1)); - if ((!IsAllowedSize && S.getOpcode() && - S.getOpcode() != Instruction::Load && + if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load && (!S.getMainOp()->isSafeToRemove() || any_of(ValOps.getArrayRef(), [&](Value *V) { @@ -18557,8 +18564,8 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, return !Stores.contains(U); })); }))) || - (ValOps.size() > Chain.size() / 2 && !S.getOpcode())) { - Size = (!IsAllowedSize && S.getOpcode()) ? 1 : 2; + (ValOps.size() > Chain.size() / 2 && !S)) { + Size = (!IsAllowedSize && S) ? 1 : 2; return false; } } @@ -18581,7 +18588,7 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, R.computeMinimumValueSizes(); Size = R.getCanonicalGraphSize(); - if (S.getOpcode() == Instruction::Load) + if (S && S.getOpcode() == Instruction::Load) Size = 2; // cut off masked gather small trees InstructionCost Cost = R.getTreeCost(); @@ -19082,7 +19089,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, // Check that all of the parts are instructions of the same type, // we permit an alternate opcode via InstructionsState. InstructionsState S = getSameOpcode(VL, *TLI); - if (!S.getOpcode()) + if (!S) return false; Instruction *I0 = S.getMainOp(); @@ -19906,16 +19913,16 @@ class HorizontalReduction { // Also check if the instruction was folded to constant/other value. auto *Inst = dyn_cast(RdxVal); if ((Inst && isVectorLikeInstWithConstOps(Inst) && - (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) || - (S.getOpcode() && !Inst)) + (!S || !S.isOpcodeOrAlt(Inst))) || + (S && !Inst)) continue; Candidates.push_back(RdxVal); TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]); } bool ShuffledExtracts = false; // Try to handle shuffled extractelements. - if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() && - I + 1 < E) { + if (S && S.getOpcode() == Instruction::ExtractElement && + !S.isAltShuffle() && I + 1 < E) { SmallVector CommonCandidates(Candidates); for (Value *RV : ReducedVals[I + 1]) { Value *RdxVal = TrackedVals.at(RV); @@ -21310,7 +21317,7 @@ static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); } InstructionsState S = getSameOpcode({I1, I2}, TLI); - if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle())) + if (S && (IsCompatibility || !S.isAltShuffle())) continue; if (IsCompatibility) return false; @@ -21468,7 +21475,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { if (NodeI1 != NodeI2) return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); InstructionsState S = getSameOpcode({I1, I2}, *TLI); - if (S.getOpcode() && !S.isAltShuffle()) + if (S && !S.isAltShuffle()) continue; return I1->getOpcode() < I2->getOpcode(); } @@ -21531,8 +21538,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { return false; if (I1->getParent() != I2->getParent()) return false; - InstructionsState S = getSameOpcode({I1, I2}, *TLI); - if (S.getOpcode()) + if (getSameOpcode({I1, I2}, *TLI)) continue; return false; } @@ -21904,8 +21910,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { if (auto *I2 = dyn_cast(V2->getValueOperand())) { if (I1->getParent() != I2->getParent()) return false; - InstructionsState S = getSameOpcode({I1, I2}, *TLI); - return S.getOpcode() > 0; + return getSameOpcode({I1, I2}, *TLI).valid(); } if (isa(V1->getValueOperand()) && isa(V2->getValueOperand())) From c744ed53a84f90598751cdcda4c68900113587ab Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Fri, 3 Jan 2025 11:58:04 -0500 Subject: [PATCH 370/567] [AMDGPU][True16][MC] disable incorrect VOPC t16 instruction (#120271) The current VOPC t16 instructions are not implemented with the correct t16 pseudo. Thus the current t16/fake16 instructions are all in fake16 format. The plan is to remove the incorrect t16 instructions and refactor them. The first step is to remove them in this patch. The next step will be updating the t16/fake16 pseudo to the correct format and add back true16 instruction one by one in the upcoming patches. --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 3 +- llvm/lib/Target/AMDGPU/SIInstructions.td | 4 +- llvm/lib/Target/AMDGPU/VOPCInstructions.td | 132 +++++------------- .../inst-select-amdgcn.fcmp.constants.w32.mir | 8 +- .../inst-select-amdgcn.fcmp.constants.w64.mir | 8 +- 5 files changed, 47 insertions(+), 108 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 3be865f03df1f..041b9b4d66f63 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1125,8 +1125,9 @@ static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, unsigned FakeS16Opc, unsigned S32Opc, unsigned S64Opc) { if (Size == 16) + // FIXME-TRUE16 use TrueS16Opc when realtrue16 is supported for CMP code return ST.hasTrue16BitInsts() - ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc + ? ST.useRealTrue16Insts() ? FakeS16Opc : FakeS16Opc : S16Opc; if (Size == 32) return S32Opc; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 789ce8815cf80..e388efe73cddb 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2674,8 +2674,8 @@ let OtherPredicates = [NotHasTrue16BitInsts] in { } // end OtherPredicates = [NotHasTrue16BitInsts] let OtherPredicates = [HasTrue16BitInsts] in { - def : FPToI1Pat; - def : FPToI1Pat; + def : FPToI1Pat; + def : FPToI1Pat; } // end OtherPredicates = [HasTrue16BitInsts] def : FPToI1Pat; diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index 9bf043ea334fe..8589d598f5870 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -1130,20 +1130,20 @@ defm : ICMP_Pattern ; defm : ICMP_Pattern ; defm : ICMP_Pattern ; -let OtherPredicates = [HasTrue16BitInsts] in { -defm : ICMP_Pattern ; -defm : ICMP_Pattern ; -defm : ICMP_Pattern ; -defm : ICMP_Pattern ; -defm : ICMP_Pattern ; -defm : ICMP_Pattern ; -defm : ICMP_Pattern ; -defm : ICMP_Pattern ; -defm : ICMP_Pattern ; -defm : ICMP_Pattern ; -} // End OtherPredicates = [HasTrue16BitInsts] - -let OtherPredicates = [NotHasTrue16BitInsts] in { +let True16Predicate = UseFakeTrue16Insts in { +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +} // End True16Predicate = UseFakeTrue16Insts + +let True16Predicate = NotHasTrue16BitInsts in { defm : ICMP_Pattern ; defm : ICMP_Pattern ; defm : ICMP_Pattern ; @@ -1154,7 +1154,7 @@ defm : ICMP_Pattern ; defm : ICMP_Pattern ; defm : ICMP_Pattern ; defm : ICMP_Pattern ; -} // End OtherPredicates = [NotHasTrue16BitInsts] +} // End True16Predicate = NotHasTrue16BitInsts multiclass FCMP_Pattern { let WaveSizePredicate = isWave64 in @@ -1215,25 +1215,25 @@ defm : FCMP_Pattern ; defm : FCMP_Pattern ; defm : FCMP_Pattern ; -let OtherPredicates = [HasTrue16BitInsts] in { -defm : FCMP_Pattern ; -defm : FCMP_Pattern ; -defm : FCMP_Pattern ; -defm : FCMP_Pattern ; -defm : FCMP_Pattern ; -defm : FCMP_Pattern ; -defm : FCMP_Pattern ; -defm : FCMP_Pattern ; - -defm : FCMP_Pattern ; -defm : FCMP_Pattern ; -defm : FCMP_Pattern ; -defm : FCMP_Pattern ; -defm : FCMP_Pattern ; -defm : FCMP_Pattern ; -} // End OtherPredicates = [HasTrue16BitInsts] - -let OtherPredicates = [NotHasTrue16BitInsts] in { +let True16Predicate = UseFakeTrue16Insts in { +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; + +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +} // End True16Predicate = UseFakeTrue16Insts + +let True16Predicate = NotHasTrue16BitInsts in { defm : FCMP_Pattern ; defm : FCMP_Pattern ; defm : FCMP_Pattern ; @@ -1249,7 +1249,7 @@ defm : FCMP_Pattern ; defm : FCMP_Pattern ; defm : FCMP_Pattern ; defm : FCMP_Pattern ; -} // End OtherPredicates = [NotHasTrue16BitInsts] +} // End True16Predicate = NotHasTrue16BitInsts //===----------------------------------------------------------------------===// // DPP Encodings @@ -1707,23 +1707,6 @@ multiclass VOPCX_Real_t16_gfx11_gfx12 op, string asm_name, VOPCX_Real_t16, VOPCX_Real_t16; -defm V_CMP_F_F16_t16 : VOPC_Real_t16_gfx11<0x000, "v_cmp_f_f16">; -defm V_CMP_LT_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x001, "v_cmp_lt_f16">; -defm V_CMP_EQ_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x002, "v_cmp_eq_f16">; -defm V_CMP_LE_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x003, "v_cmp_le_f16">; -defm V_CMP_GT_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x004, "v_cmp_gt_f16">; -defm V_CMP_LG_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x005, "v_cmp_lg_f16">; -defm V_CMP_GE_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x006, "v_cmp_ge_f16">; -defm V_CMP_O_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x007, "v_cmp_o_f16">; -defm V_CMP_U_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x008, "v_cmp_u_f16">; -defm V_CMP_NGE_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x009, "v_cmp_nge_f16">; -defm V_CMP_NLG_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x00a, "v_cmp_nlg_f16">; -defm V_CMP_NGT_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x00b, "v_cmp_ngt_f16">; -defm V_CMP_NLE_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x00c, "v_cmp_nle_f16">; -defm V_CMP_NEQ_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x00d, "v_cmp_neq_f16">; -defm V_CMP_NLT_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x00e, "v_cmp_nlt_f16">; -defm V_CMP_T_F16_t16 : VOPC_Real_t16_gfx11<0x00f, "v_cmp_t_f16", "V_CMP_TRU_F16_t16", "v_cmp_tru_f16">; - defm V_CMP_F_F16_fake16 : VOPC_Real_t16_gfx11<0x000, "v_cmp_f_f16">; defm V_CMP_LT_F16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x001, "v_cmp_lt_f16">; defm V_CMP_EQ_F16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x002, "v_cmp_eq_f16">; @@ -1759,19 +1742,6 @@ defm V_CMP_NLT_F32 : VOPC_Real_gfx11_gfx12<0x01e>; defm V_CMP_T_F32 : VOPC_Real_with_name_gfx11<0x01f, "V_CMP_TRU_F32", "v_cmp_t_f32">; defm V_CMP_T_F64 : VOPC_Real_with_name_gfx11<0x02f, "V_CMP_TRU_F64", "v_cmp_t_f64">; -defm V_CMP_LT_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x031, "v_cmp_lt_i16">; -defm V_CMP_EQ_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x032, "v_cmp_eq_i16">; -defm V_CMP_LE_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x033, "v_cmp_le_i16">; -defm V_CMP_GT_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x034, "v_cmp_gt_i16">; -defm V_CMP_NE_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x035, "v_cmp_ne_i16">; -defm V_CMP_GE_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x036, "v_cmp_ge_i16">; -defm V_CMP_LT_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x039, "v_cmp_lt_u16">; -defm V_CMP_EQ_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x03a, "v_cmp_eq_u16">; -defm V_CMP_LE_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x03b, "v_cmp_le_u16">; -defm V_CMP_GT_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x03c, "v_cmp_gt_u16">; -defm V_CMP_NE_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x03d, "v_cmp_ne_u16">; -defm V_CMP_GE_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x03e, "v_cmp_ge_u16">; - defm V_CMP_LT_I16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x031, "v_cmp_lt_i16">; defm V_CMP_EQ_I16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x032, "v_cmp_eq_i16">; defm V_CMP_LE_I16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x033, "v_cmp_le_i16">; @@ -1819,28 +1789,10 @@ defm V_CMP_NE_U64 : VOPC_Real_gfx11_gfx12<0x05d>; defm V_CMP_GE_U64 : VOPC_Real_gfx11_gfx12<0x05e>; defm V_CMP_T_U64 : VOPC_Real_gfx11<0x05f>; -defm V_CMP_CLASS_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x07d, "v_cmp_class_f16">; defm V_CMP_CLASS_F16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x07d, "v_cmp_class_f16">; defm V_CMP_CLASS_F32 : VOPC_Real_gfx11_gfx12<0x07e>; defm V_CMP_CLASS_F64 : VOPC_Real_gfx11_gfx12<0x07f>; -defm V_CMPX_F_F16_t16 : VOPCX_Real_t16_gfx11<0x080, "v_cmpx_f_f16">; -defm V_CMPX_LT_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x081, "v_cmpx_lt_f16">; -defm V_CMPX_EQ_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x082, "v_cmpx_eq_f16">; -defm V_CMPX_LE_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x083, "v_cmpx_le_f16">; -defm V_CMPX_GT_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x084, "v_cmpx_gt_f16">; -defm V_CMPX_LG_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x085, "v_cmpx_lg_f16">; -defm V_CMPX_GE_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x086, "v_cmpx_ge_f16">; -defm V_CMPX_O_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x087, "v_cmpx_o_f16">; -defm V_CMPX_U_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x088, "v_cmpx_u_f16">; -defm V_CMPX_NGE_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x089, "v_cmpx_nge_f16">; -defm V_CMPX_NLG_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x08a, "v_cmpx_nlg_f16">; -defm V_CMPX_NGT_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x08b, "v_cmpx_ngt_f16">; -defm V_CMPX_NLE_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x08c, "v_cmpx_nle_f16">; -defm V_CMPX_NEQ_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x08d, "v_cmpx_neq_f16">; -defm V_CMPX_NLT_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x08e, "v_cmpx_nlt_f16">; -defm V_CMPX_T_F16_t16 : VOPCX_Real_with_name_gfx11<0x08f, "V_CMPX_TRU_F16_t16", "v_cmpx_t_f16", "v_cmpx_tru_f16">; - defm V_CMPX_F_F16_fake16 : VOPCX_Real_t16_gfx11<0x080, "v_cmpx_f_f16">; defm V_CMPX_LT_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x081, "v_cmpx_lt_f16">; defm V_CMPX_EQ_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x082, "v_cmpx_eq_f16">; @@ -1892,19 +1844,6 @@ defm V_CMPX_NEQ_F64 : VOPCX_Real_gfx11_gfx12<0x0ad>; defm V_CMPX_NLT_F64 : VOPCX_Real_gfx11_gfx12<0x0ae>; defm V_CMPX_T_F64 : VOPCX_Real_with_name_gfx11<0x0af, "V_CMPX_TRU_F64", "v_cmpx_t_f64">; -defm V_CMPX_LT_I16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b1, "v_cmpx_lt_i16">; -defm V_CMPX_EQ_I16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b2, "v_cmpx_eq_i16">; -defm V_CMPX_LE_I16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b3, "v_cmpx_le_i16">; -defm V_CMPX_GT_I16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b4, "v_cmpx_gt_i16">; -defm V_CMPX_NE_I16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b5, "v_cmpx_ne_i16">; -defm V_CMPX_GE_I16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b6, "v_cmpx_ge_i16">; -defm V_CMPX_LT_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b9, "v_cmpx_lt_u16">; -defm V_CMPX_EQ_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0ba, "v_cmpx_eq_u16">; -defm V_CMPX_LE_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0bb, "v_cmpx_le_u16">; -defm V_CMPX_GT_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0bc, "v_cmpx_gt_u16">; -defm V_CMPX_NE_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0bd, "v_cmpx_ne_u16">; -defm V_CMPX_GE_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0be, "v_cmpx_ge_u16">; - defm V_CMPX_LT_I16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0b1, "v_cmpx_lt_i16">; defm V_CMPX_EQ_I16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0b2, "v_cmpx_eq_i16">; defm V_CMPX_LE_I16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0b3, "v_cmpx_le_i16">; @@ -1951,7 +1890,6 @@ defm V_CMPX_GT_U64 : VOPCX_Real_gfx11_gfx12<0x0dc>; defm V_CMPX_NE_U64 : VOPCX_Real_gfx11_gfx12<0x0dd>; defm V_CMPX_GE_U64 : VOPCX_Real_gfx11_gfx12<0x0de>; defm V_CMPX_T_U64 : VOPCX_Real_gfx11<0x0df>; -defm V_CMPX_CLASS_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0fd, "v_cmpx_class_f16">; defm V_CMPX_CLASS_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0fd, "v_cmpx_class_f16">; defm V_CMPX_CLASS_F32 : VOPCX_Real_gfx11_gfx12<0x0fe>; defm V_CMPX_CLASS_F64 : VOPCX_Real_gfx11_gfx12<0x0ff>; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir index 55015c6d13d8a..cdb67caea12cf 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir @@ -20,8 +20,8 @@ body: | ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]] ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]] - ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec - ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]] + ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_fake16_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F16_fake16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_fake16_e64_]] ; ; GFX11-FAKE16-LABEL: name: fcmp_false_f16 ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1 @@ -58,8 +58,8 @@ body: | ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]] ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]] - ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec - ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]] + ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_fake16_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F16_fake16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_fake16_e64_]] ; ; GFX11-FAKE16-LABEL: name: fcmp_true_f16 ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir index 4241f945a87d5..ed811d37c3d0f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir @@ -20,8 +20,8 @@ body: | ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]] ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]] - ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec - ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]] + ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_fake16_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F16_fake16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_fake16_e64_]] ; ; GFX11-FAKE16-LABEL: name: fcmp_false_f16 ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1 @@ -58,8 +58,8 @@ body: | ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]] ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]] - ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec - ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]] + ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_fake16_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F16_fake16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_fake16_e64_]] ; ; GFX11-FAKE16-LABEL: name: fcmp_true_f16 ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1 From fa56e8bb6451bdf24be6c2a8737dab5fe6a2039c Mon Sep 17 00:00:00 2001 From: agozillon Date: Fri, 3 Jan 2025 18:01:01 +0100 Subject: [PATCH 371/567] [OpenMP][MLIR] Fix threadprivate lowering when compiling for target when target operations are in use (#119310) Currently the compiler will ICE in programs like the following on the device lowering pass: ``` program main implicit none type i1_t integer :: val(1000) end type i1_t integer :: i type(i1_t), pointer :: newi1 type(i1_t), pointer :: tab=>null() integer, dimension(:), pointer :: tabval !$omp THREADPRIVATE(tab) allocate(newi1) tab=>newi1 tab%val(:)=1 tabval=>tab%val !$omp target teams distribute parallel do do i = 1, 1000 tabval(i) = i end do !$omp end target teams distribute parallel do end program main ``` This is due to the fact that THREADPRIVATE returns a result operation, and this operation can actually be used by other LLVM dialect (or other dialect) operations. However, we currently skip the lowering of threadprivate, so we effectively never generate and bind an LLVM-IR result to the threadprivate operation result. So when we later go on to lower dependent LLVM dialect operations, we are missing the required LLVM-IR result, try to access and use it and then ICE. The fix in this particular PR is to allow compilation of threadprivate for device as well as host, and simply treat the device compilation as a no-op, binding the LLVM-IR result of threadprivate with no alterations and binding it, which will allow the rest of the compilation to proceed, where we'll eventually discard the host segment in any case. The other possible solution to this I can think of, is doing something similar to Flang's passes that occur prior to CodeGen to the LLVM dialect, where they erase/no-op certain unrequired operations or transform them to lower level series of operations. And we would erase/no-op threadprivate on device as we'd never have these in target regions. The main issues I can see with this are that we currently do not specialise this stage based on wether we're compiling for device or host, so it's setting a precedent and adding another point of having to understand the separation between target and host compilation. I am also not sure we'd necessarily want to enforce this at a dialect level incase someone else wishes to add a different lowering flow or translation flow. Another possible issue is that a target operation we have/utilise would depend on the result of threadprivate, meaning we'd not be allowed to entirely erase/no-op it, I am not sure of any situations where this may be an issue currently though. --- .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 38 +++++++++++++------ ...ptarget-threadprivate-device-lowering.mlir | 30 +++++++++++++++ .../fortran/target-with-threadprivate.f90 | 37 ++++++++++++++++++ 3 files changed, 94 insertions(+), 11 deletions(-) create mode 100644 mlir/test/Target/LLVMIR/omptarget-threadprivate-device-lowering.mlir create mode 100644 offload/test/offloading/fortran/target-with-threadprivate.f90 diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index ce129417fc5b2..87cb7f03fec6a 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -2588,6 +2588,7 @@ static LogicalResult convertOmpThreadprivate(Operation &opInst, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation) { llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder); + llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); auto threadprivateOp = cast(opInst); if (failed(checkImplementationStatus(opInst))) @@ -2595,6 +2596,10 @@ convertOmpThreadprivate(Operation &opInst, llvm::IRBuilderBase &builder, Value symAddr = threadprivateOp.getSymAddr(); auto *symOp = symAddr.getDefiningOp(); + + if (auto asCast = dyn_cast(symOp)) + symOp = asCast.getOperand().getDefiningOp(); + if (!isa(symOp)) return opInst.emitError("Addressing symbol not found"); LLVM::AddressOfOp addressOfOp = dyn_cast(symOp); @@ -2602,17 +2607,20 @@ convertOmpThreadprivate(Operation &opInst, llvm::IRBuilderBase &builder, LLVM::GlobalOp global = addressOfOp.getGlobal(moduleTranslation.symbolTable()); llvm::GlobalValue *globalValue = moduleTranslation.lookupGlobal(global); - llvm::Type *type = globalValue->getValueType(); - llvm::TypeSize typeSize = - builder.GetInsertBlock()->getModule()->getDataLayout().getTypeStoreSize( - type); - llvm::ConstantInt *size = builder.getInt64(typeSize.getFixedValue()); - llvm::StringRef suffix = llvm::StringRef(".cache", 6); - std::string cacheName = (Twine(global.getSymName()).concat(suffix)).str(); - llvm::Value *callInst = - moduleTranslation.getOpenMPBuilder()->createCachedThreadPrivate( - ompLoc, globalValue, size, cacheName); - moduleTranslation.mapValue(opInst.getResult(0), callInst); + + if (!ompBuilder->Config.isTargetDevice()) { + llvm::Type *type = globalValue->getValueType(); + llvm::TypeSize typeSize = + builder.GetInsertBlock()->getModule()->getDataLayout().getTypeStoreSize( + type); + llvm::ConstantInt *size = builder.getInt64(typeSize.getFixedValue()); + llvm::Value *callInst = ompBuilder->createCachedThreadPrivate( + ompLoc, globalValue, size, global.getSymName() + ".cache"); + moduleTranslation.mapValue(opInst.getResult(0), callInst); + } else { + moduleTranslation.mapValue(opInst.getResult(0), globalValue); + } + return success(); } @@ -4212,6 +4220,14 @@ static bool isTargetDeviceOp(Operation *op) { if (op->getParentOfType()) return true; + // Certain operations return results, and whether utilised in host or + // target there is a chance an LLVM Dialect operation depends on it + // by taking it in as an operand, so we must always lower these in + // some manner or result in an ICE (whether they end up in a no-op + // or otherwise). + if (mlir::isa(op)) + return true; + if (auto parentFn = op->getParentOfType()) if (auto declareTargetIface = llvm::dyn_cast( diff --git a/mlir/test/Target/LLVMIR/omptarget-threadprivate-device-lowering.mlir b/mlir/test/Target/LLVMIR/omptarget-threadprivate-device-lowering.mlir new file mode 100644 index 0000000000000..279ecb3f8e998 --- /dev/null +++ b/mlir/test/Target/LLVMIR/omptarget-threadprivate-device-lowering.mlir @@ -0,0 +1,30 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +// Not intended to be a functional example, the aim of this test is to verify +// omp.threadprivate does not crash on lowering during the OpenMP target device +// pass when used in conjunction with target code in the same module. + +module attributes {omp.is_target_device = true } { + llvm.func @func() attributes {omp.declare_target = #omp.declaretarget} { + %0 = llvm.mlir.addressof @_QFEpointer2 : !llvm.ptr + %1 = omp.threadprivate %0 : !llvm.ptr -> !llvm.ptr + %2 = omp.map.info var_ptr(%1 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(implicit, to) capture(ByRef) -> !llvm.ptr + omp.target map_entries(%2 -> %arg0 : !llvm.ptr) { + %3 = llvm.mlir.constant(1 : i32) : i32 + %4 = llvm.getelementptr %arg0[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + llvm.store %3, %4 : i32, !llvm.ptr + omp.terminator + } + llvm.return + } + llvm.mlir.global internal @_QFEpointer2() {addr_space = 0 : i32} : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> { + %0 = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + llvm.return %0 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + } +} + +// CHECK: define weak_odr protected void @{{.*}}(ptr %{{.*}}, ptr %[[ARG1:.*]]) { +// CHECK: %[[ALLOCA:.*]] = alloca ptr, align 8 +// CHECK: store ptr %[[ARG1]], ptr %[[ALLOCA]], align 8 +// CHECK: %[[LOAD_ALLOCA:.*]] = load ptr, ptr %[[ALLOCA]], align 8 +// CHECK: store i32 1, ptr %[[LOAD_ALLOCA]], align 4 diff --git a/offload/test/offloading/fortran/target-with-threadprivate.f90 b/offload/test/offloading/fortran/target-with-threadprivate.f90 new file mode 100644 index 0000000000000..10c7cecf08412 --- /dev/null +++ b/offload/test/offloading/fortran/target-with-threadprivate.f90 @@ -0,0 +1,37 @@ +! Basic offloading test that makes sure we can use the predominantly host +! pragma threadprivate in the same program as target code +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-run-and-check-generic +program main + implicit none + + type dtype + integer :: val(10) + end type dtype + + integer :: i + type(dtype), pointer :: pointer1 + type(dtype), pointer :: pointer2=>null() + integer, dimension(:), pointer :: data_pointer + +!$omp threadprivate(pointer2) + +nullify(pointer1) +allocate(pointer1) + +pointer2=>pointer1 +pointer2%val(:)=1 +data_pointer=>pointer2%val + +!$omp target + do i = 1, 10 + data_pointer(i) = i + end do +!$omp end target + +print *, data_pointer + +end program main + +! CHECK: 1 2 3 4 5 6 7 8 9 10 From 1cade8699719c934a8debb7bef9fdc3ff11e9602 Mon Sep 17 00:00:00 2001 From: Ivan Butygin Date: Fri, 3 Jan 2025 18:02:59 +0100 Subject: [PATCH 372/567] [mlir][arith] Fold `(a * b) / b -> a` (#121534) If overflow flags allow it. Alive2 check: https://alive2.llvm.org/ce/z/5XWjWE --- mlir/lib/Dialect/Arith/IR/ArithOps.cpp | 24 +++++++++ mlir/test/Dialect/Arith/canonicalize.mlir | 64 +++++++++++++++++++++++ 2 files changed, 88 insertions(+) diff --git a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp index d8b314a3fa43c..e016a6e16e59f 100644 --- a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp +++ b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp @@ -580,11 +580,31 @@ void arith::MulUIExtendedOp::getCanonicalizationPatterns( // DivUIOp //===----------------------------------------------------------------------===// +/// Fold `(a * b) / b -> a` +static Value foldDivMul(Value lhs, Value rhs, + arith::IntegerOverflowFlags ovfFlags) { + auto mul = lhs.getDefiningOp(); + if (!mul || !bitEnumContainsAll(mul.getOverflowFlags(), ovfFlags)) + return {}; + + if (mul.getLhs() == rhs) + return mul.getRhs(); + + if (mul.getRhs() == rhs) + return mul.getLhs(); + + return {}; +} + OpFoldResult arith::DivUIOp::fold(FoldAdaptor adaptor) { // divui (x, 1) -> x. if (matchPattern(adaptor.getRhs(), m_One())) return getLhs(); + // (a * b) / b -> a + if (Value val = foldDivMul(getLhs(), getRhs(), IntegerOverflowFlags::nuw)) + return val; + // Don't fold if it would require a division by zero. bool div0 = false; auto result = constFoldBinaryOp(adaptor.getOperands(), @@ -621,6 +641,10 @@ OpFoldResult arith::DivSIOp::fold(FoldAdaptor adaptor) { if (matchPattern(adaptor.getRhs(), m_One())) return getLhs(); + // (a * b) / b -> a + if (Value val = foldDivMul(getLhs(), getRhs(), IntegerOverflowFlags::nsw)) + return val; + // Don't fold if it would overflow or if it requires a division by zero. bool overflowOrDiv0 = false; auto result = constFoldBinaryOp( diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir index 6a186a0c6ceca..522711b08f289 100644 --- a/mlir/test/Dialect/Arith/canonicalize.mlir +++ b/mlir/test/Dialect/Arith/canonicalize.mlir @@ -2060,6 +2060,70 @@ func.func @test_divf1(%arg0 : f32, %arg1 : f32) -> (f32) { // ----- +func.func @fold_divui_of_muli_0(%arg0 : index, %arg1 : index) -> index { + %0 = arith.muli %arg0, %arg1 overflow : index + %1 = arith.divui %0, %arg0 : index + return %1 : index +} +// CHECK-LABEL: func @fold_divui_of_muli_0( +// CHECK-SAME: %[[ARG0:.+]]: index, +// CHECK-SAME: %[[ARG1:.+]]: index) +// CHECK: return %[[ARG1]] + +func.func @fold_divui_of_muli_1(%arg0 : index, %arg1 : index) -> index { + %0 = arith.muli %arg0, %arg1 overflow : index + %1 = arith.divui %0, %arg1 : index + return %1 : index +} +// CHECK-LABEL: func @fold_divui_of_muli_1( +// CHECK-SAME: %[[ARG0:.+]]: index, +// CHECK-SAME: %[[ARG1:.+]]: index) +// CHECK: return %[[ARG0]] + +func.func @fold_divsi_of_muli_0(%arg0 : index, %arg1 : index) -> index { + %0 = arith.muli %arg0, %arg1 overflow : index + %1 = arith.divsi %0, %arg0 : index + return %1 : index +} +// CHECK-LABEL: func @fold_divsi_of_muli_0( +// CHECK-SAME: %[[ARG0:.+]]: index, +// CHECK-SAME: %[[ARG1:.+]]: index) +// CHECK: return %[[ARG1]] + +func.func @fold_divsi_of_muli_1(%arg0 : index, %arg1 : index) -> index { + %0 = arith.muli %arg0, %arg1 overflow : index + %1 = arith.divsi %0, %arg1 : index + return %1 : index +} +// CHECK-LABEL: func @fold_divsi_of_muli_1( +// CHECK-SAME: %[[ARG0:.+]]: index, +// CHECK-SAME: %[[ARG1:.+]]: index) +// CHECK: return %[[ARG0]] + +// Do not fold divui(mul(a, v), v) -> a with nuw attribute. +func.func @no_fold_divui_of_muli(%arg0 : index, %arg1 : index) -> index { + %0 = arith.muli %arg0, %arg1 : index + %1 = arith.divui %0, %arg0 : index + return %1 : index +} +// CHECK-LABEL: func @no_fold_divui_of_muli +// CHECK: %[[T0:.+]] = arith.muli +// CHECK: %[[T1:.+]] = arith.divui %[[T0]], +// CHECK: return %[[T1]] + +// Do not fold divsi(mul(a, v), v) -> a with nuw attribute. +func.func @no_fold_divsi_of_muli(%arg0 : index, %arg1 : index) -> index { + %0 = arith.muli %arg0, %arg1 : index + %1 = arith.divsi %0, %arg0 : index + return %1 : index +} +// CHECK-LABEL: func @no_fold_divsi_of_muli +// CHECK: %[[T0:.+]] = arith.muli +// CHECK: %[[T1:.+]] = arith.divsi %[[T0]], +// CHECK: return %[[T1]] + +// ----- + // CHECK-LABEL: @test_cmpf( func.func @test_cmpf(%arg0 : f32) -> (i1, i1, i1, i1) { // CHECK-DAG: %[[T:.*]] = arith.constant true From 4dfea22e771a0944b3b313f2790a616fa79257e1 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 3 Jan 2025 09:19:32 -0800 Subject: [PATCH 373/567] [ExpandMemCmp][AArch64][PowerPC][RISCV][X86] Use llvm.ucmp instead of (sub (zext (icmp ugt)), (zext (icmp ult))). (#121530) AArch64 and PowerPC look like a improvements. RISC-V is neutral. X86 trades a dependency breaking xor before a seta for a movsx after a sbbb. Depending on how the result is used, this movsx might go away. --- llvm/lib/CodeGen/ExpandMemCmp.cpp | 14 ++------ .../AArch64/machine-licm-hoist-load.ll | 3 +- llvm/test/CodeGen/AArch64/memcmp.ll | 15 +++----- llvm/test/CodeGen/PowerPC/memcmp.ll | 18 +++++----- llvm/test/CodeGen/PowerPC/memcmpIR.ll | 16 +++------ llvm/test/CodeGen/RISCV/memcmp-optsize.ll | 36 +++++++++---------- llvm/test/CodeGen/RISCV/memcmp.ll | 36 +++++++++---------- .../CodeGen/X86/memcmp-more-load-pairs-x32.ll | 10 +++--- .../CodeGen/X86/memcmp-more-load-pairs.ll | 24 ++++++------- llvm/test/CodeGen/X86/memcmp-optsize-x32.ll | 10 +++--- llvm/test/CodeGen/X86/memcmp-optsize.ll | 24 ++++++------- llvm/test/CodeGen/X86/memcmp-pgso-x32.ll | 10 +++--- llvm/test/CodeGen/X86/memcmp-pgso.ll | 24 ++++++------- llvm/test/CodeGen/X86/memcmp-x32.ll | 10 +++--- llvm/test/CodeGen/X86/memcmp.ll | 24 ++++++------- .../Transforms/ExpandMemCmp/AArch64/memcmp.ll | 30 +++------------- .../Transforms/ExpandMemCmp/X86/memcmp-x32.ll | 6 +--- .../Transforms/ExpandMemCmp/X86/memcmp.ll | 12 ++----- 18 files changed, 133 insertions(+), 189 deletions(-) diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp index f8ca7e370f6ef..6dc3e04ac802c 100644 --- a/llvm/lib/CodeGen/ExpandMemCmp.cpp +++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp @@ -696,17 +696,9 @@ Value *MemCmpExpansion::getMemCmpOneBlock() { } } - // The result of memcmp is negative, zero, or positive, so produce that by - // subtracting 2 extended compare bits: sub (ugt, ult). - // If a target prefers to use selects to get -1/0/1, they should be able - // to transform this later. The inverse transform (going from selects to math) - // may not be possible in the DAG because the selects got converted into - // branches before we got there. - Value *CmpUGT = Builder.CreateICmpUGT(Loads.Lhs, Loads.Rhs); - Value *CmpULT = Builder.CreateICmpULT(Loads.Lhs, Loads.Rhs); - Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty()); - Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty()); - return Builder.CreateSub(ZextUGT, ZextULT); + // The result of memcmp is negative, zero, or positive. + return Builder.CreateIntrinsic(Builder.getInt32Ty(), Intrinsic::ucmp, + {Loads.Lhs, Loads.Rhs}); } // This function expands the memcmp call into an inline expansion and returns diff --git a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll index 17f8263560430..a32c53a5a5747 100644 --- a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll +++ b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll @@ -313,9 +313,8 @@ define void @one_dimensional_with_store(ptr %a, ptr %b, ptr %c, i32 %N) { ; CHECK-NEXT: rev w9, w9 ; CHECK-NEXT: cmp w9, w10 ; CHECK-NEXT: cset w9, hi -; CHECK-NEXT: cset w10, lo +; CHECK-NEXT: csinv w9, w9, wzr, hs ; CHECK-NEXT: subs x8, x8, #1 -; CHECK-NEXT: sub w9, w9, w10 ; CHECK-NEXT: strb w9, [x2], #1 ; CHECK-NEXT: b.ne .LBB4_1 ; CHECK-NEXT: // %bb.2: // %for.exit diff --git a/llvm/test/CodeGen/AArch64/memcmp.ll b/llvm/test/CodeGen/AArch64/memcmp.ll index 4da7c8c95a4e4..4f58fd74d7d50 100644 --- a/llvm/test/CodeGen/AArch64/memcmp.ll +++ b/llvm/test/CodeGen/AArch64/memcmp.ll @@ -162,8 +162,7 @@ define i32 @length3(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev w9, w9 ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: cset w8, hi -; CHECK-NEXT: cset w9, lo -; CHECK-NEXT: sub w0, w8, w9 +; CHECK-NEXT: csinv w0, w8, wzr, hs ; CHECK-NEXT: ret %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind ret i32 %m @@ -194,8 +193,7 @@ define i32 @length4(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev w9, w9 ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: cset w8, hi -; CHECK-NEXT: cset w9, lo -; CHECK-NEXT: sub w0, w8, w9 +; CHECK-NEXT: csinv w0, w8, wzr, hs ; CHECK-NEXT: ret %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind ret i32 %m @@ -286,8 +284,7 @@ define i32 @length5(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: cset w8, hi -; CHECK-NEXT: cset w9, lo -; CHECK-NEXT: sub w0, w8, w9 +; CHECK-NEXT: csinv w0, w8, wzr, hs ; CHECK-NEXT: ret %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind ret i32 %m @@ -341,8 +338,7 @@ define i32 @length6(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: cset w8, hi -; CHECK-NEXT: cset w9, lo -; CHECK-NEXT: sub w0, w8, w9 +; CHECK-NEXT: csinv w0, w8, wzr, hs ; CHECK-NEXT: ret %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 6) nounwind ret i32 %m @@ -450,8 +446,7 @@ define i32 @length8(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: cset w8, hi -; CHECK-NEXT: cset w9, lo -; CHECK-NEXT: sub w0, w8, w9 +; CHECK-NEXT: csinv w0, w8, wzr, hs ; CHECK-NEXT: ret %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind ret i32 %m diff --git a/llvm/test/CodeGen/PowerPC/memcmp.ll b/llvm/test/CodeGen/PowerPC/memcmp.ll index 0634534b9c9df..39f9269997315 100644 --- a/llvm/test/CodeGen/PowerPC/memcmp.ll +++ b/llvm/test/CodeGen/PowerPC/memcmp.ll @@ -6,13 +6,12 @@ define signext i32 @memcmp8(ptr nocapture readonly %buffer1, ptr nocapture reado ; CHECK: # %bb.0: ; CHECK-NEXT: ldbrx 3, 0, 3 ; CHECK-NEXT: ldbrx 4, 0, 4 -; CHECK-NEXT: subc 5, 4, 3 -; CHECK-NEXT: subfe 5, 4, 4 -; CHECK-NEXT: subc 4, 3, 4 -; CHECK-NEXT: subfe 3, 3, 3 -; CHECK-NEXT: neg 5, 5 +; CHECK-NEXT: cmpld 3, 4 +; CHECK-NEXT: subc 3, 4, 3 +; CHECK-NEXT: subfe 3, 4, 4 +; CHECK-NEXT: li 4, -1 ; CHECK-NEXT: neg 3, 3 -; CHECK-NEXT: sub 3, 5, 3 +; CHECK-NEXT: isellt 3, 4, 3 ; CHECK-NEXT: extsw 3, 3 ; CHECK-NEXT: blr %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 8) @@ -24,12 +23,11 @@ define signext i32 @memcmp4(ptr nocapture readonly %buffer1, ptr nocapture reado ; CHECK: # %bb.0: ; CHECK-NEXT: lwbrx 3, 0, 3 ; CHECK-NEXT: lwbrx 4, 0, 4 +; CHECK-NEXT: cmplw 3, 4 ; CHECK-NEXT: sub 5, 4, 3 -; CHECK-NEXT: sub 3, 3, 4 +; CHECK-NEXT: li 3, -1 ; CHECK-NEXT: rldicl 5, 5, 1, 63 -; CHECK-NEXT: rldicl 3, 3, 1, 63 -; CHECK-NEXT: sub 3, 5, 3 -; CHECK-NEXT: extsw 3, 3 +; CHECK-NEXT: isellt 3, 3, 5 ; CHECK-NEXT: blr %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 4) ret i32 %call diff --git a/llvm/test/CodeGen/PowerPC/memcmpIR.ll b/llvm/test/CodeGen/PowerPC/memcmpIR.ll index 0a8bec7dc0e3f..b57d2b5116b77 100644 --- a/llvm/test/CodeGen/PowerPC/memcmpIR.ll +++ b/llvm/test/CodeGen/PowerPC/memcmpIR.ll @@ -59,22 +59,14 @@ define signext i32 @test2(ptr nocapture readonly %buffer1, ptr nocapture readonl ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, ptr ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD1]]) ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD2]]) - ; CHECK-NEXT: [[CMP1:%[0-9]+]] = icmp ugt i32 [[BSWAP1]], [[BSWAP2]] - ; CHECK-NEXT: [[CMP2:%[0-9]+]] = icmp ult i32 [[BSWAP1]], [[BSWAP2]] - ; CHECK-NEXT: [[Z1:%[0-9]+]] = zext i1 [[CMP1]] to i32 - ; CHECK-NEXT: [[Z2:%[0-9]+]] = zext i1 [[CMP2]] to i32 - ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i32 [[Z1]], [[Z2]] - ; CHECK-NEXT: ret i32 [[SUB]] + ; CHECK-NEXT: [[UCMP:%[0-9]+]] = call i32 @llvm.ucmp.i32.i32(i32 [[BSWAP1]], i32 [[BSWAP2]]) + ; CHECK-NEXT: ret i32 [[UCMP]] ; CHECK-BE-LABEL: @test2( ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, ptr ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, ptr - ; CHECK-BE-NEXT: [[CMP1:%[0-9]+]] = icmp ugt i32 [[LOAD1]], [[LOAD2]] - ; CHECK-BE-NEXT: [[CMP2:%[0-9]+]] = icmp ult i32 [[LOAD1]], [[LOAD2]] - ; CHECK-BE-NEXT: [[Z1:%[0-9]+]] = zext i1 [[CMP1]] to i32 - ; CHECK-BE-NEXT: [[Z2:%[0-9]+]] = zext i1 [[CMP2]] to i32 - ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i32 [[Z1]], [[Z2]] - ; CHECK-BE-NEXT: ret i32 [[SUB]] + ; CHECK-BE-NEXT: [[UCMP:%[0-9]+]] = call i32 @llvm.ucmp.i32.i32(i32 [[LOAD1]], i32 [[LOAD2]]) + ; CHECK-BE-NEXT: ret i32 [[UCMP]] entry: %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 4) diff --git a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll index b9a27b9d0c9e7..829fdd5592683 100644 --- a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll +++ b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll @@ -2648,9 +2648,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a1, 0(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_4: @@ -2661,9 +2661,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a1, a1, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_4: @@ -2672,9 +2672,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a1, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_4: @@ -2685,9 +2685,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a1, a1, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_4: @@ -3462,9 +3462,9 @@ define i32 @memcmp_size_8(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a1, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_8: @@ -3495,9 +3495,9 @@ define i32 @memcmp_size_8(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a1, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_8: diff --git a/llvm/test/CodeGen/RISCV/memcmp.ll b/llvm/test/CodeGen/RISCV/memcmp.ll index 629a9298ee469..bc6cf0a858be8 100644 --- a/llvm/test/CodeGen/RISCV/memcmp.ll +++ b/llvm/test/CodeGen/RISCV/memcmp.ll @@ -3344,9 +3344,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a1, 0(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_4: @@ -3357,9 +3357,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a1, a1, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_4: @@ -3368,9 +3368,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a1, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_4: @@ -3381,9 +3381,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a1, a1, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_4: @@ -4158,9 +4158,9 @@ define i32 @memcmp_size_8(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a1, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_8: @@ -4191,9 +4191,9 @@ define i32 @memcmp_size_8(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a1, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_8: diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll index ee5fd78c64379..62935f7e372b3 100644 --- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll +++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll @@ -193,13 +193,13 @@ define i32 @length4(ptr %X, ptr %Y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl (%eax), %eax ; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: bswapl %eax +; X86-NEXT: cmpl %eax, %ecx ; X86-NEXT: seta %al -; X86-NEXT: sbbl $0, %eax +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movsbl %al, %eax ; X86-NEXT: retl %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind ret i32 %m diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll index a46f9ed3d3798..9bbd335a903be 100644 --- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll +++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll @@ -179,14 +179,14 @@ define i1 @length3_eq(ptr %X, ptr %Y) nounwind { define i32 @length4(ptr %X, ptr %Y) nounwind { ; X64-LABEL: length4: ; X64: # %bb.0: -; X64-NEXT: movl (%rdi), %ecx -; X64-NEXT: movl (%rsi), %edx +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax ; X64-NEXT: bswapl %ecx -; X64-NEXT: bswapl %edx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: cmpl %ecx, %eax ; X64-NEXT: seta %al -; X64-NEXT: sbbl $0, %eax +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind ret i32 %m @@ -391,14 +391,14 @@ define i1 @length7_lt(ptr %X, ptr %Y) nounwind { define i32 @length8(ptr %X, ptr %Y) nounwind { ; X64-LABEL: length8: ; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: movq (%rsi), %rcx +; X64-NEXT: bswapq %rax ; X64-NEXT: bswapq %rcx -; X64-NEXT: bswapq %rdx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: seta %al -; X64-NEXT: sbbl $0, %eax +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind ret i32 %m diff --git a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll index 4a9643c0f4fc8..3a16ab656b11f 100644 --- a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll +++ b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll @@ -122,13 +122,13 @@ define i32 @length4(ptr %X, ptr %Y) nounwind optsize { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl (%eax), %eax ; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: bswapl %eax +; X86-NEXT: cmpl %eax, %ecx ; X86-NEXT: seta %al -; X86-NEXT: sbbl $0, %eax +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movsbl %al, %eax ; X86-NEXT: retl %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind ret i32 %m diff --git a/llvm/test/CodeGen/X86/memcmp-optsize.ll b/llvm/test/CodeGen/X86/memcmp-optsize.ll index 4e27301436c34..0f817b2c727c3 100644 --- a/llvm/test/CodeGen/X86/memcmp-optsize.ll +++ b/llvm/test/CodeGen/X86/memcmp-optsize.ll @@ -107,14 +107,14 @@ define i1 @length3_eq(ptr %X, ptr %Y) nounwind optsize { define i32 @length4(ptr %X, ptr %Y) nounwind optsize { ; X64-LABEL: length4: ; X64: # %bb.0: -; X64-NEXT: movl (%rdi), %ecx -; X64-NEXT: movl (%rsi), %edx +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax ; X64-NEXT: bswapl %ecx -; X64-NEXT: bswapl %edx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: cmpl %ecx, %eax ; X64-NEXT: seta %al -; X64-NEXT: sbbl $0, %eax +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind ret i32 %m @@ -186,14 +186,14 @@ define i1 @length5_eq(ptr %X, ptr %Y) nounwind optsize { define i32 @length8(ptr %X, ptr %Y) nounwind optsize { ; X64-LABEL: length8: ; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: movq (%rsi), %rcx +; X64-NEXT: bswapq %rax ; X64-NEXT: bswapq %rcx -; X64-NEXT: bswapq %rdx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: seta %al -; X64-NEXT: sbbl $0, %eax +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind ret i32 %m diff --git a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll index bdb50f5b60c49..35fd373536bd3 100644 --- a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll +++ b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll @@ -122,13 +122,13 @@ define i32 @length4(ptr %X, ptr %Y) nounwind !prof !14 { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl (%eax), %eax ; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: bswapl %eax +; X86-NEXT: cmpl %eax, %ecx ; X86-NEXT: seta %al -; X86-NEXT: sbbl $0, %eax +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movsbl %al, %eax ; X86-NEXT: retl %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind ret i32 %m diff --git a/llvm/test/CodeGen/X86/memcmp-pgso.ll b/llvm/test/CodeGen/X86/memcmp-pgso.ll index 9347e54220220..f638852923187 100644 --- a/llvm/test/CodeGen/X86/memcmp-pgso.ll +++ b/llvm/test/CodeGen/X86/memcmp-pgso.ll @@ -107,14 +107,14 @@ define i1 @length3_eq(ptr %X, ptr %Y) nounwind !prof !14 { define i32 @length4(ptr %X, ptr %Y) nounwind !prof !14 { ; X64-LABEL: length4: ; X64: # %bb.0: -; X64-NEXT: movl (%rdi), %ecx -; X64-NEXT: movl (%rsi), %edx +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax ; X64-NEXT: bswapl %ecx -; X64-NEXT: bswapl %edx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: cmpl %ecx, %eax ; X64-NEXT: seta %al -; X64-NEXT: sbbl $0, %eax +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind ret i32 %m @@ -186,14 +186,14 @@ define i1 @length5_eq(ptr %X, ptr %Y) nounwind !prof !14 { define i32 @length8(ptr %X, ptr %Y) nounwind !prof !14 { ; X64-LABEL: length8: ; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: movq (%rsi), %rcx +; X64-NEXT: bswapq %rax ; X64-NEXT: bswapq %rcx -; X64-NEXT: bswapq %rdx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: seta %al -; X64-NEXT: sbbl $0, %eax +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind ret i32 %m diff --git a/llvm/test/CodeGen/X86/memcmp-x32.ll b/llvm/test/CodeGen/X86/memcmp-x32.ll index ad9f2a30d75bb..4a3f5a608e585 100644 --- a/llvm/test/CodeGen/X86/memcmp-x32.ll +++ b/llvm/test/CodeGen/X86/memcmp-x32.ll @@ -221,13 +221,13 @@ define i32 @length4(ptr %X, ptr %Y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl (%eax), %eax ; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: bswapl %eax +; X86-NEXT: cmpl %eax, %ecx ; X86-NEXT: seta %al -; X86-NEXT: sbbl $0, %eax +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movsbl %al, %eax ; X86-NEXT: retl %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind ret i32 %m diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll index 8fe1a581cd9c2..014db33160606 100644 --- a/llvm/test/CodeGen/X86/memcmp.ll +++ b/llvm/test/CodeGen/X86/memcmp.ll @@ -205,14 +205,14 @@ define i1 @length3_eq(ptr %X, ptr %Y) nounwind { define i32 @length4(ptr %X, ptr %Y) nounwind { ; X64-LABEL: length4: ; X64: # %bb.0: -; X64-NEXT: movl (%rdi), %ecx -; X64-NEXT: movl (%rsi), %edx +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax ; X64-NEXT: bswapl %ecx -; X64-NEXT: bswapl %edx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: cmpl %ecx, %eax ; X64-NEXT: seta %al -; X64-NEXT: sbbl $0, %eax +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind ret i32 %m @@ -417,14 +417,14 @@ define i1 @length7_eq(ptr %X, ptr %Y) nounwind { define i32 @length8(ptr %X, ptr %Y) nounwind { ; X64-LABEL: length8: ; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: movq (%rsi), %rcx +; X64-NEXT: bswapq %rax ; X64-NEXT: bswapq %rcx -; X64-NEXT: bswapq %rdx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: seta %al -; X64-NEXT: sbbl $0, %eax +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind ret i32 %m diff --git a/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll index 92439691e1873..179b5b0a3dbf5 100644 --- a/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll +++ b/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll @@ -45,11 +45,7 @@ define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y) { ; CHECK-NEXT: [[TMP4:%.*]] = zext i24 [[TMP2]] to i32 ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]]) -; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = zext i1 [[TMP7]] to i32 -; CHECK-NEXT: [[TMP10:%.*]] = zext i1 [[TMP8]] to i32 -; CHECK-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[TMP5]], i32 [[TMP6]]) ; CHECK-NEXT: ret i32 [[TMP11]] ; %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 3) @@ -63,11 +59,7 @@ define i32 @cmp4(ptr nocapture readonly %x, ptr nocapture readonly %y) { ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[Y]], align 1 ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]]) ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) -; CHECK-NEXT: [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = zext i1 [[TMP5]] to i32 -; CHECK-NEXT: [[TMP8:%.*]] = zext i1 [[TMP6]] to i32 -; CHECK-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[TMP3]], i32 [[TMP4]]) ; CHECK-NEXT: ret i32 [[TMP9]] ; %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 4) @@ -83,11 +75,7 @@ define i32 @cmp5(ptr nocapture readonly %x, ptr nocapture readonly %y) { ; CHECK-NEXT: [[TMP4:%.*]] = zext i40 [[TMP2]] to i64 ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]]) -; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = zext i1 [[TMP7]] to i32 -; CHECK-NEXT: [[TMP10:%.*]] = zext i1 [[TMP8]] to i32 -; CHECK-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.ucmp.i32.i64(i64 [[TMP5]], i64 [[TMP6]]) ; CHECK-NEXT: ret i32 [[TMP11]] ; %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 5) @@ -103,11 +91,7 @@ define i32 @cmp6(ptr nocapture readonly %x, ptr nocapture readonly %y) { ; CHECK-NEXT: [[TMP4:%.*]] = zext i48 [[TMP2]] to i64 ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]]) -; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = zext i1 [[TMP7]] to i32 -; CHECK-NEXT: [[TMP10:%.*]] = zext i1 [[TMP8]] to i32 -; CHECK-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.ucmp.i32.i64(i64 [[TMP5]], i64 [[TMP6]]) ; CHECK-NEXT: ret i32 [[TMP11]] ; %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 6) @@ -155,11 +139,7 @@ define i32 @cmp8(ptr nocapture readonly %x, ptr nocapture readonly %y) { ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[Y]], align 1 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]]) ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) -; CHECK-NEXT: [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = zext i1 [[TMP5]] to i32 -; CHECK-NEXT: [[TMP8:%.*]] = zext i1 [[TMP6]] to i32 -; CHECK-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.ucmp.i32.i64(i64 [[TMP3]], i64 [[TMP4]]) ; CHECK-NEXT: ret i32 [[TMP9]] ; %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 8) diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll index d71ae8be19b66..0507ec9de542e 100644 --- a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll +++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll @@ -71,11 +71,7 @@ define i32 @cmp4(ptr nocapture readonly %x, ptr nocapture readonly %y) { ; X32-NEXT: [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1 ; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) ; X32-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]]) -; X32-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]] -; X32-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]] -; X32-NEXT: [[TMP9:%.*]] = zext i1 [[TMP7]] to i32 -; X32-NEXT: [[TMP10:%.*]] = zext i1 [[TMP8]] to i32 -; X32-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]] +; X32-NEXT: [[TMP11:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[TMP5]], i32 [[TMP6]]) ; X32-NEXT: ret i32 [[TMP11]] ; %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 4) diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll index f686e29975564..86dc3e5245f24 100644 --- a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll +++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll @@ -73,11 +73,7 @@ define i32 @cmp4(ptr nocapture readonly %x, ptr nocapture readonly %y) { ; X64-NEXT: [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1 ; X64-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) ; X64-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]]) -; X64-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]] -; X64-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]] -; X64-NEXT: [[TMP9:%.*]] = zext i1 [[TMP7]] to i32 -; X64-NEXT: [[TMP10:%.*]] = zext i1 [[TMP8]] to i32 -; X64-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]] +; X64-NEXT: [[TMP11:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[TMP5]], i32 [[TMP6]]) ; X64-NEXT: ret i32 [[TMP11]] ; %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 4) @@ -189,11 +185,7 @@ define i32 @cmp8(ptr nocapture readonly %x, ptr nocapture readonly %y) { ; X64-NEXT: [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1 ; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) ; X64-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]]) -; X64-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]] -; X64-NEXT: [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]] -; X64-NEXT: [[TMP9:%.*]] = zext i1 [[TMP7]] to i32 -; X64-NEXT: [[TMP10:%.*]] = zext i1 [[TMP8]] to i32 -; X64-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]] +; X64-NEXT: [[TMP11:%.*]] = call i32 @llvm.ucmp.i32.i64(i64 [[TMP5]], i64 [[TMP6]]) ; X64-NEXT: ret i32 [[TMP11]] ; %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 8) From 715dcb2310a4378fdf324cd3d3b47d6f160842aa Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 3 Jan 2025 09:19:54 -0800 Subject: [PATCH 374/567] [ExpandMemCmp] Use m_SpecificInt to simplify code. NFC (#121532) --- llvm/lib/CodeGen/ExpandMemCmp.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp index 6dc3e04ac802c..cc75a01c6477a 100644 --- a/llvm/lib/CodeGen/ExpandMemCmp.cpp +++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp @@ -669,15 +669,15 @@ Value *MemCmpExpansion::getMemCmpOneBlock() { if (CI->hasOneUser()) { auto *UI = cast(*CI->user_begin()); CmpPredicate Pred = ICmpInst::Predicate::BAD_ICMP_PREDICATE; - uint64_t Shift; bool NeedsZExt = false; // This is a special case because instead of checking if the result is less // than zero: // bool result = memcmp(a, b, NBYTES) < 0; // Compiler is clever enough to generate the following code: // bool result = memcmp(a, b, NBYTES) >> 31; - if (match(UI, m_LShr(m_Value(), m_ConstantInt(Shift))) && - Shift == (CI->getType()->getIntegerBitWidth() - 1)) { + if (match(UI, + m_LShr(m_Value(), + m_SpecificInt(CI->getType()->getIntegerBitWidth() - 1)))) { Pred = ICmpInst::ICMP_SLT; NeedsZExt = true; } else { From c19f0f005a1ccf21bd2f0656f90455a55413a32f Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 3 Jan 2025 09:20:30 -0800 Subject: [PATCH 375/567] [PatternMatch] Make m_SpecificMask pass expected mask by Value. NFC (#121527) Unlike m_Mask, we don't need to modify a variable owned by the caller so we should pass the ArrayRef by value or const reference. --- llvm/include/llvm/IR/PatternMatch.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h index cc0e8d598ff1e..b37f967191aaa 100644 --- a/llvm/include/llvm/IR/PatternMatch.h +++ b/llvm/include/llvm/IR/PatternMatch.h @@ -1844,9 +1844,9 @@ struct m_ZeroMask { }; struct m_SpecificMask { - ArrayRef &MaskRef; - m_SpecificMask(ArrayRef &MaskRef) : MaskRef(MaskRef) {} - bool match(ArrayRef Mask) { return MaskRef == Mask; } + ArrayRef Val; + m_SpecificMask(ArrayRef Val) : Val(Val) {} + bool match(ArrayRef Mask) { return Val == Mask; } }; struct m_SplatOrPoisonMask { From e6f76378c20bebec85f66c1574bb6bb928a79025 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 3 Jan 2025 09:25:08 -0800 Subject: [PATCH 376/567] EntryExitInstrumenter: skip available_externally linkage gnu::always_inline functions, which lower to available_externally, may not have definitions external to the module. -finstrument-function family options instrumentating the function (which takes the function address) may lead to a linker error if the function is not optimized out, e.g. ``` // -std=c++17 or above with libstdc++ #include std::string str; int main() {} ``` Simplified reproduce: ``` template struct A { [[gnu::always_inline]] T bar(T a) { return a * 2; } }; extern template class A; int main(int argc, char **argv) { return A().bar(argc); } ``` GCC's -finstrument-function instrumentation skips such functions (https://gcc.gnu.org/PR78333). Let's skip such functions (available_externally) as well. Fix #50742 Pull Request: https://github.com/llvm/llvm-project/pull/121452 --- llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp | 6 ++++++ llvm/test/Transforms/EntryExitInstrumenter/mcount.ll | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp index 5b33edd51cffa..d47f1b4253b54 100644 --- a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp +++ b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp @@ -103,6 +103,12 @@ static bool runOnFunction(Function &F, bool PostInlining) { if (F.hasFnAttribute(Attribute::Naked)) return false; + // available_externally functions may not have definitions external to the + // module (e.g. gnu::always_inline). Instrumenting them might lead to linker + // errors if they are optimized out. Skip them like GCC. + if (F.hasAvailableExternallyLinkage()) + return false; + StringRef EntryAttr = PostInlining ? "instrument-function-entry-inlined" : "instrument-function-entry"; diff --git a/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll b/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll index bd5f4c2b51a89..56ccfb9ed2e7e 100644 --- a/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll +++ b/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll @@ -129,6 +129,13 @@ define void @naked() naked { ret void } +define available_externally void @always_inline() { +; CHECK-LABEL: define available_externally void @always_inline() { +; CHECK-NEXT: ret void +; + ret void +} + ; The attributes are "consumed" when the instrumentation is inserted. ; CHECK: attributes ; CHECK-NOT: instrument-function From ee9be864bcc5e3cc89f5f23485db2285ad7119f7 Mon Sep 17 00:00:00 2001 From: Mingming Liu Date: Fri, 3 Jan 2025 09:38:04 -0800 Subject: [PATCH 377/567] [NFC] Fix a typo (#121545) `InputSectionBase::relsOrRelas` make at most one array-ref non-empty. One-off counter (as debugging log) shows the number of empty member containers is 2 or 3 in a real build. Fix the typo. --- lld/ELF/InputSection.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lld/ELF/InputSection.h b/lld/ELF/InputSection.h index 60988dfacbd74..98e7d5d4ff0cd 100644 --- a/lld/ELF/InputSection.h +++ b/lld/ELF/InputSection.h @@ -33,7 +33,7 @@ class SyntheticSection; template class ObjFile; class OutputSection; -// Returned by InputSectionBase::relsOrRelas. At most one member is empty. +// Returned by InputSectionBase::relsOrRelas. At least two members are empty. template struct RelsOrRelas { Relocs rels; Relocs relas; From 82fdd103f9484ce85ec64e3d013cfd8000e22fea Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 3 Jan 2025 16:55:27 +0000 Subject: [PATCH 378/567] [X86] Add test coverage for #107423 --- llvm/test/CodeGen/X86/pr107423.ll | 74 +++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 llvm/test/CodeGen/X86/pr107423.ll diff --git a/llvm/test/CodeGen/X86/pr107423.ll b/llvm/test/CodeGen/X86/pr107423.ll new file mode 100644 index 0000000000000..d5119d45f97c0 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr107423.ll @@ -0,0 +1,74 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=sandybridge | FileCheck %s + +define void @PR107423(<64 x i8> %arg, ptr %p0) { +; CHECK-LABEL: PR107423: +; CHECK: # %bb.0: +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 +; CHECK-NEXT: vpsllw $8, %xmm2, %xmm2 +; CHECK-NEXT: vpsllw $8, %xmm1, %xmm3 +; CHECK-NEXT: vpaddb %xmm2, %xmm3, %xmm3 +; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm4 +; CHECK-NEXT: vpaddb %xmm1, %xmm4, %xmm1 +; CHECK-NEXT: vpaddb %xmm4, %xmm0, %xmm4 +; CHECK-NEXT: vpsllw $8, %xmm4, %xmm4 +; CHECK-NEXT: vpaddb %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vpsllw $8, %xmm1, %xmm1 +; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpor %xmm3, %xmm2, %xmm2 +; CHECK-NEXT: vpsllw $8, %xmm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqu %xmm0, 16(%rdi) +; CHECK-NEXT: vmovdqu %xmm2, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %i3 = bitcast <64 x i8> %arg to <32 x i16> + %i4 = shufflevector <32 x i16> %i3, <32 x i16> poison, <8 x i32> + %i5 = shl <8 x i16> %i4, + %i6 = bitcast <8 x i16> %i5 to <16 x i8> + %i7 = shufflevector <64 x i8> %arg, <64 x i8> poison, <64 x i32> + %i8 = shufflevector <64 x i8> %arg, <64 x i8> poison, <64 x i32> + %i9 = shufflevector <64 x i8> %i7, <64 x i8> %i8, <64 x i32> + %i10 = shufflevector <16 x i8> %i6, <16 x i8> poison, <64 x i32> + %i11 = shufflevector <64 x i8> %i10, <64 x i8> %i9, <64 x i32> + %i12 = bitcast <64 x i8> %i11 to <32 x i16> + %i13 = shl <32 x i16> %i12, + %i14 = bitcast <32 x i16> %i13 to <64 x i8> + %i15 = shufflevector <64 x i8> %i14, <64 x i8> poison, <16 x i32> + %i16 = shufflevector <64 x i8> %i11, <64 x i8> poison, <64 x i32> + %i17 = shufflevector <16 x i8> %i6, <16 x i8> poison, <64 x i32> + %i18 = shufflevector <64 x i8> %i16, <64 x i8> %i17, <64 x i32> + %i19 = shufflevector <16 x i8> %i15, <16 x i8> poison, <64 x i32> + %i20 = shufflevector <64 x i8> %i19, <64 x i8> %i18, <64 x i32> + %i21 = add <64 x i8> %i20, %i11 + %i22 = bitcast <64 x i8> %i21 to <32 x i16> + %i23 = shl <32 x i16> %i22, + %i24 = bitcast <32 x i16> %i23 to <64 x i8> + %i25 = shufflevector <64 x i8> %i24, <64 x i8> poison, <16 x i32> + %i26 = bitcast <32 x i16> %i23 to <64 x i8> + %i28 = shufflevector <64 x i8> %i26, <64 x i8> poison, <16 x i32> + %i32 = shufflevector <64 x i8> %i21, <64 x i8> poison, <64 x i32> + %i33 = shufflevector <16 x i8> %i25, <16 x i8> poison, <64 x i32> + %i34 = shufflevector <64 x i8> %i32, <64 x i8> %i33, <64 x i32> + %i35 = shufflevector <16 x i8> %i28, <16 x i8> poison, <64 x i32> + %i36 = shufflevector <64 x i8> %i35, <64 x i8> %i34, <64 x i32> + %i37 = add <64 x i8> %i36, %i21 + %i38 = bitcast <64 x i8> %i37 to <32 x i16> + %i39 = shufflevector <32 x i16> %i38, <32 x i16> poison, <8 x i32> + %i40 = shl <8 x i16> %i39, + %i41 = bitcast <8 x i16> %i40 to <16 x i8> + %i42 = shufflevector <16 x i8> %i41, <16 x i8> poison, <64 x i32> + %i43 = shufflevector <64 x i8> %i42, <64 x i8> %i37, <64 x i32> + %i44 = bitcast <64 x i8> %i43 to <32 x i16> + %i45 = shufflevector <32 x i16> %i44, <32 x i16> poison, <8 x i32> + %i46 = shl <8 x i16> %i45, + %i47 = bitcast <8 x i16> %i46 to <16 x i8> + %i48 = shufflevector <16 x i8> %i47, <16 x i8> poison, <64 x i32> + %i49 = shufflevector <64 x i8> %i43, <64 x i8> %i48, <32 x i32> + %i50 = shufflevector <64 x i8> %i37, <64 x i8> poison, <32 x i32> + %i51 = add <32 x i8> %i49, %i50 + store <32 x i8> %i51, ptr %p0, align 1 + ret void +} From 13cf5c9c227a502f86f8c0e3c7d5fe147bc91b8b Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 3 Jan 2025 09:48:38 -0800 Subject: [PATCH 379/567] [RISCV] Re-generate memcmp test checks missed in #121530. NFC A patch landed to these tests while #121530 was in review and I forgot to rebase. --- llvm/test/CodeGen/RISCV/memcmp-optsize.ll | 48 +++++++++++------------ llvm/test/CodeGen/RISCV/memcmp.ll | 48 +++++++++++------------ 2 files changed, 48 insertions(+), 48 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll index 829fdd5592683..f9086ba9d6354 100644 --- a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll +++ b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll @@ -2459,9 +2459,9 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: or a1, a1, a3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_3: @@ -2478,9 +2478,9 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a1, a1, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_3: @@ -2493,9 +2493,9 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: pack a1, a3, a1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_3: @@ -2512,9 +2512,9 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a1, a1, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_3: @@ -2845,9 +2845,9 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a1, a3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_5: @@ -2878,9 +2878,9 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_5: @@ -3044,9 +3044,9 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a1, a3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_6: @@ -3083,9 +3083,9 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_6: diff --git a/llvm/test/CodeGen/RISCV/memcmp.ll b/llvm/test/CodeGen/RISCV/memcmp.ll index bc6cf0a858be8..831e21af43807 100644 --- a/llvm/test/CodeGen/RISCV/memcmp.ll +++ b/llvm/test/CodeGen/RISCV/memcmp.ll @@ -3155,9 +3155,9 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: or a1, a1, a3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_3: @@ -3174,9 +3174,9 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a1, a1, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_3: @@ -3189,9 +3189,9 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: pack a1, a3, a1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_3: @@ -3208,9 +3208,9 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a1, a1, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_3: @@ -3541,9 +3541,9 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a1, a3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_5: @@ -3574,9 +3574,9 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_5: @@ -3740,9 +3740,9 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a1, a3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_6: @@ -3779,9 +3779,9 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_6: From bad0f98bda1ca0b8a106b14b9cce98bf1dbc15cc Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 2 Jan 2025 17:47:31 -0800 Subject: [PATCH 380/567] [ExpandMemCmp][AArch][RISCV][X86] Pre-commit tests for recognizing canonical form of (icmp sle/sge X, 0). NFC Pre-commit for #121540. --- llvm/test/CodeGen/AArch64/memcmp.ll | 378 ++++++------ llvm/test/CodeGen/RISCV/memcmp.ll | 910 +++++++++++++++++++++++----- llvm/test/CodeGen/X86/memcmp.ll | 126 ++-- 3 files changed, 1047 insertions(+), 367 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/memcmp.ll b/llvm/test/CodeGen/AArch64/memcmp.ll index 4f58fd74d7d50..864f38468842a 100644 --- a/llvm/test/CodeGen/AArch64/memcmp.ll +++ b/llvm/test/CodeGen/AArch64/memcmp.ll @@ -257,6 +257,42 @@ define i1 @length4_gt(ptr %X, ptr %Y) nounwind { ret i1 %c } +define i1 @length4_le(ptr %X, ptr %Y) nounwind { +; CHECK-LABEL: length4_le: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: ldr w9, [x1] +; CHECK-NEXT: rev w8, w8 +; CHECK-NEXT: rev w9, w9 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: cset w8, hi +; CHECK-NEXT: csinv w8, w8, wzr, hs +; CHECK-NEXT: cmp w8, #1 +; CHECK-NEXT: cset w0, lt +; CHECK-NEXT: ret + %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind + %c = icmp slt i32 %m, 1 + ret i1 %c +} + +define i1 @length4_ge(ptr %X, ptr %Y) nounwind { +; CHECK-LABEL: length4_ge: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: ldr w9, [x1] +; CHECK-NEXT: rev w8, w8 +; CHECK-NEXT: rev w9, w9 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: cset w8, hi +; CHECK-NEXT: csinv w8, w8, wzr, hs +; CHECK-NEXT: mvn w8, w8 +; CHECK-NEXT: lsr w0, w8, #31 +; CHECK-NEXT: ret + %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind + %c = icmp sgt i32 %m, -1 + ret i1 %c +} + define i1 @length4_eq_const(ptr %X) nounwind { ; CHECK-LABEL: length4_eq_const: ; CHECK: // %bb.0: @@ -371,18 +407,18 @@ define i32 @length7(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev w8, w8 ; CHECK-NEXT: rev w9, w9 ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: b.ne .LBB24_3 +; CHECK-NEXT: b.ne .LBB26_3 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldur w8, [x0, #3] ; CHECK-NEXT: ldur w9, [x1, #3] ; CHECK-NEXT: rev w8, w8 ; CHECK-NEXT: rev w9, w9 ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: b.ne .LBB24_3 +; CHECK-NEXT: b.ne .LBB26_3 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB24_3: // %res_block +; CHECK-NEXT: .LBB26_3: // %res_block ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w0, w8, hs @@ -399,18 +435,18 @@ define i1 @length7_lt(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev w8, w8 ; CHECK-NEXT: rev w9, w9 ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: b.ne .LBB25_3 +; CHECK-NEXT: b.ne .LBB27_3 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldur w8, [x0, #3] ; CHECK-NEXT: ldur w9, [x1, #3] ; CHECK-NEXT: rev w8, w8 ; CHECK-NEXT: rev w9, w9 ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: b.ne .LBB25_3 +; CHECK-NEXT: b.ne .LBB27_3 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: lsr w0, wzr, #31 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB25_3: // %res_block +; CHECK-NEXT: .LBB27_3: // %res_block ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs @@ -489,13 +525,13 @@ define i32 @length9(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB30_2 +; CHECK-NEXT: b.ne .LBB32_2 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldrb w8, [x0, #8] ; CHECK-NEXT: ldrb w9, [x1, #8] ; CHECK-NEXT: sub w0, w8, w9 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB30_2: // %res_block +; CHECK-NEXT: .LBB32_2: // %res_block ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w0, w8, hs ; CHECK-NEXT: ret @@ -527,7 +563,7 @@ define i32 @length10(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB32_3 +; CHECK-NEXT: b.ne .LBB34_3 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldrh w8, [x0, #8] ; CHECK-NEXT: ldrh w9, [x1, #8] @@ -536,11 +572,11 @@ define i32 @length10(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: lsr w8, w8, #16 ; CHECK-NEXT: lsr w9, w9, #16 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB32_3 +; CHECK-NEXT: b.ne .LBB34_3 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB32_3: // %res_block +; CHECK-NEXT: .LBB34_3: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w0, w8, hs @@ -573,18 +609,18 @@ define i32 @length11(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB34_3 +; CHECK-NEXT: b.ne .LBB36_3 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldur x8, [x0, #3] ; CHECK-NEXT: ldur x9, [x1, #3] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB34_3 +; CHECK-NEXT: b.ne .LBB36_3 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB34_3: // %res_block +; CHECK-NEXT: .LBB36_3: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w0, w8, hs @@ -633,18 +669,18 @@ define i32 @length12(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB37_3 +; CHECK-NEXT: b.ne .LBB39_3 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr w8, [x0, #8] ; CHECK-NEXT: ldr w9, [x1, #8] ; CHECK-NEXT: rev w8, w8 ; CHECK-NEXT: rev w9, w9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB37_3 +; CHECK-NEXT: b.ne .LBB39_3 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB37_3: // %res_block +; CHECK-NEXT: .LBB39_3: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w0, w8, hs @@ -693,18 +729,18 @@ define i32 @length15(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB40_3 +; CHECK-NEXT: b.ne .LBB42_3 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldur x8, [x0, #7] ; CHECK-NEXT: ldur x9, [x1, #7] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB40_3 +; CHECK-NEXT: b.ne .LBB42_3 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB40_3: // %res_block +; CHECK-NEXT: .LBB42_3: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w0, w8, hs @@ -721,18 +757,18 @@ define i1 @length15_lt(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB41_3 +; CHECK-NEXT: b.ne .LBB43_3 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldur x8, [x0, #7] ; CHECK-NEXT: ldur x9, [x1, #7] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB41_3 +; CHECK-NEXT: b.ne .LBB43_3 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: lsr w0, wzr, #31 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB41_3: // %res_block +; CHECK-NEXT: .LBB43_3: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs @@ -753,7 +789,7 @@ define i32 @length15_const(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: movk x8, #12594, lsl #48 ; CHECK-NEXT: cmp x9, x8 -; CHECK-NEXT: b.ne .LBB42_3 +; CHECK-NEXT: b.ne .LBB44_3 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: mov x8, #13365 // =0x3435 ; CHECK-NEXT: ldur x9, [x0, #7] @@ -762,11 +798,11 @@ define i32 @length15_const(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: movk x8, #14393, lsl #48 ; CHECK-NEXT: cmp x9, x8 -; CHECK-NEXT: b.ne .LBB42_3 +; CHECK-NEXT: b.ne .LBB44_3 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB42_3: // %res_block +; CHECK-NEXT: .LBB44_3: // %res_block ; CHECK-NEXT: cmp x9, x8 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w0, w8, hs @@ -801,7 +837,7 @@ define i1 @length15_gt_const(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: movk x8, #12594, lsl #48 ; CHECK-NEXT: cmp x9, x8 -; CHECK-NEXT: b.ne .LBB44_3 +; CHECK-NEXT: b.ne .LBB46_3 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: mov x8, #13365 // =0x3435 ; CHECK-NEXT: ldur x9, [x0, #7] @@ -810,15 +846,15 @@ define i1 @length15_gt_const(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: movk x8, #14393, lsl #48 ; CHECK-NEXT: cmp x9, x8 -; CHECK-NEXT: b.ne .LBB44_3 +; CHECK-NEXT: b.ne .LBB46_3 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: b .LBB44_4 -; CHECK-NEXT: .LBB44_3: // %res_block +; CHECK-NEXT: b .LBB46_4 +; CHECK-NEXT: .LBB46_3: // %res_block ; CHECK-NEXT: cmp x9, x8 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs -; CHECK-NEXT: .LBB44_4: // %endblock +; CHECK-NEXT: .LBB46_4: // %endblock ; CHECK-NEXT: cmp w8, #0 ; CHECK-NEXT: cset w0, gt ; CHECK-NEXT: ret @@ -836,18 +872,18 @@ define i32 @length16(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB45_3 +; CHECK-NEXT: b.ne .LBB47_3 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB45_3 +; CHECK-NEXT: b.ne .LBB47_3 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB45_3: // %res_block +; CHECK-NEXT: .LBB47_3: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w0, w8, hs @@ -878,18 +914,18 @@ define i1 @length16_lt(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB47_3 +; CHECK-NEXT: b.ne .LBB49_3 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB47_3 +; CHECK-NEXT: b.ne .LBB49_3 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: lsr w0, wzr, #31 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB47_3: // %res_block +; CHECK-NEXT: .LBB49_3: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs @@ -908,22 +944,22 @@ define i1 @length16_gt(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB48_3 +; CHECK-NEXT: b.ne .LBB50_3 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB48_3 +; CHECK-NEXT: b.ne .LBB50_3 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: b .LBB48_4 -; CHECK-NEXT: .LBB48_3: // %res_block +; CHECK-NEXT: b .LBB50_4 +; CHECK-NEXT: .LBB50_3: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs -; CHECK-NEXT: .LBB48_4: // %endblock +; CHECK-NEXT: .LBB50_4: // %endblock ; CHECK-NEXT: cmp w8, #0 ; CHECK-NEXT: cset w0, gt ; CHECK-NEXT: ret @@ -962,25 +998,25 @@ define i32 @length24(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB50_4 +; CHECK-NEXT: b.ne .LBB52_4 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB50_4 +; CHECK-NEXT: b.ne .LBB52_4 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB50_4 +; CHECK-NEXT: b.ne .LBB52_4 ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB50_4: // %res_block +; CHECK-NEXT: .LBB52_4: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w0, w8, hs @@ -1014,25 +1050,25 @@ define i1 @length24_lt(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB52_4 +; CHECK-NEXT: b.ne .LBB54_4 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB52_4 +; CHECK-NEXT: b.ne .LBB54_4 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB52_4 +; CHECK-NEXT: b.ne .LBB54_4 ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: lsr w0, wzr, #31 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB52_4: // %res_block +; CHECK-NEXT: .LBB54_4: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs @@ -1051,29 +1087,29 @@ define i1 @length24_gt(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB53_4 +; CHECK-NEXT: b.ne .LBB55_4 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB53_4 +; CHECK-NEXT: b.ne .LBB55_4 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB53_4 +; CHECK-NEXT: b.ne .LBB55_4 ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: b .LBB53_5 -; CHECK-NEXT: .LBB53_4: // %res_block +; CHECK-NEXT: b .LBB55_5 +; CHECK-NEXT: .LBB55_4: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs -; CHECK-NEXT: .LBB53_5: // %endblock +; CHECK-NEXT: .LBB55_5: // %endblock ; CHECK-NEXT: cmp w8, #0 ; CHECK-NEXT: cset w0, gt ; CHECK-NEXT: ret @@ -1117,32 +1153,32 @@ define i32 @length31(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB55_5 +; CHECK-NEXT: b.ne .LBB57_5 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB55_5 +; CHECK-NEXT: b.ne .LBB57_5 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB55_5 +; CHECK-NEXT: b.ne .LBB57_5 ; CHECK-NEXT: // %bb.3: // %loadbb3 ; CHECK-NEXT: ldur x8, [x0, #23] ; CHECK-NEXT: ldur x9, [x1, #23] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB55_5 +; CHECK-NEXT: b.ne .LBB57_5 ; CHECK-NEXT: // %bb.4: ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB55_5: // %res_block +; CHECK-NEXT: .LBB57_5: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w0, w8, hs @@ -1179,32 +1215,32 @@ define i1 @length31_lt(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB57_5 +; CHECK-NEXT: b.ne .LBB59_5 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB57_5 +; CHECK-NEXT: b.ne .LBB59_5 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB57_5 +; CHECK-NEXT: b.ne .LBB59_5 ; CHECK-NEXT: // %bb.3: // %loadbb3 ; CHECK-NEXT: ldur x8, [x0, #23] ; CHECK-NEXT: ldur x9, [x1, #23] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB57_5 +; CHECK-NEXT: b.ne .LBB59_5 ; CHECK-NEXT: // %bb.4: ; CHECK-NEXT: lsr w0, wzr, #31 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB57_5: // %res_block +; CHECK-NEXT: .LBB59_5: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs @@ -1223,36 +1259,36 @@ define i1 @length31_gt(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB58_5 +; CHECK-NEXT: b.ne .LBB60_5 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB58_5 +; CHECK-NEXT: b.ne .LBB60_5 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB58_5 +; CHECK-NEXT: b.ne .LBB60_5 ; CHECK-NEXT: // %bb.3: // %loadbb3 ; CHECK-NEXT: ldur x8, [x0, #23] ; CHECK-NEXT: ldur x9, [x1, #23] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB58_5 +; CHECK-NEXT: b.ne .LBB60_5 ; CHECK-NEXT: // %bb.4: ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: b .LBB58_6 -; CHECK-NEXT: .LBB58_5: // %res_block +; CHECK-NEXT: b .LBB60_6 +; CHECK-NEXT: .LBB60_5: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs -; CHECK-NEXT: .LBB58_6: // %endblock +; CHECK-NEXT: .LBB60_6: // %endblock ; CHECK-NEXT: cmp w8, #0 ; CHECK-NEXT: cset w0, gt ; CHECK-NEXT: ret @@ -1322,32 +1358,32 @@ define i32 @length32(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB61_5 +; CHECK-NEXT: b.ne .LBB63_5 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB61_5 +; CHECK-NEXT: b.ne .LBB63_5 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB61_5 +; CHECK-NEXT: b.ne .LBB63_5 ; CHECK-NEXT: // %bb.3: // %loadbb3 ; CHECK-NEXT: ldr x8, [x0, #24] ; CHECK-NEXT: ldr x9, [x1, #24] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB61_5 +; CHECK-NEXT: b.ne .LBB63_5 ; CHECK-NEXT: // %bb.4: ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB61_5: // %res_block +; CHECK-NEXT: .LBB63_5: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w0, w8, hs @@ -1383,32 +1419,32 @@ define i1 @length32_lt(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB63_5 +; CHECK-NEXT: b.ne .LBB65_5 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB63_5 +; CHECK-NEXT: b.ne .LBB65_5 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB63_5 +; CHECK-NEXT: b.ne .LBB65_5 ; CHECK-NEXT: // %bb.3: // %loadbb3 ; CHECK-NEXT: ldr x8, [x0, #24] ; CHECK-NEXT: ldr x9, [x1, #24] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB63_5 +; CHECK-NEXT: b.ne .LBB65_5 ; CHECK-NEXT: // %bb.4: ; CHECK-NEXT: lsr w0, wzr, #31 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB63_5: // %res_block +; CHECK-NEXT: .LBB65_5: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs @@ -1427,36 +1463,36 @@ define i1 @length32_gt(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB64_5 +; CHECK-NEXT: b.ne .LBB66_5 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB64_5 +; CHECK-NEXT: b.ne .LBB66_5 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB64_5 +; CHECK-NEXT: b.ne .LBB66_5 ; CHECK-NEXT: // %bb.3: // %loadbb3 ; CHECK-NEXT: ldr x8, [x0, #24] ; CHECK-NEXT: ldr x9, [x1, #24] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB64_5 +; CHECK-NEXT: b.ne .LBB66_5 ; CHECK-NEXT: // %bb.4: ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: b .LBB64_6 -; CHECK-NEXT: .LBB64_5: // %res_block +; CHECK-NEXT: b .LBB66_6 +; CHECK-NEXT: .LBB66_5: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs -; CHECK-NEXT: .LBB64_6: // %endblock +; CHECK-NEXT: .LBB66_6: // %endblock ; CHECK-NEXT: cmp w8, #0 ; CHECK-NEXT: cset w0, gt ; CHECK-NEXT: ret @@ -1523,46 +1559,46 @@ define i32 @length48(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB67_7 +; CHECK-NEXT: b.ne .LBB69_7 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB67_7 +; CHECK-NEXT: b.ne .LBB69_7 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB67_7 +; CHECK-NEXT: b.ne .LBB69_7 ; CHECK-NEXT: // %bb.3: // %loadbb3 ; CHECK-NEXT: ldr x8, [x0, #24] ; CHECK-NEXT: ldr x9, [x1, #24] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB67_7 +; CHECK-NEXT: b.ne .LBB69_7 ; CHECK-NEXT: // %bb.4: // %loadbb4 ; CHECK-NEXT: ldr x8, [x0, #32] ; CHECK-NEXT: ldr x9, [x1, #32] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB67_7 +; CHECK-NEXT: b.ne .LBB69_7 ; CHECK-NEXT: // %bb.5: // %loadbb5 ; CHECK-NEXT: ldr x8, [x0, #40] ; CHECK-NEXT: ldr x9, [x1, #40] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB67_7 +; CHECK-NEXT: b.ne .LBB69_7 ; CHECK-NEXT: // %bb.6: ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB67_7: // %res_block +; CHECK-NEXT: .LBB69_7: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w0, w8, hs @@ -1601,46 +1637,46 @@ define i1 @length48_lt(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB69_7 +; CHECK-NEXT: b.ne .LBB71_7 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB69_7 +; CHECK-NEXT: b.ne .LBB71_7 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB69_7 +; CHECK-NEXT: b.ne .LBB71_7 ; CHECK-NEXT: // %bb.3: // %loadbb3 ; CHECK-NEXT: ldr x8, [x0, #24] ; CHECK-NEXT: ldr x9, [x1, #24] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB69_7 +; CHECK-NEXT: b.ne .LBB71_7 ; CHECK-NEXT: // %bb.4: // %loadbb4 ; CHECK-NEXT: ldr x8, [x0, #32] ; CHECK-NEXT: ldr x9, [x1, #32] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB69_7 +; CHECK-NEXT: b.ne .LBB71_7 ; CHECK-NEXT: // %bb.5: // %loadbb5 ; CHECK-NEXT: ldr x8, [x0, #40] ; CHECK-NEXT: ldr x9, [x1, #40] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB69_7 +; CHECK-NEXT: b.ne .LBB71_7 ; CHECK-NEXT: // %bb.6: ; CHECK-NEXT: lsr w0, wzr, #31 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB69_7: // %res_block +; CHECK-NEXT: .LBB71_7: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs @@ -1659,50 +1695,50 @@ define i1 @length48_gt(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB70_7 +; CHECK-NEXT: b.ne .LBB72_7 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB70_7 +; CHECK-NEXT: b.ne .LBB72_7 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB70_7 +; CHECK-NEXT: b.ne .LBB72_7 ; CHECK-NEXT: // %bb.3: // %loadbb3 ; CHECK-NEXT: ldr x8, [x0, #24] ; CHECK-NEXT: ldr x9, [x1, #24] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB70_7 +; CHECK-NEXT: b.ne .LBB72_7 ; CHECK-NEXT: // %bb.4: // %loadbb4 ; CHECK-NEXT: ldr x8, [x0, #32] ; CHECK-NEXT: ldr x9, [x1, #32] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB70_7 +; CHECK-NEXT: b.ne .LBB72_7 ; CHECK-NEXT: // %bb.5: // %loadbb5 ; CHECK-NEXT: ldr x8, [x0, #40] ; CHECK-NEXT: ldr x9, [x1, #40] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB70_7 +; CHECK-NEXT: b.ne .LBB72_7 ; CHECK-NEXT: // %bb.6: ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: b .LBB70_8 -; CHECK-NEXT: .LBB70_7: // %res_block +; CHECK-NEXT: b .LBB72_8 +; CHECK-NEXT: .LBB72_7: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs -; CHECK-NEXT: .LBB70_8: // %endblock +; CHECK-NEXT: .LBB72_8: // %endblock ; CHECK-NEXT: cmp w8, #0 ; CHECK-NEXT: cset w0, gt ; CHECK-NEXT: ret @@ -1780,60 +1816,60 @@ define i32 @length63(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB73_9 +; CHECK-NEXT: b.ne .LBB75_9 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB73_9 +; CHECK-NEXT: b.ne .LBB75_9 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB73_9 +; CHECK-NEXT: b.ne .LBB75_9 ; CHECK-NEXT: // %bb.3: // %loadbb3 ; CHECK-NEXT: ldr x8, [x0, #24] ; CHECK-NEXT: ldr x9, [x1, #24] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB73_9 +; CHECK-NEXT: b.ne .LBB75_9 ; CHECK-NEXT: // %bb.4: // %loadbb4 ; CHECK-NEXT: ldr x8, [x0, #32] ; CHECK-NEXT: ldr x9, [x1, #32] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB73_9 +; CHECK-NEXT: b.ne .LBB75_9 ; CHECK-NEXT: // %bb.5: // %loadbb5 ; CHECK-NEXT: ldr x8, [x0, #40] ; CHECK-NEXT: ldr x9, [x1, #40] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB73_9 +; CHECK-NEXT: b.ne .LBB75_9 ; CHECK-NEXT: // %bb.6: // %loadbb6 ; CHECK-NEXT: ldr x8, [x0, #48] ; CHECK-NEXT: ldr x9, [x1, #48] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB73_9 +; CHECK-NEXT: b.ne .LBB75_9 ; CHECK-NEXT: // %bb.7: // %loadbb7 ; CHECK-NEXT: ldur x8, [x0, #55] ; CHECK-NEXT: ldur x9, [x1, #55] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB73_9 +; CHECK-NEXT: b.ne .LBB75_9 ; CHECK-NEXT: // %bb.8: ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB73_9: // %res_block +; CHECK-NEXT: .LBB75_9: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w0, w8, hs @@ -1878,60 +1914,60 @@ define i1 @length63_lt(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB75_9 +; CHECK-NEXT: b.ne .LBB77_9 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB75_9 +; CHECK-NEXT: b.ne .LBB77_9 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB75_9 +; CHECK-NEXT: b.ne .LBB77_9 ; CHECK-NEXT: // %bb.3: // %loadbb3 ; CHECK-NEXT: ldr x8, [x0, #24] ; CHECK-NEXT: ldr x9, [x1, #24] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB75_9 +; CHECK-NEXT: b.ne .LBB77_9 ; CHECK-NEXT: // %bb.4: // %loadbb4 ; CHECK-NEXT: ldr x8, [x0, #32] ; CHECK-NEXT: ldr x9, [x1, #32] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB75_9 +; CHECK-NEXT: b.ne .LBB77_9 ; CHECK-NEXT: // %bb.5: // %loadbb5 ; CHECK-NEXT: ldr x8, [x0, #40] ; CHECK-NEXT: ldr x9, [x1, #40] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB75_9 +; CHECK-NEXT: b.ne .LBB77_9 ; CHECK-NEXT: // %bb.6: // %loadbb6 ; CHECK-NEXT: ldr x8, [x0, #48] ; CHECK-NEXT: ldr x9, [x1, #48] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB75_9 +; CHECK-NEXT: b.ne .LBB77_9 ; CHECK-NEXT: // %bb.7: // %loadbb7 ; CHECK-NEXT: ldur x8, [x0, #55] ; CHECK-NEXT: ldur x9, [x1, #55] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB75_9 +; CHECK-NEXT: b.ne .LBB77_9 ; CHECK-NEXT: // %bb.8: ; CHECK-NEXT: lsr w0, wzr, #31 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB75_9: // %res_block +; CHECK-NEXT: .LBB77_9: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs @@ -1950,64 +1986,64 @@ define i1 @length63_gt(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB76_9 +; CHECK-NEXT: b.ne .LBB78_9 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB76_9 +; CHECK-NEXT: b.ne .LBB78_9 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB76_9 +; CHECK-NEXT: b.ne .LBB78_9 ; CHECK-NEXT: // %bb.3: // %loadbb3 ; CHECK-NEXT: ldr x8, [x0, #24] ; CHECK-NEXT: ldr x9, [x1, #24] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB76_9 +; CHECK-NEXT: b.ne .LBB78_9 ; CHECK-NEXT: // %bb.4: // %loadbb4 ; CHECK-NEXT: ldr x8, [x0, #32] ; CHECK-NEXT: ldr x9, [x1, #32] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB76_9 +; CHECK-NEXT: b.ne .LBB78_9 ; CHECK-NEXT: // %bb.5: // %loadbb5 ; CHECK-NEXT: ldr x8, [x0, #40] ; CHECK-NEXT: ldr x9, [x1, #40] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB76_9 +; CHECK-NEXT: b.ne .LBB78_9 ; CHECK-NEXT: // %bb.6: // %loadbb6 ; CHECK-NEXT: ldr x8, [x0, #48] ; CHECK-NEXT: ldr x9, [x1, #48] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB76_9 +; CHECK-NEXT: b.ne .LBB78_9 ; CHECK-NEXT: // %bb.7: // %loadbb7 ; CHECK-NEXT: ldur x8, [x0, #55] ; CHECK-NEXT: ldur x9, [x1, #55] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB76_9 +; CHECK-NEXT: b.ne .LBB78_9 ; CHECK-NEXT: // %bb.8: ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: b .LBB76_10 -; CHECK-NEXT: .LBB76_9: // %res_block +; CHECK-NEXT: b .LBB78_10 +; CHECK-NEXT: .LBB78_9: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs -; CHECK-NEXT: .LBB76_10: // %endblock +; CHECK-NEXT: .LBB78_10: // %endblock ; CHECK-NEXT: cmp w8, #0 ; CHECK-NEXT: cset w0, gt ; CHECK-NEXT: ret @@ -2071,60 +2107,60 @@ define i32 @length64(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB78_9 +; CHECK-NEXT: b.ne .LBB80_9 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB78_9 +; CHECK-NEXT: b.ne .LBB80_9 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB78_9 +; CHECK-NEXT: b.ne .LBB80_9 ; CHECK-NEXT: // %bb.3: // %loadbb3 ; CHECK-NEXT: ldr x8, [x0, #24] ; CHECK-NEXT: ldr x9, [x1, #24] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB78_9 +; CHECK-NEXT: b.ne .LBB80_9 ; CHECK-NEXT: // %bb.4: // %loadbb4 ; CHECK-NEXT: ldr x8, [x0, #32] ; CHECK-NEXT: ldr x9, [x1, #32] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB78_9 +; CHECK-NEXT: b.ne .LBB80_9 ; CHECK-NEXT: // %bb.5: // %loadbb5 ; CHECK-NEXT: ldr x8, [x0, #40] ; CHECK-NEXT: ldr x9, [x1, #40] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB78_9 +; CHECK-NEXT: b.ne .LBB80_9 ; CHECK-NEXT: // %bb.6: // %loadbb6 ; CHECK-NEXT: ldr x8, [x0, #48] ; CHECK-NEXT: ldr x9, [x1, #48] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB78_9 +; CHECK-NEXT: b.ne .LBB80_9 ; CHECK-NEXT: // %bb.7: // %loadbb7 ; CHECK-NEXT: ldr x8, [x0, #56] ; CHECK-NEXT: ldr x9, [x1, #56] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB78_9 +; CHECK-NEXT: b.ne .LBB80_9 ; CHECK-NEXT: // %bb.8: ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB78_9: // %res_block +; CHECK-NEXT: .LBB80_9: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w0, w8, hs @@ -2167,60 +2203,60 @@ define i1 @length64_lt(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB80_9 +; CHECK-NEXT: b.ne .LBB82_9 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB80_9 +; CHECK-NEXT: b.ne .LBB82_9 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB80_9 +; CHECK-NEXT: b.ne .LBB82_9 ; CHECK-NEXT: // %bb.3: // %loadbb3 ; CHECK-NEXT: ldr x8, [x0, #24] ; CHECK-NEXT: ldr x9, [x1, #24] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB80_9 +; CHECK-NEXT: b.ne .LBB82_9 ; CHECK-NEXT: // %bb.4: // %loadbb4 ; CHECK-NEXT: ldr x8, [x0, #32] ; CHECK-NEXT: ldr x9, [x1, #32] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB80_9 +; CHECK-NEXT: b.ne .LBB82_9 ; CHECK-NEXT: // %bb.5: // %loadbb5 ; CHECK-NEXT: ldr x8, [x0, #40] ; CHECK-NEXT: ldr x9, [x1, #40] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB80_9 +; CHECK-NEXT: b.ne .LBB82_9 ; CHECK-NEXT: // %bb.6: // %loadbb6 ; CHECK-NEXT: ldr x8, [x0, #48] ; CHECK-NEXT: ldr x9, [x1, #48] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB80_9 +; CHECK-NEXT: b.ne .LBB82_9 ; CHECK-NEXT: // %bb.7: // %loadbb7 ; CHECK-NEXT: ldr x8, [x0, #56] ; CHECK-NEXT: ldr x9, [x1, #56] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB80_9 +; CHECK-NEXT: b.ne .LBB82_9 ; CHECK-NEXT: // %bb.8: ; CHECK-NEXT: lsr w0, wzr, #31 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB80_9: // %res_block +; CHECK-NEXT: .LBB82_9: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs @@ -2239,64 +2275,64 @@ define i1 @length64_gt(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB81_9 +; CHECK-NEXT: b.ne .LBB83_9 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB81_9 +; CHECK-NEXT: b.ne .LBB83_9 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB81_9 +; CHECK-NEXT: b.ne .LBB83_9 ; CHECK-NEXT: // %bb.3: // %loadbb3 ; CHECK-NEXT: ldr x8, [x0, #24] ; CHECK-NEXT: ldr x9, [x1, #24] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB81_9 +; CHECK-NEXT: b.ne .LBB83_9 ; CHECK-NEXT: // %bb.4: // %loadbb4 ; CHECK-NEXT: ldr x8, [x0, #32] ; CHECK-NEXT: ldr x9, [x1, #32] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB81_9 +; CHECK-NEXT: b.ne .LBB83_9 ; CHECK-NEXT: // %bb.5: // %loadbb5 ; CHECK-NEXT: ldr x8, [x0, #40] ; CHECK-NEXT: ldr x9, [x1, #40] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB81_9 +; CHECK-NEXT: b.ne .LBB83_9 ; CHECK-NEXT: // %bb.6: // %loadbb6 ; CHECK-NEXT: ldr x8, [x0, #48] ; CHECK-NEXT: ldr x9, [x1, #48] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB81_9 +; CHECK-NEXT: b.ne .LBB83_9 ; CHECK-NEXT: // %bb.7: // %loadbb7 ; CHECK-NEXT: ldr x8, [x0, #56] ; CHECK-NEXT: ldr x9, [x1, #56] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB81_9 +; CHECK-NEXT: b.ne .LBB83_9 ; CHECK-NEXT: // %bb.8: ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: b .LBB81_10 -; CHECK-NEXT: .LBB81_9: // %res_block +; CHECK-NEXT: b .LBB83_10 +; CHECK-NEXT: .LBB83_9: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs -; CHECK-NEXT: .LBB81_10: // %endblock +; CHECK-NEXT: .LBB83_10: // %endblock ; CHECK-NEXT: cmp w8, #0 ; CHECK-NEXT: cset w0, gt ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/memcmp.ll b/llvm/test/CodeGen/RISCV/memcmp.ll index 831e21af43807..5adda28acb427 100644 --- a/llvm/test/CodeGen/RISCV/memcmp.ll +++ b/llvm/test/CodeGen/RISCV/memcmp.ll @@ -2710,6 +2710,216 @@ entry: ret i1 %ret } +define i1 @bcmp_le_zero(ptr %s1, ptr %s2) nounwind { +; CHECK-ALIGNED-RV32-LABEL: bcmp_le_zero: +; CHECK-ALIGNED-RV32: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-NEXT: call bcmp +; CHECK-ALIGNED-RV32-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-NEXT: ret +; +; CHECK-ALIGNED-RV64-LABEL: bcmp_le_zero: +; CHECK-ALIGNED-RV64: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-NEXT: call bcmp +; CHECK-ALIGNED-RV64-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-NEXT: ret +; +; CHECK-ALIGNED-RV32-ZBB-LABEL: bcmp_le_zero: +; CHECK-ALIGNED-RV32-ZBB: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-ZBB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-ZBB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-ZBB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-ZBB-NEXT: call bcmp +; CHECK-ALIGNED-RV32-ZBB-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV32-ZBB-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-ZBB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-ZBB-NEXT: ret +; +; CHECK-ALIGNED-RV64-ZBB-LABEL: bcmp_le_zero: +; CHECK-ALIGNED-RV64-ZBB: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-ZBB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-ZBB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-ZBB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-ZBB-NEXT: call bcmp +; CHECK-ALIGNED-RV64-ZBB-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV64-ZBB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-ZBB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-ZBB-NEXT: ret +; +; CHECK-ALIGNED-RV32-ZBKB-LABEL: bcmp_le_zero: +; CHECK-ALIGNED-RV32-ZBKB: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-ZBKB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-ZBKB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: call bcmp +; CHECK-ALIGNED-RV32-ZBKB-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-ZBKB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: ret +; +; CHECK-ALIGNED-RV64-ZBKB-LABEL: bcmp_le_zero: +; CHECK-ALIGNED-RV64-ZBKB: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-ZBKB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-ZBKB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: call bcmp +; CHECK-ALIGNED-RV64-ZBKB-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-ZBKB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: ret +; +; CHECK-ALIGNED-RV32-V-LABEL: bcmp_le_zero: +; CHECK-ALIGNED-RV32-V: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-V-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-V-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-V-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-V-NEXT: call bcmp +; CHECK-ALIGNED-RV32-V-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV32-V-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-V-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-V-NEXT: ret +; +; CHECK-ALIGNED-RV64-V-LABEL: bcmp_le_zero: +; CHECK-ALIGNED-RV64-V: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-V-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-V-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-V-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-V-NEXT: call bcmp +; CHECK-ALIGNED-RV64-V-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV64-V-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-V-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-V-NEXT: ret +; +; CHECK-UNALIGNED-LABEL: bcmp_le_zero: +; CHECK-UNALIGNED: # %bb.0: # %entry +; CHECK-UNALIGNED-NEXT: lw a0, 0(a0) +; CHECK-UNALIGNED-NEXT: lw a1, 0(a1) +; CHECK-UNALIGNED-NEXT: xor a0, a0, a1 +; CHECK-UNALIGNED-NEXT: snez a0, a0 +; CHECK-UNALIGNED-NEXT: slti a0, a0, 1 +; CHECK-UNALIGNED-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 4) + %ret = icmp slt i32 %bcmp, 1 + ret i1 %ret +} + +define i1 @bcmp_ge_zero(ptr %s1, ptr %s2) nounwind { +; CHECK-ALIGNED-RV32-LABEL: bcmp_ge_zero: +; CHECK-ALIGNED-RV32: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-NEXT: call bcmp +; CHECK-ALIGNED-RV32-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV32-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-NEXT: ret +; +; CHECK-ALIGNED-RV64-LABEL: bcmp_ge_zero: +; CHECK-ALIGNED-RV64: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-NEXT: call bcmp +; CHECK-ALIGNED-RV64-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV64-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-NEXT: ret +; +; CHECK-ALIGNED-RV32-ZBB-LABEL: bcmp_ge_zero: +; CHECK-ALIGNED-RV32-ZBB: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-ZBB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-ZBB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-ZBB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-ZBB-NEXT: call bcmp +; CHECK-ALIGNED-RV32-ZBB-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV32-ZBB-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV32-ZBB-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-ZBB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-ZBB-NEXT: ret +; +; CHECK-ALIGNED-RV64-ZBB-LABEL: bcmp_ge_zero: +; CHECK-ALIGNED-RV64-ZBB: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-ZBB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-ZBB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-ZBB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-ZBB-NEXT: call bcmp +; CHECK-ALIGNED-RV64-ZBB-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV64-ZBB-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV64-ZBB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-ZBB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-ZBB-NEXT: ret +; +; CHECK-ALIGNED-RV32-ZBKB-LABEL: bcmp_ge_zero: +; CHECK-ALIGNED-RV32-ZBKB: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-ZBKB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-ZBKB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: call bcmp +; CHECK-ALIGNED-RV32-ZBKB-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-ZBKB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: ret +; +; CHECK-ALIGNED-RV64-ZBKB-LABEL: bcmp_ge_zero: +; CHECK-ALIGNED-RV64-ZBKB: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-ZBKB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-ZBKB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: call bcmp +; CHECK-ALIGNED-RV64-ZBKB-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-ZBKB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: ret +; +; CHECK-ALIGNED-RV32-V-LABEL: bcmp_ge_zero: +; CHECK-ALIGNED-RV32-V: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-V-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-V-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-V-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-V-NEXT: call bcmp +; CHECK-ALIGNED-RV32-V-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV32-V-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV32-V-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-V-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-V-NEXT: ret +; +; CHECK-ALIGNED-RV64-V-LABEL: bcmp_ge_zero: +; CHECK-ALIGNED-RV64-V: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-V-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-V-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-V-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-V-NEXT: call bcmp +; CHECK-ALIGNED-RV64-V-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV64-V-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV64-V-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-V-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-V-NEXT: ret +; +; CHECK-UNALIGNED-LABEL: bcmp_ge_zero: +; CHECK-UNALIGNED: # %bb.0: # %entry +; CHECK-UNALIGNED-NEXT: li a0, 1 +; CHECK-UNALIGNED-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 4) + %ret = icmp sgt i32 %bcmp, -1 + ret i1 %ret +} + define i32 @memcmp_size_0(ptr %s1, ptr %s2) nounwind { ; CHECK-LABEL: memcmp_size_0: ; CHECK: # %bb.0: # %entry @@ -3517,13 +3727,13 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB26_2 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB28_2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lbu a0, 4(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lbu a1, 4(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a0, a1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB26_2: # %res_block +; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB28_2: # %res_block ; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ori a0, a0, 1 @@ -3552,13 +3762,13 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB26_2 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB28_2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lbu a0, 4(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lbu a1, 4(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a0, a1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB26_2: # %res_block +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB28_2: # %res_block ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ori a0, a0, 1 @@ -3710,7 +3920,7 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB27_3 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB29_3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lh a0, 4(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lh a1, 4(a1) @@ -3718,11 +3928,11 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: srli a2, a2, 16 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: srli a3, a3, 16 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB27_3 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB29_3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.2: ; CHECK-UNALIGNED-RV32-ZBB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB27_3: # %res_block +; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB29_3: # %res_block ; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ori a0, a0, 1 @@ -3751,7 +3961,7 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB27_3 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB29_3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lh a0, 4(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lh a1, 4(a1) @@ -3759,11 +3969,11 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: srli a2, a2, 16 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: srli a3, a3, 16 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB27_3 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB29_3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.2: ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB27_3: # %res_block +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB29_3: # %res_block ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ori a0, a0, 1 @@ -3915,17 +4125,17 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB28_3 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB30_3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a0, 3(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a1, 3(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB28_3 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB30_3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.2: ; CHECK-UNALIGNED-RV32-ZBB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB28_3: # %res_block +; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB30_3: # %res_block ; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ori a0, a0, 1 @@ -3939,7 +4149,7 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB28_3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB30_3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a0, 3(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a1, 3(a1) @@ -3947,11 +4157,11 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB28_3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB30_3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.2: ; CHECK-UNALIGNED-RV64-ZBB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB28_3: # %res_block +; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB30_3: # %res_block ; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ori a0, a0, 1 @@ -3963,17 +4173,17 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB28_3 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB30_3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a0, 3(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a1, 3(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB28_3 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB30_3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.2: ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB28_3: # %res_block +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB30_3: # %res_block ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ori a0, a0, 1 @@ -3987,7 +4197,7 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB28_3 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB30_3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a0, 3(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a1, 3(a1) @@ -3995,11 +4205,11 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB28_3 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB30_3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.2: ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB28_3: # %res_block +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB30_3: # %res_block ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ori a0, a0, 1 @@ -4136,17 +4346,17 @@ define i32 @memcmp_size_8(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB29_3 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB31_3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a0, 4(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a1, 4(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB29_3 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB31_3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.2: ; CHECK-UNALIGNED-RV32-ZBB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB29_3: # %res_block +; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB31_3: # %res_block ; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ori a0, a0, 1 @@ -4169,17 +4379,17 @@ define i32 @memcmp_size_8(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB29_3 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB31_3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a0, 4(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a1, 4(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB29_3 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB31_3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.2: ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB29_3: # %res_block +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB31_3: # %res_block ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ori a0, a0, 1 @@ -4327,29 +4537,29 @@ define i32 @memcmp_size_15(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB30_5 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB32_5 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 4(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 4(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB30_5 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB32_5 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 8(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 8(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB30_5 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB32_5 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a0, 11(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a1, 11(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB30_5 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB32_5 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.4: ; CHECK-UNALIGNED-RV32-ZBB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB30_5: # %res_block +; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB32_5: # %res_block ; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ori a0, a0, 1 @@ -4361,17 +4571,17 @@ define i32 @memcmp_size_15(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB30_3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB32_3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a0, 7(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a1, 7(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB30_3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB32_3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.2: ; CHECK-UNALIGNED-RV64-ZBB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB30_3: # %res_block +; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB32_3: # %res_block ; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ori a0, a0, 1 @@ -4383,29 +4593,29 @@ define i32 @memcmp_size_15(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB30_5 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB32_5 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 4(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 4(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB30_5 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB32_5 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 8(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 8(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB30_5 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB32_5 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a0, 11(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a1, 11(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB30_5 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB32_5 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.4: ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB30_5: # %res_block +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB32_5: # %res_block ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ori a0, a0, 1 @@ -4417,17 +4627,17 @@ define i32 @memcmp_size_15(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB30_3 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB32_3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a0, 7(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a1, 7(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB30_3 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB32_3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.2: ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB30_3: # %res_block +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB32_3: # %res_block ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ori a0, a0, 1 @@ -4564,29 +4774,29 @@ define i32 @memcmp_size_16(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB31_5 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB33_5 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 4(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 4(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB31_5 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB33_5 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 8(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 8(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB31_5 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB33_5 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a0, 12(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a1, 12(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB31_5 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB33_5 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.4: ; CHECK-UNALIGNED-RV32-ZBB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB31_5: # %res_block +; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB33_5: # %res_block ; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ori a0, a0, 1 @@ -4598,17 +4808,17 @@ define i32 @memcmp_size_16(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB31_3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB33_3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a0, 8(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a1, 8(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB31_3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB33_3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.2: ; CHECK-UNALIGNED-RV64-ZBB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB31_3: # %res_block +; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB33_3: # %res_block ; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ori a0, a0, 1 @@ -4620,29 +4830,29 @@ define i32 @memcmp_size_16(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB31_5 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB33_5 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 4(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 4(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB31_5 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB33_5 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 8(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 8(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB31_5 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB33_5 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a0, 12(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a1, 12(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB31_5 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB33_5 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.4: ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB31_5: # %res_block +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB33_5: # %res_block ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ori a0, a0, 1 @@ -4654,17 +4864,17 @@ define i32 @memcmp_size_16(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB31_3 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB33_3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a0, 8(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a1, 8(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB31_3 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB33_3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.2: ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB31_3: # %res_block +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB33_3: # %res_block ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ori a0, a0, 1 @@ -4801,53 +5011,53 @@ define i32 @memcmp_size_31(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 4(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 4(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 8(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 8(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 12(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 12(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.4: # %loadbb4 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 16(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 16(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.5: # %loadbb5 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 20(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 20(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.6: # %loadbb6 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 24(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 24(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.7: # %loadbb7 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a0, 27(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a1, 27(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.8: ; CHECK-UNALIGNED-RV32-ZBB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB32_9: # %res_block +; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB34_9: # %res_block ; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ori a0, a0, 1 @@ -4859,29 +5069,29 @@ define i32 @memcmp_size_31(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB32_5 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB34_5 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 8(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 8(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB32_5 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB34_5 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 16(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 16(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB32_5 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB34_5 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a0, 23(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a1, 23(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB32_5 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB34_5 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.4: ; CHECK-UNALIGNED-RV64-ZBB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB32_5: # %res_block +; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB34_5: # %res_block ; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ori a0, a0, 1 @@ -4893,53 +5103,53 @@ define i32 @memcmp_size_31(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 4(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 4(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 8(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 8(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 12(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 12(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.4: # %loadbb4 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 16(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 16(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.5: # %loadbb5 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 20(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 20(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.6: # %loadbb6 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 24(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 24(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.7: # %loadbb7 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a0, 27(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a1, 27(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.8: ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB32_9: # %res_block +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB34_9: # %res_block ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ori a0, a0, 1 @@ -4951,29 +5161,29 @@ define i32 @memcmp_size_31(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB32_5 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB34_5 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 8(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 8(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB32_5 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB34_5 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 16(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 16(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB32_5 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB34_5 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a0, 23(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a1, 23(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB32_5 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB34_5 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.4: ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB32_5: # %res_block +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB34_5: # %res_block ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ori a0, a0, 1 @@ -5110,53 +5320,53 @@ define i32 @memcmp_size_32(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 4(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 4(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 8(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 8(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 12(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 12(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.4: # %loadbb4 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 16(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 16(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.5: # %loadbb5 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 20(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 20(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.6: # %loadbb6 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 24(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 24(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.7: # %loadbb7 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a0, 28(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a1, 28(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.8: ; CHECK-UNALIGNED-RV32-ZBB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB33_9: # %res_block +; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB35_9: # %res_block ; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ori a0, a0, 1 @@ -5168,29 +5378,29 @@ define i32 @memcmp_size_32(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB33_5 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB35_5 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 8(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 8(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB33_5 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB35_5 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 16(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 16(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB33_5 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB35_5 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a0, 24(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a1, 24(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB33_5 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB35_5 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.4: ; CHECK-UNALIGNED-RV64-ZBB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB33_5: # %res_block +; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB35_5: # %res_block ; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ori a0, a0, 1 @@ -5202,53 +5412,53 @@ define i32 @memcmp_size_32(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 4(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 4(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 8(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 8(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 12(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 12(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.4: # %loadbb4 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 16(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 16(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.5: # %loadbb5 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 20(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 20(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.6: # %loadbb6 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 24(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 24(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.7: # %loadbb7 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a0, 28(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a1, 28(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.8: ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB33_9: # %res_block +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB35_9: # %res_block ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ori a0, a0, 1 @@ -5260,29 +5470,29 @@ define i32 @memcmp_size_32(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB33_5 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB35_5 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 8(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 8(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB33_5 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB35_5 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 16(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 16(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB33_5 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB35_5 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a0, 24(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a1, 24(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB33_5 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB35_5 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.4: ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB33_5: # %res_block +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB35_5: # %res_block ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ori a0, a0, 1 @@ -5379,53 +5589,53 @@ define i32 @memcmp_size_63(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 8(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 8(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 16(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 16(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 24(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 24(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.4: # %loadbb4 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 32(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 32(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.5: # %loadbb5 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 40(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 40(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.6: # %loadbb6 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 48(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 48(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.7: # %loadbb7 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a0, 55(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a1, 55(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.8: ; CHECK-UNALIGNED-RV64-ZBB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB34_9: # %res_block +; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB36_9: # %res_block ; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ori a0, a0, 1 @@ -5437,53 +5647,53 @@ define i32 @memcmp_size_63(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 8(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 8(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 16(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 16(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 24(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 24(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.4: # %loadbb4 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 32(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 32(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.5: # %loadbb5 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 40(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 40(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.6: # %loadbb6 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 48(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 48(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.7: # %loadbb7 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a0, 55(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a1, 55(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.8: ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB34_9: # %res_block +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB36_9: # %res_block ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ori a0, a0, 1 @@ -5570,53 +5780,53 @@ define i32 @memcmp_size_64(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 8(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 8(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 16(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 16(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 24(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 24(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.4: # %loadbb4 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 32(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 32(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.5: # %loadbb5 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 40(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 40(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.6: # %loadbb6 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 48(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 48(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.7: # %loadbb7 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a0, 56(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a1, 56(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.8: ; CHECK-UNALIGNED-RV64-ZBB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB35_9: # %res_block +; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB37_9: # %res_block ; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ori a0, a0, 1 @@ -5628,53 +5838,53 @@ define i32 @memcmp_size_64(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 8(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 8(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 16(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 16(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 24(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 24(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.4: # %loadbb4 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 32(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 32(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.5: # %loadbb5 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 40(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 40(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.6: # %loadbb6 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 48(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 48(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.7: # %loadbb7 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a0, 56(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a1, 56(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.8: ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB35_9: # %res_block +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB37_9: # %res_block ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ori a0, a0, 1 @@ -6336,5 +6546,401 @@ entry: %ret = icmp sgt i32 %memcmp, 0 ret i1 %ret } + +define i1 @memcmp_le_zero(ptr %s1, ptr %s2) nounwind { +; CHECK-ALIGNED-RV32-LABEL: memcmp_le_zero: +; CHECK-ALIGNED-RV32: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-NEXT: call memcmp +; CHECK-ALIGNED-RV32-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-NEXT: ret +; +; CHECK-ALIGNED-RV64-LABEL: memcmp_le_zero: +; CHECK-ALIGNED-RV64: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-NEXT: call memcmp +; CHECK-ALIGNED-RV64-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-NEXT: ret +; +; CHECK-ALIGNED-RV32-ZBB-LABEL: memcmp_le_zero: +; CHECK-ALIGNED-RV32-ZBB: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-ZBB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-ZBB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-ZBB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-ZBB-NEXT: call memcmp +; CHECK-ALIGNED-RV32-ZBB-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV32-ZBB-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-ZBB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-ZBB-NEXT: ret +; +; CHECK-ALIGNED-RV64-ZBB-LABEL: memcmp_le_zero: +; CHECK-ALIGNED-RV64-ZBB: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-ZBB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-ZBB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-ZBB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-ZBB-NEXT: call memcmp +; CHECK-ALIGNED-RV64-ZBB-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV64-ZBB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-ZBB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-ZBB-NEXT: ret +; +; CHECK-ALIGNED-RV32-ZBKB-LABEL: memcmp_le_zero: +; CHECK-ALIGNED-RV32-ZBKB: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-ZBKB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-ZBKB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: call memcmp +; CHECK-ALIGNED-RV32-ZBKB-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-ZBKB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: ret +; +; CHECK-ALIGNED-RV64-ZBKB-LABEL: memcmp_le_zero: +; CHECK-ALIGNED-RV64-ZBKB: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-ZBKB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-ZBKB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: call memcmp +; CHECK-ALIGNED-RV64-ZBKB-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-ZBKB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: ret +; +; CHECK-ALIGNED-RV32-V-LABEL: memcmp_le_zero: +; CHECK-ALIGNED-RV32-V: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-V-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-V-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-V-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-V-NEXT: call memcmp +; CHECK-ALIGNED-RV32-V-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV32-V-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-V-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-V-NEXT: ret +; +; CHECK-ALIGNED-RV64-V-LABEL: memcmp_le_zero: +; CHECK-ALIGNED-RV64-V: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-V-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-V-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-V-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-V-NEXT: call memcmp +; CHECK-ALIGNED-RV64-V-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV64-V-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-V-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-V-NEXT: ret +; +; CHECK-UNALIGNED-RV32-LABEL: memcmp_le_zero: +; CHECK-UNALIGNED-RV32: # %bb.0: # %entry +; CHECK-UNALIGNED-RV32-NEXT: addi sp, sp, -16 +; CHECK-UNALIGNED-RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-UNALIGNED-RV32-NEXT: li a2, 4 +; CHECK-UNALIGNED-RV32-NEXT: call memcmp +; CHECK-UNALIGNED-RV32-NEXT: slti a0, a0, 1 +; CHECK-UNALIGNED-RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-UNALIGNED-RV32-NEXT: addi sp, sp, 16 +; CHECK-UNALIGNED-RV32-NEXT: ret +; +; CHECK-UNALIGNED-RV64-LABEL: memcmp_le_zero: +; CHECK-UNALIGNED-RV64: # %bb.0: # %entry +; CHECK-UNALIGNED-RV64-NEXT: addi sp, sp, -16 +; CHECK-UNALIGNED-RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-UNALIGNED-RV64-NEXT: li a2, 4 +; CHECK-UNALIGNED-RV64-NEXT: call memcmp +; CHECK-UNALIGNED-RV64-NEXT: slti a0, a0, 1 +; CHECK-UNALIGNED-RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-UNALIGNED-RV64-NEXT: addi sp, sp, 16 +; CHECK-UNALIGNED-RV64-NEXT: ret +; +; CHECK-UNALIGNED-RV32-ZBB-LABEL: memcmp_le_zero: +; CHECK-UNALIGNED-RV32-ZBB: # %bb.0: # %entry +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a0, 0(a0) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a1, 0(a1) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a0, a2 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: slti a0, a0, 1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret +; +; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_le_zero: +; CHECK-UNALIGNED-RV64-ZBB: # %bb.0: # %entry +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a0, 0(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a1, 0(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a1, a1, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a2 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slti a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret +; +; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_le_zero: +; CHECK-UNALIGNED-RV32-ZBKB: # %bb.0: # %entry +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a0, 0(a0) +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a1, 0(a1) +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a0, a2 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: slti a0, a0, 1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret +; +; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_le_zero: +; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a0, 0(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a1, 0(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a1, a1, 32 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a2 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: slti a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret +; +; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_le_zero: +; CHECK-UNALIGNED-RV32-V: # %bb.0: # %entry +; CHECK-UNALIGNED-RV32-V-NEXT: addi sp, sp, -16 +; CHECK-UNALIGNED-RV32-V-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-UNALIGNED-RV32-V-NEXT: li a2, 4 +; CHECK-UNALIGNED-RV32-V-NEXT: call memcmp +; CHECK-UNALIGNED-RV32-V-NEXT: slti a0, a0, 1 +; CHECK-UNALIGNED-RV32-V-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-UNALIGNED-RV32-V-NEXT: addi sp, sp, 16 +; CHECK-UNALIGNED-RV32-V-NEXT: ret +; +; CHECK-UNALIGNED-RV64-V-LABEL: memcmp_le_zero: +; CHECK-UNALIGNED-RV64-V: # %bb.0: # %entry +; CHECK-UNALIGNED-RV64-V-NEXT: addi sp, sp, -16 +; CHECK-UNALIGNED-RV64-V-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-UNALIGNED-RV64-V-NEXT: li a2, 4 +; CHECK-UNALIGNED-RV64-V-NEXT: call memcmp +; CHECK-UNALIGNED-RV64-V-NEXT: slti a0, a0, 1 +; CHECK-UNALIGNED-RV64-V-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-UNALIGNED-RV64-V-NEXT: addi sp, sp, 16 +; CHECK-UNALIGNED-RV64-V-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iXLen 4) + %ret = icmp slt i32 %memcmp, 1 + ret i1 %ret +} + +define i1 @memcmp_ge_zero(ptr %s1, ptr %s2) nounwind { +; CHECK-ALIGNED-RV32-LABEL: memcmp_ge_zero: +; CHECK-ALIGNED-RV32: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-NEXT: call memcmp +; CHECK-ALIGNED-RV32-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV32-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-NEXT: ret +; +; CHECK-ALIGNED-RV64-LABEL: memcmp_ge_zero: +; CHECK-ALIGNED-RV64: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-NEXT: call memcmp +; CHECK-ALIGNED-RV64-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV64-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-NEXT: ret +; +; CHECK-ALIGNED-RV32-ZBB-LABEL: memcmp_ge_zero: +; CHECK-ALIGNED-RV32-ZBB: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-ZBB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-ZBB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-ZBB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-ZBB-NEXT: call memcmp +; CHECK-ALIGNED-RV32-ZBB-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV32-ZBB-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV32-ZBB-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-ZBB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-ZBB-NEXT: ret +; +; CHECK-ALIGNED-RV64-ZBB-LABEL: memcmp_ge_zero: +; CHECK-ALIGNED-RV64-ZBB: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-ZBB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-ZBB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-ZBB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-ZBB-NEXT: call memcmp +; CHECK-ALIGNED-RV64-ZBB-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV64-ZBB-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV64-ZBB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-ZBB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-ZBB-NEXT: ret +; +; CHECK-ALIGNED-RV32-ZBKB-LABEL: memcmp_ge_zero: +; CHECK-ALIGNED-RV32-ZBKB: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-ZBKB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-ZBKB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: call memcmp +; CHECK-ALIGNED-RV32-ZBKB-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-ZBKB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: ret +; +; CHECK-ALIGNED-RV64-ZBKB-LABEL: memcmp_ge_zero: +; CHECK-ALIGNED-RV64-ZBKB: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-ZBKB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-ZBKB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: call memcmp +; CHECK-ALIGNED-RV64-ZBKB-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-ZBKB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: ret +; +; CHECK-ALIGNED-RV32-V-LABEL: memcmp_ge_zero: +; CHECK-ALIGNED-RV32-V: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-V-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-V-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-V-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-V-NEXT: call memcmp +; CHECK-ALIGNED-RV32-V-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV32-V-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV32-V-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-V-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-V-NEXT: ret +; +; CHECK-ALIGNED-RV64-V-LABEL: memcmp_ge_zero: +; CHECK-ALIGNED-RV64-V: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-V-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-V-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-V-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-V-NEXT: call memcmp +; CHECK-ALIGNED-RV64-V-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV64-V-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV64-V-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-V-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-V-NEXT: ret +; +; CHECK-UNALIGNED-RV32-LABEL: memcmp_ge_zero: +; CHECK-UNALIGNED-RV32: # %bb.0: # %entry +; CHECK-UNALIGNED-RV32-NEXT: addi sp, sp, -16 +; CHECK-UNALIGNED-RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-UNALIGNED-RV32-NEXT: li a2, 4 +; CHECK-UNALIGNED-RV32-NEXT: call memcmp +; CHECK-UNALIGNED-RV32-NEXT: slti a0, a0, 0 +; CHECK-UNALIGNED-RV32-NEXT: xori a0, a0, 1 +; CHECK-UNALIGNED-RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-UNALIGNED-RV32-NEXT: addi sp, sp, 16 +; CHECK-UNALIGNED-RV32-NEXT: ret +; +; CHECK-UNALIGNED-RV64-LABEL: memcmp_ge_zero: +; CHECK-UNALIGNED-RV64: # %bb.0: # %entry +; CHECK-UNALIGNED-RV64-NEXT: addi sp, sp, -16 +; CHECK-UNALIGNED-RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-UNALIGNED-RV64-NEXT: li a2, 4 +; CHECK-UNALIGNED-RV64-NEXT: call memcmp +; CHECK-UNALIGNED-RV64-NEXT: slti a0, a0, 0 +; CHECK-UNALIGNED-RV64-NEXT: xori a0, a0, 1 +; CHECK-UNALIGNED-RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-UNALIGNED-RV64-NEXT: addi sp, sp, 16 +; CHECK-UNALIGNED-RV64-NEXT: ret +; +; CHECK-UNALIGNED-RV32-ZBB-LABEL: memcmp_ge_zero: +; CHECK-UNALIGNED-RV32-ZBB: # %bb.0: # %entry +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a0, 0(a0) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a1, 0(a1) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a0, a2 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: slti a0, a0, 0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: xori a0, a0, 1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret +; +; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_ge_zero: +; CHECK-UNALIGNED-RV64-ZBB: # %bb.0: # %entry +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a0, 0(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a1, 0(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a1, a1, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a2 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slti a0, a0, 0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: xori a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret +; +; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_ge_zero: +; CHECK-UNALIGNED-RV32-ZBKB: # %bb.0: # %entry +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a0, 0(a0) +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a1, 0(a1) +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a0, a2 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: slti a0, a0, 0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: xori a0, a0, 1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret +; +; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_ge_zero: +; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a0, 0(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a1, 0(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a1, a1, 32 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a2 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: slti a0, a0, 0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: xori a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret +; +; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_ge_zero: +; CHECK-UNALIGNED-RV32-V: # %bb.0: # %entry +; CHECK-UNALIGNED-RV32-V-NEXT: addi sp, sp, -16 +; CHECK-UNALIGNED-RV32-V-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-UNALIGNED-RV32-V-NEXT: li a2, 4 +; CHECK-UNALIGNED-RV32-V-NEXT: call memcmp +; CHECK-UNALIGNED-RV32-V-NEXT: slti a0, a0, 0 +; CHECK-UNALIGNED-RV32-V-NEXT: xori a0, a0, 1 +; CHECK-UNALIGNED-RV32-V-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-UNALIGNED-RV32-V-NEXT: addi sp, sp, 16 +; CHECK-UNALIGNED-RV32-V-NEXT: ret +; +; CHECK-UNALIGNED-RV64-V-LABEL: memcmp_ge_zero: +; CHECK-UNALIGNED-RV64-V: # %bb.0: # %entry +; CHECK-UNALIGNED-RV64-V-NEXT: addi sp, sp, -16 +; CHECK-UNALIGNED-RV64-V-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-UNALIGNED-RV64-V-NEXT: li a2, 4 +; CHECK-UNALIGNED-RV64-V-NEXT: call memcmp +; CHECK-UNALIGNED-RV64-V-NEXT: slti a0, a0, 0 +; CHECK-UNALIGNED-RV64-V-NEXT: xori a0, a0, 1 +; CHECK-UNALIGNED-RV64-V-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-UNALIGNED-RV64-V-NEXT: addi sp, sp, 16 +; CHECK-UNALIGNED-RV64-V-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iXLen 4) + %ret = icmp sgt i32 %memcmp, -1 + ret i1 %ret +} ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CHECK-ALIGNED: {{.*}} diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll index 014db33160606..e744d2a06e55f 100644 --- a/llvm/test/CodeGen/X86/memcmp.ll +++ b/llvm/test/CodeGen/X86/memcmp.ll @@ -260,6 +260,44 @@ define i1 @length4_gt(ptr %X, ptr %Y) nounwind { ret i1 %c } +define i1 @length4_le(ptr %X, ptr %Y) nounwind { +; X64-LABEL: length4_le: +; X64: # %bb.0: +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax +; X64-NEXT: bswapl %ecx +; X64-NEXT: cmpl %ecx, %eax +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setle %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind + %c = icmp slt i32 %m, 1 + ret i1 %c +} + +define i1 @length4_ge(ptr %X, ptr %Y) nounwind { +; X64-LABEL: length4_ge: +; X64: # %bb.0: +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax +; X64-NEXT: bswapl %ecx +; X64-NEXT: cmpl %ecx, %eax +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setns %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind + %c = icmp sgt i32 %m, -1 + ret i1 %c +} + define i1 @length4_eq_const(ptr %X) nounwind { ; X64-LABEL: length4_eq_const: ; X64: # %bb.0: @@ -279,13 +317,13 @@ define i32 @length5(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapl %ecx ; X64-NEXT: bswapl %edx ; X64-NEXT: cmpl %edx, %ecx -; X64-NEXT: jne .LBB18_3 +; X64-NEXT: jne .LBB20_3 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movzbl 4(%rdi), %eax ; X64-NEXT: movzbl 4(%rsi), %ecx ; X64-NEXT: subl %ecx, %eax ; X64-NEXT: retq -; X64-NEXT: .LBB18_3: # %res_block +; X64-NEXT: .LBB20_3: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl %edx, %ecx ; X64-NEXT: sbbl %eax, %eax @@ -319,7 +357,7 @@ define i1 @length5_lt(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapl %ecx ; X64-NEXT: bswapl %edx ; X64-NEXT: cmpl %edx, %ecx -; X64-NEXT: jne .LBB20_3 +; X64-NEXT: jne .LBB22_3 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movzbl 4(%rdi), %eax ; X64-NEXT: movzbl 4(%rsi), %ecx @@ -327,7 +365,7 @@ define i1 @length5_lt(ptr %X, ptr %Y) nounwind { ; X64-NEXT: shrl $31, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq -; X64-NEXT: .LBB20_3: # %res_block +; X64-NEXT: .LBB22_3: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl %edx, %ecx ; X64-NEXT: sbbl %eax, %eax @@ -348,7 +386,7 @@ define i32 @length7(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapl %ecx ; X64-NEXT: bswapl %edx ; X64-NEXT: cmpl %edx, %ecx -; X64-NEXT: jne .LBB21_2 +; X64-NEXT: jne .LBB23_2 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movl 3(%rdi), %ecx ; X64-NEXT: movl 3(%rsi), %edx @@ -356,13 +394,13 @@ define i32 @length7(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapl %edx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl %edx, %ecx -; X64-NEXT: je .LBB21_3 -; X64-NEXT: .LBB21_2: # %res_block +; X64-NEXT: je .LBB23_3 +; X64-NEXT: .LBB23_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl %edx, %ecx ; X64-NEXT: sbbl %eax, %eax ; X64-NEXT: orl $1, %eax -; X64-NEXT: .LBB21_3: # %endblock +; X64-NEXT: .LBB23_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind ret i32 %m @@ -376,7 +414,7 @@ define i1 @length7_lt(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapl %ecx ; X64-NEXT: bswapl %edx ; X64-NEXT: cmpl %edx, %ecx -; X64-NEXT: jne .LBB22_2 +; X64-NEXT: jne .LBB24_2 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movl 3(%rdi), %ecx ; X64-NEXT: movl 3(%rsi), %edx @@ -384,13 +422,13 @@ define i1 @length7_lt(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapl %edx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl %edx, %ecx -; X64-NEXT: je .LBB22_3 -; X64-NEXT: .LBB22_2: # %res_block +; X64-NEXT: je .LBB24_3 +; X64-NEXT: .LBB24_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl %edx, %ecx ; X64-NEXT: sbbl %eax, %eax ; X64-NEXT: orl $1, %eax -; X64-NEXT: .LBB22_3: # %endblock +; X64-NEXT: .LBB24_3: # %endblock ; X64-NEXT: shrl $31, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq @@ -524,7 +562,7 @@ define i32 @length12(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB31_2 +; X64-NEXT: jne .LBB33_2 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: movl 8(%rsi), %edx @@ -532,13 +570,13 @@ define i32 @length12(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapl %edx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: je .LBB31_3 -; X64-NEXT: .LBB31_2: # %res_block +; X64-NEXT: je .LBB33_3 +; X64-NEXT: .LBB33_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: sbbl %eax, %eax ; X64-NEXT: orl $1, %eax -; X64-NEXT: .LBB31_3: # %endblock +; X64-NEXT: .LBB33_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind ret i32 %m @@ -582,7 +620,7 @@ define i32 @length15(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB34_2 +; X64-NEXT: jne .LBB36_2 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movq 7(%rdi), %rcx ; X64-NEXT: movq 7(%rsi), %rdx @@ -590,13 +628,13 @@ define i32 @length15(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: je .LBB34_3 -; X64-NEXT: .LBB34_2: # %res_block +; X64-NEXT: je .LBB36_3 +; X64-NEXT: .LBB36_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: sbbl %eax, %eax ; X64-NEXT: orl $1, %eax -; X64-NEXT: .LBB34_3: # %endblock +; X64-NEXT: .LBB36_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind ret i32 %m @@ -610,7 +648,7 @@ define i1 @length15_lt(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB35_2 +; X64-NEXT: jne .LBB37_2 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movq 7(%rdi), %rcx ; X64-NEXT: movq 7(%rsi), %rdx @@ -618,13 +656,13 @@ define i1 @length15_lt(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: je .LBB35_3 -; X64-NEXT: .LBB35_2: # %res_block +; X64-NEXT: je .LBB37_3 +; X64-NEXT: .LBB37_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: sbbl %eax, %eax ; X64-NEXT: orl $1, %eax -; X64-NEXT: .LBB35_3: # %endblock +; X64-NEXT: .LBB37_3: # %endblock ; X64-NEXT: shrl $31, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq @@ -640,20 +678,20 @@ define i32 @length15_const(ptr %X, ptr %Y) nounwind { ; X64-NEXT: movq (%rdi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rcx, %rdx -; X64-NEXT: jne .LBB36_2 +; X64-NEXT: jne .LBB38_2 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movabsq $4051322327650219061, %rcx # imm = 0x3839303132333435 ; X64-NEXT: movq 7(%rdi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rcx, %rdx -; X64-NEXT: je .LBB36_3 -; X64-NEXT: .LBB36_2: # %res_block +; X64-NEXT: je .LBB38_3 +; X64-NEXT: .LBB38_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rcx, %rdx ; X64-NEXT: sbbl %eax, %eax ; X64-NEXT: orl $1, %eax -; X64-NEXT: .LBB36_3: # %endblock +; X64-NEXT: .LBB38_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 15) nounwind ret i32 %m @@ -681,20 +719,20 @@ define i1 @length15_gt_const(ptr %X, ptr %Y) nounwind { ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: bswapq %rcx ; X64-NEXT: cmpq %rax, %rcx -; X64-NEXT: jne .LBB38_2 +; X64-NEXT: jne .LBB40_2 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movabsq $4051322327650219061, %rax # imm = 0x3839303132333435 ; X64-NEXT: movq 7(%rdi), %rcx ; X64-NEXT: bswapq %rcx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpq %rax, %rcx -; X64-NEXT: je .LBB38_3 -; X64-NEXT: .LBB38_2: # %res_block +; X64-NEXT: je .LBB40_3 +; X64-NEXT: .LBB40_2: # %res_block ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpq %rax, %rcx ; X64-NEXT: sbbl %edx, %edx ; X64-NEXT: orl $1, %edx -; X64-NEXT: .LBB38_3: # %endblock +; X64-NEXT: .LBB40_3: # %endblock ; X64-NEXT: testl %edx, %edx ; X64-NEXT: setg %al ; X64-NEXT: retq @@ -713,7 +751,7 @@ define i32 @length16(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB39_2 +; X64-NEXT: jne .LBB41_2 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rcx ; X64-NEXT: movq 8(%rsi), %rdx @@ -721,13 +759,13 @@ define i32 @length16(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: je .LBB39_3 -; X64-NEXT: .LBB39_2: # %res_block +; X64-NEXT: je .LBB41_3 +; X64-NEXT: .LBB41_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: sbbl %eax, %eax ; X64-NEXT: orl $1, %eax -; X64-NEXT: .LBB39_3: # %endblock +; X64-NEXT: .LBB41_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind ret i32 %m @@ -783,7 +821,7 @@ define i1 @length16_lt(ptr %x, ptr %y) nounwind { ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB41_2 +; X64-NEXT: jne .LBB43_2 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rcx ; X64-NEXT: movq 8(%rsi), %rdx @@ -791,13 +829,13 @@ define i1 @length16_lt(ptr %x, ptr %y) nounwind { ; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: je .LBB41_3 -; X64-NEXT: .LBB41_2: # %res_block +; X64-NEXT: je .LBB43_3 +; X64-NEXT: .LBB43_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: sbbl %eax, %eax ; X64-NEXT: orl $1, %eax -; X64-NEXT: .LBB41_3: # %endblock +; X64-NEXT: .LBB43_3: # %endblock ; X64-NEXT: shrl $31, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq @@ -814,7 +852,7 @@ define i1 @length16_gt(ptr %x, ptr %y) nounwind { ; X64-NEXT: bswapq %rax ; X64-NEXT: bswapq %rcx ; X64-NEXT: cmpq %rcx, %rax -; X64-NEXT: jne .LBB42_2 +; X64-NEXT: jne .LBB44_2 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rax ; X64-NEXT: movq 8(%rsi), %rcx @@ -822,13 +860,13 @@ define i1 @length16_gt(ptr %x, ptr %y) nounwind { ; X64-NEXT: bswapq %rcx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpq %rcx, %rax -; X64-NEXT: je .LBB42_3 -; X64-NEXT: .LBB42_2: # %res_block +; X64-NEXT: je .LBB44_3 +; X64-NEXT: .LBB44_2: # %res_block ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: sbbl %edx, %edx ; X64-NEXT: orl $1, %edx -; X64-NEXT: .LBB42_3: # %endblock +; X64-NEXT: .LBB44_3: # %endblock ; X64-NEXT: testl %edx, %edx ; X64-NEXT: setg %al ; X64-NEXT: retq From 6d321530af6e83e51c2ed08463593af07ead9448 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 3 Jan 2025 09:15:27 -0800 Subject: [PATCH 381/567] [CG][RISCV]Add more RVV tests with exact vlen and linear/quadratic number of shuffles --- .../rvv/fixed-vectors-shuffle-exact-vlen.ll | 88 +++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll index f0ee780137300..bb05eb5368ae9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll @@ -312,3 +312,91 @@ define <4 x double> @shuffles_add(<4 x double> %0, <4 x double> %1) vscale_range ret <4 x double> %5 } +define <16 x i32> @m4_square_num_of_shuffles_in_chunks(<16 x i32> %0) vscale_range(2,2) { +; CHECK-LABEL: m4_square_num_of_shuffles_in_chunks: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a0, %hi(.LCPI17_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI17_0) +; CHECK-NEXT: vl1r.v v12, (a0) +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vsext.vf2 v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vrgatherei16.vv v12, v8, v16 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %1 = shufflevector <16 x i32> %0, <16 x i32> poison, <16 x i32> + ret <16 x i32> %1 +} + +define <16 x i32> @m4_linear_num_of_shuffles_in_chunks(<16 x i32> %0) vscale_range(2,2) { +; CHECK-LABEL: m4_linear_num_of_shuffles_in_chunks: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a0, %hi(.LCPI18_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI18_0) +; CHECK-NEXT: vl2re16.v v16, (a0) +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vrgatherei16.vv v12, v8, v16 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %1 = shufflevector <16 x i32> %0, <16 x i32> poison, <16 x i32> + ret <16 x i32> %1 +} + +define i64 @multi_chunks_shuffle(<32 x i32> %0) vscale_range(8,8) { +; RV32-LABEL: multi_chunks_shuffle: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 16, e32, m1, ta, ma +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: li a0, 32 +; RV32-NEXT: li a1, 63 +; RV32-NEXT: vwsubu.vx v12, v10, a0 +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vmv.v.x v10, a0 +; RV32-NEXT: lui a0, 61681 +; RV32-NEXT: addi a0, a0, -241 +; RV32-NEXT: vand.vx v12, v12, a1 +; RV32-NEXT: vand.vx v10, v10, a1 +; RV32-NEXT: vsrl.vv v12, v8, v12 +; RV32-NEXT: vsll.vv v8, v8, v10 +; RV32-NEXT: vmv.s.x v0, a0 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vmerge.vvm v8, v10, v8, v0 +; RV32-NEXT: vrgather.vi v10, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV32-NEXT: vslidedown.vi v8, v8, 1 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: srai a1, a0, 31 +; RV32-NEXT: ret +; +; RV64-LABEL: multi_chunks_shuffle: +; RV64: # %bb.0: # %entry +; RV64-NEXT: li a0, 32 +; RV64-NEXT: vsetivli zero, 16, e64, m2, ta, ma +; RV64-NEXT: vsrl.vx v10, v8, a0 +; RV64-NEXT: vsll.vx v8, v8, a0 +; RV64-NEXT: lui a0, 61681 +; RV64-NEXT: addi a0, a0, -241 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vmv.s.x v0, a0 +; RV64-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vmerge.vvm v8, v10, v8, v0 +; RV64-NEXT: vrgather.vi v10, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64-NEXT: vslidedown.vi v8, v8, 1 +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: ret +entry: + %1 = shufflevector <32 x i32> %0, <32 x i32> zeroinitializer, <32 x i32> + %2 = shufflevector <32 x i32> zeroinitializer, <32 x i32> %1, <32 x i32> + %3 = or <32 x i32> %1, %2 + %4 = extractelement <32 x i32> %3, i64 1 + %conv199 = sext i32 %4 to i64 + ret i64 %conv199 +} From d37aa5135c732b37ae3daab9d9bdcc4c45f7a17d Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Fri, 3 Jan 2025 13:09:23 -0500 Subject: [PATCH 382/567] [AMDGPU][True16][MC] true16 for v_not_b16 (#120659) Support true16 format for v_not_b16 in MC --- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 2 +- llvm/test/MC/AMDGPU/gfx11_asm_vop1.s | 75 +++++---- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s | 65 ++++---- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s | 21 ++- llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s | 42 +++++ .../MC/AMDGPU/gfx11_asm_vop1_t16_promote.s | 154 +++++++++++++----- .../AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s | 65 ++++---- .../MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s | 21 ++- .../test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s | 69 ++++---- llvm/test/MC/AMDGPU/gfx12_asm_vop1.s | 74 +++++---- llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s | 62 +++---- llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s | 18 +- llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s | 42 +++++ .../MC/AMDGPU/gfx12_asm_vop1_t16_promote.s | 154 +++++++++++++----- .../test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s | 69 ++++---- .../AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s | 65 ++++---- .../MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s | 21 ++- .../Disassembler/AMDGPU/gfx11_dasm_vop1.txt | 63 +++++-- .../AMDGPU/gfx11_dasm_vop1_dpp16.txt | 54 ++++-- .../AMDGPU/gfx11_dasm_vop1_dpp8.txt | 17 +- .../gfx11_dasm_vop3_dpp16_from_vop1.txt | 54 ++++-- .../AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt | 18 +- .../AMDGPU/gfx11_dasm_vop3_from_vop1.txt | 57 +++++-- .../AMDGPU/gfx12_dasm_vop1_dpp16.txt | 50 ++++-- .../AMDGPU/gfx12_dasm_vop1_dpp8.txt | 14 +- .../AMDGPU/gfx12_dasm_vop3_from_vop1.txt | 57 +++++-- .../gfx12_dasm_vop3_from_vop1_dpp16.txt | 54 ++++-- .../AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt | 18 +- 28 files changed, 1020 insertions(+), 455 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 92ebd0e10c8fd..30911d45c9e97 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -1018,7 +1018,7 @@ defm V_CLS_I32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x03b, defm V_SWAP_B16 : VOP1Only_Real_gfx11_gfx12<0x066>; defm V_PERMLANE64_B32 : VOP1Only_Real_gfx11_gfx12<0x067>; defm V_MOV_B16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x01c, "v_mov_b16">; -defm V_NOT_B16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x069, "v_not_b16">; +defm V_NOT_B16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x069, "v_not_b16">; defm V_CVT_I32_I16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x06a, "v_cvt_i32_i16">; defm V_CVT_U32_U16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x06b, "v_cvt_u32_u16">; diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s index fe08042ae5c84..5ceb8ed0065d3 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s @@ -2684,50 +2684,65 @@ v_movrelsd_b32 v255, v255 v_nop // GFX11: v_nop ; encoding: [0x00,0x00,0x00,0x7e] -v_not_b16 v5, v1 -// GFX11: v_not_b16_e32 v5, v1 ; encoding: [0x01,0xd3,0x0a,0x7e] +v_not_b16 v5.l, v1.l +// GFX11: v_not_b16_e32 v5.l, v1.l ; encoding: [0x01,0xd3,0x0a,0x7e] -v_not_b16 v5, v127 -// GFX11: v_not_b16_e32 v5, v127 ; encoding: [0x7f,0xd3,0x0a,0x7e] +v_not_b16 v5.l, v127.l +// GFX11: v_not_b16_e32 v5.l, v127.l ; encoding: [0x7f,0xd3,0x0a,0x7e] -v_not_b16 v5, s1 -// GFX11: v_not_b16_e32 v5, s1 ; encoding: [0x01,0xd2,0x0a,0x7e] +v_not_b16 v5.l, s1 +// GFX11: v_not_b16_e32 v5.l, s1 ; encoding: [0x01,0xd2,0x0a,0x7e] -v_not_b16 v5, s105 -// GFX11: v_not_b16_e32 v5, s105 ; encoding: [0x69,0xd2,0x0a,0x7e] +v_not_b16 v5.l, s105 +// GFX11: v_not_b16_e32 v5.l, s105 ; encoding: [0x69,0xd2,0x0a,0x7e] -v_not_b16 v5, vcc_lo -// GFX11: v_not_b16_e32 v5, vcc_lo ; encoding: [0x6a,0xd2,0x0a,0x7e] +v_not_b16 v5.l, vcc_lo +// GFX11: v_not_b16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xd2,0x0a,0x7e] -v_not_b16 v5, vcc_hi -// GFX11: v_not_b16_e32 v5, vcc_hi ; encoding: [0x6b,0xd2,0x0a,0x7e] +v_not_b16 v5.l, vcc_hi +// GFX11: v_not_b16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xd2,0x0a,0x7e] -v_not_b16 v5, ttmp15 -// GFX11: v_not_b16_e32 v5, ttmp15 ; encoding: [0x7b,0xd2,0x0a,0x7e] +v_not_b16 v5.l, ttmp15 +// GFX11: v_not_b16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xd2,0x0a,0x7e] -v_not_b16 v5, m0 -// GFX11: v_not_b16_e32 v5, m0 ; encoding: [0x7d,0xd2,0x0a,0x7e] +v_not_b16 v5.l, m0 +// GFX11: v_not_b16_e32 v5.l, m0 ; encoding: [0x7d,0xd2,0x0a,0x7e] -v_not_b16 v5, exec_lo -// GFX11: v_not_b16_e32 v5, exec_lo ; encoding: [0x7e,0xd2,0x0a,0x7e] +v_not_b16 v5.l, exec_lo +// GFX11: v_not_b16_e32 v5.l, exec_lo ; encoding: [0x7e,0xd2,0x0a,0x7e] -v_not_b16 v5, exec_hi -// GFX11: v_not_b16_e32 v5, exec_hi ; encoding: [0x7f,0xd2,0x0a,0x7e] +v_not_b16 v5.l, exec_hi +// GFX11: v_not_b16_e32 v5.l, exec_hi ; encoding: [0x7f,0xd2,0x0a,0x7e] -v_not_b16 v5, null -// GFX11: v_not_b16_e32 v5, null ; encoding: [0x7c,0xd2,0x0a,0x7e] +v_not_b16 v5.l, null +// GFX11: v_not_b16_e32 v5.l, null ; encoding: [0x7c,0xd2,0x0a,0x7e] -v_not_b16 v5, -1 -// GFX11: v_not_b16_e32 v5, -1 ; encoding: [0xc1,0xd2,0x0a,0x7e] +v_not_b16 v5.l, -1 +// GFX11: v_not_b16_e32 v5.l, -1 ; encoding: [0xc1,0xd2,0x0a,0x7e] -v_not_b16 v5, 0.5 -// GFX11: v_not_b16_e32 v5, 0.5 ; encoding: [0xf0,0xd2,0x0a,0x7e] +v_not_b16 v5.l, 0.5 +// GFX11: v_not_b16_e32 v5.l, 0.5 ; encoding: [0xf0,0xd2,0x0a,0x7e] -v_not_b16 v5, src_scc -// GFX11: v_not_b16_e32 v5, src_scc ; encoding: [0xfd,0xd2,0x0a,0x7e] +v_not_b16 v5.l, src_scc +// GFX11: v_not_b16_e32 v5.l, src_scc ; encoding: [0xfd,0xd2,0x0a,0x7e] -v_not_b16 v127, 0xfe0b -// GFX11: v_not_b16_e32 v127, 0xfe0b ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_not_b16 v127.l, 0xfe0b +// GFX11: v_not_b16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +v_not_b16 v5.l, v1.h +// GFX11: v_not_b16_e32 v5.l, v1.h ; encoding: [0x81,0xd3,0x0a,0x7e] + +v_not_b16 v5.l, v127.h +// GFX11: v_not_b16_e32 v5.l, v127.h ; encoding: [0xff,0xd3,0x0a,0x7e] + +v_not_b16 v127.l, 0.5 +// GFX11: v_not_b16_e32 v127.l, 0.5 ; encoding: [0xf0,0xd2,0xfe,0x7e] + +v_not_b16 v5.h, src_scc +// GFX11: v_not_b16_e32 v5.h, src_scc ; encoding: [0xfd,0xd2,0x0a,0x7f] + +v_not_b16 v127.h, 0xfe0b +// GFX11: v_not_b16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xd2,0xfe,0x7f,0x0b,0xfe,0x00,0x00] v_not_b32 v5, v1 // GFX11: v_not_b32_e32 v5, v1 ; encoding: [0x01,0x6f,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s index f5cf3fd390c7d..4d1bd99b90252 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s @@ -2144,47 +2144,56 @@ v_movrelsd_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_movrelsd_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_movrelsd_b32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x88,0xfe,0x7f,0xff,0x6f,0x05,0x30] -v_not_b16 v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_not_b16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +v_not_b16 v5.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_not_b16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff] -v_not_b16 v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_not_b16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +v_not_b16 v5.l, v1.l quad_perm:[0,1,2,3] +// GFX11: v_not_b16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff] -v_not_b16 v5, v1 row_mirror -// GFX11: v_not_b16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff] +v_not_b16 v5.l, v1.l row_mirror +// GFX11: v_not_b16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff] -v_not_b16 v5, v1 row_half_mirror -// GFX11: v_not_b16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff] +v_not_b16 v5.l, v1.l row_half_mirror +// GFX11: v_not_b16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff] -v_not_b16 v5, v1 row_shl:1 -// GFX11: v_not_b16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff] +v_not_b16 v5.l, v1.l row_shl:1 +// GFX11: v_not_b16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff] -v_not_b16 v5, v1 row_shl:15 -// GFX11: v_not_b16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +v_not_b16 v5.l, v1.l row_shl:15 +// GFX11: v_not_b16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff] -v_not_b16 v5, v1 row_shr:1 -// GFX11: v_not_b16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff] +v_not_b16 v5.l, v1.l row_shr:1 +// GFX11: v_not_b16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff] -v_not_b16 v5, v1 row_shr:15 -// GFX11: v_not_b16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +v_not_b16 v5.l, v1.l row_shr:15 +// GFX11: v_not_b16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff] -v_not_b16 v5, v1 row_ror:1 -// GFX11: v_not_b16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff] +v_not_b16 v5.l, v1.l row_ror:1 +// GFX11: v_not_b16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff] -v_not_b16 v5, v1 row_ror:15 -// GFX11: v_not_b16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +v_not_b16 v5.l, v1.l row_ror:15 +// GFX11: v_not_b16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff] -v_not_b16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_not_b16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff] +v_not_b16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_not_b16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff] -v_not_b16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_not_b16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +v_not_b16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_not_b16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01] -v_not_b16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_not_b16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x09,0x13] +v_not_b16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_not_b16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x09,0x13] -v_not_b16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_not_b16_dpp v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x05,0x30] +v_not_b16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_not_b16_dpp v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x05,0x30] + +v_not_b16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_not_b16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x5f,0x01,0x01] + +v_not_b16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_not_b16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd2,0x0a,0x7f,0x81,0x60,0x09,0x13] + +v_not_b16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_not_b16_dpp v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7f,0xff,0x6f,0x05,0x30] v_not_b32 v5, v1 quad_perm:[3,2,1,0] // GFX11: v_not_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s index 5a0ffd04bc5c1..2799ea7b8ef8b 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s @@ -500,14 +500,23 @@ v_movrelsd_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_movrelsd_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_movrelsd_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x88,0xfe,0x7f,0xff,0x00,0x00,0x00] -v_not_b16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_not_b16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_not_b16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_not_b16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_not_b16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_not_b16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_not_b16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_not_b16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +v_not_b16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_not_b16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +v_not_b16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_not_b16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0xfe,0x7e,0x7f,0x77,0x39,0x05] + +v_not_b16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_not_b16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd2,0x0a,0x7f,0x81,0x77,0x39,0x05] + +v_not_b16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_not_b16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd2,0xfe,0x7f,0xff,0x00,0x00,0x00] v_not_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_not_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6e,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s index 92882cb89e201..caa73b7b9f047 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s @@ -584,6 +584,12 @@ v_log_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0] v_not_b16_e32 v128, 0xfe0b // GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction +v_not_b16_e32 v128.h, 0xfe0b +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_not_b16_e32 v128.l, 0xfe0b +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + v_not_b16_e32 v255, v1 // GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction @@ -593,6 +599,24 @@ v_not_b16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0] v_not_b16_e32 v255, v1 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction +v_not_b16_e32 v255.h, v1.h +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_not_b16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_not_b16_e32 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_not_b16_e32 v255.l, v1.l +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_not_b16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_not_b16_e32 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + v_not_b16_e32 v5, v199 // GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction @@ -602,6 +626,24 @@ v_not_b16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_not_b16_e32 v5, v199 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction +v_not_b16_e32 v5.h, v199.h +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_not_b16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_not_b16_e32 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_not_b16_e32 v5.l, v199.l +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_not_b16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_not_b16_e32 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + v_rcp_f16_e32 v128.h, 0xfe0b // GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s index d97c8ed844dbb..0dd1bf6142189 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s @@ -1538,71 +1538,137 @@ v_log_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_log_f16 v5, v199 quad_perm:[3,2,1,0] // GFX11: v_log_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] -v_not_b16 v128, 0xfe0b -// GFX11: v_not_b16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_not_b16 v128.h, 0xfe0b +// GFX11: v_not_b16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_not_b16 v255, -1 -// GFX11: v_not_b16_e64 v255, -1 ; encoding: [0xff,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] +v_not_b16 v128.l, 0xfe0b +// GFX11: v_not_b16_e64 v128.l, 0xfe0b ; encoding: [0x80,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_not_b16 v255, 0.5 -// GFX11: v_not_b16_e64 v255, 0.5 ; encoding: [0xff,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00] +v_not_b16 v255.h, -1 +// GFX11: v_not_b16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0xc1,0x00,0x00,0x00] -v_not_b16 v255, exec_hi -// GFX11: v_not_b16_e64 v255, exec_hi ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] +v_not_b16 v255.h, 0.5 +// GFX11: v_not_b16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0xf0,0x00,0x00,0x00] -v_not_b16 v255, exec_lo -// GFX11: v_not_b16_e64 v255, exec_lo ; encoding: [0xff,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] +v_not_b16 v255.h, exec_hi +// GFX11: v_not_b16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7f,0x00,0x00,0x00] -v_not_b16 v255, m0 -// GFX11: v_not_b16_e64 v255, m0 ; encoding: [0xff,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] +v_not_b16 v255.h, exec_lo +// GFX11: v_not_b16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7e,0x00,0x00,0x00] -v_not_b16 v255, null -// GFX11: v_not_b16_e64 v255, null ; encoding: [0xff,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] +v_not_b16 v255.h, m0 +// GFX11: v_not_b16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7d,0x00,0x00,0x00] -v_not_b16 v255, s1 -// GFX11: v_not_b16_e64 v255, s1 ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] +v_not_b16 v255.h, null +// GFX11: v_not_b16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7c,0x00,0x00,0x00] -v_not_b16 v255, s105 -// GFX11: v_not_b16_e64 v255, s105 ; encoding: [0xff,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] +v_not_b16 v255.h, s1 +// GFX11: v_not_b16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x01,0x00,0x00,0x00] -v_not_b16 v255, src_scc -// GFX11: v_not_b16_e64 v255, src_scc ; encoding: [0xff,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] +v_not_b16 v255.h, s105 +// GFX11: v_not_b16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x69,0x00,0x00,0x00] -v_not_b16 v255, ttmp15 -// GFX11: v_not_b16_e64 v255, ttmp15 ; encoding: [0xff,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] +v_not_b16 v255.h, src_scc +// GFX11: v_not_b16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0xfd,0x00,0x00,0x00] -v_not_b16 v255, v1 -// GFX11: v_not_b16_e64 v255, v1 ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] +v_not_b16 v255.h, ttmp15 +// GFX11: v_not_b16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7b,0x00,0x00,0x00] -v_not_b16 v255, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_not_b16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_not_b16 v255.h, v1.h +// GFX11: v_not_b16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00] -v_not_b16 v255, v1 quad_perm:[3,2,1,0] -// GFX11: v_not_b16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_not_b16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_not_b16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_not_b16 v255, v127 -// GFX11: v_not_b16_e64 v255, v127 ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x01,0x00,0x00] +v_not_b16 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX11: v_not_b16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_not_b16 v255, v127 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_not_b16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] +v_not_b16 v255.h, v127.h +// GFX11: v_not_b16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe9,0xd5,0x7f,0x01,0x00,0x00] -v_not_b16 v255, v127 quad_perm:[3,2,1,0] -// GFX11: v_not_b16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] +v_not_b16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_not_b16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] -v_not_b16 v255, vcc_hi -// GFX11: v_not_b16_e64 v255, vcc_hi ; encoding: [0xff,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] +v_not_b16 v255.h, v127.h quad_perm:[3,2,1,0] +// GFX11: v_not_b16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] -v_not_b16 v255, vcc_lo -// GFX11: v_not_b16_e64 v255, vcc_lo ; encoding: [0xff,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] +v_not_b16 v255.h, vcc_hi +// GFX11: v_not_b16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x6b,0x00,0x00,0x00] -v_not_b16 v5, v199 -// GFX11: v_not_b16_e64 v5, v199 ; encoding: [0x05,0x00,0xe9,0xd5,0xc7,0x01,0x00,0x00] +v_not_b16 v255.h, vcc_lo +// GFX11: v_not_b16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x6a,0x00,0x00,0x00] -v_not_b16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_not_b16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] +v_not_b16 v255.l, -1 +// GFX11: v_not_b16_e64 v255.l, -1 ; encoding: [0xff,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] -v_not_b16 v5, v199 quad_perm:[3,2,1,0] -// GFX11: v_not_b16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] +v_not_b16 v255.l, 0.5 +// GFX11: v_not_b16_e64 v255.l, 0.5 ; encoding: [0xff,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00] + +v_not_b16 v255.l, exec_hi +// GFX11: v_not_b16_e64 v255.l, exec_hi ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] + +v_not_b16 v255.l, exec_lo +// GFX11: v_not_b16_e64 v255.l, exec_lo ; encoding: [0xff,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] + +v_not_b16 v255.l, m0 +// GFX11: v_not_b16_e64 v255.l, m0 ; encoding: [0xff,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] + +v_not_b16 v255.l, null +// GFX11: v_not_b16_e64 v255.l, null ; encoding: [0xff,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] + +v_not_b16 v255.l, s1 +// GFX11: v_not_b16_e64 v255.l, s1 ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] + +v_not_b16 v255.l, s105 +// GFX11: v_not_b16_e64 v255.l, s105 ; encoding: [0xff,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] + +v_not_b16 v255.l, src_scc +// GFX11: v_not_b16_e64 v255.l, src_scc ; encoding: [0xff,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] + +v_not_b16 v255.l, ttmp15 +// GFX11: v_not_b16_e64 v255.l, ttmp15 ; encoding: [0xff,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] + +v_not_b16 v255.l, v1.l +// GFX11: v_not_b16_e64 v255.l, v1.l ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] + +v_not_b16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_not_b16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_not_b16 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_not_b16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_not_b16 v255.l, v127.l +// GFX11: v_not_b16_e64 v255.l, v127.l ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x01,0x00,0x00] + +v_not_b16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_not_b16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] + +v_not_b16 v255.l, v127.l quad_perm:[3,2,1,0] +// GFX11: v_not_b16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] + +v_not_b16 v255.l, vcc_hi +// GFX11: v_not_b16_e64 v255.l, vcc_hi ; encoding: [0xff,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] + +v_not_b16 v255.l, vcc_lo +// GFX11: v_not_b16_e64 v255.l, vcc_lo ; encoding: [0xff,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] + +v_not_b16 v5.h, v199.h +// GFX11: v_not_b16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe9,0xd5,0xc7,0x01,0x00,0x00] + +v_not_b16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_not_b16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_not_b16 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX11: v_not_b16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] + +v_not_b16 v5.l, v199.l +// GFX11: v_not_b16_e64 v5.l, v199.l ; encoding: [0x05,0x00,0xe9,0xd5,0xc7,0x01,0x00,0x00] + +v_not_b16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_not_b16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_not_b16 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX11: v_not_b16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] v_rcp_f16 v128, 0xfe0b // GFX11: v_rcp_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xd4,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s index 6176baf11c552..8de72e74c2856 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s @@ -2248,47 +2248,56 @@ v_movrelsd_b32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl: v_movrelsd_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_movrelsd_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xc4,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] -v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_not_b16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +v_not_b16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] +// GFX11: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_not_b16_e64_dpp v5, v1 row_mirror -// GFX11: v_not_b16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_mirror +// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_half_mirror -// GFX11: v_not_b16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_half_mirror +// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_shl:1 -// GFX11: v_not_b16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_shl:1 +// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_shl:15 -// GFX11: v_not_b16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_shl:15 +// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_shr:1 -// GFX11: v_not_b16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_shr:1 +// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_shr:15 -// GFX11: v_not_b16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_shr:15 +// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_ror:1 -// GFX11: v_not_b16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_ror:1 +// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_ror:15 -// GFX11: v_not_b16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_ror:15 +// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +v_not_b16_e64_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] -v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13] +v_not_b16_e64_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13] -v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] +v_not_b16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_not_b16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] + +v_not_b16_e64_dpp v5.h, v1.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: [0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] + +v_not_b16_e64_dpp v5.l, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: [0x05,0x08,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13] + +v_not_b16_e64_dpp v255.h, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: [0xff,0x40,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] v_not_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX11: v_not_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s index f3c8c8a69fbe5..182a13831ec6d 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s @@ -640,14 +640,23 @@ v_movrelsd_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_movrelsd_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_movrelsd_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xc4,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] -v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +v_not_b16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_not_b16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] + +v_not_b16_e64_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: [0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_not_b16_e64_dpp v5.l, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: [0x05,0x08,0xe9,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_not_b16_e64_dpp v255.h, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: [0xff,0x40,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] v_not_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_not_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s index 9020017c86106..17678e3bd9f08 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s @@ -2674,50 +2674,59 @@ v_movrelsd_b32_e64 v255, v255 v_nop_e64 // GFX11: v_nop ; encoding: [0x00,0x00,0x80,0xd5,0x00,0x00,0x00,0x00] -v_not_b16_e64 v5, v1 -// GFX11: v_not_b16_e64 v5, v1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] +v_not_b16_e64 v5.l, v1.l +// GFX11: v_not_b16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] -v_not_b16_e64 v5, v255 -// GFX11: v_not_b16_e64 v5, v255 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00] +v_not_b16_e64 v5.l, v255.l +// GFX11: v_not_b16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00] -v_not_b16_e64 v5, s1 -// GFX11: v_not_b16_e64 v5, s1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] +v_not_b16_e64 v5.l, s1 +// GFX11: v_not_b16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] -v_not_b16_e64 v5, s105 -// GFX11: v_not_b16_e64 v5, s105 ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] +v_not_b16_e64 v5.l, s105 +// GFX11: v_not_b16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] -v_not_b16_e64 v5, vcc_lo -// GFX11: v_not_b16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] +v_not_b16_e64 v5.l, vcc_lo +// GFX11: v_not_b16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] -v_not_b16_e64 v5, vcc_hi -// GFX11: v_not_b16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] +v_not_b16_e64 v5.l, vcc_hi +// GFX11: v_not_b16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] -v_not_b16_e64 v5, ttmp15 -// GFX11: v_not_b16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] +v_not_b16_e64 v5.l, ttmp15 +// GFX11: v_not_b16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] -v_not_b16_e64 v5, m0 -// GFX11: v_not_b16_e64 v5, m0 ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] +v_not_b16_e64 v5.l, m0 +// GFX11: v_not_b16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] -v_not_b16_e64 v5, exec_lo -// GFX11: v_not_b16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] +v_not_b16_e64 v5.l, exec_lo +// GFX11: v_not_b16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] -v_not_b16_e64 v5, exec_hi -// GFX11: v_not_b16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] +v_not_b16_e64 v5.l, exec_hi +// GFX11: v_not_b16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] -v_not_b16_e64 v5, null -// GFX11: v_not_b16_e64 v5, null ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] +v_not_b16_e64 v5.l, null +// GFX11: v_not_b16_e64 v5.l, null ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] -v_not_b16_e64 v5, -1 -// GFX11: v_not_b16_e64 v5, -1 ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] +v_not_b16_e64 v5.l, -1 +// GFX11: v_not_b16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] -v_not_b16_e64 v5, 0.5 -// GFX11: v_not_b16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00] +v_not_b16_e64 v5.l, 0.5 +// GFX11: v_not_b16_e64 v5.l, 0.5 ; encoding: [0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00] -v_not_b16_e64 v5, src_scc -// GFX11: v_not_b16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] +v_not_b16_e64 v5.l, src_scc +// GFX11: v_not_b16_e64 v5.l, src_scc ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] -v_not_b16_e64 v255, 0xfe0b -// GFX11: v_not_b16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_not_b16_e64 v255.l, 0xfe0b +// GFX11: v_not_b16_e64 v255.l, 0xfe0b ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] + +v_not_b16_e64 v5.h, v1.h +// GFX11: [0x05,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00] + +v_not_b16_e64 v5.l, v255.h +// GFX11: [0x05,0x08,0xe9,0xd5,0xff,0x01,0x00,0x00] + +v_not_b16_e64 v255.h, 0xfe0b +// GFX11: [0xff,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] v_not_b32_e64 v5, v1 // GFX11: v_not_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s index b125821d1306e..4f82643fd4886 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s @@ -2759,51 +2759,63 @@ v_movrelsd_b32 v255, v255 v_nop // GFX12: v_nop ; encoding: [0x00,0x00,0x00,0x7e] -v_not_b16 v5, v1 -// GFX12: v_not_b16_e32 v5, v1 ; encoding: [0x01,0xd3,0x0a,0x7e] +v_not_b16 v5.l, v1.l +// GFX12: v_not_b16_e32 v5.l, v1.l ; encoding: [0x01,0xd3,0x0a,0x7e] -v_not_b16 v5, v127 -// GFX12: v_not_b16_e32 v5, v127 ; encoding: [0x7f,0xd3,0x0a,0x7e] +v_not_b16 v5.l, v127.l +// GFX12: v_not_b16_e32 v5.l, v127.l ; encoding: [0x7f,0xd3,0x0a,0x7e] -v_not_b16 v5, s1 -// GFX12: v_not_b16_e32 v5, s1 ; encoding: [0x01,0xd2,0x0a,0x7e] +v_not_b16 v5.l, s1 +// GFX12: v_not_b16_e32 v5.l, s1 ; encoding: [0x01,0xd2,0x0a,0x7e] -v_not_b16 v5, s105 -// GFX12: v_not_b16_e32 v5, s105 ; encoding: [0x69,0xd2,0x0a,0x7e] +v_not_b16 v5.l, s105 +// GFX12: v_not_b16_e32 v5.l, s105 ; encoding: [0x69,0xd2,0x0a,0x7e] -v_not_b16 v5, vcc_lo -// GFX12: v_not_b16_e32 v5, vcc_lo ; encoding: [0x6a,0xd2,0x0a,0x7e] +v_not_b16 v5.l, vcc_lo +// GFX12: v_not_b16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xd2,0x0a,0x7e] -v_not_b16 v5, vcc_hi -// GFX12: v_not_b16_e32 v5, vcc_hi ; encoding: [0x6b,0xd2,0x0a,0x7e] +v_not_b16 v5.l, vcc_hi +// GFX12: v_not_b16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xd2,0x0a,0x7e] -v_not_b16 v5, ttmp15 -// GFX12: v_not_b16_e32 v5, ttmp15 ; encoding: [0x7b,0xd2,0x0a,0x7e] +v_not_b16 v5.l, ttmp15 +// GFX12: v_not_b16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xd2,0x0a,0x7e] -v_not_b16 v5, m0 -// GFX12: v_not_b16_e32 v5, m0 ; encoding: [0x7d,0xd2,0x0a,0x7e] +v_not_b16 v5.l, m0 +// GFX12: v_not_b16_e32 v5.l, m0 ; encoding: [0x7d,0xd2,0x0a,0x7e] -v_not_b16 v5, exec_lo -// GFX12: v_not_b16_e32 v5, exec_lo ; encoding: [0x7e,0xd2,0x0a,0x7e] +v_not_b16 v5.l, exec_lo +// GFX12: v_not_b16_e32 v5.l, exec_lo ; encoding: [0x7e,0xd2,0x0a,0x7e] -v_not_b16 v5, exec_hi -// GFX12: v_not_b16_e32 v5, exec_hi ; encoding: [0x7f,0xd2,0x0a,0x7e] +v_not_b16 v5.l, exec_hi +// GFX12: v_not_b16_e32 v5.l, exec_hi ; encoding: [0x7f,0xd2,0x0a,0x7e] -v_not_b16 v5, null -// GFX12: v_not_b16_e32 v5, null ; encoding: [0x7c,0xd2,0x0a,0x7e] +v_not_b16 v5.l, null +// GFX12: v_not_b16_e32 v5.l, null ; encoding: [0x7c,0xd2,0x0a,0x7e] -v_not_b16 v5, -1 -// GFX12: v_not_b16_e32 v5, -1 ; encoding: [0xc1,0xd2,0x0a,0x7e] +v_not_b16 v5.l, -1 +// GFX12: v_not_b16_e32 v5.l, -1 ; encoding: [0xc1,0xd2,0x0a,0x7e] -v_not_b16 v5, 0.5 -// GFX12-ASM: v_not_b16_e32 v5, 0.5 ; encoding: [0xf0,0xd2,0x0a,0x7e] -// GFX12-DIS: v_not_b16_e32 v5, 0x3800 ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x38,0x00,0x00] +v_not_b16 v5.l, 0.5 +// GFX12-ASM: v_not_b16_e32 v5.l, 0.5 ; encoding: [0xf0,0xd2,0x0a,0x7e] +// GFX12-DIS: v_not_b16_e32 v5.l, 0x3800 ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x38,0x00,0x00] -v_not_b16 v5, src_scc -// GFX12: v_not_b16_e32 v5, src_scc ; encoding: [0xfd,0xd2,0x0a,0x7e] +v_not_b16 v5.l, src_scc +// GFX12: v_not_b16_e32 v5.l, src_scc ; encoding: [0xfd,0xd2,0x0a,0x7e] -v_not_b16 v127, 0xfe0b -// GFX12: v_not_b16_e32 v127, 0xfe0b ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_not_b16 v127.l, 0xfe0b +// GFX12: v_not_b16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +v_not_b16 v5.l, v1.h +// GFX12: v_not_b16_e32 v5.l, v1.h ; encoding: [0x81,0xd3,0x0a,0x7e] + +v_not_b16 v5.l, v127.h +// GFX12: v_not_b16_e32 v5.l, v127.h ; encoding: [0xff,0xd3,0x0a,0x7e] + +v_not_b16 v5.h, src_scc +// GFX12: v_not_b16_e32 v5.h, src_scc ; encoding: [0xfd,0xd2,0x0a,0x7f] + +v_not_b16 v127.h, 0xfe0b +// GFX12: v_not_b16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xd2,0xfe,0x7f,0x0b,0xfe,0x00,0x00] v_not_b32 v5, v1 // GFX12: v_not_b32_e32 v5, v1 ; encoding: [0x01,0x6f,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s index a625326c1dae4..2b3a52cf4e804 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s @@ -2212,47 +2212,53 @@ v_movrelsd_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_movrelsd_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_movrelsd_b32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x88,0xfe,0x7f,0xff,0x6f,0x05,0x30] -v_not_b16 v5, v1 quad_perm:[3,2,1,0] -// GFX12: v_not_b16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +v_not_b16 v5.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_not_b16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff] -v_not_b16 v5, v1 quad_perm:[0,1,2,3] -// GFX12: v_not_b16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +v_not_b16 v5.l, v1.l quad_perm:[0,1,2,3] +// GFX12: v_not_b16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff] -v_not_b16 v5, v1 row_mirror -// GFX12: v_not_b16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff] +v_not_b16 v5.l, v1.l row_mirror +// GFX12: v_not_b16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff] -v_not_b16 v5, v1 row_half_mirror -// GFX12: v_not_b16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff] +v_not_b16 v5.l, v1.l row_half_mirror +// GFX12: v_not_b16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff] -v_not_b16 v5, v1 row_shl:1 -// GFX12: v_not_b16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff] +v_not_b16 v5.l, v1.l row_shl:1 +// GFX12: v_not_b16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff] -v_not_b16 v5, v1 row_shl:15 -// GFX12: v_not_b16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +v_not_b16 v5.l, v1.l row_shl:15 +// GFX12: v_not_b16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff] -v_not_b16 v5, v1 row_shr:1 -// GFX12: v_not_b16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff] +v_not_b16 v5.l, v1.l row_shr:1 +// GFX12: v_not_b16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff] -v_not_b16 v5, v1 row_shr:15 -// GFX12: v_not_b16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +v_not_b16 v5.l, v1.l row_shr:15 +// GFX12: v_not_b16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff] -v_not_b16 v5, v1 row_ror:1 -// GFX12: v_not_b16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff] +v_not_b16 v5.l, v1.l row_ror:1 +// GFX12: v_not_b16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff] -v_not_b16 v5, v1 row_ror:15 -// GFX12: v_not_b16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +v_not_b16 v5.l, v1.l row_ror:15 +// GFX12: v_not_b16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff] -v_not_b16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_not_b16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff] +v_not_b16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_not_b16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff] -v_not_b16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_not_b16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +v_not_b16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_not_b16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01] -v_not_b16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_not_b16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x09,0x13] +v_not_b16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_not_b16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x09,0x13] -v_not_b16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_not_b16_dpp v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x05,0x30] +v_not_b16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_not_b16_dpp v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x05,0x30] + +v_not_b16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_not_b16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd2,0x0a,0x7f,0x81,0x60,0x09,0x13] + +v_not_b16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_not_b16_dpp v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7f,0xff,0x6f,0x05,0x30] v_not_b32 v5, v1 quad_perm:[3,2,1,0] // GFX12: v_not_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s index 9281d6fb16ce8..977d5b08b80ee 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s @@ -529,14 +529,20 @@ v_movrelsd_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_movrelsd_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_movrelsd_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x88,0xfe,0x7f,0xff,0x00,0x00,0x00] -v_not_b16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_not_b16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_not_b16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_not_b16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_not_b16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_not_b16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_not_b16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_not_b16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +v_not_b16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_not_b16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +v_not_b16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_not_b16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd2,0x0a,0x7f,0x81,0x77,0x39,0x05] + +v_not_b16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_not_b16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd2,0xfe,0x7f,0xff,0x00,0x00,0x00] v_not_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_not_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6e,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s index 33a5dded095c7..1b6734a6a652b 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s @@ -536,6 +536,12 @@ v_log_f16_e32 v5, v199 quad_perm:[3,2,1,0] v_not_b16_e32 v128, 0xfe0b // GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction +v_not_b16_e32 v128.h, 0xfe0b +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_not_b16_e32 v128.l, 0xfe0b +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + v_not_b16_e32 v255, v1 // GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction @@ -545,6 +551,24 @@ v_not_b16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0] v_not_b16_e32 v255, v1 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction +v_not_b16_e32 v255.h, v1.h +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_not_b16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_not_b16_e32 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_not_b16_e32 v255.l, v1.l +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_not_b16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_not_b16_e32 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + v_not_b16_e32 v5, v199 // GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction @@ -554,6 +578,24 @@ v_not_b16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_not_b16_e32 v5, v199 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction +v_not_b16_e32 v5.h, v199.h +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_not_b16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_not_b16_e32 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_not_b16_e32 v5.l, v199.l +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_not_b16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_not_b16_e32 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + v_rcp_f16_e32 v128, 0xfe0b // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s index 03519d43c49a9..9d36ea0b9f479 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s @@ -1498,71 +1498,137 @@ v_log_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_log_f16 v5, v199 quad_perm:[3,2,1,0] // GFX12: v_log_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] -v_not_b16 v128, 0xfe0b -// GFX12: v_not_b16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_not_b16 v128.h, 0xfe0b +// GFX12: v_not_b16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_not_b16 v255, -1 -// GFX12: v_not_b16_e64 v255, -1 ; encoding: [0xff,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] +v_not_b16 v128.l, 0xfe0b +// GFX12: v_not_b16_e64 v128.l, 0xfe0b ; encoding: [0x80,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_not_b16 v255, 0.5 -// GFX12: v_not_b16_e64 v255, 0.5 ; encoding: [0xff,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00] +v_not_b16 v255.h, -1 +// GFX12: v_not_b16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0xc1,0x00,0x00,0x00] -v_not_b16 v255, exec_hi -// GFX12: v_not_b16_e64 v255, exec_hi ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] +v_not_b16 v255.h, 0.5 +// GFX12: v_not_b16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0xf0,0x00,0x00,0x00] -v_not_b16 v255, exec_lo -// GFX12: v_not_b16_e64 v255, exec_lo ; encoding: [0xff,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] +v_not_b16 v255.h, exec_hi +// GFX12: v_not_b16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7f,0x00,0x00,0x00] -v_not_b16 v255, m0 -// GFX12: v_not_b16_e64 v255, m0 ; encoding: [0xff,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] +v_not_b16 v255.h, exec_lo +// GFX12: v_not_b16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7e,0x00,0x00,0x00] -v_not_b16 v255, null -// GFX12: v_not_b16_e64 v255, null ; encoding: [0xff,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] +v_not_b16 v255.h, m0 +// GFX12: v_not_b16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7d,0x00,0x00,0x00] -v_not_b16 v255, s1 -// GFX12: v_not_b16_e64 v255, s1 ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] +v_not_b16 v255.h, null +// GFX12: v_not_b16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7c,0x00,0x00,0x00] -v_not_b16 v255, s105 -// GFX12: v_not_b16_e64 v255, s105 ; encoding: [0xff,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] +v_not_b16 v255.h, s1 +// GFX12: v_not_b16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x01,0x00,0x00,0x00] -v_not_b16 v255, src_scc -// GFX12: v_not_b16_e64 v255, src_scc ; encoding: [0xff,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] +v_not_b16 v255.h, s105 +// GFX12: v_not_b16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x69,0x00,0x00,0x00] -v_not_b16 v255, ttmp15 -// GFX12: v_not_b16_e64 v255, ttmp15 ; encoding: [0xff,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] +v_not_b16 v255.h, src_scc +// GFX12: v_not_b16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0xfd,0x00,0x00,0x00] -v_not_b16 v255, v1 -// GFX12: v_not_b16_e64 v255, v1 ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] +v_not_b16 v255.h, ttmp15 +// GFX12: v_not_b16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7b,0x00,0x00,0x00] -v_not_b16 v255, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_not_b16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_not_b16 v255.h, v1.h +// GFX12: v_not_b16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00] -v_not_b16 v255, v1 quad_perm:[3,2,1,0] -// GFX12: v_not_b16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_not_b16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_not_b16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_not_b16 v255, v127 -// GFX12: v_not_b16_e64 v255, v127 ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x01,0x00,0x00] +v_not_b16 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX12: v_not_b16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_not_b16 v255, v127 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_not_b16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] +v_not_b16 v255.h, v127.h +// GFX12: v_not_b16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe9,0xd5,0x7f,0x01,0x00,0x00] -v_not_b16 v255, v127 quad_perm:[3,2,1,0] -// GFX12: v_not_b16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] +v_not_b16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_not_b16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] -v_not_b16 v255, vcc_hi -// GFX12: v_not_b16_e64 v255, vcc_hi ; encoding: [0xff,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] +v_not_b16 v255.h, v127.h quad_perm:[3,2,1,0] +// GFX12: v_not_b16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] -v_not_b16 v255, vcc_lo -// GFX12: v_not_b16_e64 v255, vcc_lo ; encoding: [0xff,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] +v_not_b16 v255.h, vcc_hi +// GFX12: v_not_b16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x6b,0x00,0x00,0x00] -v_not_b16 v5, v199 -// GFX12: v_not_b16_e64 v5, v199 ; encoding: [0x05,0x00,0xe9,0xd5,0xc7,0x01,0x00,0x00] +v_not_b16 v255.h, vcc_lo +// GFX12: v_not_b16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x6a,0x00,0x00,0x00] -v_not_b16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_not_b16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] +v_not_b16 v255.l, -1 +// GFX12: v_not_b16_e64 v255.l, -1 ; encoding: [0xff,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] -v_not_b16 v5, v199 quad_perm:[3,2,1,0] -// GFX12: v_not_b16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] +v_not_b16 v255.l, 0.5 +// GFX12: v_not_b16_e64 v255.l, 0.5 ; encoding: [0xff,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00] + +v_not_b16 v255.l, exec_hi +// GFX12: v_not_b16_e64 v255.l, exec_hi ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] + +v_not_b16 v255.l, exec_lo +// GFX12: v_not_b16_e64 v255.l, exec_lo ; encoding: [0xff,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] + +v_not_b16 v255.l, m0 +// GFX12: v_not_b16_e64 v255.l, m0 ; encoding: [0xff,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] + +v_not_b16 v255.l, null +// GFX12: v_not_b16_e64 v255.l, null ; encoding: [0xff,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] + +v_not_b16 v255.l, s1 +// GFX12: v_not_b16_e64 v255.l, s1 ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] + +v_not_b16 v255.l, s105 +// GFX12: v_not_b16_e64 v255.l, s105 ; encoding: [0xff,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] + +v_not_b16 v255.l, src_scc +// GFX12: v_not_b16_e64 v255.l, src_scc ; encoding: [0xff,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] + +v_not_b16 v255.l, ttmp15 +// GFX12: v_not_b16_e64 v255.l, ttmp15 ; encoding: [0xff,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] + +v_not_b16 v255.l, v1.l +// GFX12: v_not_b16_e64 v255.l, v1.l ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] + +v_not_b16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_not_b16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_not_b16 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_not_b16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_not_b16 v255.l, v127.l +// GFX12: v_not_b16_e64 v255.l, v127.l ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x01,0x00,0x00] + +v_not_b16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_not_b16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] + +v_not_b16 v255.l, v127.l quad_perm:[3,2,1,0] +// GFX12: v_not_b16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] + +v_not_b16 v255.l, vcc_hi +// GFX12: v_not_b16_e64 v255.l, vcc_hi ; encoding: [0xff,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] + +v_not_b16 v255.l, vcc_lo +// GFX12: v_not_b16_e64 v255.l, vcc_lo ; encoding: [0xff,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] + +v_not_b16 v5.h, v199.h +// GFX12: v_not_b16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe9,0xd5,0xc7,0x01,0x00,0x00] + +v_not_b16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_not_b16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_not_b16 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX12: v_not_b16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] + +v_not_b16 v5.l, v199.l +// GFX12: v_not_b16_e64 v5.l, v199.l ; encoding: [0x05,0x00,0xe9,0xd5,0xc7,0x01,0x00,0x00] + +v_not_b16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_not_b16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_not_b16 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX12: v_not_b16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] v_rcp_f16 v128, 0xfe0b // GFX12: v_rcp_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xd4,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s index e2fe08ddc8b06..71c12a1333ebc 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s @@ -2824,50 +2824,59 @@ v_movrelsd_b32_e64 v255, v255 v_nop_e64 // GFX12: v_nop ; encoding: [0x00,0x00,0x80,0xd5,0x00,0x00,0x00,0x00] -v_not_b16_e64 v5, v1 -// GFX12: v_not_b16_e64 v5, v1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] +v_not_b16_e64 v5.l, v1.l +// GFX12: v_not_b16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] -v_not_b16_e64 v5, v255 -// GFX12: v_not_b16_e64 v5, v255 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00] +v_not_b16_e64 v5.l, v255.l +// GFX12: v_not_b16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00] -v_not_b16_e64 v5, s1 -// GFX12: v_not_b16_e64 v5, s1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] +v_not_b16_e64 v5.l, s1 +// GFX12: v_not_b16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] -v_not_b16_e64 v5, s105 -// GFX12: v_not_b16_e64 v5, s105 ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] +v_not_b16_e64 v5.l, s105 +// GFX12: v_not_b16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] -v_not_b16_e64 v5, vcc_lo -// GFX12: v_not_b16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] +v_not_b16_e64 v5.l, vcc_lo +// GFX12: v_not_b16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] -v_not_b16_e64 v5, vcc_hi -// GFX12: v_not_b16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] +v_not_b16_e64 v5.l, vcc_hi +// GFX12: v_not_b16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] -v_not_b16_e64 v5, ttmp15 -// GFX12: v_not_b16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] +v_not_b16_e64 v5.l, ttmp15 +// GFX12: v_not_b16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] -v_not_b16_e64 v5, m0 -// GFX12: v_not_b16_e64 v5, m0 ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] +v_not_b16_e64 v5.l, m0 +// GFX12: v_not_b16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] -v_not_b16_e64 v5, exec_lo -// GFX12: v_not_b16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] +v_not_b16_e64 v5.l, exec_lo +// GFX12: v_not_b16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] -v_not_b16_e64 v5, exec_hi -// GFX12: v_not_b16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] +v_not_b16_e64 v5.l, exec_hi +// GFX12: v_not_b16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] -v_not_b16_e64 v5, null -// GFX12: v_not_b16_e64 v5, null ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] +v_not_b16_e64 v5.l, null +// GFX12: v_not_b16_e64 v5.l, null ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] -v_not_b16_e64 v5, -1 -// GFX12: v_not_b16_e64 v5, -1 ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] +v_not_b16_e64 v5.l, -1 +// GFX12: v_not_b16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] -v_not_b16_e64 v5, 0.5 -// GFX12: v_not_b16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00] +v_not_b16_e64 v5.l, 0.5 +// GFX12: v_not_b16_e64 v5.l, 0.5 ; encoding: [0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00] -v_not_b16_e64 v5, src_scc -// GFX12: v_not_b16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] +v_not_b16_e64 v5.l, src_scc +// GFX12: v_not_b16_e64 v5.l, src_scc ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] -v_not_b16_e64 v255, 0xfe0b -// GFX12: v_not_b16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_not_b16_e64 v255.l, 0xfe0b +// GFX12: v_not_b16_e64 v255.l, 0xfe0b ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] + +v_not_b16_e64 v5.h, v1.h +// GFX12: v_not_b16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00] + +v_not_b16_e64 v5.l, v255.h +// GFX12: v_not_b16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe9,0xd5,0xff,0x01,0x00,0x00] + +v_not_b16_e64 v255.h, 0xfe0b +// GFX12: v_not_b16_e64 v255.h, 0xfe0b op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] v_not_b32_e64 v5, v1 // GFX12: v_not_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s index 3fff2749e6e99..42166032124a3 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s @@ -2128,47 +2128,56 @@ v_movrels_b32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_movrels_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_movrels_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xc3,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] -v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0] -// GFX12: v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_not_b16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3] -// GFX12: v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +v_not_b16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] +// GFX12: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_not_b16_e64_dpp v5, v1 row_mirror -// GFX12: v_not_b16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_mirror +// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_half_mirror -// GFX12: v_not_b16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_half_mirror +// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_shl:1 -// GFX12: v_not_b16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_shl:1 +// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_shl:15 -// GFX12: v_not_b16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_shl:15 +// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_shr:1 -// GFX12: v_not_b16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_shr:1 +// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_shr:15 -// GFX12: v_not_b16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_shr:15 +// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_ror:1 -// GFX12: v_not_b16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_ror:1 +// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_ror:15 -// GFX12: v_not_b16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_ror:15 +// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +v_not_b16_e64_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] -v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13] +v_not_b16_e64_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13] -v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] +v_not_b16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_not_b16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] + +v_not_b16_e64_dpp v5.h, v1.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_not_b16_e64_dpp v5.h, v1.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] + +v_not_b16_e64_dpp v5.l, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_not_b16_e64_dpp v5.l, v1.h op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x08,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13] + +v_not_b16_e64_dpp v255.h, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_not_b16_e64_dpp v255.h, v255.l op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x40,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] v_not_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX12: v_not_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s index e4ae0ad655518..d65d2004fc1e7 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s @@ -619,14 +619,23 @@ v_movrels_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_movrels_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_movrels_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xc3,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] -v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +v_not_b16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_not_b16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] + +v_not_b16_e64_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_not_b16_e64_dpp v5.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_not_b16_e64_dpp v5.l, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_not_b16_e64_dpp v5.l, v1.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x08,0xe9,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_not_b16_e64_dpp v255.h, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_not_b16_e64_dpp v255.h, v255.l op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x40,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] v_not_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_not_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt index 8cf2c2b4f2d1e..38c573a19ba00 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt @@ -2638,49 +2638,82 @@ # GFX11: v_nop ; encoding: [0x00,0x00,0x00,0x7e] 0x01,0xd3,0x0a,0x7e -# GFX11: v_not_b16_e32 v5, v1 ; encoding: [0x01,0xd3,0x0a,0x7e] +# GFX11-REAL16: v_not_b16_e32 v5.l, v1.l ; encoding: [0x01,0xd3,0x0a,0x7e] +# GFX11-FAKE16: v_not_b16_e32 v5, v1 ; encoding: [0x01,0xd3,0x0a,0x7e] 0x7f,0xd3,0x0a,0x7e -# GFX11: v_not_b16_e32 v5, v127 ; encoding: [0x7f,0xd3,0x0a,0x7e] +# GFX11-REAL16: v_not_b16_e32 v5.l, v127.l ; encoding: [0x7f,0xd3,0x0a,0x7e] +# GFX11-FAKE16: v_not_b16_e32 v5, v127 ; encoding: [0x7f,0xd3,0x0a,0x7e] 0x01,0xd2,0x0a,0x7e -# GFX11: v_not_b16_e32 v5, s1 ; encoding: [0x01,0xd2,0x0a,0x7e] +# GFX11-REAL16: v_not_b16_e32 v5.l, s1 ; encoding: [0x01,0xd2,0x0a,0x7e] +# GFX11-FAKE16: v_not_b16_e32 v5, s1 ; encoding: [0x01,0xd2,0x0a,0x7e] 0x69,0xd2,0x0a,0x7e -# GFX11: v_not_b16_e32 v5, s105 ; encoding: [0x69,0xd2,0x0a,0x7e] +# GFX11-REAL16: v_not_b16_e32 v5.l, s105 ; encoding: [0x69,0xd2,0x0a,0x7e] +# GFX11-FAKE16: v_not_b16_e32 v5, s105 ; encoding: [0x69,0xd2,0x0a,0x7e] 0x6a,0xd2,0x0a,0x7e -# GFX11: v_not_b16_e32 v5, vcc_lo ; encoding: [0x6a,0xd2,0x0a,0x7e] +# GFX11-REAL16: v_not_b16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xd2,0x0a,0x7e] +# GFX11-FAKE16: v_not_b16_e32 v5, vcc_lo ; encoding: [0x6a,0xd2,0x0a,0x7e] 0x6b,0xd2,0x0a,0x7e -# GFX11: v_not_b16_e32 v5, vcc_hi ; encoding: [0x6b,0xd2,0x0a,0x7e] +# GFX11-REAL16: v_not_b16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xd2,0x0a,0x7e] +# GFX11-FAKE16: v_not_b16_e32 v5, vcc_hi ; encoding: [0x6b,0xd2,0x0a,0x7e] 0x7b,0xd2,0x0a,0x7e -# GFX11: v_not_b16_e32 v5, ttmp15 ; encoding: [0x7b,0xd2,0x0a,0x7e] +# GFX11-REAL16: v_not_b16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xd2,0x0a,0x7e] +# GFX11-FAKE16: v_not_b16_e32 v5, ttmp15 ; encoding: [0x7b,0xd2,0x0a,0x7e] 0x7d,0xd2,0x0a,0x7e -# GFX11: v_not_b16_e32 v5, m0 ; encoding: [0x7d,0xd2,0x0a,0x7e] +# GFX11-REAL16: v_not_b16_e32 v5.l, m0 ; encoding: [0x7d,0xd2,0x0a,0x7e] +# GFX11-FAKE16: v_not_b16_e32 v5, m0 ; encoding: [0x7d,0xd2,0x0a,0x7e] 0x7e,0xd2,0x0a,0x7e -# GFX11: v_not_b16_e32 v5, exec_lo ; encoding: [0x7e,0xd2,0x0a,0x7e] +# GFX11-REAL16: v_not_b16_e32 v5.l, exec_lo ; encoding: [0x7e,0xd2,0x0a,0x7e] +# GFX11-FAKE16: v_not_b16_e32 v5, exec_lo ; encoding: [0x7e,0xd2,0x0a,0x7e] 0x7f,0xd2,0x0a,0x7e -# GFX11: v_not_b16_e32 v5, exec_hi ; encoding: [0x7f,0xd2,0x0a,0x7e] +# GFX11-REAL16: v_not_b16_e32 v5.l, exec_hi ; encoding: [0x7f,0xd2,0x0a,0x7e] +# GFX11-FAKE16: v_not_b16_e32 v5, exec_hi ; encoding: [0x7f,0xd2,0x0a,0x7e] 0x7c,0xd2,0x0a,0x7e -# GFX11: v_not_b16_e32 v5, null ; encoding: [0x7c,0xd2,0x0a,0x7e] +# GFX11-REAL16: v_not_b16_e32 v5.l, null ; encoding: [0x7c,0xd2,0x0a,0x7e] +# GFX11-FAKE16: v_not_b16_e32 v5, null ; encoding: [0x7c,0xd2,0x0a,0x7e] 0xc1,0xd2,0x0a,0x7e -# GFX11: v_not_b16_e32 v5, -1 ; encoding: [0xc1,0xd2,0x0a,0x7e] +# GFX11-REAL16: v_not_b16_e32 v5.l, -1 ; encoding: [0xc1,0xd2,0x0a,0x7e] +# GFX11-FAKE16: v_not_b16_e32 v5, -1 ; encoding: [0xc1,0xd2,0x0a,0x7e] 0xf0,0xd2,0x0a,0x7e -# GFX11: v_not_b16_e32 v5, 0x3800 ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x38,0x00,0x00] +# GFX11-REAL16: v_not_b16_e32 v5.l, 0x3800 ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x38,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e32 v5, 0x3800 ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x38,0x00,0x00] 0xfd,0xd2,0x0a,0x7e -# GFX11: v_not_b16_e32 v5, src_scc ; encoding: [0xfd,0xd2,0x0a,0x7e] +# GFX11-REAL16: v_not_b16_e32 v5.l, src_scc ; encoding: [0xfd,0xd2,0x0a,0x7e] +# GFX11-FAKE16: v_not_b16_e32 v5, src_scc ; encoding: [0xfd,0xd2,0x0a,0x7e] 0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00 -# GFX11: v_not_b16_e32 v127, 0xfe0b ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_not_b16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e32 v127, 0xfe0b ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +0x81,0xd3,0x0a,0x7e +# GFX11-REAL16: v_not_b16_e32 v5.l, v1.h ; encoding: [0x81,0xd3,0x0a,0x7e] +# GFX11-FAKE16: v_not_b16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xd3,0x0a,0x7e] + +0xff,0xd3,0x0a,0x7e +# GFX11-REAL16: v_not_b16_e32 v5.l, v127.h ; encoding: [0xff,0xd3,0x0a,0x7e] +# GFX11-FAKE16: v_not_b16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xd3,0x0a,0x7e] + +0xf0,0xd2,0xfe,0x7e +# GFX11-REAL16: v_not_b16_e32 v127.l, 0x3800 ; encoding: [0xff,0xd2,0xfe,0x7e,0x00,0x38,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e32 v127, 0x3800 ; encoding: [0xff,0xd2,0xfe,0x7e,0x00,0x38,0x00,0x00] + +0xfd,0xd2,0x0a,0x7f +# GFX11-REAL16: v_not_b16_e32 v5.h, src_scc ; encoding: [0xfd,0xd2,0x0a,0x7f] + +0xff,0xd2,0xfe,0x7f,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_not_b16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xd2,0xfe,0x7f,0x0b,0xfe,0x00,0x00] 0x01,0x6f,0x0a,0x7e # GFX11: v_not_b32_e32 v5, v1 ; encoding: [0x01,0x6f,0x0a,0x7e] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt index b9a499549d12c..b801e393c635d 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt @@ -2119,46 +2119,72 @@ # GFX11: v_movrelsd_b32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x88,0xfe,0x7f,0xff,0x6f,0x0d,0x30] 0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX11: v_not_b16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_not_b16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX11: v_not_b16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_not_b16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX11: v_not_b16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX11: v_not_b16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX11: v_not_b16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX11: v_not_b16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX11: v_not_b16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX11: v_not_b16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX11: v_not_b16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX11: v_not_b16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX11: v_not_b16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX11: v_not_b16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX11: v_not_b16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30 -# GFX11: v_not_b16_dpp v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30] +# GFX11-REAL16: v_not_b16_dpp v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_not_b16_dpp v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30] + +0xfa,0xd2,0xfe,0x7e,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_not_b16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_not_b16_dpp v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x5f,0x01,0x01] + +0xfa,0xd2,0x0a,0x7f,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_not_b16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7f,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_mul_i32_i24_e32 v128, 1, v176 ; encoding: [0x81,0x60,0x01,0x13] + +0xfa,0xd2,0xfe,0x7f,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_not_b16_dpp v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7f,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_lshlrev_b32_e32 v6, v255, v183 ; encoding: [0xff,0x6f,0x0d,0x30] 0xfa,0x6e,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX11: v_not_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt index 80c739a98f65f..faf3c6f628b95 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt @@ -398,10 +398,23 @@ # GFX11: v_movrelsd_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x88,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX11: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00 -# GFX11: v_not_b16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_not_b16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_not_b16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +0xe9,0xd2,0xfe,0x7e,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_not_b16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0xfe,0x7e,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_not_b16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0xfe,0x7e,0x7f,0x77,0x39,0x05] + +0xe9,0xd2,0x0a,0x7f,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_not_b16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7f,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v129, v187 ; encoding: [0x81,0x77,0x39,0x05] + +0xea,0xd2,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_not_b16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0x6e,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX11: v_not_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6e,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt index fd84ed734fb31..f689c43b75365 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt @@ -2257,46 +2257,72 @@ # GFX11: v_movrelsd_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xc4,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX11: v_not_b16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX11: v_not_b16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX11: v_not_b16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_not_b16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX11: v_not_b16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_not_b16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX11: v_not_b16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_not_b16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX11: v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01 -# GFX11: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13 -# GFX11: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] 0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 -# GFX11: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_not_b16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] + +0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_not_b16_e64_dpp v5.h, v1.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] + +0x05,0x08,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.h op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] + +0xff,0x40,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_not_b16_e64_dpp v255.h, v255.l op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xb7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX11: v_not_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt index 0edbff63d60ed..48824399a0887 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt @@ -599,10 +599,24 @@ # GFX11: v_movrelsd_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xc4,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 -# GFX11: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] + +0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_not_b16_e64_dpp v5.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +0x05,0x08,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +0xff,0x40,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_not_b16_e64_dpp v255.h, v255.l op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0x05,0x00,0xb7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX11: v_not_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt index 0406d78078305..04c9094465b3b 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt @@ -2658,49 +2658,76 @@ # GFX11: v_nop ; encoding: [0x00,0x00,0x80,0xd5,0x00,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00 -# GFX11: v_not_b16_e64 v5, v1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, v1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00 -# GFX11: v_not_b16_e64 v5, v255 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, v255 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00 -# GFX11: v_not_b16_e64 v5, s1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, s1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00 -# GFX11: v_not_b16_e64 v5, s105 ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, s105 ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00 -# GFX11: v_not_b16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00 -# GFX11: v_not_b16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00 -# GFX11: v_not_b16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00 -# GFX11: v_not_b16_e64 v5, m0 ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, m0 ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00 -# GFX11: v_not_b16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00 -# GFX11: v_not_b16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00 -# GFX11: v_not_b16_e64 v5, null ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64 v5.l, null ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, null ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00 -# GFX11: v_not_b16_e64 v5, -1 ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, -1 ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00 -# GFX11: v_not_b16_e64 v5, 0x3800 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64 v5.l, 0x3800 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, 0x3800 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00] 0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00 -# GFX11: v_not_b16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64 v5.l, src_scc ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] 0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00 -# GFX11: v_not_b16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64 v255.l, 0xfe0b ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] + +0x05,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00 +# GFX11-REAL16: v_not_b16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, v1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x08,0xe9,0xd5,0xff,0x01,0x00,0x00 +# GFX11-REAL16: v_not_b16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe9,0xd5,0xff,0x01,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, v255 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00] + +0xff,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_not_b16_e64 v255.h, 0xfe0b op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00 # GFX11: v_not_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt index 22ae18815a522..b93a6252beaeb 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt @@ -2239,46 +2239,68 @@ # GFX12: v_movrelsd_b32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x88,0xfe,0x7f,0xff,0x6f,0x0d,0x30] 0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX12: v_not_b16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_not_b16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX12: v_not_b16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_not_b16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX12: v_not_b16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX12: v_not_b16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX12: v_not_b16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX12: v_not_b16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX12: v_not_b16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX12: v_not_b16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX12: v_not_b16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX12: v_not_b16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX12: v_not_b16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX12: v_not_b16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX12: v_not_b16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30 -# GFX12: v_not_b16_dpp v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30] +# GFX12-REAL16: v_not_b16_dpp v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_not_b16_dpp v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30] + +0xfa,0xd2,0x0a,0x7f,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_not_b16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7f,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_mul_i32_i24_e32 v128, 1, v176 ; encoding: [0x81,0x60,0x01,0x13] + +0xfa,0xd2,0xfe,0x7f,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_not_b16_dpp v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7f,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_lshlrev_b32_e32 v6, v255, v183 ; encoding: [0xff,0x6f,0x0d,0x30] 0xfa,0x6e,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX12: v_not_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt index bfb84c6cdff39..092ba9b88f951 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt @@ -411,10 +411,20 @@ # GFX12: v_movrelsd_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x88,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX12: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00 -# GFX12: v_not_b16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_not_b16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_not_b16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +0xe9,0xd2,0x0a,0x7f,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_not_b16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7f,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05] + +0xea,0xd2,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_not_b16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7f,0xff,0x00,0x00,0x00] + 0xe9,0x6e,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX12: v_not_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6e,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt index e27469230a15f..7fdb9e0ac6977 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt @@ -2707,49 +2707,76 @@ # GFX12: v_movrelsd_b32_e64 v255, v255 ; encoding: [0xff,0x00,0xc4,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00 -# GFX12: v_not_b16_e64 v5, v1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, v1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00 -# GFX12: v_not_b16_e64 v5, v255 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, v255 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00 -# GFX12: v_not_b16_e64 v5, s1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, s1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00 -# GFX12: v_not_b16_e64 v5, s105 ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, s105 ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00 -# GFX12: v_not_b16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00 -# GFX12: v_not_b16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00 -# GFX12: v_not_b16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00 -# GFX12: v_not_b16_e64 v5, m0 ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, m0 ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00 -# GFX12: v_not_b16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00 -# GFX12: v_not_b16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00 -# GFX12: v_not_b16_e64 v5, null ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64 v5.l, null ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, null ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00 -# GFX12: v_not_b16_e64 v5, -1 ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, -1 ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00 -# GFX12: v_not_b16_e64 v5, 0x3800 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64 v5.l, 0x3800 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, 0x3800 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00] 0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00 -# GFX12: v_not_b16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64 v5.l, src_scc ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] 0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00 -# GFX12: v_not_b16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64 v255.l, 0xfe0b ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] + +0x05,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00 +# GFX12-REAL16: v_not_b16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, v1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x08,0xe9,0xd5,0xff,0x01,0x00,0x00 +# GFX12-REAL16: v_not_b16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe9,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, v255 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00] + +0xff,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_not_b16_e64 v255.h, 0xfe0b op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00 # GFX12: v_not_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt index bc957576b19b6..ad491dc02d384 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt @@ -2125,46 +2125,72 @@ # GFX12: v_movrels_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xc3,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX12: v_not_b16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX12: v_not_b16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX12: v_not_b16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_not_b16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX12: v_not_b16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_not_b16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX12: v_not_b16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_not_b16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX12: v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01 -# GFX12: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13 -# GFX12: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] 0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 -# GFX12: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_not_b16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] + +0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_not_b16_e64_dpp v5.h, v1.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] + +0x05,0x08,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.h op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] + +0xff,0x40,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_not_b16_e64_dpp v255.h, v255.l op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xb7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX12: v_not_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt index 989824315b2d2..21b4d0572bf37 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt @@ -587,10 +587,24 @@ # GFX12: v_movrels_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xc3,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 -# GFX12: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] + +0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_not_b16_e64_dpp v5.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +0x05,0x08,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +0xff,0x40,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_not_b16_e64_dpp v255.h, v255.l op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0x05,0x00,0xb7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX12: v_not_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] From a4e47586b9c0566761b7fb704011da6ded823398 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 3 Jan 2025 10:23:13 -0800 Subject: [PATCH 383/567] [ExpandMemCmp] Recognize canonical form of (icmp sle/sge X, 0) in getMemCmpOneBlock. (#121540) This code recognizes special cases where the result of memcmp is compared with 0. If the compare is sle/sge, then InstCombine canonicalizes to (icmp slt X, 1) or (icmp sgt X, -1). We should recognize those patterns too. --- llvm/lib/CodeGen/ExpandMemCmp.cpp | 8 +++++++ llvm/test/CodeGen/AArch64/memcmp.ll | 10 ++------ llvm/test/CodeGen/RISCV/memcmp.ll | 36 +++++++---------------------- llvm/test/CodeGen/X86/memcmp.ll | 12 ++-------- 4 files changed, 20 insertions(+), 46 deletions(-) diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp index cc75a01c6477a..74f93e1979532 100644 --- a/llvm/lib/CodeGen/ExpandMemCmp.cpp +++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp @@ -680,6 +680,14 @@ Value *MemCmpExpansion::getMemCmpOneBlock() { m_SpecificInt(CI->getType()->getIntegerBitWidth() - 1)))) { Pred = ICmpInst::ICMP_SLT; NeedsZExt = true; + } else if (match(UI, m_SpecificICmp(ICmpInst::ICMP_SGT, m_Specific(CI), + m_AllOnes()))) { + // Adjust predicate as if it compared with 0. + Pred = ICmpInst::ICMP_SGE; + } else if (match(UI, m_SpecificICmp(ICmpInst::ICMP_SLT, m_Specific(CI), + m_One()))) { + // Adjust predicate as if it compared with 0. + Pred = ICmpInst::ICMP_SLE; } else { // In case of a successful match this call will set `Pred` variable match(UI, m_ICmp(Pred, m_Specific(CI), m_Zero())); diff --git a/llvm/test/CodeGen/AArch64/memcmp.ll b/llvm/test/CodeGen/AArch64/memcmp.ll index 864f38468842a..98ea86b06d6c5 100644 --- a/llvm/test/CodeGen/AArch64/memcmp.ll +++ b/llvm/test/CodeGen/AArch64/memcmp.ll @@ -265,10 +265,7 @@ define i1 @length4_le(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev w8, w8 ; CHECK-NEXT: rev w9, w9 ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w8, hi -; CHECK-NEXT: csinv w8, w8, wzr, hs -; CHECK-NEXT: cmp w8, #1 -; CHECK-NEXT: cset w0, lt +; CHECK-NEXT: cset w0, ls ; CHECK-NEXT: ret %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind %c = icmp slt i32 %m, 1 @@ -283,10 +280,7 @@ define i1 @length4_ge(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev w8, w8 ; CHECK-NEXT: rev w9, w9 ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w8, hi -; CHECK-NEXT: csinv w8, w8, wzr, hs -; CHECK-NEXT: mvn w8, w8 -; CHECK-NEXT: lsr w0, w8, #31 +; CHECK-NEXT: cset w0, hs ; CHECK-NEXT: ret %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind %c = icmp sgt i32 %m, -1 diff --git a/llvm/test/CodeGen/RISCV/memcmp.ll b/llvm/test/CodeGen/RISCV/memcmp.ll index 5adda28acb427..f0290298e362a 100644 --- a/llvm/test/CodeGen/RISCV/memcmp.ll +++ b/llvm/test/CodeGen/RISCV/memcmp.ll @@ -6664,10 +6664,8 @@ define i1 @memcmp_le_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a1, 0(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a2, a0, a1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a1, a0 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a0, a2 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: slti a0, a0, 1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: xori a0, a0, 1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_le_zero: @@ -6678,10 +6676,8 @@ define i1 @memcmp_le_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a1, a1, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a1, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: slti a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: xori a0, a0, 1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_le_zero: @@ -6690,10 +6686,8 @@ define i1 @memcmp_le_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a1, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a2, a0, a1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a1, a0 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a0, a2 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: slti a0, a0, 1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: xori a0, a0, 1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_le_zero: @@ -6704,10 +6698,8 @@ define i1 @memcmp_le_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a1, a1, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a1, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: slti a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: xori a0, a0, 1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_le_zero: @@ -6864,10 +6856,7 @@ define i1 @memcmp_ge_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a1, 0(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a2, a0, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a1, a0 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a0, a2 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: slti a0, a0, 0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a0, a1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: xori a0, a0, 1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret ; @@ -6879,10 +6868,7 @@ define i1 @memcmp_ge_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a1, a1, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a1, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: slti a0, a0, 0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: xori a0, a0, 1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; @@ -6892,10 +6878,7 @@ define i1 @memcmp_ge_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a1, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a2, a0, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a1, a0 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a0, a2 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: slti a0, a0, 0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a0, a1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: xori a0, a0, 1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret ; @@ -6907,10 +6890,7 @@ define i1 @memcmp_ge_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a1, a1, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a1, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: slti a0, a0, 0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: xori a0, a0, 1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll index e744d2a06e55f..bb089e5ddda87 100644 --- a/llvm/test/CodeGen/X86/memcmp.ll +++ b/llvm/test/CodeGen/X86/memcmp.ll @@ -268,11 +268,7 @@ define i1 @length4_le(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapl %eax ; X64-NEXT: bswapl %ecx ; X64-NEXT: cmpl %ecx, %eax -; X64-NEXT: seta %al -; X64-NEXT: sbbb $0, %al -; X64-NEXT: movsbl %al, %eax -; X64-NEXT: testl %eax, %eax -; X64-NEXT: setle %al +; X64-NEXT: setbe %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind %c = icmp slt i32 %m, 1 @@ -287,11 +283,7 @@ define i1 @length4_ge(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapl %eax ; X64-NEXT: bswapl %ecx ; X64-NEXT: cmpl %ecx, %eax -; X64-NEXT: seta %al -; X64-NEXT: sbbb $0, %al -; X64-NEXT: movsbl %al, %eax -; X64-NEXT: testl %eax, %eax -; X64-NEXT: setns %al +; X64-NEXT: setae %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind %c = icmp sgt i32 %m, -1 From 39a9073f9eb71ac610cbafe7eed05ca668871b5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Fri, 3 Jan 2025 10:35:11 -0800 Subject: [PATCH 384/567] [flang][cuda] Downgrade allocate pinned error to a warning (#121589) To be in accordance with the reference compiler. --- flang/lib/Semantics/check-allocate.cpp | 6 ++++-- flang/test/Semantics/cuf07.cuf | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/flang/lib/Semantics/check-allocate.cpp b/flang/lib/Semantics/check-allocate.cpp index 1e5412324916d..223bee6eb6f11 100644 --- a/flang/lib/Semantics/check-allocate.cpp +++ b/flang/lib/Semantics/check-allocate.cpp @@ -616,9 +616,11 @@ bool AllocationCheckerHelper::RunChecks(SemanticsContext &context) { } if (allocateInfo_.gotPinned) { std::optional cudaAttr{GetCUDADataAttr(ultimate_)}; - if (!cudaAttr || *cudaAttr != common::CUDADataAttr::Pinned) { + if ((!cudaAttr || *cudaAttr != common::CUDADataAttr::Pinned) && + context.languageFeatures().ShouldWarn( + common::UsageWarning::CUDAUsage)) { context.Say(name_.source, - "Object in ALLOCATE must have PINNED attribute when PINNED option is specified"_err_en_US); + "Object in ALLOCATE should have PINNED attribute when PINNED option is specified"_warn_en_US); } } if (allocateInfo_.gotStream) { diff --git a/flang/test/Semantics/cuf07.cuf b/flang/test/Semantics/cuf07.cuf index c48abb5adf0d4..56b2164532ae2 100644 --- a/flang/test/Semantics/cuf07.cuf +++ b/flang/test/Semantics/cuf07.cuf @@ -28,7 +28,7 @@ module m integer, allocatable, device :: ia(:) logical :: plog - !ERROR: Object in ALLOCATE must have PINNED attribute when PINNED option is specified + !WARNING: Object in ALLOCATE should have PINNED attribute when PINNED option is specified allocate(ia(100), pinned = plog) end subroutine From 0844f83fea66332943deed7cdf97b686b2c7c37b Mon Sep 17 00:00:00 2001 From: Arseniy Zaostrovnykh Date: Fri, 3 Jan 2025 19:36:24 +0100 Subject: [PATCH 385/567] [clang][analyzer] Stable order for SymbolRef-keyed containers (#121551) Generalize the `SymbolID`s used for `SymbolData` to all `SymExpr`s and use these IDs for comparison `SymbolRef` keys in various containers, such as `ConstraintMap`. These IDs are superior to raw pointer values because they are more controllable and are not randomized across executions (unlike [pointers](https://en.wikipedia.org/wiki/Address_space_layout_randomization)). These IDs order is stable across runs because SymExprs are allocated in the same order. Stability of the constraint order is important for the stability of the analyzer results. I evaluated this change on a set of 200+ open-source C and C++ projects with the total number of ~78 000 symbolic-execution issues passing Z3 refutation. This patch reduced the run-to-run churn (flakiness) in SE issues from 80-90 to 30-40 (out of 78K) in our CSA deployment (in our setting flaky issues are mostly due to Z3 refutation instability). Note, most of the issue churn (flakiness) is caused by the mentioned Z3 refutation. With Z3 refutation disabled, issue churn goes down to ~10 issues out of 83K and this patch has no effect on appearing/disappearing issues between runs. It however, seems to reduce the volatility of the execution flow: before we had 40-80 issues with changed execution flow, after - 10-30. Importantly, this change is necessary for the next step in stabilizing analysis results by caching Z3 query outcomes between analysis runs (work in progress). Across our admittedly noisy CI runs, I detected no significant effect on memory footprint or analysis time. CPP-5919 --- .../Core/PathSensitive/SymExpr.h | 31 ++++-- .../Core/PathSensitive/SymbolManager.h | 100 ++++++++++++++---- .../lib/StaticAnalyzer/Core/SymbolManager.cpp | 25 ++--- clang/test/Analysis/dump_egraph.cpp | 2 +- .../expr-inspection-printState-diseq-info.c | 12 +-- .../expr-inspection-printState-eq-classes.c | 4 +- clang/test/Analysis/ptr-arith.cpp | 4 +- ...symbol-simplification-disequality-info.cpp | 20 ++-- ...-simplification-fixpoint-one-iteration.cpp | 12 +-- ...simplification-fixpoint-two-iterations.cpp | 18 ++-- clang/test/Analysis/unary-sym-expr.c | 6 +- 11 files changed, 149 insertions(+), 85 deletions(-) diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h index 862a30c0e7363..aca14cf813c4b 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h @@ -25,6 +25,8 @@ namespace ento { class MemRegion; +using SymbolID = unsigned; + /// Symbolic value. These values used to capture symbolic execution of /// the program. class SymExpr : public llvm::FoldingSetNode { @@ -39,9 +41,19 @@ class SymExpr : public llvm::FoldingSetNode { private: Kind K; + /// A unique identifier for this symbol. + /// + /// It is useful for SymbolData to easily differentiate multiple symbols, but + /// also for "ephemeral" symbols, such as binary operations, because this id + /// can be used for arranging constraints or equivalence classes instead of + /// unstable pointer values. + /// + /// Note, however, that it can't be used in Profile because SymbolManager + /// needs to compute Profile before allocating SymExpr. + const SymbolID Sym; protected: - SymExpr(Kind k) : K(k) {} + SymExpr(Kind k, SymbolID Sym) : K(k), Sym(Sym) {} static bool isValidTypeForSymbol(QualType T) { // FIXME: Depending on whether we choose to deprecate structural symbols, @@ -56,6 +68,14 @@ class SymExpr : public llvm::FoldingSetNode { Kind getKind() const { return K; } + /// Get a unique identifier for this symbol. + /// The ID is unique across all SymExprs in a SymbolManager. + /// They reflect the allocation order of these SymExprs, + /// and are likely stable across runs. + /// Used as a key in SymbolRef containers and as part of identity + /// for SymbolData, e.g. SymbolConjured with ID = 7 is "conj_$7". + SymbolID getSymbolID() const { return Sym; } + virtual void dump() const; virtual void dumpToStream(raw_ostream &os) const {} @@ -112,19 +132,14 @@ inline raw_ostream &operator<<(raw_ostream &os, using SymbolRef = const SymExpr *; using SymbolRefSmallVectorTy = SmallVector; -using SymbolID = unsigned; /// A symbol representing data which can be stored in a memory location /// (region). class SymbolData : public SymExpr { - const SymbolID Sym; - void anchor() override; protected: - SymbolData(Kind k, SymbolID sym) : SymExpr(k), Sym(sym) { - assert(classof(this)); - } + SymbolData(Kind k, SymbolID sym) : SymExpr(k, sym) { assert(classof(this)); } public: ~SymbolData() override = default; @@ -132,8 +147,6 @@ class SymbolData : public SymExpr { /// Get a string representation of the kind of the region. virtual StringRef getKindStr() const = 0; - SymbolID getSymbolID() const { return Sym; } - unsigned computeComplexity() const override { return 1; }; diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h index 73732d532f630..b57f415ec139f 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h @@ -25,6 +25,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/FoldingSet.h" +#include "llvm/ADT/ImmutableSet.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Support/Allocator.h" #include @@ -43,15 +44,16 @@ class StoreManager; class SymbolRegionValue : public SymbolData { const TypedValueRegion *R; -public: + friend class SymExprAllocator; SymbolRegionValue(SymbolID sym, const TypedValueRegion *r) : SymbolData(SymbolRegionValueKind, sym), R(r) { assert(r); assert(isValidTypeForSymbol(r->getValueType())); } +public: LLVM_ATTRIBUTE_RETURNS_NONNULL - const TypedValueRegion* getRegion() const { return R; } + const TypedValueRegion *getRegion() const { return R; } static void Profile(llvm::FoldingSetNodeID& profile, const TypedValueRegion* R) { profile.AddInteger((unsigned) SymbolRegionValueKind); @@ -84,7 +86,7 @@ class SymbolConjured : public SymbolData { const LocationContext *LCtx; const void *SymbolTag; -public: + friend class SymExprAllocator; SymbolConjured(SymbolID sym, const Stmt *s, const LocationContext *lctx, QualType t, unsigned count, const void *symbolTag) : SymbolData(SymbolConjuredKind, sym), S(s), T(t), Count(count), @@ -98,6 +100,7 @@ class SymbolConjured : public SymbolData { assert(isValidTypeForSymbol(t)); } +public: /// It might return null. const Stmt *getStmt() const { return S; } unsigned getCount() const { return Count; } @@ -137,7 +140,7 @@ class SymbolDerived : public SymbolData { SymbolRef parentSymbol; const TypedValueRegion *R; -public: + friend class SymExprAllocator; SymbolDerived(SymbolID sym, SymbolRef parent, const TypedValueRegion *r) : SymbolData(SymbolDerivedKind, sym), parentSymbol(parent), R(r) { assert(parent); @@ -145,6 +148,7 @@ class SymbolDerived : public SymbolData { assert(isValidTypeForSymbol(r->getValueType())); } +public: LLVM_ATTRIBUTE_RETURNS_NONNULL SymbolRef getParentSymbol() const { return parentSymbol; } LLVM_ATTRIBUTE_RETURNS_NONNULL @@ -180,12 +184,13 @@ class SymbolDerived : public SymbolData { class SymbolExtent : public SymbolData { const SubRegion *R; -public: + friend class SymExprAllocator; SymbolExtent(SymbolID sym, const SubRegion *r) : SymbolData(SymbolExtentKind, sym), R(r) { assert(r); } +public: LLVM_ATTRIBUTE_RETURNS_NONNULL const SubRegion *getRegion() const { return R; } @@ -222,7 +227,7 @@ class SymbolMetadata : public SymbolData { unsigned Count; const void *Tag; -public: + friend class SymExprAllocator; SymbolMetadata(SymbolID sym, const MemRegion* r, const Stmt *s, QualType t, const LocationContext *LCtx, unsigned count, const void *tag) : SymbolData(SymbolMetadataKind, sym), R(r), S(s), T(t), LCtx(LCtx), @@ -234,6 +239,7 @@ class SymbolMetadata : public SymbolData { assert(tag); } + public: LLVM_ATTRIBUTE_RETURNS_NONNULL const MemRegion *getRegion() const { return R; } @@ -286,15 +292,16 @@ class SymbolCast : public SymExpr { /// The type of the result. QualType ToTy; -public: - SymbolCast(const SymExpr *In, QualType From, QualType To) - : SymExpr(SymbolCastKind), Operand(In), FromTy(From), ToTy(To) { + friend class SymExprAllocator; + SymbolCast(SymbolID Sym, const SymExpr *In, QualType From, QualType To) + : SymExpr(SymbolCastKind, Sym), Operand(In), FromTy(From), ToTy(To) { assert(In); assert(isValidTypeForSymbol(From)); // FIXME: GenericTaintChecker creates symbols of void type. // Otherwise, 'To' should also be a valid type. } +public: unsigned computeComplexity() const override { if (Complexity == 0) Complexity = 1 + Operand->computeComplexity(); @@ -332,9 +339,10 @@ class UnarySymExpr : public SymExpr { UnaryOperator::Opcode Op; QualType T; -public: - UnarySymExpr(const SymExpr *In, UnaryOperator::Opcode Op, QualType T) - : SymExpr(UnarySymExprKind), Operand(In), Op(Op), T(T) { + friend class SymExprAllocator; + UnarySymExpr(SymbolID Sym, const SymExpr *In, UnaryOperator::Opcode Op, + QualType T) + : SymExpr(UnarySymExprKind, Sym), Operand(In), Op(Op), T(T) { // Note, some unary operators are modeled as a binary operator. E.g. ++x is // modeled as x + 1. assert((Op == UO_Minus || Op == UO_Not) && "non-supported unary expression"); @@ -345,6 +353,7 @@ class UnarySymExpr : public SymExpr { assert(!Loc::isLocType(T) && "unary symbol should be nonloc"); } +public: unsigned computeComplexity() const override { if (Complexity == 0) Complexity = 1 + Operand->computeComplexity(); @@ -381,8 +390,8 @@ class BinarySymExpr : public SymExpr { QualType T; protected: - BinarySymExpr(Kind k, BinaryOperator::Opcode op, QualType t) - : SymExpr(k), Op(op), T(t) { + BinarySymExpr(SymbolID Sym, Kind k, BinaryOperator::Opcode op, QualType t) + : SymExpr(k, Sym), Op(op), T(t) { assert(classof(this)); // Binary expressions are results of arithmetic. Pointer arithmetic is not // handled by binary expressions, but it is instead handled by applying @@ -425,14 +434,15 @@ class BinarySymExprImpl : public BinarySymExpr { LHSTYPE LHS; RHSTYPE RHS; -public: - BinarySymExprImpl(LHSTYPE lhs, BinaryOperator::Opcode op, RHSTYPE rhs, - QualType t) - : BinarySymExpr(ClassKind, op, t), LHS(lhs), RHS(rhs) { + friend class SymExprAllocator; + BinarySymExprImpl(SymbolID Sym, LHSTYPE lhs, BinaryOperator::Opcode op, + RHSTYPE rhs, QualType t) + : BinarySymExpr(Sym, ClassKind, op, t), LHS(lhs), RHS(rhs) { assert(getPointer(lhs)); assert(getPointer(rhs)); } +public: void dumpToStream(raw_ostream &os) const override { dumpToStreamImpl(os, LHS); dumpToStreamImpl(os, getOpcode()); @@ -478,6 +488,21 @@ using IntSymExpr = BinarySymExprImpl; +class SymExprAllocator { + SymbolID NextSymbolID = 0; + llvm::BumpPtrAllocator &Alloc; + +public: + explicit SymExprAllocator(llvm::BumpPtrAllocator &Alloc) : Alloc(Alloc) {} + + template SymT *make(ArgsT &&...Args) { + return new (Alloc) SymT(nextID(), std::forward(Args)...); + } + +private: + SymbolID nextID() { return NextSymbolID++; } +}; + class SymbolManager { using DataSetTy = llvm::FoldingSet; using SymbolDependTy = @@ -489,15 +514,14 @@ class SymbolManager { /// alive as long as the key is live. SymbolDependTy SymbolDependencies; - unsigned SymbolCounter = 0; - llvm::BumpPtrAllocator& BPAlloc; + SymExprAllocator Alloc; BasicValueFactory &BV; ASTContext &Ctx; public: SymbolManager(ASTContext &ctx, BasicValueFactory &bv, - llvm::BumpPtrAllocator& bpalloc) - : SymbolDependencies(16), BPAlloc(bpalloc), BV(bv), Ctx(ctx) {} + llvm::BumpPtrAllocator &bpalloc) + : SymbolDependencies(16), Alloc(bpalloc), BV(bv), Ctx(ctx) {} static bool canSymbolicate(QualType T); @@ -687,4 +711,36 @@ class SymbolVisitor { } // namespace clang +// Override the default definition that would use pointer values of SymbolRefs +// to order them, which is unstable due to ASLR. +// Use the SymbolID instead which reflect the order in which the symbols were +// allocated. This is usually stable across runs leading to the stability of +// ConstraintMap and other containers using SymbolRef as keys. +template <> +struct ::llvm::ImutContainerInfo + : public ImutProfileInfo { + using value_type = clang::ento::SymbolRef; + using value_type_ref = clang::ento::SymbolRef; + using key_type = value_type; + using key_type_ref = value_type_ref; + using data_type = bool; + using data_type_ref = bool; + + static key_type_ref KeyOfValue(value_type_ref D) { return D; } + static data_type_ref DataOfValue(value_type_ref) { return true; } + + static bool isEqual(clang::ento::SymbolRef LHS, clang::ento::SymbolRef RHS) { + return LHS->getSymbolID() == RHS->getSymbolID(); + } + + static bool isLess(clang::ento::SymbolRef LHS, clang::ento::SymbolRef RHS) { + return LHS->getSymbolID() < RHS->getSymbolID(); + } + + // This might seem redundant, but it is required because of the way + // ImmutableSet is implemented through AVLTree: + // same as ImmutableMap, but with a non-informative "data". + static bool isDataEqual(data_type_ref, data_type_ref) { return true; } +}; + #endif // LLVM_CLANG_STATICANALYZER_CORE_PATHSENSITIVE_SYMBOLMANAGER_H diff --git a/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp b/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp index f21e5c3ad7bd7..738b6a175ce6d 100644 --- a/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp +++ b/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp @@ -170,9 +170,8 @@ SymbolManager::getRegionValueSymbol(const TypedValueRegion* R) { void *InsertPos; SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos); if (!SD) { - SD = new (BPAlloc) SymbolRegionValue(SymbolCounter, R); + SD = Alloc.make(R); DataSet.InsertNode(SD, InsertPos); - ++SymbolCounter; } return cast(SD); @@ -188,9 +187,8 @@ const SymbolConjured* SymbolManager::conjureSymbol(const Stmt *E, void *InsertPos; SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos); if (!SD) { - SD = new (BPAlloc) SymbolConjured(SymbolCounter, E, LCtx, T, Count, SymbolTag); + SD = Alloc.make(E, LCtx, T, Count, SymbolTag); DataSet.InsertNode(SD, InsertPos); - ++SymbolCounter; } return cast(SD); @@ -204,9 +202,8 @@ SymbolManager::getDerivedSymbol(SymbolRef parentSymbol, void *InsertPos; SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos); if (!SD) { - SD = new (BPAlloc) SymbolDerived(SymbolCounter, parentSymbol, R); + SD = Alloc.make(parentSymbol, R); DataSet.InsertNode(SD, InsertPos); - ++SymbolCounter; } return cast(SD); @@ -219,9 +216,8 @@ SymbolManager::getExtentSymbol(const SubRegion *R) { void *InsertPos; SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos); if (!SD) { - SD = new (BPAlloc) SymbolExtent(SymbolCounter, R); + SD = Alloc.make(R); DataSet.InsertNode(SD, InsertPos); - ++SymbolCounter; } return cast(SD); @@ -236,9 +232,8 @@ SymbolManager::getMetadataSymbol(const MemRegion* R, const Stmt *S, QualType T, void *InsertPos; SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos); if (!SD) { - SD = new (BPAlloc) SymbolMetadata(SymbolCounter, R, S, T, LCtx, Count, SymbolTag); + SD = Alloc.make(R, S, T, LCtx, Count, SymbolTag); DataSet.InsertNode(SD, InsertPos); - ++SymbolCounter; } return cast(SD); @@ -252,7 +247,7 @@ SymbolManager::getCastSymbol(const SymExpr *Op, void *InsertPos; SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos); if (!data) { - data = new (BPAlloc) SymbolCast(Op, From, To); + data = Alloc.make(Op, From, To); DataSet.InsertNode(data, InsertPos); } @@ -268,7 +263,7 @@ const SymIntExpr *SymbolManager::getSymIntExpr(const SymExpr *lhs, SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos); if (!data) { - data = new (BPAlloc) SymIntExpr(lhs, op, v, t); + data = Alloc.make(lhs, op, v, t); DataSet.InsertNode(data, InsertPos); } @@ -284,7 +279,7 @@ const IntSymExpr *SymbolManager::getIntSymExpr(APSIntPtr lhs, SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos); if (!data) { - data = new (BPAlloc) IntSymExpr(lhs, op, rhs, t); + data = Alloc.make(lhs, op, rhs, t); DataSet.InsertNode(data, InsertPos); } @@ -301,7 +296,7 @@ const SymSymExpr *SymbolManager::getSymSymExpr(const SymExpr *lhs, SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos); if (!data) { - data = new (BPAlloc) SymSymExpr(lhs, op, rhs, t); + data = Alloc.make(lhs, op, rhs, t); DataSet.InsertNode(data, InsertPos); } @@ -316,7 +311,7 @@ const UnarySymExpr *SymbolManager::getUnarySymExpr(const SymExpr *Operand, void *InsertPos; SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos); if (!data) { - data = new (BPAlloc) UnarySymExpr(Operand, Opc, T); + data = Alloc.make(Operand, Opc, T); DataSet.InsertNode(data, InsertPos); } diff --git a/clang/test/Analysis/dump_egraph.cpp b/clang/test/Analysis/dump_egraph.cpp index d1229b2634674..13459699a06f6 100644 --- a/clang/test/Analysis/dump_egraph.cpp +++ b/clang/test/Analysis/dump_egraph.cpp @@ -21,7 +21,7 @@ void foo() { // CHECK: \"location_context\": \"#0 Call\", \"calling\": \"T::T\", \"location\": \{ \"line\": 15, \"column\": 5, \"file\": \"{{.*}}dump_egraph.cpp\" \}, \"items\": [\l        \{ \"init_id\": {{[0-9]+}}, \"kind\": \"construct into member variable\", \"argument_index\": null, \"pretty\": \"s\", \"value\": \"&t.s\" -// CHECK: \"cluster\": \"t\", \"pointer\": \"{{0x[0-9a-f]+}}\", \"items\": [\l        \{ \"kind\": \"Default\", \"offset\": 0, \"value\": \"conj_$2\{int, LC5, no stmt, #1\}\" +// CHECK: \"cluster\": \"t\", \"pointer\": \"{{0x[0-9a-f]+}}\", \"items\": [\l        \{ \"kind\": \"Default\", \"offset\": 0, \"value\": \"conj_$3\{int, LC5, no stmt, #1\}\" // CHECK: \"dynamic_types\": [\l      \{ \"region\": \"HeapSymRegion\{conj_$1\{S *, LC1, S{{[0-9]+}}, #1\}\}\", \"dyn_type\": \"S\", \"sub_classable\": false \}\l diff --git a/clang/test/Analysis/expr-inspection-printState-diseq-info.c b/clang/test/Analysis/expr-inspection-printState-diseq-info.c index c5c31785a600e..515fcbbd43079 100644 --- a/clang/test/Analysis/expr-inspection-printState-diseq-info.c +++ b/clang/test/Analysis/expr-inspection-printState-diseq-info.c @@ -18,17 +18,17 @@ void test_disequality_info(int e0, int b0, int b1, int c0) { // CHECK-NEXT: { // CHECK-NEXT: "class": [ "(reg_$0) - 2" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "reg_$2" ]] + // CHECK-NEXT: [ "reg_$7" ]] // CHECK-NEXT: }, // CHECK-NEXT: { - // CHECK-NEXT: "class": [ "reg_$2" ], + // CHECK-NEXT: "class": [ "reg_$15" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "(reg_$0) - 2" ], - // CHECK-NEXT: [ "reg_$3" ]] + // CHECK-NEXT: [ "reg_$7" ]] // CHECK-NEXT: }, // CHECK-NEXT: { - // CHECK-NEXT: "class": [ "reg_$3" ], + // CHECK-NEXT: "class": [ "reg_$7" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "reg_$2" ]] + // CHECK-NEXT: [ "(reg_$0) - 2" ], + // CHECK-NEXT: [ "reg_$15" ]] // CHECK-NEXT: } // CHECK-NEXT: ], diff --git a/clang/test/Analysis/expr-inspection-printState-eq-classes.c b/clang/test/Analysis/expr-inspection-printState-eq-classes.c index 38e23d6e83826..19cc13735ab5a 100644 --- a/clang/test/Analysis/expr-inspection-printState-eq-classes.c +++ b/clang/test/Analysis/expr-inspection-printState-eq-classes.c @@ -16,6 +16,6 @@ void test_equivalence_classes(int a, int b, int c, int d) { } // CHECK: "equivalence_classes": [ -// CHECK-NEXT: [ "(reg_$0) != (reg_$2)" ], -// CHECK-NEXT: [ "reg_$0", "reg_$2", "reg_$3" ] +// CHECK-NEXT: [ "(reg_$0) != (reg_$5)" ], +// CHECK-NEXT: [ "reg_$0", "reg_$20", "reg_$5" ] // CHECK-NEXT: ], diff --git a/clang/test/Analysis/ptr-arith.cpp b/clang/test/Analysis/ptr-arith.cpp index a1264a1f04839..ec1c75c0c4063 100644 --- a/clang/test/Analysis/ptr-arith.cpp +++ b/clang/test/Analysis/ptr-arith.cpp @@ -139,10 +139,10 @@ struct parse_t { int parse(parse_t *p) { unsigned copy = p->bits2; clang_analyzer_dump(copy); - // expected-warning@-1 {{reg_$1},0 S64b,struct Bug_55934::parse_t}.bits2>}} + // expected-warning@-1 {{reg_$2},0 S64b,struct Bug_55934::parse_t}.bits2>}} header *bits = (header *)© clang_analyzer_dump(bits->b); - // expected-warning@-1 {{derived_$2{reg_$1},0 S64b,struct Bug_55934::parse_t}.bits2>,Element{copy,0 S64b,struct Bug_55934::header}.b}}} + // expected-warning@-1 {{derived_$4{reg_$2},0 S64b,struct Bug_55934::parse_t}.bits2>,Element{copy,0 S64b,struct Bug_55934::header}.b}}} return bits->b; // no-warning } } // namespace Bug_55934 diff --git a/clang/test/Analysis/symbol-simplification-disequality-info.cpp b/clang/test/Analysis/symbol-simplification-disequality-info.cpp index 69238b583eb84..33b8f150f5d02 100644 --- a/clang/test/Analysis/symbol-simplification-disequality-info.cpp +++ b/clang/test/Analysis/symbol-simplification-disequality-info.cpp @@ -14,14 +14,14 @@ void test(int a, int b, int c, int d) { clang_analyzer_printState(); // CHECK: "disequality_info": [ // CHECK-NEXT: { - // CHECK-NEXT: "class": [ "((reg_$0) + (reg_$1)) + (reg_$2)" ], + // CHECK-NEXT: "class": [ "((reg_$0) + (reg_$2)) + (reg_$5)" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "reg_$3" ]] + // CHECK-NEXT: [ "reg_$8" ]] // CHECK-NEXT: }, // CHECK-NEXT: { - // CHECK-NEXT: "class": [ "reg_$3" ], + // CHECK-NEXT: "class": [ "reg_$8" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "((reg_$0) + (reg_$1)) + (reg_$2)" ]] + // CHECK-NEXT: [ "((reg_$0) + (reg_$2)) + (reg_$5)" ]] // CHECK-NEXT: } // CHECK-NEXT: ], @@ -32,14 +32,14 @@ void test(int a, int b, int c, int d) { clang_analyzer_printState(); // CHECK: "disequality_info": [ // CHECK-NEXT: { - // CHECK-NEXT: "class": [ "(reg_$0) + (reg_$2)" ], + // CHECK-NEXT: "class": [ "(reg_$0) + (reg_$5)" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "reg_$3" ]] + // CHECK-NEXT: [ "reg_$8" ]] // CHECK-NEXT: }, // CHECK-NEXT: { - // CHECK-NEXT: "class": [ "reg_$3" ], + // CHECK-NEXT: "class": [ "reg_$8" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "(reg_$0) + (reg_$2)" ]] + // CHECK-NEXT: [ "(reg_$0) + (reg_$5)" ]] // CHECK-NEXT: } // CHECK-NEXT: ], @@ -50,10 +50,10 @@ void test(int a, int b, int c, int d) { // CHECK-NEXT: { // CHECK-NEXT: "class": [ "reg_$0" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "reg_$3" ]] + // CHECK-NEXT: [ "reg_$8" ]] // CHECK-NEXT: }, // CHECK-NEXT: { - // CHECK-NEXT: "class": [ "reg_$3" ], + // CHECK-NEXT: "class": [ "reg_$8" ], // CHECK-NEXT: "disequal_to": [ // CHECK-NEXT: [ "reg_$0" ]] // CHECK-NEXT: } diff --git a/clang/test/Analysis/symbol-simplification-fixpoint-one-iteration.cpp b/clang/test/Analysis/symbol-simplification-fixpoint-one-iteration.cpp index 73922d420a8c3..42e984762538e 100644 --- a/clang/test/Analysis/symbol-simplification-fixpoint-one-iteration.cpp +++ b/clang/test/Analysis/symbol-simplification-fixpoint-one-iteration.cpp @@ -13,10 +13,10 @@ void test(int a, int b, int c) { return; clang_analyzer_printState(); // CHECK: "constraints": [ - // CHECK-NEXT: { "symbol": "((reg_$0) + (reg_$1)) != (reg_$2)", "range": "{ [0, 0] }" } + // CHECK-NEXT: { "symbol": "((reg_$0) + (reg_$2)) != (reg_$5)", "range": "{ [0, 0] }" } // CHECK-NEXT: ], // CHECK-NEXT: "equivalence_classes": [ - // CHECK-NEXT: [ "(reg_$0) + (reg_$1)", "reg_$2" ] + // CHECK-NEXT: [ "(reg_$0) + (reg_$2)", "reg_$5" ] // CHECK-NEXT: ], // CHECK-NEXT: "disequality_info": null, @@ -25,12 +25,12 @@ void test(int a, int b, int c) { return; clang_analyzer_printState(); // CHECK: "constraints": [ - // CHECK-NEXT: { "symbol": "(reg_$0) != (reg_$2)", "range": "{ [0, 0] }" }, - // CHECK-NEXT: { "symbol": "reg_$1", "range": "{ [0, 0] }" } + // CHECK-NEXT: { "symbol": "(reg_$0) != (reg_$5)", "range": "{ [0, 0] }" }, + // CHECK-NEXT: { "symbol": "reg_$2", "range": "{ [0, 0] }" } // CHECK-NEXT: ], // CHECK-NEXT: "equivalence_classes": [ - // CHECK-NEXT: [ "(reg_$0) != (reg_$2)" ], - // CHECK-NEXT: [ "reg_$0", "reg_$2" ] + // CHECK-NEXT: [ "(reg_$0) != (reg_$5)" ], + // CHECK-NEXT: [ "reg_$0", "reg_$5" ] // CHECK-NEXT: ], // CHECK-NEXT: "disequality_info": null, diff --git a/clang/test/Analysis/symbol-simplification-fixpoint-two-iterations.cpp b/clang/test/Analysis/symbol-simplification-fixpoint-two-iterations.cpp index 679ed3fda7a7a..cffb5a70869eb 100644 --- a/clang/test/Analysis/symbol-simplification-fixpoint-two-iterations.cpp +++ b/clang/test/Analysis/symbol-simplification-fixpoint-two-iterations.cpp @@ -15,11 +15,11 @@ void test(int a, int b, int c, int d) { return; clang_analyzer_printState(); // CHECK: "constraints": [ - // CHECK-NEXT: { "symbol": "(((reg_$0) + (reg_$1)) + (reg_$2)) != (reg_$3)", "range": "{ [0, 0] }" }, - // CHECK-NEXT: { "symbol": "(reg_$2) + (reg_$1)", "range": "{ [0, 0] }" } + // CHECK-NEXT: { "symbol": "(((reg_$0) + (reg_$2)) + (reg_$5)) != (reg_$8)", "range": "{ [0, 0] }" }, + // CHECK-NEXT: { "symbol": "(reg_$5) + (reg_$2)", "range": "{ [0, 0] }" } // CHECK-NEXT: ], // CHECK-NEXT: "equivalence_classes": [ - // CHECK-NEXT: [ "((reg_$0) + (reg_$1)) + (reg_$2)", "reg_$3" ] + // CHECK-NEXT: [ "((reg_$0) + (reg_$2)) + (reg_$5)", "reg_$8" ] // CHECK-NEXT: ], // CHECK-NEXT: "disequality_info": null, @@ -28,14 +28,14 @@ void test(int a, int b, int c, int d) { return; clang_analyzer_printState(); // CHECK: "constraints": [ - // CHECK-NEXT: { "symbol": "(reg_$0) != (reg_$3)", "range": "{ [0, 0] }" }, - // CHECK-NEXT: { "symbol": "reg_$1", "range": "{ [0, 0] }" }, - // CHECK-NEXT: { "symbol": "reg_$2", "range": "{ [0, 0] }" } + // CHECK-NEXT: { "symbol": "(reg_$0) != (reg_$8)", "range": "{ [0, 0] }" }, + // CHECK-NEXT: { "symbol": "reg_$2", "range": "{ [0, 0] }" }, + // CHECK-NEXT: { "symbol": "reg_$5", "range": "{ [0, 0] }" } // CHECK-NEXT: ], // CHECK-NEXT: "equivalence_classes": [ - // CHECK-NEXT: [ "(reg_$0) != (reg_$3)" ], - // CHECK-NEXT: [ "reg_$0", "reg_$3" ], - // CHECK-NEXT: [ "reg_$2" ] + // CHECK-NEXT: [ "(reg_$0) != (reg_$8)" ], + // CHECK-NEXT: [ "reg_$0", "reg_$8" ], + // CHECK-NEXT: [ "reg_$5" ] // CHECK-NEXT: ], // CHECK-NEXT: "disequality_info": null, diff --git a/clang/test/Analysis/unary-sym-expr.c b/clang/test/Analysis/unary-sym-expr.c index 92e11b295bee7..64a01a956c442 100644 --- a/clang/test/Analysis/unary-sym-expr.c +++ b/clang/test/Analysis/unary-sym-expr.c @@ -11,9 +11,9 @@ int test(int x, int y) { clang_analyzer_dump(-x); // expected-warning{{-reg_$0}} clang_analyzer_dump(~x); // expected-warning{{~reg_$0}} int z = x + y; - clang_analyzer_dump(-z); // expected-warning{{-((reg_$0) + (reg_$1))}} - clang_analyzer_dump(-(x + y)); // expected-warning{{-((reg_$0) + (reg_$1))}} - clang_analyzer_dump(-x + y); // expected-warning{{(-reg_$0) + (reg_$1)}} + clang_analyzer_dump(-z); // expected-warning{{-((reg_$0) + (reg_$3))}} + clang_analyzer_dump(-(x + y)); // expected-warning{{-((reg_$0) + (reg_$3))}} + clang_analyzer_dump(-x + y); // expected-warning{{(-reg_$0) + (reg_$3)}} if (-x == 0) { clang_analyzer_eval(-x == 0); // expected-warning{{TRUE}} From a106ad0f1d0f74fde3591149c63f3e94ec780fef Mon Sep 17 00:00:00 2001 From: Balazs Benics Date: Fri, 3 Jan 2025 19:43:24 +0100 Subject: [PATCH 386/567] Revert "[clang][analyzer] Stable order for SymbolRef-keyed containers" (#121592) Reverts llvm/llvm-project#121551 We had a bunch of build errors caused by this PR. https://lab.llvm.org/buildbot/#/builders/144/builds/14875 --- .../Core/PathSensitive/SymExpr.h | 31 ++---- .../Core/PathSensitive/SymbolManager.h | 100 ++++-------------- .../lib/StaticAnalyzer/Core/SymbolManager.cpp | 25 +++-- clang/test/Analysis/dump_egraph.cpp | 2 +- .../expr-inspection-printState-diseq-info.c | 12 +-- .../expr-inspection-printState-eq-classes.c | 4 +- clang/test/Analysis/ptr-arith.cpp | 4 +- ...symbol-simplification-disequality-info.cpp | 20 ++-- ...-simplification-fixpoint-one-iteration.cpp | 12 +-- ...simplification-fixpoint-two-iterations.cpp | 18 ++-- clang/test/Analysis/unary-sym-expr.c | 6 +- 11 files changed, 85 insertions(+), 149 deletions(-) diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h index aca14cf813c4b..862a30c0e7363 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h @@ -25,8 +25,6 @@ namespace ento { class MemRegion; -using SymbolID = unsigned; - /// Symbolic value. These values used to capture symbolic execution of /// the program. class SymExpr : public llvm::FoldingSetNode { @@ -41,19 +39,9 @@ class SymExpr : public llvm::FoldingSetNode { private: Kind K; - /// A unique identifier for this symbol. - /// - /// It is useful for SymbolData to easily differentiate multiple symbols, but - /// also for "ephemeral" symbols, such as binary operations, because this id - /// can be used for arranging constraints or equivalence classes instead of - /// unstable pointer values. - /// - /// Note, however, that it can't be used in Profile because SymbolManager - /// needs to compute Profile before allocating SymExpr. - const SymbolID Sym; protected: - SymExpr(Kind k, SymbolID Sym) : K(k), Sym(Sym) {} + SymExpr(Kind k) : K(k) {} static bool isValidTypeForSymbol(QualType T) { // FIXME: Depending on whether we choose to deprecate structural symbols, @@ -68,14 +56,6 @@ class SymExpr : public llvm::FoldingSetNode { Kind getKind() const { return K; } - /// Get a unique identifier for this symbol. - /// The ID is unique across all SymExprs in a SymbolManager. - /// They reflect the allocation order of these SymExprs, - /// and are likely stable across runs. - /// Used as a key in SymbolRef containers and as part of identity - /// for SymbolData, e.g. SymbolConjured with ID = 7 is "conj_$7". - SymbolID getSymbolID() const { return Sym; } - virtual void dump() const; virtual void dumpToStream(raw_ostream &os) const {} @@ -132,14 +112,19 @@ inline raw_ostream &operator<<(raw_ostream &os, using SymbolRef = const SymExpr *; using SymbolRefSmallVectorTy = SmallVector; +using SymbolID = unsigned; /// A symbol representing data which can be stored in a memory location /// (region). class SymbolData : public SymExpr { + const SymbolID Sym; + void anchor() override; protected: - SymbolData(Kind k, SymbolID sym) : SymExpr(k, sym) { assert(classof(this)); } + SymbolData(Kind k, SymbolID sym) : SymExpr(k), Sym(sym) { + assert(classof(this)); + } public: ~SymbolData() override = default; @@ -147,6 +132,8 @@ class SymbolData : public SymExpr { /// Get a string representation of the kind of the region. virtual StringRef getKindStr() const = 0; + SymbolID getSymbolID() const { return Sym; } + unsigned computeComplexity() const override { return 1; }; diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h index b57f415ec139f..73732d532f630 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h @@ -25,7 +25,6 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/FoldingSet.h" -#include "llvm/ADT/ImmutableSet.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Support/Allocator.h" #include @@ -44,16 +43,15 @@ class StoreManager; class SymbolRegionValue : public SymbolData { const TypedValueRegion *R; - friend class SymExprAllocator; +public: SymbolRegionValue(SymbolID sym, const TypedValueRegion *r) : SymbolData(SymbolRegionValueKind, sym), R(r) { assert(r); assert(isValidTypeForSymbol(r->getValueType())); } -public: LLVM_ATTRIBUTE_RETURNS_NONNULL - const TypedValueRegion *getRegion() const { return R; } + const TypedValueRegion* getRegion() const { return R; } static void Profile(llvm::FoldingSetNodeID& profile, const TypedValueRegion* R) { profile.AddInteger((unsigned) SymbolRegionValueKind); @@ -86,7 +84,7 @@ class SymbolConjured : public SymbolData { const LocationContext *LCtx; const void *SymbolTag; - friend class SymExprAllocator; +public: SymbolConjured(SymbolID sym, const Stmt *s, const LocationContext *lctx, QualType t, unsigned count, const void *symbolTag) : SymbolData(SymbolConjuredKind, sym), S(s), T(t), Count(count), @@ -100,7 +98,6 @@ class SymbolConjured : public SymbolData { assert(isValidTypeForSymbol(t)); } -public: /// It might return null. const Stmt *getStmt() const { return S; } unsigned getCount() const { return Count; } @@ -140,7 +137,7 @@ class SymbolDerived : public SymbolData { SymbolRef parentSymbol; const TypedValueRegion *R; - friend class SymExprAllocator; +public: SymbolDerived(SymbolID sym, SymbolRef parent, const TypedValueRegion *r) : SymbolData(SymbolDerivedKind, sym), parentSymbol(parent), R(r) { assert(parent); @@ -148,7 +145,6 @@ class SymbolDerived : public SymbolData { assert(isValidTypeForSymbol(r->getValueType())); } -public: LLVM_ATTRIBUTE_RETURNS_NONNULL SymbolRef getParentSymbol() const { return parentSymbol; } LLVM_ATTRIBUTE_RETURNS_NONNULL @@ -184,13 +180,12 @@ class SymbolDerived : public SymbolData { class SymbolExtent : public SymbolData { const SubRegion *R; - friend class SymExprAllocator; +public: SymbolExtent(SymbolID sym, const SubRegion *r) : SymbolData(SymbolExtentKind, sym), R(r) { assert(r); } -public: LLVM_ATTRIBUTE_RETURNS_NONNULL const SubRegion *getRegion() const { return R; } @@ -227,7 +222,7 @@ class SymbolMetadata : public SymbolData { unsigned Count; const void *Tag; - friend class SymExprAllocator; +public: SymbolMetadata(SymbolID sym, const MemRegion* r, const Stmt *s, QualType t, const LocationContext *LCtx, unsigned count, const void *tag) : SymbolData(SymbolMetadataKind, sym), R(r), S(s), T(t), LCtx(LCtx), @@ -239,7 +234,6 @@ class SymbolMetadata : public SymbolData { assert(tag); } - public: LLVM_ATTRIBUTE_RETURNS_NONNULL const MemRegion *getRegion() const { return R; } @@ -292,16 +286,15 @@ class SymbolCast : public SymExpr { /// The type of the result. QualType ToTy; - friend class SymExprAllocator; - SymbolCast(SymbolID Sym, const SymExpr *In, QualType From, QualType To) - : SymExpr(SymbolCastKind, Sym), Operand(In), FromTy(From), ToTy(To) { +public: + SymbolCast(const SymExpr *In, QualType From, QualType To) + : SymExpr(SymbolCastKind), Operand(In), FromTy(From), ToTy(To) { assert(In); assert(isValidTypeForSymbol(From)); // FIXME: GenericTaintChecker creates symbols of void type. // Otherwise, 'To' should also be a valid type. } -public: unsigned computeComplexity() const override { if (Complexity == 0) Complexity = 1 + Operand->computeComplexity(); @@ -339,10 +332,9 @@ class UnarySymExpr : public SymExpr { UnaryOperator::Opcode Op; QualType T; - friend class SymExprAllocator; - UnarySymExpr(SymbolID Sym, const SymExpr *In, UnaryOperator::Opcode Op, - QualType T) - : SymExpr(UnarySymExprKind, Sym), Operand(In), Op(Op), T(T) { +public: + UnarySymExpr(const SymExpr *In, UnaryOperator::Opcode Op, QualType T) + : SymExpr(UnarySymExprKind), Operand(In), Op(Op), T(T) { // Note, some unary operators are modeled as a binary operator. E.g. ++x is // modeled as x + 1. assert((Op == UO_Minus || Op == UO_Not) && "non-supported unary expression"); @@ -353,7 +345,6 @@ class UnarySymExpr : public SymExpr { assert(!Loc::isLocType(T) && "unary symbol should be nonloc"); } -public: unsigned computeComplexity() const override { if (Complexity == 0) Complexity = 1 + Operand->computeComplexity(); @@ -390,8 +381,8 @@ class BinarySymExpr : public SymExpr { QualType T; protected: - BinarySymExpr(SymbolID Sym, Kind k, BinaryOperator::Opcode op, QualType t) - : SymExpr(k, Sym), Op(op), T(t) { + BinarySymExpr(Kind k, BinaryOperator::Opcode op, QualType t) + : SymExpr(k), Op(op), T(t) { assert(classof(this)); // Binary expressions are results of arithmetic. Pointer arithmetic is not // handled by binary expressions, but it is instead handled by applying @@ -434,15 +425,14 @@ class BinarySymExprImpl : public BinarySymExpr { LHSTYPE LHS; RHSTYPE RHS; - friend class SymExprAllocator; - BinarySymExprImpl(SymbolID Sym, LHSTYPE lhs, BinaryOperator::Opcode op, - RHSTYPE rhs, QualType t) - : BinarySymExpr(Sym, ClassKind, op, t), LHS(lhs), RHS(rhs) { +public: + BinarySymExprImpl(LHSTYPE lhs, BinaryOperator::Opcode op, RHSTYPE rhs, + QualType t) + : BinarySymExpr(ClassKind, op, t), LHS(lhs), RHS(rhs) { assert(getPointer(lhs)); assert(getPointer(rhs)); } -public: void dumpToStream(raw_ostream &os) const override { dumpToStreamImpl(os, LHS); dumpToStreamImpl(os, getOpcode()); @@ -488,21 +478,6 @@ using IntSymExpr = BinarySymExprImpl; -class SymExprAllocator { - SymbolID NextSymbolID = 0; - llvm::BumpPtrAllocator &Alloc; - -public: - explicit SymExprAllocator(llvm::BumpPtrAllocator &Alloc) : Alloc(Alloc) {} - - template SymT *make(ArgsT &&...Args) { - return new (Alloc) SymT(nextID(), std::forward(Args)...); - } - -private: - SymbolID nextID() { return NextSymbolID++; } -}; - class SymbolManager { using DataSetTy = llvm::FoldingSet; using SymbolDependTy = @@ -514,14 +489,15 @@ class SymbolManager { /// alive as long as the key is live. SymbolDependTy SymbolDependencies; - SymExprAllocator Alloc; + unsigned SymbolCounter = 0; + llvm::BumpPtrAllocator& BPAlloc; BasicValueFactory &BV; ASTContext &Ctx; public: SymbolManager(ASTContext &ctx, BasicValueFactory &bv, - llvm::BumpPtrAllocator &bpalloc) - : SymbolDependencies(16), Alloc(bpalloc), BV(bv), Ctx(ctx) {} + llvm::BumpPtrAllocator& bpalloc) + : SymbolDependencies(16), BPAlloc(bpalloc), BV(bv), Ctx(ctx) {} static bool canSymbolicate(QualType T); @@ -711,36 +687,4 @@ class SymbolVisitor { } // namespace clang -// Override the default definition that would use pointer values of SymbolRefs -// to order them, which is unstable due to ASLR. -// Use the SymbolID instead which reflect the order in which the symbols were -// allocated. This is usually stable across runs leading to the stability of -// ConstraintMap and other containers using SymbolRef as keys. -template <> -struct ::llvm::ImutContainerInfo - : public ImutProfileInfo { - using value_type = clang::ento::SymbolRef; - using value_type_ref = clang::ento::SymbolRef; - using key_type = value_type; - using key_type_ref = value_type_ref; - using data_type = bool; - using data_type_ref = bool; - - static key_type_ref KeyOfValue(value_type_ref D) { return D; } - static data_type_ref DataOfValue(value_type_ref) { return true; } - - static bool isEqual(clang::ento::SymbolRef LHS, clang::ento::SymbolRef RHS) { - return LHS->getSymbolID() == RHS->getSymbolID(); - } - - static bool isLess(clang::ento::SymbolRef LHS, clang::ento::SymbolRef RHS) { - return LHS->getSymbolID() < RHS->getSymbolID(); - } - - // This might seem redundant, but it is required because of the way - // ImmutableSet is implemented through AVLTree: - // same as ImmutableMap, but with a non-informative "data". - static bool isDataEqual(data_type_ref, data_type_ref) { return true; } -}; - #endif // LLVM_CLANG_STATICANALYZER_CORE_PATHSENSITIVE_SYMBOLMANAGER_H diff --git a/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp b/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp index 738b6a175ce6d..f21e5c3ad7bd7 100644 --- a/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp +++ b/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp @@ -170,8 +170,9 @@ SymbolManager::getRegionValueSymbol(const TypedValueRegion* R) { void *InsertPos; SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos); if (!SD) { - SD = Alloc.make(R); + SD = new (BPAlloc) SymbolRegionValue(SymbolCounter, R); DataSet.InsertNode(SD, InsertPos); + ++SymbolCounter; } return cast(SD); @@ -187,8 +188,9 @@ const SymbolConjured* SymbolManager::conjureSymbol(const Stmt *E, void *InsertPos; SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos); if (!SD) { - SD = Alloc.make(E, LCtx, T, Count, SymbolTag); + SD = new (BPAlloc) SymbolConjured(SymbolCounter, E, LCtx, T, Count, SymbolTag); DataSet.InsertNode(SD, InsertPos); + ++SymbolCounter; } return cast(SD); @@ -202,8 +204,9 @@ SymbolManager::getDerivedSymbol(SymbolRef parentSymbol, void *InsertPos; SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos); if (!SD) { - SD = Alloc.make(parentSymbol, R); + SD = new (BPAlloc) SymbolDerived(SymbolCounter, parentSymbol, R); DataSet.InsertNode(SD, InsertPos); + ++SymbolCounter; } return cast(SD); @@ -216,8 +219,9 @@ SymbolManager::getExtentSymbol(const SubRegion *R) { void *InsertPos; SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos); if (!SD) { - SD = Alloc.make(R); + SD = new (BPAlloc) SymbolExtent(SymbolCounter, R); DataSet.InsertNode(SD, InsertPos); + ++SymbolCounter; } return cast(SD); @@ -232,8 +236,9 @@ SymbolManager::getMetadataSymbol(const MemRegion* R, const Stmt *S, QualType T, void *InsertPos; SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos); if (!SD) { - SD = Alloc.make(R, S, T, LCtx, Count, SymbolTag); + SD = new (BPAlloc) SymbolMetadata(SymbolCounter, R, S, T, LCtx, Count, SymbolTag); DataSet.InsertNode(SD, InsertPos); + ++SymbolCounter; } return cast(SD); @@ -247,7 +252,7 @@ SymbolManager::getCastSymbol(const SymExpr *Op, void *InsertPos; SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos); if (!data) { - data = Alloc.make(Op, From, To); + data = new (BPAlloc) SymbolCast(Op, From, To); DataSet.InsertNode(data, InsertPos); } @@ -263,7 +268,7 @@ const SymIntExpr *SymbolManager::getSymIntExpr(const SymExpr *lhs, SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos); if (!data) { - data = Alloc.make(lhs, op, v, t); + data = new (BPAlloc) SymIntExpr(lhs, op, v, t); DataSet.InsertNode(data, InsertPos); } @@ -279,7 +284,7 @@ const IntSymExpr *SymbolManager::getIntSymExpr(APSIntPtr lhs, SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos); if (!data) { - data = Alloc.make(lhs, op, rhs, t); + data = new (BPAlloc) IntSymExpr(lhs, op, rhs, t); DataSet.InsertNode(data, InsertPos); } @@ -296,7 +301,7 @@ const SymSymExpr *SymbolManager::getSymSymExpr(const SymExpr *lhs, SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos); if (!data) { - data = Alloc.make(lhs, op, rhs, t); + data = new (BPAlloc) SymSymExpr(lhs, op, rhs, t); DataSet.InsertNode(data, InsertPos); } @@ -311,7 +316,7 @@ const UnarySymExpr *SymbolManager::getUnarySymExpr(const SymExpr *Operand, void *InsertPos; SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos); if (!data) { - data = Alloc.make(Operand, Opc, T); + data = new (BPAlloc) UnarySymExpr(Operand, Opc, T); DataSet.InsertNode(data, InsertPos); } diff --git a/clang/test/Analysis/dump_egraph.cpp b/clang/test/Analysis/dump_egraph.cpp index 13459699a06f6..d1229b2634674 100644 --- a/clang/test/Analysis/dump_egraph.cpp +++ b/clang/test/Analysis/dump_egraph.cpp @@ -21,7 +21,7 @@ void foo() { // CHECK: \"location_context\": \"#0 Call\", \"calling\": \"T::T\", \"location\": \{ \"line\": 15, \"column\": 5, \"file\": \"{{.*}}dump_egraph.cpp\" \}, \"items\": [\l        \{ \"init_id\": {{[0-9]+}}, \"kind\": \"construct into member variable\", \"argument_index\": null, \"pretty\": \"s\", \"value\": \"&t.s\" -// CHECK: \"cluster\": \"t\", \"pointer\": \"{{0x[0-9a-f]+}}\", \"items\": [\l        \{ \"kind\": \"Default\", \"offset\": 0, \"value\": \"conj_$3\{int, LC5, no stmt, #1\}\" +// CHECK: \"cluster\": \"t\", \"pointer\": \"{{0x[0-9a-f]+}}\", \"items\": [\l        \{ \"kind\": \"Default\", \"offset\": 0, \"value\": \"conj_$2\{int, LC5, no stmt, #1\}\" // CHECK: \"dynamic_types\": [\l      \{ \"region\": \"HeapSymRegion\{conj_$1\{S *, LC1, S{{[0-9]+}}, #1\}\}\", \"dyn_type\": \"S\", \"sub_classable\": false \}\l diff --git a/clang/test/Analysis/expr-inspection-printState-diseq-info.c b/clang/test/Analysis/expr-inspection-printState-diseq-info.c index 515fcbbd43079..c5c31785a600e 100644 --- a/clang/test/Analysis/expr-inspection-printState-diseq-info.c +++ b/clang/test/Analysis/expr-inspection-printState-diseq-info.c @@ -18,17 +18,17 @@ void test_disequality_info(int e0, int b0, int b1, int c0) { // CHECK-NEXT: { // CHECK-NEXT: "class": [ "(reg_$0) - 2" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "reg_$7" ]] + // CHECK-NEXT: [ "reg_$2" ]] // CHECK-NEXT: }, // CHECK-NEXT: { - // CHECK-NEXT: "class": [ "reg_$15" ], + // CHECK-NEXT: "class": [ "reg_$2" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "reg_$7" ]] + // CHECK-NEXT: [ "(reg_$0) - 2" ], + // CHECK-NEXT: [ "reg_$3" ]] // CHECK-NEXT: }, // CHECK-NEXT: { - // CHECK-NEXT: "class": [ "reg_$7" ], + // CHECK-NEXT: "class": [ "reg_$3" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "(reg_$0) - 2" ], - // CHECK-NEXT: [ "reg_$15" ]] + // CHECK-NEXT: [ "reg_$2" ]] // CHECK-NEXT: } // CHECK-NEXT: ], diff --git a/clang/test/Analysis/expr-inspection-printState-eq-classes.c b/clang/test/Analysis/expr-inspection-printState-eq-classes.c index 19cc13735ab5a..38e23d6e83826 100644 --- a/clang/test/Analysis/expr-inspection-printState-eq-classes.c +++ b/clang/test/Analysis/expr-inspection-printState-eq-classes.c @@ -16,6 +16,6 @@ void test_equivalence_classes(int a, int b, int c, int d) { } // CHECK: "equivalence_classes": [ -// CHECK-NEXT: [ "(reg_$0) != (reg_$5)" ], -// CHECK-NEXT: [ "reg_$0", "reg_$20", "reg_$5" ] +// CHECK-NEXT: [ "(reg_$0) != (reg_$2)" ], +// CHECK-NEXT: [ "reg_$0", "reg_$2", "reg_$3" ] // CHECK-NEXT: ], diff --git a/clang/test/Analysis/ptr-arith.cpp b/clang/test/Analysis/ptr-arith.cpp index ec1c75c0c4063..a1264a1f04839 100644 --- a/clang/test/Analysis/ptr-arith.cpp +++ b/clang/test/Analysis/ptr-arith.cpp @@ -139,10 +139,10 @@ struct parse_t { int parse(parse_t *p) { unsigned copy = p->bits2; clang_analyzer_dump(copy); - // expected-warning@-1 {{reg_$2},0 S64b,struct Bug_55934::parse_t}.bits2>}} + // expected-warning@-1 {{reg_$1},0 S64b,struct Bug_55934::parse_t}.bits2>}} header *bits = (header *)© clang_analyzer_dump(bits->b); - // expected-warning@-1 {{derived_$4{reg_$2},0 S64b,struct Bug_55934::parse_t}.bits2>,Element{copy,0 S64b,struct Bug_55934::header}.b}}} + // expected-warning@-1 {{derived_$2{reg_$1},0 S64b,struct Bug_55934::parse_t}.bits2>,Element{copy,0 S64b,struct Bug_55934::header}.b}}} return bits->b; // no-warning } } // namespace Bug_55934 diff --git a/clang/test/Analysis/symbol-simplification-disequality-info.cpp b/clang/test/Analysis/symbol-simplification-disequality-info.cpp index 33b8f150f5d02..69238b583eb84 100644 --- a/clang/test/Analysis/symbol-simplification-disequality-info.cpp +++ b/clang/test/Analysis/symbol-simplification-disequality-info.cpp @@ -14,14 +14,14 @@ void test(int a, int b, int c, int d) { clang_analyzer_printState(); // CHECK: "disequality_info": [ // CHECK-NEXT: { - // CHECK-NEXT: "class": [ "((reg_$0) + (reg_$2)) + (reg_$5)" ], + // CHECK-NEXT: "class": [ "((reg_$0) + (reg_$1)) + (reg_$2)" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "reg_$8" ]] + // CHECK-NEXT: [ "reg_$3" ]] // CHECK-NEXT: }, // CHECK-NEXT: { - // CHECK-NEXT: "class": [ "reg_$8" ], + // CHECK-NEXT: "class": [ "reg_$3" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "((reg_$0) + (reg_$2)) + (reg_$5)" ]] + // CHECK-NEXT: [ "((reg_$0) + (reg_$1)) + (reg_$2)" ]] // CHECK-NEXT: } // CHECK-NEXT: ], @@ -32,14 +32,14 @@ void test(int a, int b, int c, int d) { clang_analyzer_printState(); // CHECK: "disequality_info": [ // CHECK-NEXT: { - // CHECK-NEXT: "class": [ "(reg_$0) + (reg_$5)" ], + // CHECK-NEXT: "class": [ "(reg_$0) + (reg_$2)" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "reg_$8" ]] + // CHECK-NEXT: [ "reg_$3" ]] // CHECK-NEXT: }, // CHECK-NEXT: { - // CHECK-NEXT: "class": [ "reg_$8" ], + // CHECK-NEXT: "class": [ "reg_$3" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "(reg_$0) + (reg_$5)" ]] + // CHECK-NEXT: [ "(reg_$0) + (reg_$2)" ]] // CHECK-NEXT: } // CHECK-NEXT: ], @@ -50,10 +50,10 @@ void test(int a, int b, int c, int d) { // CHECK-NEXT: { // CHECK-NEXT: "class": [ "reg_$0" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "reg_$8" ]] + // CHECK-NEXT: [ "reg_$3" ]] // CHECK-NEXT: }, // CHECK-NEXT: { - // CHECK-NEXT: "class": [ "reg_$8" ], + // CHECK-NEXT: "class": [ "reg_$3" ], // CHECK-NEXT: "disequal_to": [ // CHECK-NEXT: [ "reg_$0" ]] // CHECK-NEXT: } diff --git a/clang/test/Analysis/symbol-simplification-fixpoint-one-iteration.cpp b/clang/test/Analysis/symbol-simplification-fixpoint-one-iteration.cpp index 42e984762538e..73922d420a8c3 100644 --- a/clang/test/Analysis/symbol-simplification-fixpoint-one-iteration.cpp +++ b/clang/test/Analysis/symbol-simplification-fixpoint-one-iteration.cpp @@ -13,10 +13,10 @@ void test(int a, int b, int c) { return; clang_analyzer_printState(); // CHECK: "constraints": [ - // CHECK-NEXT: { "symbol": "((reg_$0) + (reg_$2)) != (reg_$5)", "range": "{ [0, 0] }" } + // CHECK-NEXT: { "symbol": "((reg_$0) + (reg_$1)) != (reg_$2)", "range": "{ [0, 0] }" } // CHECK-NEXT: ], // CHECK-NEXT: "equivalence_classes": [ - // CHECK-NEXT: [ "(reg_$0) + (reg_$2)", "reg_$5" ] + // CHECK-NEXT: [ "(reg_$0) + (reg_$1)", "reg_$2" ] // CHECK-NEXT: ], // CHECK-NEXT: "disequality_info": null, @@ -25,12 +25,12 @@ void test(int a, int b, int c) { return; clang_analyzer_printState(); // CHECK: "constraints": [ - // CHECK-NEXT: { "symbol": "(reg_$0) != (reg_$5)", "range": "{ [0, 0] }" }, - // CHECK-NEXT: { "symbol": "reg_$2", "range": "{ [0, 0] }" } + // CHECK-NEXT: { "symbol": "(reg_$0) != (reg_$2)", "range": "{ [0, 0] }" }, + // CHECK-NEXT: { "symbol": "reg_$1", "range": "{ [0, 0] }" } // CHECK-NEXT: ], // CHECK-NEXT: "equivalence_classes": [ - // CHECK-NEXT: [ "(reg_$0) != (reg_$5)" ], - // CHECK-NEXT: [ "reg_$0", "reg_$5" ] + // CHECK-NEXT: [ "(reg_$0) != (reg_$2)" ], + // CHECK-NEXT: [ "reg_$0", "reg_$2" ] // CHECK-NEXT: ], // CHECK-NEXT: "disequality_info": null, diff --git a/clang/test/Analysis/symbol-simplification-fixpoint-two-iterations.cpp b/clang/test/Analysis/symbol-simplification-fixpoint-two-iterations.cpp index cffb5a70869eb..679ed3fda7a7a 100644 --- a/clang/test/Analysis/symbol-simplification-fixpoint-two-iterations.cpp +++ b/clang/test/Analysis/symbol-simplification-fixpoint-two-iterations.cpp @@ -15,11 +15,11 @@ void test(int a, int b, int c, int d) { return; clang_analyzer_printState(); // CHECK: "constraints": [ - // CHECK-NEXT: { "symbol": "(((reg_$0) + (reg_$2)) + (reg_$5)) != (reg_$8)", "range": "{ [0, 0] }" }, - // CHECK-NEXT: { "symbol": "(reg_$5) + (reg_$2)", "range": "{ [0, 0] }" } + // CHECK-NEXT: { "symbol": "(((reg_$0) + (reg_$1)) + (reg_$2)) != (reg_$3)", "range": "{ [0, 0] }" }, + // CHECK-NEXT: { "symbol": "(reg_$2) + (reg_$1)", "range": "{ [0, 0] }" } // CHECK-NEXT: ], // CHECK-NEXT: "equivalence_classes": [ - // CHECK-NEXT: [ "((reg_$0) + (reg_$2)) + (reg_$5)", "reg_$8" ] + // CHECK-NEXT: [ "((reg_$0) + (reg_$1)) + (reg_$2)", "reg_$3" ] // CHECK-NEXT: ], // CHECK-NEXT: "disequality_info": null, @@ -28,14 +28,14 @@ void test(int a, int b, int c, int d) { return; clang_analyzer_printState(); // CHECK: "constraints": [ - // CHECK-NEXT: { "symbol": "(reg_$0) != (reg_$8)", "range": "{ [0, 0] }" }, - // CHECK-NEXT: { "symbol": "reg_$2", "range": "{ [0, 0] }" }, - // CHECK-NEXT: { "symbol": "reg_$5", "range": "{ [0, 0] }" } + // CHECK-NEXT: { "symbol": "(reg_$0) != (reg_$3)", "range": "{ [0, 0] }" }, + // CHECK-NEXT: { "symbol": "reg_$1", "range": "{ [0, 0] }" }, + // CHECK-NEXT: { "symbol": "reg_$2", "range": "{ [0, 0] }" } // CHECK-NEXT: ], // CHECK-NEXT: "equivalence_classes": [ - // CHECK-NEXT: [ "(reg_$0) != (reg_$8)" ], - // CHECK-NEXT: [ "reg_$0", "reg_$8" ], - // CHECK-NEXT: [ "reg_$5" ] + // CHECK-NEXT: [ "(reg_$0) != (reg_$3)" ], + // CHECK-NEXT: [ "reg_$0", "reg_$3" ], + // CHECK-NEXT: [ "reg_$2" ] // CHECK-NEXT: ], // CHECK-NEXT: "disequality_info": null, diff --git a/clang/test/Analysis/unary-sym-expr.c b/clang/test/Analysis/unary-sym-expr.c index 64a01a956c442..92e11b295bee7 100644 --- a/clang/test/Analysis/unary-sym-expr.c +++ b/clang/test/Analysis/unary-sym-expr.c @@ -11,9 +11,9 @@ int test(int x, int y) { clang_analyzer_dump(-x); // expected-warning{{-reg_$0}} clang_analyzer_dump(~x); // expected-warning{{~reg_$0}} int z = x + y; - clang_analyzer_dump(-z); // expected-warning{{-((reg_$0) + (reg_$3))}} - clang_analyzer_dump(-(x + y)); // expected-warning{{-((reg_$0) + (reg_$3))}} - clang_analyzer_dump(-x + y); // expected-warning{{(-reg_$0) + (reg_$3)}} + clang_analyzer_dump(-z); // expected-warning{{-((reg_$0) + (reg_$1))}} + clang_analyzer_dump(-(x + y)); // expected-warning{{-((reg_$0) + (reg_$1))}} + clang_analyzer_dump(-x + y); // expected-warning{{(-reg_$0) + (reg_$1)}} if (-x == 0) { clang_analyzer_eval(-x == 0); // expected-warning{{TRUE}} From e32afded9227635108fad003e5c6d3bd88e5e3c1 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 3 Jan 2025 10:46:37 -0800 Subject: [PATCH 387/567] [LegalizeVectorOps] Use getBoolConstant instead of getAllOnesConstant in VectorLegalizer::UnrollVSETCC. (#121526) This code should follow the target preference for boolean contents of a vector type. We shouldn't assume that true is negative one. --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index db21e70897064..39903bde25a62 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -2246,11 +2246,13 @@ SDValue VectorLegalizer::UnrollVSETCC(SDNode *Node) { DAG.getVectorIdxConstant(i, dl)); SDValue RHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, RHS, DAG.getVectorIdxConstant(i, dl)); + // FIXME: We should use i1 setcc + boolext here, but it causes regressions. Ops[i] = DAG.getNode(ISD::SETCC, dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), TmpEltVT), LHSElem, RHSElem, CC); - Ops[i] = DAG.getSelect(dl, EltVT, Ops[i], DAG.getAllOnesConstant(dl, EltVT), + Ops[i] = DAG.getSelect(dl, EltVT, Ops[i], + DAG.getBoolConstant(true, dl, EltVT, VT), DAG.getConstant(0, dl, EltVT)); } return DAG.getBuildVector(VT, dl, Ops); From 34d2c3b9349b151bd69defa4880ecf56fb017287 Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Fri, 3 Jan 2025 14:11:25 -0500 Subject: [PATCH 388/567] [AMDGPU][True16][MC] true16 for v_sin_f16 (#120692) Support true16 format for v_sin_f16 in MC --- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 2 +- llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll | 32 ++++ llvm/test/MC/AMDGPU/gfx11_asm_vop1.s | 75 +++++---- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s | 65 ++++---- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s | 21 ++- llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s | 42 +++++ .../MC/AMDGPU/gfx11_asm_vop1_t16_promote.s | 154 +++++++++++++----- .../AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s | 65 ++++---- .../MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s | 25 ++- .../test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s | 69 ++++---- llvm/test/MC/AMDGPU/gfx12_asm_vop1.s | 72 ++++---- llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s | 62 +++---- llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s | 18 +- llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s | 42 +++++ .../MC/AMDGPU/gfx12_asm_vop1_t16_promote.s | 154 +++++++++++++----- .../test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s | 69 ++++---- .../AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s | 65 ++++---- .../MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s | 25 ++- .../Disassembler/AMDGPU/gfx11_dasm_vop1.txt | 63 +++++-- .../AMDGPU/gfx11_dasm_vop1_dpp16.txt | 54 ++++-- .../AMDGPU/gfx11_dasm_vop1_dpp8.txt | 17 +- .../gfx11_dasm_vop3_dpp16_from_vop1.txt | 54 ++++-- .../AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt | 24 ++- .../AMDGPU/gfx11_dasm_vop3_from_vop1.txt | 57 +++++-- .../AMDGPU/gfx12_dasm_vop1_dpp16.txt | 50 ++++-- .../AMDGPU/gfx12_dasm_vop1_dpp8.txt | 13 +- .../AMDGPU/gfx12_dasm_vop3_from_vop1.txt | 57 +++++-- .../gfx12_dasm_vop3_from_vop1_dpp16.txt | 54 ++++-- .../AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt | 24 ++- 29 files changed, 1062 insertions(+), 462 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 30911d45c9e97..badca264e8f92 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -1045,7 +1045,7 @@ defm V_CEIL_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16 defm V_TRUNC_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05d, "v_trunc_f16">; defm V_RNDNE_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05e, "v_rndne_f16">; defm V_FRACT_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05f, "v_fract_f16">; -defm V_SIN_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x060, "v_sin_f16">; +defm V_SIN_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x060, "v_sin_f16">; defm V_COS_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x061, "v_cos_f16">; defm V_SAT_PK_U8_I16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x062, "v_sat_pk_u8_i16">; defm V_CVT_NORM_I16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x063, "v_cvt_norm_i16_f16">; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll index 2bb89fdabda7e..6927636ad04aa 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -4,6 +4,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX12 %s define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX6-LABEL: sin_f16: @@ -80,6 +81,19 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX11-NEXT: v_sin_f16_e32 v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sin_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_sin_f16_e32 v1, v1 +; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX12-NEXT: s_endpgm %a.val = load half, ptr addrspace(1) %a %r.val = call half @llvm.sin.f16(half %a.val) store half %r.val, ptr addrspace(1) %r @@ -188,6 +202,24 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX11-NEXT: v_pack_b32_f16 v1, v1, v2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sin_v2f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX12-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_mul_f16_e32 v2, 0.15915494, v2 +; GFX12-NEXT: v_sin_f16_e32 v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX12-NEXT: v_sin_f16_e32 v2, v2 +; GFX12-NEXT: v_pack_b32_f16 v1, v1, v2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_endpgm %a.val = load <2 x half>, ptr addrspace(1) %a %r.val = call <2 x half> @llvm.sin.v2f16(<2 x half> %a.val) store <2 x half> %r.val, ptr addrspace(1) %r diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s index 5ceb8ed0065d3..9b9837b46b26d 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s @@ -3305,50 +3305,65 @@ v_sat_pk_u8_i16 v5.h, src_scc v_sat_pk_u8_i16 v127.h, 0xfe0b // GFX11: v_sat_pk_u8_i16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7f,0x0b,0xfe,0x00,0x00] -v_sin_f16 v5, v1 -// GFX11: v_sin_f16_e32 v5, v1 ; encoding: [0x01,0xc1,0x0a,0x7e] +v_sin_f16 v5.l, v1.l +// GFX11: v_sin_f16_e32 v5.l, v1.l ; encoding: [0x01,0xc1,0x0a,0x7e] -v_sin_f16 v5, v127 -// GFX11: v_sin_f16_e32 v5, v127 ; encoding: [0x7f,0xc1,0x0a,0x7e] +v_sin_f16 v5.l, v127.l +// GFX11: v_sin_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xc1,0x0a,0x7e] -v_sin_f16 v5, s1 -// GFX11: v_sin_f16_e32 v5, s1 ; encoding: [0x01,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, s1 +// GFX11: v_sin_f16_e32 v5.l, s1 ; encoding: [0x01,0xc0,0x0a,0x7e] -v_sin_f16 v5, s105 -// GFX11: v_sin_f16_e32 v5, s105 ; encoding: [0x69,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, s105 +// GFX11: v_sin_f16_e32 v5.l, s105 ; encoding: [0x69,0xc0,0x0a,0x7e] -v_sin_f16 v5, vcc_lo -// GFX11: v_sin_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, vcc_lo +// GFX11: v_sin_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xc0,0x0a,0x7e] -v_sin_f16 v5, vcc_hi -// GFX11: v_sin_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, vcc_hi +// GFX11: v_sin_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xc0,0x0a,0x7e] -v_sin_f16 v5, ttmp15 -// GFX11: v_sin_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, ttmp15 +// GFX11: v_sin_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xc0,0x0a,0x7e] -v_sin_f16 v5, m0 -// GFX11: v_sin_f16_e32 v5, m0 ; encoding: [0x7d,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, m0 +// GFX11: v_sin_f16_e32 v5.l, m0 ; encoding: [0x7d,0xc0,0x0a,0x7e] -v_sin_f16 v5, exec_lo -// GFX11: v_sin_f16_e32 v5, exec_lo ; encoding: [0x7e,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, exec_lo +// GFX11: v_sin_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xc0,0x0a,0x7e] -v_sin_f16 v5, exec_hi -// GFX11: v_sin_f16_e32 v5, exec_hi ; encoding: [0x7f,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, exec_hi +// GFX11: v_sin_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xc0,0x0a,0x7e] -v_sin_f16 v5, null -// GFX11: v_sin_f16_e32 v5, null ; encoding: [0x7c,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, null +// GFX11: v_sin_f16_e32 v5.l, null ; encoding: [0x7c,0xc0,0x0a,0x7e] -v_sin_f16 v5, -1 -// GFX11: v_sin_f16_e32 v5, -1 ; encoding: [0xc1,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, -1 +// GFX11: v_sin_f16_e32 v5.l, -1 ; encoding: [0xc1,0xc0,0x0a,0x7e] -v_sin_f16 v5, 0.5 -// GFX11: v_sin_f16_e32 v5, 0.5 ; encoding: [0xf0,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, 0.5 +// GFX11: v_sin_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xc0,0x0a,0x7e] -v_sin_f16 v5, src_scc -// GFX11: v_sin_f16_e32 v5, src_scc ; encoding: [0xfd,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, src_scc +// GFX11: v_sin_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xc0,0x0a,0x7e] -v_sin_f16 v127, 0xfe0b -// GFX11: v_sin_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_sin_f16 v127.l, 0xfe0b +// GFX11: v_sin_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +v_sin_f16 v5.l, v1.h +// GFX11: v_sin_f16_e32 v5.l, v1.h ; encoding: [0x81,0xc1,0x0a,0x7e] + +v_sin_f16 v5.l, v127.h +// GFX11: v_sin_f16_e32 v5.l, v127.h ; encoding: [0xff,0xc1,0x0a,0x7e] + +v_sin_f16 v127.l, 0.5 +// GFX11: v_sin_f16_e32 v127.l, 0.5 ; encoding: [0xf0,0xc0,0xfe,0x7e] + +v_sin_f16 v5.h, src_scc +// GFX11: v_sin_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xc0,0x0a,0x7f] + +v_sin_f16 v127.h, 0xfe0b +// GFX11: v_sin_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xc0,0xfe,0x7f,0x0b,0xfe,0x00,0x00] v_sin_f32 v5, v1 // GFX11: v_sin_f32_e32 v5, v1 ; encoding: [0x01,0x6b,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s index 4d1bd99b90252..b080bd9fca461 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s @@ -2582,47 +2582,56 @@ v_sat_pk_u8_i16 v5.h, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi: v_sat_pk_u8_i16 v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_sat_pk_u8_i16_dpp v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7f,0xff,0x6f,0x05,0x30] -v_sin_f16 v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff] +v_sin_f16 v5.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_sin_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff] -v_sin_f16 v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff] +v_sin_f16 v5.l, v1.l quad_perm:[0,1,2,3] +// GFX11: v_sin_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff] -v_sin_f16 v5, v1 row_mirror -// GFX11: v_sin_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff] +v_sin_f16 v5.l, v1.l row_mirror +// GFX11: v_sin_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff] -v_sin_f16 v5, v1 row_half_mirror -// GFX11: v_sin_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff] +v_sin_f16 v5.l, v1.l row_half_mirror +// GFX11: v_sin_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff] -v_sin_f16 v5, v1 row_shl:1 -// GFX11: v_sin_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff] +v_sin_f16 v5.l, v1.l row_shl:1 +// GFX11: v_sin_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff] -v_sin_f16 v5, v1 row_shl:15 -// GFX11: v_sin_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff] +v_sin_f16 v5.l, v1.l row_shl:15 +// GFX11: v_sin_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff] -v_sin_f16 v5, v1 row_shr:1 -// GFX11: v_sin_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff] +v_sin_f16 v5.l, v1.l row_shr:1 +// GFX11: v_sin_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff] -v_sin_f16 v5, v1 row_shr:15 -// GFX11: v_sin_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff] +v_sin_f16 v5.l, v1.l row_shr:15 +// GFX11: v_sin_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff] -v_sin_f16 v5, v1 row_ror:1 -// GFX11: v_sin_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff] +v_sin_f16 v5.l, v1.l row_ror:1 +// GFX11: v_sin_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff] -v_sin_f16 v5, v1 row_ror:15 -// GFX11: v_sin_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff] +v_sin_f16 v5.l, v1.l row_ror:15 +// GFX11: v_sin_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff] -v_sin_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_sin_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff] +v_sin_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_sin_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff] -v_sin_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_sin_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01] +v_sin_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_sin_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01] -v_sin_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_sin_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x09,0x13] +v_sin_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_sin_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x09,0x13] -v_sin_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_sin_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +v_sin_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_sin_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x35,0x30] + +v_sin_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_sin_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x5f,0x01,0x01] + +v_sin_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_sin_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc0,0x0a,0x7f,0x81,0x60,0x09,0x13] + +v_sin_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_sin_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7f,0xff,0x6f,0x35,0x30] v_sin_f32 v5, v1 quad_perm:[3,2,1,0] // GFX11: v_sin_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s index 2799ea7b8ef8b..6a47dce49ed2a 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s @@ -614,14 +614,23 @@ v_sat_pk_u8_i16 v5.h, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_sat_pk_u8_i16 v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_sat_pk_u8_i16_dpp v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00] -v_sin_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_sin_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sin_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_sin_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_sin_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_sin_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_sin_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_sin_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00] +v_sin_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_sin_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +v_sin_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sin_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0xfe,0x7e,0x7f,0x77,0x39,0x05] + +v_sin_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_sin_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc0,0x0a,0x7f,0x81,0x77,0x39,0x05] + +v_sin_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_sin_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc0,0xfe,0x7f,0xff,0x00,0x00,0x00] v_sin_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_sin_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s index caa73b7b9f047..34f10c98e1468 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s @@ -779,6 +779,12 @@ v_sat_pk_u8_i16_e32 v199.l, v5.l quad_perm:[3,2,1,0] v_sin_f16_e32 v128, 0xfe0b // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_sin_f16_e32 v128.h, 0xfe0b +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_sin_f16_e32 v128.l, 0xfe0b +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + v_sin_f16_e32 v255, v1 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -788,6 +794,24 @@ v_sin_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0] v_sin_f16_e32 v255, v1 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction +v_sin_f16_e32 v255.h, v1.h +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_sin_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_sin_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_sin_f16_e32 v255.l, v1.l +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_sin_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_sin_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + v_sin_f16_e32 v5, v199 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -797,6 +821,24 @@ v_sin_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_sin_f16_e32 v5, v199 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction +v_sin_f16_e32 v5.h, v199.h +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sin_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sin_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sin_f16_e32 v5.l, v199.l +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sin_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sin_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + v_sqrt_f16_e32 v128.h, 0xfe0b // GFX11: :[[@LINE-1]]:16: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s index 0dd1bf6142189..9e424fbd004e4 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s @@ -1886,71 +1886,137 @@ v_sat_pk_u8_i16 v199.l, v5 dpp8:[7,6,5,4,3,2,1,0] v_sat_pk_u8_i16 v199.l, v5 quad_perm:[3,2,1,0] // GFX11: v_sat_pk_u8_i16_e64_dpp v199.l, v5 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xc7,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x05,0x1b,0x00,0xff] -v_sin_f16 v128, 0xfe0b -// GFX11: v_sin_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xe0,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_sin_f16 v128.h, 0xfe0b +// GFX11: v_sin_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe0,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_sin_f16 v255, -1 -// GFX11: v_sin_f16_e64 v255, -1 ; encoding: [0xff,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] +v_sin_f16 v128.l, 0xfe0b +// GFX11: v_sin_f16_e64 v128.l, 0xfe0b ; encoding: [0x80,0x00,0xe0,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_sin_f16 v255, 0.5 -// GFX11: v_sin_f16_e64 v255, 0.5 ; encoding: [0xff,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x00] +v_sin_f16 v255.h, -1 +// GFX11: v_sin_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0xc1,0x00,0x00,0x00] -v_sin_f16 v255, exec_hi -// GFX11: v_sin_f16_e64 v255, exec_hi ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] +v_sin_f16 v255.h, 0.5 +// GFX11: v_sin_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0xf0,0x00,0x00,0x00] -v_sin_f16 v255, exec_lo -// GFX11: v_sin_f16_e64 v255, exec_lo ; encoding: [0xff,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] +v_sin_f16 v255.h, exec_hi +// GFX11: v_sin_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7f,0x00,0x00,0x00] -v_sin_f16 v255, m0 -// GFX11: v_sin_f16_e64 v255, m0 ; encoding: [0xff,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] +v_sin_f16 v255.h, exec_lo +// GFX11: v_sin_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7e,0x00,0x00,0x00] -v_sin_f16 v255, null -// GFX11: v_sin_f16_e64 v255, null ; encoding: [0xff,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] +v_sin_f16 v255.h, m0 +// GFX11: v_sin_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7d,0x00,0x00,0x00] -v_sin_f16 v255, s1 -// GFX11: v_sin_f16_e64 v255, s1 ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] +v_sin_f16 v255.h, null +// GFX11: v_sin_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7c,0x00,0x00,0x00] -v_sin_f16 v255, s105 -// GFX11: v_sin_f16_e64 v255, s105 ; encoding: [0xff,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] +v_sin_f16 v255.h, s1 +// GFX11: v_sin_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x01,0x00,0x00,0x00] -v_sin_f16 v255, src_scc -// GFX11: v_sin_f16_e64 v255, src_scc ; encoding: [0xff,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x00] +v_sin_f16 v255.h, s105 +// GFX11: v_sin_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x69,0x00,0x00,0x00] -v_sin_f16 v255, ttmp15 -// GFX11: v_sin_f16_e64 v255, ttmp15 ; encoding: [0xff,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] +v_sin_f16 v255.h, src_scc +// GFX11: v_sin_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0xfd,0x00,0x00,0x00] -v_sin_f16 v255, v1 -// GFX11: v_sin_f16_e64 v255, v1 ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] +v_sin_f16 v255.h, ttmp15 +// GFX11: v_sin_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7b,0x00,0x00,0x00] -v_sin_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_sin_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_sin_f16 v255.h, v1.h +// GFX11: v_sin_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00] -v_sin_f16 v255, v1 quad_perm:[3,2,1,0] -// GFX11: v_sin_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_sin_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sin_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_sin_f16 v255, v127 -// GFX11: v_sin_f16_e64 v255, v127 ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x01,0x00,0x00] +v_sin_f16 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX11: v_sin_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_sin_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_sin_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] +v_sin_f16 v255.h, v127.h +// GFX11: v_sin_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe0,0xd5,0x7f,0x01,0x00,0x00] -v_sin_f16 v255, v127 quad_perm:[3,2,1,0] -// GFX11: v_sin_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] +v_sin_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sin_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] -v_sin_f16 v255, vcc_hi -// GFX11: v_sin_f16_e64 v255, vcc_hi ; encoding: [0xff,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] +v_sin_f16 v255.h, v127.h quad_perm:[3,2,1,0] +// GFX11: v_sin_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] -v_sin_f16 v255, vcc_lo -// GFX11: v_sin_f16_e64 v255, vcc_lo ; encoding: [0xff,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] +v_sin_f16 v255.h, vcc_hi +// GFX11: v_sin_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x6b,0x00,0x00,0x00] -v_sin_f16 v5, v199 -// GFX11: v_sin_f16_e64 v5, v199 ; encoding: [0x05,0x00,0xe0,0xd5,0xc7,0x01,0x00,0x00] +v_sin_f16 v255.h, vcc_lo +// GFX11: v_sin_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x6a,0x00,0x00,0x00] -v_sin_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_sin_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] +v_sin_f16 v255.l, -1 +// GFX11: v_sin_f16_e64 v255.l, -1 ; encoding: [0xff,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] -v_sin_f16 v5, v199 quad_perm:[3,2,1,0] -// GFX11: v_sin_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] +v_sin_f16 v255.l, 0.5 +// GFX11: v_sin_f16_e64 v255.l, 0.5 ; encoding: [0xff,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x00] + +v_sin_f16 v255.l, exec_hi +// GFX11: v_sin_f16_e64 v255.l, exec_hi ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] + +v_sin_f16 v255.l, exec_lo +// GFX11: v_sin_f16_e64 v255.l, exec_lo ; encoding: [0xff,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] + +v_sin_f16 v255.l, m0 +// GFX11: v_sin_f16_e64 v255.l, m0 ; encoding: [0xff,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] + +v_sin_f16 v255.l, null +// GFX11: v_sin_f16_e64 v255.l, null ; encoding: [0xff,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] + +v_sin_f16 v255.l, s1 +// GFX11: v_sin_f16_e64 v255.l, s1 ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] + +v_sin_f16 v255.l, s105 +// GFX11: v_sin_f16_e64 v255.l, s105 ; encoding: [0xff,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] + +v_sin_f16 v255.l, src_scc +// GFX11: v_sin_f16_e64 v255.l, src_scc ; encoding: [0xff,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x00] + +v_sin_f16 v255.l, ttmp15 +// GFX11: v_sin_f16_e64 v255.l, ttmp15 ; encoding: [0xff,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] + +v_sin_f16 v255.l, v1.l +// GFX11: v_sin_f16_e64 v255.l, v1.l ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] + +v_sin_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sin_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_sin_f16 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_sin_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_sin_f16 v255.l, v127.l +// GFX11: v_sin_f16_e64 v255.l, v127.l ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x01,0x00,0x00] + +v_sin_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sin_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] + +v_sin_f16 v255.l, v127.l quad_perm:[3,2,1,0] +// GFX11: v_sin_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] + +v_sin_f16 v255.l, vcc_hi +// GFX11: v_sin_f16_e64 v255.l, vcc_hi ; encoding: [0xff,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] + +v_sin_f16 v255.l, vcc_lo +// GFX11: v_sin_f16_e64 v255.l, vcc_lo ; encoding: [0xff,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] + +v_sin_f16 v5.h, v199.h +// GFX11: v_sin_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe0,0xd5,0xc7,0x01,0x00,0x00] + +v_sin_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sin_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_sin_f16 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX11: v_sin_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] + +v_sin_f16 v5.l, v199.l +// GFX11: v_sin_f16_e64 v5.l, v199.l ; encoding: [0x05,0x00,0xe0,0xd5,0xc7,0x01,0x00,0x00] + +v_sin_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sin_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_sin_f16 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX11: v_sin_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] v_sqrt_f16 v128, 0xfe0b // GFX11: v_sqrt_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xd5,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s index 8de72e74c2856..3992b869c46d5 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s @@ -2698,47 +2698,56 @@ v_sat_pk_u8_i16_e64_dpp v255.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bou v_sat_pk_u8_i16_e64_dpp v255.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: [0xff,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] -v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_sin_f16_e64_dpp v5, v1 row_mirror -// GFX11: v_sin_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_mirror +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_half_mirror -// GFX11: v_sin_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_half_mirror +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_shl:1 -// GFX11: v_sin_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_shl:1 +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_shl:15 -// GFX11: v_sin_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_shl:15 +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_shr:1 -// GFX11: v_sin_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_shr:1 +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_shr:15 -// GFX11: v_sin_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_shr:15 +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_ror:1 -// GFX11: v_sin_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_ror:1 +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_ror:15 -// GFX11: v_sin_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_ror:15 +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +v_sin_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +v_sin_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +v_sin_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: [0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +v_sin_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: [0x05,0x08,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +v_sin_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: [0xff,0xc1,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] v_sin_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX11: v_sin_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s index 182a13831ec6d..a123c73c73bcb 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s @@ -781,17 +781,26 @@ v_sat_pk_u8_i16_e64_dpp v255.l, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 v_sat_pk_u8_i16_e64_dpp v255.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: [0xff,0x40,0xe2,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] -v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_sin_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +v_sin_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +v_sin_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe0,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe0,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +v_sin_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: [0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +v_sin_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: [0x05,0x08,0xe0,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +v_sin_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: [0xff,0xc1,0xe0,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] v_sin_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_sin_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s index 17678e3bd9f08..4b055165871cf 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s @@ -3247,50 +3247,59 @@ v_sat_pk_u8_i16_e64 v255.l, 0xfe0b v_sat_pk_u8_i16_e64 v255.h, 0xfe0b // GFX11: [0xff,0x40,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_sin_f16_e64 v5, v1 -// GFX11: v_sin_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] +v_sin_f16_e64 v5.l, v1.l +// GFX11: v_sin_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] -v_sin_f16_e64 v5, v255 -// GFX11: v_sin_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00] +v_sin_f16_e64 v5.l, v255.l +// GFX11: v_sin_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00] -v_sin_f16_e64 v5, s1 -// GFX11: v_sin_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, s1 +// GFX11: v_sin_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] -v_sin_f16_e64 v5, s105 -// GFX11: v_sin_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, s105 +// GFX11: v_sin_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] -v_sin_f16_e64 v5, vcc_lo -// GFX11: v_sin_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, vcc_lo +// GFX11: v_sin_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] -v_sin_f16_e64 v5, vcc_hi -// GFX11: v_sin_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, vcc_hi +// GFX11: v_sin_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] -v_sin_f16_e64 v5, ttmp15 -// GFX11: v_sin_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, ttmp15 +// GFX11: v_sin_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] -v_sin_f16_e64 v5, m0 -// GFX11: v_sin_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, m0 +// GFX11: v_sin_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] -v_sin_f16_e64 v5, exec_lo -// GFX11: v_sin_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, exec_lo +// GFX11: v_sin_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] -v_sin_f16_e64 v5, exec_hi -// GFX11: v_sin_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, exec_hi +// GFX11: v_sin_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] -v_sin_f16_e64 v5, null -// GFX11: v_sin_f16_e64 v5, null ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, null +// GFX11: v_sin_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] -v_sin_f16_e64 v5, -1 -// GFX11: v_sin_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, -1 +// GFX11: v_sin_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] -v_sin_f16_e64 v5, 0.5 mul:2 -// GFX11: v_sin_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08] +v_sin_f16_e64 v5.l, 0.5 mul:2 +// GFX11: v_sin_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08] -v_sin_f16_e64 v5, src_scc mul:4 -// GFX11: v_sin_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10] +v_sin_f16_e64 v5.l, src_scc mul:4 +// GFX11: v_sin_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10] -v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 -// GFX11: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +v_sin_f16_e64 v255.l, -|0xfe0b| clamp div:2 +// GFX11: v_sin_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_sin_f16_e64 v5.h, v1.h +// GFX11: [0x05,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00] + +v_sin_f16_e64 v5.l, v255.h +// GFX11: [0x05,0x08,0xe0,0xd5,0xff,0x01,0x00,0x00] + +v_sin_f16_e64 v255.h, -|0xfe0b| clamp div:2 +// GFX11: [0xff,0xc1,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] v_sin_f32_e64 v5, v1 // GFX11: v_sin_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s index 4f82643fd4886..ed90e480012c0 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s @@ -3378,50 +3378,62 @@ v_sat_pk_u8_i16 v5.h, src_scc v_sat_pk_u8_i16 v127.h, 0xfe0b // GFX12: v_sat_pk_u8_i16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7f,0x0b,0xfe,0x00,0x00] -v_sin_f16 v5, v1 -// GFX12: v_sin_f16_e32 v5, v1 ; encoding: [0x01,0xc1,0x0a,0x7e] +v_sin_f16 v5.l, v1.l +// GFX12: v_sin_f16_e32 v5.l, v1.l ; encoding: [0x01,0xc1,0x0a,0x7e] -v_sin_f16 v5, v127 -// GFX12: v_sin_f16_e32 v5, v127 ; encoding: [0x7f,0xc1,0x0a,0x7e] +v_sin_f16 v5.l, v127.l +// GFX12: v_sin_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xc1,0x0a,0x7e] -v_sin_f16 v5, s1 -// GFX12: v_sin_f16_e32 v5, s1 ; encoding: [0x01,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, s1 +// GFX12: v_sin_f16_e32 v5.l, s1 ; encoding: [0x01,0xc0,0x0a,0x7e] -v_sin_f16 v5, s105 -// GFX12: v_sin_f16_e32 v5, s105 ; encoding: [0x69,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, s105 +// GFX12: v_sin_f16_e32 v5.l, s105 ; encoding: [0x69,0xc0,0x0a,0x7e] -v_sin_f16 v5, vcc_lo -// GFX12: v_sin_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, vcc_lo +// GFX12: v_sin_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xc0,0x0a,0x7e] -v_sin_f16 v5, vcc_hi -// GFX12: v_sin_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, vcc_hi +// GFX12: v_sin_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xc0,0x0a,0x7e] -v_sin_f16 v5, ttmp15 -// GFX12: v_sin_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, ttmp15 +// GFX12: v_sin_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xc0,0x0a,0x7e] -v_sin_f16 v5, m0 -// GFX12: v_sin_f16_e32 v5, m0 ; encoding: [0x7d,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, m0 +// GFX12: v_sin_f16_e32 v5.l, m0 ; encoding: [0x7d,0xc0,0x0a,0x7e] -v_sin_f16 v5, exec_lo -// GFX12: v_sin_f16_e32 v5, exec_lo ; encoding: [0x7e,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, exec_lo +// GFX12: v_sin_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xc0,0x0a,0x7e] -v_sin_f16 v5, exec_hi -// GFX12: v_sin_f16_e32 v5, exec_hi ; encoding: [0x7f,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, exec_hi +// GFX12: v_sin_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xc0,0x0a,0x7e] -v_sin_f16 v5, null -// GFX12: v_sin_f16_e32 v5, null ; encoding: [0x7c,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, null +// GFX12: v_sin_f16_e32 v5.l, null ; encoding: [0x7c,0xc0,0x0a,0x7e] -v_sin_f16 v5, -1 -// GFX12: v_sin_f16_e32 v5, -1 ; encoding: [0xc1,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, -1 +// GFX12: v_sin_f16_e32 v5.l, -1 ; encoding: [0xc1,0xc0,0x0a,0x7e] -v_sin_f16 v5, 0.5 -// GFX12: v_sin_f16_e32 v5, 0.5 ; encoding: [0xf0,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, 0.5 +// GFX12: v_sin_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xc0,0x0a,0x7e] -v_sin_f16 v5, src_scc -// GFX12: v_sin_f16_e32 v5, src_scc ; encoding: [0xfd,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, src_scc +// GFX12: v_sin_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xc0,0x0a,0x7e] -v_sin_f16 v127, 0xfe0b -// GFX12: v_sin_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_sin_f16 v127.l, 0xfe0b +// GFX12: v_sin_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +v_sin_f16 v5.l, v1.h +// GFX12: v_sin_f16_e32 v5.l, v1.h ; encoding: [0x81,0xc1,0x0a,0x7e] + +v_sin_f16 v5.l, v127.h +// GFX12: v_sin_f16_e32 v5.l, v127.h ; encoding: [0xff,0xc1,0x0a,0x7e] + +v_sin_f16 v5.h, src_scc +// GFX12: v_sin_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xc0,0x0a,0x7f] + +v_sin_f16 v127.h, 0xfe0b +// GFX12: v_sin_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xc0,0xfe,0x7f,0x0b,0xfe,0x00,0x00] v_sin_f32 v5, v1 // GFX12: v_sin_f32_e32 v5, v1 ; encoding: [0x01,0x6b,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s index 2b3a52cf4e804..90968055e2a82 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s @@ -2644,47 +2644,53 @@ v_sat_pk_u8_i16 v5.h, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi: v_sat_pk_u8_i16 v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_sat_pk_u8_i16_dpp v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7f,0xff,0x6f,0x05,0x30] -v_sin_f16 v5, v1 quad_perm:[3,2,1,0] -// GFX12: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff] +v_sin_f16 v5.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_sin_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff] -v_sin_f16 v5, v1 quad_perm:[0,1,2,3] -// GFX12: v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff] +v_sin_f16 v5.l, v1.l quad_perm:[0,1,2,3] +// GFX12: v_sin_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff] -v_sin_f16 v5, v1 row_mirror -// GFX12: v_sin_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff] +v_sin_f16 v5.l, v1.l row_mirror +// GFX12: v_sin_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff] -v_sin_f16 v5, v1 row_half_mirror -// GFX12: v_sin_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff] +v_sin_f16 v5.l, v1.l row_half_mirror +// GFX12: v_sin_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff] -v_sin_f16 v5, v1 row_shl:1 -// GFX12: v_sin_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff] +v_sin_f16 v5.l, v1.l row_shl:1 +// GFX12: v_sin_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff] -v_sin_f16 v5, v1 row_shl:15 -// GFX12: v_sin_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff] +v_sin_f16 v5.l, v1.l row_shl:15 +// GFX12: v_sin_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff] -v_sin_f16 v5, v1 row_shr:1 -// GFX12: v_sin_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff] +v_sin_f16 v5.l, v1.l row_shr:1 +// GFX12: v_sin_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff] -v_sin_f16 v5, v1 row_shr:15 -// GFX12: v_sin_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff] +v_sin_f16 v5.l, v1.l row_shr:15 +// GFX12: v_sin_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff] -v_sin_f16 v5, v1 row_ror:1 -// GFX12: v_sin_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff] +v_sin_f16 v5.l, v1.l row_ror:1 +// GFX12: v_sin_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff] -v_sin_f16 v5, v1 row_ror:15 -// GFX12: v_sin_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff] +v_sin_f16 v5.l, v1.l row_ror:15 +// GFX12: v_sin_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff] -v_sin_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_sin_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff] +v_sin_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_sin_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff] -v_sin_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_sin_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01] +v_sin_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_sin_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01] -v_sin_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_sin_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x09,0x13] +v_sin_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_sin_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x09,0x13] -v_sin_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_sin_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +v_sin_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_sin_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x35,0x30] + +v_sin_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_sin_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc0,0x0a,0x7f,0x81,0x60,0x09,0x13] + +v_sin_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_sin_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7f,0xff,0x6f,0x35,0x30] v_sin_f32 v5, v1 quad_perm:[3,2,1,0] // GFX12: v_sin_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s index 977d5b08b80ee..0ce0087918f56 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s @@ -631,14 +631,20 @@ v_sat_pk_u8_i16 v5.h, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_sat_pk_u8_i16 v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_sat_pk_u8_i16_dpp v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00] -v_sin_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_sin_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_sin_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_sin_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_sin_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_sin_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_sin_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_sin_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00] +v_sin_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_sin_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +v_sin_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_sin_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc0,0x0a,0x7f,0x81,0x77,0x39,0x05] + +v_sin_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_sin_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc0,0xfe,0x7f,0xff,0x00,0x00,0x00] v_sin_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_sin_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s index 1b6734a6a652b..92a0d15bbc6f0 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s @@ -689,6 +689,12 @@ v_sat_pk_u8_i16_e32 v199.l, v5 quad_perm:[3,2,1,0] v_sin_f16_e32 v128, 0xfe0b // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_sin_f16_e32 v128.h, 0xfe0b +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_sin_f16_e32 v128.l, 0xfe0b +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + v_sin_f16_e32 v255, v1 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -698,6 +704,24 @@ v_sin_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0] v_sin_f16_e32 v255, v1 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction +v_sin_f16_e32 v255.h, v1.h +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_sin_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_sin_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_sin_f16_e32 v255.l, v1.l +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_sin_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_sin_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + v_sin_f16_e32 v5, v199 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -707,6 +731,24 @@ v_sin_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_sin_f16_e32 v5, v199 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction +v_sin_f16_e32 v5.h, v199.h +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sin_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sin_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sin_f16_e32 v5.l, v199.l +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sin_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sin_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + v_sqrt_f16_e32 v128, 0xfe0b // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s index 9d36ea0b9f479..bbe7b65d03281 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s @@ -1846,71 +1846,137 @@ v_sat_pk_u8_i16 v199.h, v5 dpp8:[7,6,5,4,3,2,1,0] v_sat_pk_u8_i16 v199.h, v5 quad_perm:[3,2,1,0] // GFX12: v_sat_pk_u8_i16_e64_dpp v199.h, v5 op_sel:[0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xc7,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x05,0x1b,0x00,0xff] -v_sin_f16 v128, 0xfe0b -// GFX12: v_sin_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xe0,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_sin_f16 v128.h, 0xfe0b +// GFX12: v_sin_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe0,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_sin_f16 v255, -1 -// GFX12: v_sin_f16_e64 v255, -1 ; encoding: [0xff,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] +v_sin_f16 v128.l, 0xfe0b +// GFX12: v_sin_f16_e64 v128.l, 0xfe0b ; encoding: [0x80,0x00,0xe0,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_sin_f16 v255, 0.5 -// GFX12: v_sin_f16_e64 v255, 0.5 ; encoding: [0xff,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x00] +v_sin_f16 v255.h, -1 +// GFX12: v_sin_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0xc1,0x00,0x00,0x00] -v_sin_f16 v255, exec_hi -// GFX12: v_sin_f16_e64 v255, exec_hi ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] +v_sin_f16 v255.h, 0.5 +// GFX12: v_sin_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0xf0,0x00,0x00,0x00] -v_sin_f16 v255, exec_lo -// GFX12: v_sin_f16_e64 v255, exec_lo ; encoding: [0xff,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] +v_sin_f16 v255.h, exec_hi +// GFX12: v_sin_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7f,0x00,0x00,0x00] -v_sin_f16 v255, m0 -// GFX12: v_sin_f16_e64 v255, m0 ; encoding: [0xff,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] +v_sin_f16 v255.h, exec_lo +// GFX12: v_sin_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7e,0x00,0x00,0x00] -v_sin_f16 v255, null -// GFX12: v_sin_f16_e64 v255, null ; encoding: [0xff,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] +v_sin_f16 v255.h, m0 +// GFX12: v_sin_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7d,0x00,0x00,0x00] -v_sin_f16 v255, s1 -// GFX12: v_sin_f16_e64 v255, s1 ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] +v_sin_f16 v255.h, null +// GFX12: v_sin_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7c,0x00,0x00,0x00] -v_sin_f16 v255, s105 -// GFX12: v_sin_f16_e64 v255, s105 ; encoding: [0xff,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] +v_sin_f16 v255.h, s1 +// GFX12: v_sin_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x01,0x00,0x00,0x00] -v_sin_f16 v255, src_scc -// GFX12: v_sin_f16_e64 v255, src_scc ; encoding: [0xff,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x00] +v_sin_f16 v255.h, s105 +// GFX12: v_sin_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x69,0x00,0x00,0x00] -v_sin_f16 v255, ttmp15 -// GFX12: v_sin_f16_e64 v255, ttmp15 ; encoding: [0xff,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] +v_sin_f16 v255.h, src_scc +// GFX12: v_sin_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0xfd,0x00,0x00,0x00] -v_sin_f16 v255, v1 -// GFX12: v_sin_f16_e64 v255, v1 ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] +v_sin_f16 v255.h, ttmp15 +// GFX12: v_sin_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7b,0x00,0x00,0x00] -v_sin_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_sin_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_sin_f16 v255.h, v1.h +// GFX12: v_sin_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00] -v_sin_f16 v255, v1 quad_perm:[3,2,1,0] -// GFX12: v_sin_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_sin_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_sin_f16 v255, v127 -// GFX12: v_sin_f16_e64 v255, v127 ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x01,0x00,0x00] +v_sin_f16 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_sin_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_sin_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] +v_sin_f16 v255.h, v127.h +// GFX12: v_sin_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe0,0xd5,0x7f,0x01,0x00,0x00] -v_sin_f16 v255, v127 quad_perm:[3,2,1,0] -// GFX12: v_sin_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] +v_sin_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] -v_sin_f16 v255, vcc_hi -// GFX12: v_sin_f16_e64 v255, vcc_hi ; encoding: [0xff,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] +v_sin_f16 v255.h, v127.h quad_perm:[3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] -v_sin_f16 v255, vcc_lo -// GFX12: v_sin_f16_e64 v255, vcc_lo ; encoding: [0xff,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] +v_sin_f16 v255.h, vcc_hi +// GFX12: v_sin_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x6b,0x00,0x00,0x00] -v_sin_f16 v5, v199 -// GFX12: v_sin_f16_e64 v5, v199 ; encoding: [0x05,0x00,0xe0,0xd5,0xc7,0x01,0x00,0x00] +v_sin_f16 v255.h, vcc_lo +// GFX12: v_sin_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x6a,0x00,0x00,0x00] -v_sin_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_sin_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] +v_sin_f16 v255.l, -1 +// GFX12: v_sin_f16_e64 v255.l, -1 ; encoding: [0xff,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] -v_sin_f16 v5, v199 quad_perm:[3,2,1,0] -// GFX12: v_sin_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] +v_sin_f16 v255.l, 0.5 +// GFX12: v_sin_f16_e64 v255.l, 0.5 ; encoding: [0xff,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x00] + +v_sin_f16 v255.l, exec_hi +// GFX12: v_sin_f16_e64 v255.l, exec_hi ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] + +v_sin_f16 v255.l, exec_lo +// GFX12: v_sin_f16_e64 v255.l, exec_lo ; encoding: [0xff,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] + +v_sin_f16 v255.l, m0 +// GFX12: v_sin_f16_e64 v255.l, m0 ; encoding: [0xff,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] + +v_sin_f16 v255.l, null +// GFX12: v_sin_f16_e64 v255.l, null ; encoding: [0xff,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] + +v_sin_f16 v255.l, s1 +// GFX12: v_sin_f16_e64 v255.l, s1 ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] + +v_sin_f16 v255.l, s105 +// GFX12: v_sin_f16_e64 v255.l, s105 ; encoding: [0xff,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] + +v_sin_f16 v255.l, src_scc +// GFX12: v_sin_f16_e64 v255.l, src_scc ; encoding: [0xff,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x00] + +v_sin_f16 v255.l, ttmp15 +// GFX12: v_sin_f16_e64 v255.l, ttmp15 ; encoding: [0xff,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] + +v_sin_f16 v255.l, v1.l +// GFX12: v_sin_f16_e64 v255.l, v1.l ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] + +v_sin_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_sin_f16 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_sin_f16 v255.l, v127.l +// GFX12: v_sin_f16_e64 v255.l, v127.l ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x01,0x00,0x00] + +v_sin_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] + +v_sin_f16 v255.l, v127.l quad_perm:[3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] + +v_sin_f16 v255.l, vcc_hi +// GFX12: v_sin_f16_e64 v255.l, vcc_hi ; encoding: [0xff,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] + +v_sin_f16 v255.l, vcc_lo +// GFX12: v_sin_f16_e64 v255.l, vcc_lo ; encoding: [0xff,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] + +v_sin_f16 v5.h, v199.h +// GFX12: v_sin_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe0,0xd5,0xc7,0x01,0x00,0x00] + +v_sin_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_sin_f16 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] + +v_sin_f16 v5.l, v199.l +// GFX12: v_sin_f16_e64 v5.l, v199.l ; encoding: [0x05,0x00,0xe0,0xd5,0xc7,0x01,0x00,0x00] + +v_sin_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_sin_f16 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] v_sqrt_f16 v128, 0xfe0b // GFX12: v_sqrt_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xd5,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s index 71c12a1333ebc..5af15f2eb971f 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s @@ -3397,50 +3397,59 @@ v_sat_pk_u8_i16_e64 v255, 0xfe0b v_sat_pk_u8_i16_e64 v255.h, 0xfe0b // GFX12: v_sat_pk_u8_i16_e64 v255.h, 0xfe0b op_sel:[0,1] ; encoding: [0xff,0x40,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_sin_f16_e64 v5, v1 -// GFX12: v_sin_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] +v_sin_f16_e64 v5.l, v1.l +// GFX12: v_sin_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] -v_sin_f16_e64 v5, v255 -// GFX12: v_sin_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00] +v_sin_f16_e64 v5.l, v255.l +// GFX12: v_sin_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00] -v_sin_f16_e64 v5, s1 -// GFX12: v_sin_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, s1 +// GFX12: v_sin_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] -v_sin_f16_e64 v5, s105 -// GFX12: v_sin_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, s105 +// GFX12: v_sin_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] -v_sin_f16_e64 v5, vcc_lo -// GFX12: v_sin_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, vcc_lo +// GFX12: v_sin_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] -v_sin_f16_e64 v5, vcc_hi -// GFX12: v_sin_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, vcc_hi +// GFX12: v_sin_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] -v_sin_f16_e64 v5, ttmp15 -// GFX12: v_sin_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, ttmp15 +// GFX12: v_sin_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] -v_sin_f16_e64 v5, m0 -// GFX12: v_sin_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, m0 +// GFX12: v_sin_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] -v_sin_f16_e64 v5, exec_lo -// GFX12: v_sin_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, exec_lo +// GFX12: v_sin_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] -v_sin_f16_e64 v5, exec_hi -// GFX12: v_sin_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, exec_hi +// GFX12: v_sin_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] -v_sin_f16_e64 v5, null -// GFX12: v_sin_f16_e64 v5, null ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, null +// GFX12: v_sin_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] -v_sin_f16_e64 v5, -1 -// GFX12: v_sin_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, -1 +// GFX12: v_sin_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] -v_sin_f16_e64 v5, 0.5 mul:2 -// GFX12: v_sin_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08] +v_sin_f16_e64 v5.l, 0.5 mul:2 +// GFX12: v_sin_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08] -v_sin_f16_e64 v5, src_scc mul:4 -// GFX12: v_sin_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10] +v_sin_f16_e64 v5.l, src_scc mul:4 +// GFX12: v_sin_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10] -v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 -// GFX12: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +v_sin_f16_e64 v255.l, -|0xfe0b| clamp div:2 +// GFX12: v_sin_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_sin_f16_e64 v5.h, v1.h +// GFX12: v_sin_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00] + +v_sin_f16_e64 v5.l, v255.h +// GFX12: v_sin_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe0,0xd5,0xff,0x01,0x00,0x00] + +v_sin_f16_e64 v255.h, -|0xfe0b| clamp div:2 +// GFX12: v_sin_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] v_sin_f32_e64 v5, v1 // GFX12: v_sin_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s index 42166032124a3..39638cefd44ad 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s @@ -2560,47 +2560,56 @@ v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound v_sat_pk_u8_i16_e64_dpp v255.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_sat_pk_u8_i16_e64_dpp v255.h, v255 op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] -v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] -// GFX12: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] -// GFX12: v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_sin_f16_e64_dpp v5, v1 row_mirror -// GFX12: v_sin_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_mirror +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_half_mirror -// GFX12: v_sin_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_half_mirror +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_shl:1 -// GFX12: v_sin_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_shl:1 +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_shl:15 -// GFX12: v_sin_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_shl:15 +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_shr:1 -// GFX12: v_sin_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_shr:1 +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_shr:15 -// GFX12: v_sin_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_shr:15 +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_ror:1 -// GFX12: v_sin_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_ror:1 +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_ror:15 -// GFX12: v_sin_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_ror:15 +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +v_sin_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +v_sin_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +v_sin_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_sin_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +v_sin_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_sin_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x08,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +v_sin_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_sin_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc1,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] v_sin_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX12: v_sin_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s index d65d2004fc1e7..a6cef6f134b0a 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s @@ -742,17 +742,26 @@ v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 v_sat_pk_u8_i16_e64_dpp v255.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_sat_pk_u8_i16_e64_dpp v255.h, v255 op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x40,0xe2,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] -v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_sin_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +v_sin_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +v_sin_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe0,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe0,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +v_sin_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +v_sin_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_sin_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x08,0xe0,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +v_sin_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_sin_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc1,0xe0,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] v_sin_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_sin_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt index 38c573a19ba00..0abced9f2f77b 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt @@ -3323,49 +3323,82 @@ # GFX11-REAL16: v_sat_pk_u8_i16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7f,0x0b,0xfe,0x00,0x00] 0x01,0xc1,0x0a,0x7e -# GFX11: v_sin_f16_e32 v5, v1 ; encoding: [0x01,0xc1,0x0a,0x7e] +# GFX11-REAL16: v_sin_f16_e32 v5.l, v1.l ; encoding: [0x01,0xc1,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, v1 ; encoding: [0x01,0xc1,0x0a,0x7e] 0x7f,0xc1,0x0a,0x7e -# GFX11: v_sin_f16_e32 v5, v127 ; encoding: [0x7f,0xc1,0x0a,0x7e] +# GFX11-REAL16: v_sin_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xc1,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, v127 ; encoding: [0x7f,0xc1,0x0a,0x7e] 0x01,0xc0,0x0a,0x7e -# GFX11: v_sin_f16_e32 v5, s1 ; encoding: [0x01,0xc0,0x0a,0x7e] +# GFX11-REAL16: v_sin_f16_e32 v5.l, s1 ; encoding: [0x01,0xc0,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, s1 ; encoding: [0x01,0xc0,0x0a,0x7e] 0x69,0xc0,0x0a,0x7e -# GFX11: v_sin_f16_e32 v5, s105 ; encoding: [0x69,0xc0,0x0a,0x7e] +# GFX11-REAL16: v_sin_f16_e32 v5.l, s105 ; encoding: [0x69,0xc0,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, s105 ; encoding: [0x69,0xc0,0x0a,0x7e] 0x6a,0xc0,0x0a,0x7e -# GFX11: v_sin_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xc0,0x0a,0x7e] +# GFX11-REAL16: v_sin_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xc0,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xc0,0x0a,0x7e] 0x6b,0xc0,0x0a,0x7e -# GFX11: v_sin_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xc0,0x0a,0x7e] +# GFX11-REAL16: v_sin_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xc0,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xc0,0x0a,0x7e] 0x7b,0xc0,0x0a,0x7e -# GFX11: v_sin_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xc0,0x0a,0x7e] +# GFX11-REAL16: v_sin_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xc0,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xc0,0x0a,0x7e] 0x7d,0xc0,0x0a,0x7e -# GFX11: v_sin_f16_e32 v5, m0 ; encoding: [0x7d,0xc0,0x0a,0x7e] +# GFX11-REAL16: v_sin_f16_e32 v5.l, m0 ; encoding: [0x7d,0xc0,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, m0 ; encoding: [0x7d,0xc0,0x0a,0x7e] 0x7e,0xc0,0x0a,0x7e -# GFX11: v_sin_f16_e32 v5, exec_lo ; encoding: [0x7e,0xc0,0x0a,0x7e] +# GFX11-REAL16: v_sin_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xc0,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, exec_lo ; encoding: [0x7e,0xc0,0x0a,0x7e] 0x7f,0xc0,0x0a,0x7e -# GFX11: v_sin_f16_e32 v5, exec_hi ; encoding: [0x7f,0xc0,0x0a,0x7e] +# GFX11-REAL16: v_sin_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xc0,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, exec_hi ; encoding: [0x7f,0xc0,0x0a,0x7e] 0x7c,0xc0,0x0a,0x7e -# GFX11: v_sin_f16_e32 v5, null ; encoding: [0x7c,0xc0,0x0a,0x7e] +# GFX11-REAL16: v_sin_f16_e32 v5.l, null ; encoding: [0x7c,0xc0,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, null ; encoding: [0x7c,0xc0,0x0a,0x7e] 0xc1,0xc0,0x0a,0x7e -# GFX11: v_sin_f16_e32 v5, -1 ; encoding: [0xc1,0xc0,0x0a,0x7e] +# GFX11-REAL16: v_sin_f16_e32 v5.l, -1 ; encoding: [0xc1,0xc0,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, -1 ; encoding: [0xc1,0xc0,0x0a,0x7e] 0xf0,0xc0,0x0a,0x7e -# GFX11: v_sin_f16_e32 v5, 0.5 ; encoding: [0xf0,0xc0,0x0a,0x7e] +# GFX11-REAL16: v_sin_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xc0,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, 0.5 ; encoding: [0xf0,0xc0,0x0a,0x7e] 0xfd,0xc0,0x0a,0x7e -# GFX11: v_sin_f16_e32 v5, src_scc ; encoding: [0xfd,0xc0,0x0a,0x7e] +# GFX11-REAL16: v_sin_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xc0,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, src_scc ; encoding: [0xfd,0xc0,0x0a,0x7e] 0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00 -# GFX11: v_sin_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_sin_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +0x81,0xc1,0x0a,0x7e +# GFX11-REAL16: v_sin_f16_e32 v5.l, v1.h ; encoding: [0x81,0xc1,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xc1,0x0a,0x7e] + +0xff,0xc1,0x0a,0x7e +# GFX11-REAL16: v_sin_f16_e32 v5.l, v127.h ; encoding: [0xff,0xc1,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xc1,0x0a,0x7e] + +0xf0,0xc0,0xfe,0x7e +# GFX11-REAL16: v_sin_f16_e32 v127.l, 0.5 ; encoding: [0xf0,0xc0,0xfe,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v127, 0.5 ; encoding: [0xf0,0xc0,0xfe,0x7e] + +0xfd,0xc0,0x0a,0x7f +# GFX11-REAL16: v_sin_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xc0,0x0a,0x7f] + +0xff,0xc0,0xfe,0x7f,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_sin_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xc0,0xfe,0x7f,0x0b,0xfe,0x00,0x00] 0x01,0x6b,0x0a,0x7e # GFX11: v_sin_f32_e32 v5, v1 ; encoding: [0x01,0x6b,0x0a,0x7e] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt index b801e393c635d..7043f3b2b9f29 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt @@ -2619,46 +2619,72 @@ # GFX11-FAKE16: v_lshlrev_b32_e32 v6, v255, v183 ; encoding: [0xff,0x6f,0x0d,0x30] 0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX11: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX11: v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX11: v_sin_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX11: v_sin_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX11: v_sin_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX11: v_sin_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX11: v_sin_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX11: v_sin_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX11: v_sin_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX11: v_sin_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX11: v_sin_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX11: v_sin_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX11: v_sin_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30 -# GFX11: v_sin_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX11-REAL16: v_sin_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX11-FAKE16: v_sin_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] + +0xfa,0xc0,0xfe,0x7e,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_sin_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_sin_f16_dpp v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x5f,0x01,0x01] + +0xfa,0xc0,0x0a,0x7f,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_sin_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7f,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_mul_i32_i24_e32 v128, 1, v176 ; encoding: [0x81,0x60,0x01,0x13] + +0xfa,0xc0,0xfe,0x7f,0xff,0x6f,0x3d,0x30 +# GFX11-REAL16: v_sin_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7f,0xff,0x6f,0x3d,0x30] +# GFX11-FAKE16: v_lshlrev_b32_e32 v30, v255, v183 ; encoding: [0xff,0x6f,0x3d,0x30] 0xfa,0x6a,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX11: v_sin_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt index faf3c6f628b95..d2eb919849fd3 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt @@ -502,10 +502,23 @@ # GFX11-REAL16: v_sat_pk_u8_i16_dpp v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX11: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00 -# GFX11: v_sin_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_sin_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +0xe9,0xc0,0xfe,0x7e,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_sin_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0xfe,0x7e,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_sin_f16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0xfe,0x7e,0x7f,0x77,0x39,0x05] + +0xe9,0xc0,0x0a,0x7f,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_sin_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7f,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v129, v187 ; encoding: [0x81,0x77,0x39,0x05] + +0xea,0xc0,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_sin_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX11: v_sin_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt index f689c43b75365..5c3fde7b80556 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt @@ -2773,46 +2773,72 @@ # GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX11: v_sin_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX11: v_sin_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX11: v_sin_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_sin_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX11: v_sin_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_sin_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX11: v_sin_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_sin_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX11: v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX11: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 -# GFX11: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] 0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 -# GFX11: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] + +0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_sin_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +0x05,0x08,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] + +0xff,0xc1,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_sin_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX11: v_sin_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt index 48824399a0887..28b39f4b0344a 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt @@ -753,16 +753,32 @@ # GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX11: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX11: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] 0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX11: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_sin_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +0x05,0x08,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +0xff,0xc1,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_sin_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX11: v_sin_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt index 04c9094465b3b..d078bc2b8cb04 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt @@ -3295,49 +3295,76 @@ # GFX11-FAKE16: v_sat_pk_u8_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00 -# GFX11: v_sin_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] +# GFX11-REAL16: v_sin_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00 -# GFX11: v_sin_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00] +# GFX11-REAL16: v_sin_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00 -# GFX11: v_sin_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] +# GFX11-REAL16: v_sin_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00 -# GFX11: v_sin_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] +# GFX11-REAL16: v_sin_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00 -# GFX11: v_sin_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] +# GFX11-REAL16: v_sin_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00 -# GFX11: v_sin_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] +# GFX11-REAL16: v_sin_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00 -# GFX11: v_sin_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] +# GFX11-REAL16: v_sin_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00 -# GFX11: v_sin_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] +# GFX11-REAL16: v_sin_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00 -# GFX11: v_sin_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] +# GFX11-REAL16: v_sin_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00 -# GFX11: v_sin_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_sin_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00 -# GFX11: v_sin_f16_e64 v5, null ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] +# GFX11-REAL16: v_sin_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v5, null ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00 -# GFX11: v_sin_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] +# GFX11-REAL16: v_sin_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08 -# GFX11: v_sin_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08] +# GFX11-REAL16: v_sin_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08] +# GFX11-FAKE16: v_sin_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08] 0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10 -# GFX11: v_sin_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10] +# GFX11-REAL16: v_sin_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10] +# GFX11-FAKE16: v_sin_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10] 0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 -# GFX11: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_sin_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00 +# GFX11-REAL16: v_sin_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x08,0xe0,0xd5,0xff,0x01,0x00,0x00 +# GFX11-REAL16: v_sin_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe0,0xd5,0xff,0x01,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00] + +0xff,0xc1,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_sin_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] 0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00 # GFX11: v_sin_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt index b93a6252beaeb..46dedd970a320 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt @@ -2747,46 +2747,68 @@ # GFX12-FAKE16: v_lshlrev_b32_e32 v6, v255, v183 ; encoding: [0xff,0x6f,0x0d,0x30] 0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX12: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX12: v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX12: v_sin_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX12: v_sin_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX12: v_sin_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX12: v_sin_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX12: v_sin_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX12: v_sin_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX12: v_sin_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX12: v_sin_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX12: v_sin_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX12: v_sin_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX12: v_sin_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30 -# GFX12: v_sin_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-REAL16: v_sin_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-FAKE16: v_sin_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] + +0xfa,0xc0,0x0a,0x7f,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_sin_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7f,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_mul_i32_i24_e32 v128, 1, v176 ; encoding: [0x81,0x60,0x01,0x13] + +0xfa,0xc0,0xfe,0x7f,0xff,0x6f,0x3d,0x30 +# GFX12-REAL16: v_sin_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7f,0xff,0x6f,0x3d,0x30] +# GFX12-FAKE16: v_lshlrev_b32_e32 v30, v255, v183 ; encoding: [0xff,0x6f,0x3d,0x30] 0xfa,0x6a,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX12: v_sin_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt index 092ba9b88f951..551dab7ec3e7c 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt @@ -508,10 +508,19 @@ # GFX12-REAL16: v_sat_pk_u8_i16_dpp v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX12: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00 -# GFX12: v_sin_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_sin_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +0xe9,0xc0,0x0a,0x7f,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_sin_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7f,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05] + +0xea,0xc0,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_sin_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX12: v_sin_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt index 7fdb9e0ac6977..0d01be721e60d 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt @@ -3341,49 +3341,76 @@ # GFX12-FAKE16: v_sat_pk_u8_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00 -# GFX12: v_sin_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] +# GFX12-REAL16: v_sin_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00 -# GFX12: v_sin_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00] +# GFX12-REAL16: v_sin_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00 -# GFX12: v_sin_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] +# GFX12-REAL16: v_sin_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00 -# GFX12: v_sin_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] +# GFX12-REAL16: v_sin_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00 -# GFX12: v_sin_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-REAL16: v_sin_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00 -# GFX12: v_sin_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-REAL16: v_sin_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00 -# GFX12: v_sin_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-REAL16: v_sin_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00 -# GFX12: v_sin_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-REAL16: v_sin_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00 -# GFX12: v_sin_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-REAL16: v_sin_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00 -# GFX12: v_sin_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_sin_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00 -# GFX12: v_sin_f16_e64 v5, null ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-REAL16: v_sin_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v5, null ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00 -# GFX12: v_sin_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-REAL16: v_sin_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08 -# GFX12: v_sin_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-REAL16: v_sin_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-FAKE16: v_sin_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08] 0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10 -# GFX12: v_sin_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-REAL16: v_sin_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-FAKE16: v_sin_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10] 0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 -# GFX12: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_sin_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00 +# GFX12-REAL16: v_sin_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x08,0xe0,0xd5,0xff,0x01,0x00,0x00 +# GFX12-REAL16: v_sin_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe0,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00] + +0xff,0xc1,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_sin_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] 0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00 # GFX12: v_sin_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt index ad491dc02d384..d501d62c006eb 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt @@ -2617,46 +2617,72 @@ # GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX12: v_sin_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX12: v_sin_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX12: v_sin_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_sin_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX12: v_sin_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_sin_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX12: v_sin_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_sin_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX12: v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX12: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 -# GFX12: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] 0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 -# GFX12: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] + +0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_sin_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +0x05,0x08,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] + +0xff,0xc1,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_sin_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX12: v_sin_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt index 21b4d0572bf37..aba7d3ff43d8b 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt @@ -717,16 +717,32 @@ # GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX12: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX12: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] 0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX12: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_sin_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +0x05,0x08,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +0xff,0xc1,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_sin_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX12: v_sin_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] From 5ee8418057646f4640cd1bb60e73f9e5129ea12e Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 3 Jan 2025 11:19:22 -0800 Subject: [PATCH 389/567] [Docs][TableGen] Remove ReturnRange from the SearchIndex documentation. NFC SearchIndex doesn't support ReturnRange. It is only supported for the primary key. --- llvm/docs/TableGen/BackEnds.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/llvm/docs/TableGen/BackEnds.rst b/llvm/docs/TableGen/BackEnds.rst index f73269e717184..94af2e4ab8f5c 100644 --- a/llvm/docs/TableGen/BackEnds.rst +++ b/llvm/docs/TableGen/BackEnds.rst @@ -1071,8 +1071,6 @@ function. This class provides three fields. * ``bit EarlyOut``. See the third example in `Generic Tables`_. -* ``bit ReturnRange``. See the second example in `Generic Tables`_. - Here is an example of a secondary key added to the ``CTable`` above. The generated function looks up entries based on the ``Name`` and ``Kind`` fields. From 432a871ba8f6a62272a7ef1162305328b0de7802 Mon Sep 17 00:00:00 2001 From: Ellis Hoag Date: Fri, 3 Jan 2025 11:23:35 -0800 Subject: [PATCH 390/567] Deprecate order file instrumentation (#121514) --- clang/include/clang/Driver/Options.td | 2 +- clang/lib/Driver/ToolChains/Clang.cpp | 22 +++++++++++++--------- clang/test/Driver/clang_f_opts.c | 2 ++ 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 523761f5e0d80..12edfbb171d34 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1890,7 +1890,7 @@ defm pseudo_probe_for_profiling : BoolFOption<"pseudo-probe-for-profiling", " pseudo probes for sample profiling">>; def forder_file_instrumentation : Flag<["-"], "forder-file-instrumentation">, Group, Visibility<[ClangOption, CC1Option, CLOption]>, - HelpText<"Generate instrumented code to collect order file into default.profraw file (overridden by '=' form of option or LLVM_PROFILE_FILE env var)">; + HelpText<"Generate instrumented code to collect order file into default.profraw file (overridden by '=' form of option or LLVM_PROFILE_FILE env var). Deprecated, please use temporal profiling.">; def fprofile_list_EQ : Joined<["-"], "fprofile-list=">, Group, Visibility<[ClangOption, CC1Option, CLOption]>, HelpText<"Filename defining the list of functions/files to instrument. " diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index a020e00cd1739..daf863c78d303 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -8010,15 +8010,19 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, } } - if (Args.hasArg(options::OPT_forder_file_instrumentation)) { - CmdArgs.push_back("-forder-file-instrumentation"); - // Enable order file instrumentation when ThinLTO is not on. When ThinLTO is - // on, we need to pass these flags as linker flags and that will be handled - // outside of the compiler. - if (!IsUsingLTO) { - CmdArgs.push_back("-mllvm"); - CmdArgs.push_back("-enable-order-file-instrumentation"); - } + if (const Arg *A = + Args.getLastArg(options::OPT_forder_file_instrumentation)) { + D.Diag(diag::warn_drv_deprecated_arg) + << A->getAsString(Args) << /*hasReplacement=*/true + << "-mllvm -pgo-temporal-instrumentation"; + CmdArgs.push_back("-forder-file-instrumentation"); + // Enable order file instrumentation when ThinLTO is not on. When ThinLTO is + // on, we need to pass these flags as linker flags and that will be handled + // outside of the compiler. + if (!IsUsingLTO) { + CmdArgs.push_back("-mllvm"); + CmdArgs.push_back("-enable-order-file-instrumentation"); + } } if (Arg *A = Args.getLastArg(options::OPT_fforce_enable_int128, diff --git a/clang/test/Driver/clang_f_opts.c b/clang/test/Driver/clang_f_opts.c index ddbf1fd951c84..2b72068eae1ee 100644 --- a/clang/test/Driver/clang_f_opts.c +++ b/clang/test/Driver/clang_f_opts.c @@ -364,6 +364,7 @@ // RUN: -fno-devirtualize-speculatively \ // RUN: -fslp-vectorize-aggressive \ // RUN: -fno-slp-vectorize-aggressive \ +// RUN: -forder-file-instrumentation \ // RUN: %s 2>&1 | FileCheck --check-prefix=CHECK-WARNING %s // CHECK-WARNING-DAG: optimization flag '-finline-limit=1000' is not supported // CHECK-WARNING-DAG: optimization flag '-finline-limit' is not supported @@ -423,6 +424,7 @@ // CHECK-WARNING-DAG: optimization flag '-fno-devirtualize-speculatively' is not supported // CHECK-WARNING-DAG: the flag '-fslp-vectorize-aggressive' has been deprecated and will be ignored // CHECK-WARNING-DAG: the flag '-fno-slp-vectorize-aggressive' has been deprecated and will be ignored +// CHECK-WARNING-DAG: argument '-forder-file-instrumentation' is deprecated, use '-mllvm -pgo-temporal-instrumentation' instead // Test that we mute the warning on these // RUN: %clang -### -finline-limit=1000 -Wno-invalid-command-line-argument \ From 11c6af666b75d03ac67dfdf9ba190587b7efbcd8 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 3 Jan 2025 19:28:02 +0000 Subject: [PATCH 391/567] [VPlan] Fix name ExitVPBB -> MiddleVPBB (NFC). ExitVPBB actually refers to the middle block, clarify name. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f2f8a85b7cc23..7ef5295bb1276 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7785,16 +7785,16 @@ DenseMap LoopVectorizationPlanner::executePlan( BestVPlan.execute(&State); - auto *ExitVPBB = BestVPlan.getMiddleBlock(); + auto *MiddleVPBB = BestVPlan.getMiddleBlock(); // 2.5 When vectorizing the epilogue, fix reduction and induction resume // values from the additional bypass block. if (VectorizingEpilogue) { assert(!ILV.Legal->hasUncountableEarlyExit() && "Epilogue vectorisation not yet supported with early exits"); BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock(); - for (VPRecipeBase &R : *ExitVPBB) { + for (VPRecipeBase &R : *MiddleVPBB) { fixReductionScalarResumeWhenVectorizingEpilog( - &R, State, State.CFG.VPBB2IRBB[ExitVPBB], BypassBlock); + &R, State, State.CFG.VPBB2IRBB[MiddleVPBB], BypassBlock); } BasicBlock *PH = OrigLoop->getLoopPreheader(); for (const auto &[IVPhi, _] : Legal->getInductionVars()) { @@ -7840,7 +7840,7 @@ DenseMap LoopVectorizationPlanner::executePlan( // 4. Adjust branch weight of the branch in the middle block. auto *MiddleTerm = - cast(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator()); + cast(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator()); if (MiddleTerm->isConditional() && hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { // Assume that `Count % VectorTripCount` is equally distributed. From cb2eafe6ac72064529da5219434e351851a2b68f Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 3 Jan 2025 19:37:58 +0000 Subject: [PATCH 392/567] [TableGen] Use SmallVectors for preprocessor include stack. NFC. (#121571) This is just a minor cleanup and a small step in the direction of using LLVM containers in preference to STL containers in lib/TableGen. --- llvm/lib/TableGen/TGLexer.cpp | 41 +++++++++++++---------------------- llvm/lib/TableGen/TGLexer.h | 5 ++--- 2 files changed, 17 insertions(+), 29 deletions(-) diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp index eee42511804f5..e23aec6efba59 100644 --- a/llvm/lib/TableGen/TGLexer.cpp +++ b/llvm/lib/TableGen/TGLexer.cpp @@ -81,8 +81,7 @@ TGLexer::TGLexer(SourceMgr &SM, ArrayRef Macros) : SrcMgr(SM) { TokStart = nullptr; // Pretend that we enter the "top-level" include file. - PrepIncludeStack.push_back( - std::make_unique>()); + PrepIncludeStack.emplace_back(); // Add all macros defined on the command line to the DefinedMacros set. // Check invalid macro names and print fatal error if we find one. @@ -453,8 +452,7 @@ bool TGLexer::LexInclude() { CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); CurPtr = CurBuf.begin(); - PrepIncludeStack.push_back( - std::make_unique>()); + PrepIncludeStack.emplace_back(); return false; } @@ -656,17 +654,13 @@ tgtok::TokKind TGLexer::LexExclaim() { bool TGLexer::prepExitInclude(bool IncludeStackMustBeEmpty) { // Report an error, if preprocessor control stack for the current // file is not empty. - if (!PrepIncludeStack.back()->empty()) { + if (!PrepIncludeStack.back().empty()) { prepReportPreprocessorStackError(); return false; } // Pop the preprocessing controls from the include stack. - if (PrepIncludeStack.empty()) { - PrintFatalError("preprocessor include stack is empty"); - } - PrepIncludeStack.pop_back(); if (IncludeStackMustBeEmpty) { @@ -761,7 +755,7 @@ tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind, // Regardless of whether we are processing tokens or not, // we put the #ifdef control on stack. // Note that MacroIsDefined has been canonicalized against ifdef. - PrepIncludeStack.back()->push_back( + PrepIncludeStack.back().push_back( {tgtok::Ifdef, MacroIsDefined, SMLoc::getFromPointer(TokStart)}); if (!prepSkipDirectiveEnd()) @@ -789,10 +783,10 @@ tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind, } else if (Kind == tgtok::Else) { // Check if this #else is correct before calling prepSkipDirectiveEnd(), // which will move CurPtr away from the beginning of #else. - if (PrepIncludeStack.back()->empty()) + if (PrepIncludeStack.back().empty()) return ReturnError(TokStart, "#else without #ifdef or #ifndef"); - PreprocessorControlDesc IfdefEntry = PrepIncludeStack.back()->back(); + PreprocessorControlDesc IfdefEntry = PrepIncludeStack.back().back(); if (IfdefEntry.Kind != tgtok::Ifdef) { PrintError(TokStart, "double #else"); @@ -801,9 +795,8 @@ tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind, // Replace the corresponding #ifdef's control with its negation // on the control stack. - PrepIncludeStack.back()->pop_back(); - PrepIncludeStack.back()->push_back( - {Kind, !IfdefEntry.IsDefined, SMLoc::getFromPointer(TokStart)}); + PrepIncludeStack.back().back() = {Kind, !IfdefEntry.IsDefined, + SMLoc::getFromPointer(TokStart)}; if (!prepSkipDirectiveEnd()) return ReturnError(CurPtr, "only comments are supported after #else"); @@ -822,10 +815,10 @@ tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind, } else if (Kind == tgtok::Endif) { // Check if this #endif is correct before calling prepSkipDirectiveEnd(), // which will move CurPtr away from the beginning of #endif. - if (PrepIncludeStack.back()->empty()) + if (PrepIncludeStack.back().empty()) return ReturnError(TokStart, "#endif without #ifdef"); - auto &IfdefOrElseEntry = PrepIncludeStack.back()->back(); + auto &IfdefOrElseEntry = PrepIncludeStack.back().back(); if (IfdefOrElseEntry.Kind != tgtok::Ifdef && IfdefOrElseEntry.Kind != tgtok::Else) { @@ -836,7 +829,7 @@ tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind, if (!prepSkipDirectiveEnd()) return ReturnError(CurPtr, "only comments are supported after #endif"); - PrepIncludeStack.back()->pop_back(); + PrepIncludeStack.back().pop_back(); // If we were processing tokens before this #endif, then // we should continue it. @@ -1055,20 +1048,16 @@ bool TGLexer::prepSkipDirectiveEnd() { } bool TGLexer::prepIsProcessingEnabled() { - for (const PreprocessorControlDesc &I : - llvm::reverse(*PrepIncludeStack.back())) - if (!I.IsDefined) - return false; - - return true; + return all_of(PrepIncludeStack.back(), + [](const PreprocessorControlDesc &I) { return I.IsDefined; }); } void TGLexer::prepReportPreprocessorStackError() { - if (PrepIncludeStack.back()->empty()) + if (PrepIncludeStack.back().empty()) PrintFatalError("prepReportPreprocessorStackError() called with " "empty control stack"); - auto &PrepControl = PrepIncludeStack.back()->back(); + auto &PrepControl = PrepIncludeStack.back().back(); PrintError(CurBuf.end(), "reached EOF without matching #endif"); PrintError(PrepControl.SrcPos, "the latest preprocessor control is here"); diff --git a/llvm/lib/TableGen/TGLexer.h b/llvm/lib/TableGen/TGLexer.h index 963d75e52cc8f..f8b32dc5377f5 100644 --- a/llvm/lib/TableGen/TGLexer.h +++ b/llvm/lib/TableGen/TGLexer.h @@ -13,6 +13,7 @@ #ifndef LLVM_LIB_TABLEGEN_TGLEXER_H #define LLVM_LIB_TABLEGEN_TGLEXER_H +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSet.h" #include "llvm/Support/DataTypes.h" @@ -21,7 +22,6 @@ #include #include #include -#include namespace llvm { template class ArrayRef; @@ -323,8 +323,7 @@ class TGLexer { // preprocessing control stacks for the current file and all its // parent files. The back() element is the preprocessing control // stack for the current file. - std::vector>> - PrepIncludeStack; + SmallVector> PrepIncludeStack; // Validate that the current preprocessing control stack is empty, // since we are about to exit a file, and pop the include stack. From f7420a9dff6d09715042b60c9e26a40a1b2a3147 Mon Sep 17 00:00:00 2001 From: Abid Qadeer Date: Fri, 3 Jan 2025 19:41:48 +0000 Subject: [PATCH 393/567] [flang][debug] Fix issue with argument numbering. (#120726) Currently fir::isDummyArgument is being used to check if a DeclareOp represents a dummy argument. The argument passed to the function is declOp.getMemref(). This bypasses the code in isDummyArgument that checks for dummy_scope because the `Value` returned by the getMemref() may not have DeclareOp as its defining op. This bypassing mean that sometime a variable will be marked as argument when it should not. This happened in this case where same arg was being used for 2 different result variables with use of `entry` in the function. The solution is to check directly if the declOp has a dummy_scope. If yes, we know this is dummy argument. We can now check if the memref points to the BlockArgument and use its number. This will still miss arguments where memref does not directly point to a BlockArgument but that is missed currently too. Note that we can still evaluate those variable in debugger. It is just that they are not marked as arguments. Fixes #116525. --- flang/lib/Optimizer/Transforms/AddDebugInfo.cpp | 6 +++--- flang/test/Integration/debug-116525.f90 | 12 ++++++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) create mode 100644 flang/test/Integration/debug-116525.f90 diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp index 3a437c7a0f013..a8e9d198ccb97 100644 --- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp +++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp @@ -121,9 +121,9 @@ void AddDebugInfoPass::handleDeclareOp(fir::cg::XDeclareOp declOp, // constant attribute of [hl]fir.declare/fircg.ext_declare operation that has // a dummy_scope operand). unsigned argNo = 0; - if (fir::isDummyArgument(declOp.getMemref())) { - auto arg = llvm::cast(declOp.getMemref()); - argNo = arg.getArgNumber() + 1; + if (declOp.getDummyScope()) { + if (auto arg = llvm::dyn_cast(declOp.getMemref())) + argNo = arg.getArgNumber() + 1; } auto tyAttr = typeGen.convertType(fir::unwrapRefType(declOp.getType()), diff --git a/flang/test/Integration/debug-116525.f90 b/flang/test/Integration/debug-116525.f90 new file mode 100644 index 0000000000000..1916a34df4c12 --- /dev/null +++ b/flang/test/Integration/debug-116525.f90 @@ -0,0 +1,12 @@ +! RUN: %flang_fc1 -fopenmp -emit-llvm -debug-info-kind=standalone %s -o - + +! Test that this does not cause build failure. +function s(x) + character(len=2) :: x, s, ss + + s = x + + entry ss() + +end function s + From 3b72c62e7faa918d9a7e7439a4aa86d00921b0b8 Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Fri, 3 Jan 2025 14:42:39 -0500 Subject: [PATCH 394/567] [AMDGPU][True16][MC] true16 for v_frexp_mant_f16 (#120653) Support true16 format for v_frexp_mant_f16 in MC --- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 2 +- llvm/test/CodeGen/AMDGPU/llvm.frexp.ll | 262 ++++++++++++++++++ llvm/test/MC/AMDGPU/gfx11_asm_vop1.s | 75 +++-- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s | 65 +++-- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s | 21 +- llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s | 42 +++ .../MC/AMDGPU/gfx11_asm_vop1_t16_promote.s | 154 +++++++--- .../AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s | 65 +++-- .../MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s | 25 +- .../test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s | 69 +++-- llvm/test/MC/AMDGPU/gfx12_asm_vop1.s | 72 +++-- llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s | 62 +++-- llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s | 18 +- llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s | 42 +++ .../MC/AMDGPU/gfx12_asm_vop1_t16_promote.s | 154 +++++++--- .../test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s | 69 +++-- .../AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s | 65 +++-- .../MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s | 25 +- .../Disassembler/AMDGPU/gfx11_dasm_vop1.txt | 63 ++++- .../AMDGPU/gfx11_dasm_vop1_dpp16.txt | 54 +++- .../AMDGPU/gfx11_dasm_vop1_dpp8.txt | 23 +- .../gfx11_dasm_vop3_dpp16_from_vop1.txt | 54 +++- .../AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt | 24 +- .../AMDGPU/gfx11_dasm_vop3_from_vop1.txt | 57 +++- .../AMDGPU/gfx12_dasm_vop1_dpp16.txt | 50 +++- .../AMDGPU/gfx12_dasm_vop1_dpp8.txt | 13 +- .../AMDGPU/gfx12_dasm_vop3_from_vop1.txt | 58 ++-- .../gfx12_dasm_vop3_from_vop1_dpp16.txt | 55 +++- .../AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt | 25 +- 29 files changed, 1298 insertions(+), 465 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index badca264e8f92..79f0caec418ba 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -1036,7 +1036,7 @@ defm V_LOG_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x057, "v_log_f16" defm V_LOG_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x057, "v_log_f16">; defm V_EXP_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x058, "v_exp_f16">; defm V_EXP_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x058, "v_exp_f16">; -defm V_FREXP_MANT_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x059, "v_frexp_mant_f16">; +defm V_FREXP_MANT_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x059, "v_frexp_mant_f16">; defm V_FREXP_EXP_I16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05a, "v_frexp_exp_i16_f16">; defm V_FLOOR_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">; defm V_FLOOR_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll index b9fef0834cb24..88ef7a9363930 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll @@ -3,11 +3,13 @@ ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-SDAG %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6-GISEL %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-GISEL %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL %s define { half, i32 } @test_frexp_f16_i32(half %a) { ; GFX6-SDAG-LABEL: test_frexp_f16_i32: @@ -50,6 +52,19 @@ define { half, i32 } @test_frexp_f16_i32(half %a) { ; GFX11-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f16_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v1, v0 +; GFX12-NEXT: v_frexp_mant_f16_e32 v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f16_i32: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -96,6 +111,16 @@ define half @test_frexp_f16_i32_only_use_fract(half %a) { ; GFX11-NEXT: v_frexp_mant_f16_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f16_i32_only_use_fract: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f16_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f16_i32_only_use_fract: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -145,6 +170,18 @@ define i32 @test_frexp_f16_i32_only_use_exp(half %a) { ; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f16_i32_only_use_exp: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f16_i32_only_use_exp: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -221,6 +258,25 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) { ; GFX11-NEXT: v_bfe_i32 v2, v4, 0, 16 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_v2f16_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-NEXT: v_frexp_mant_f16_e32 v2, v0 +; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_frexp_mant_f16_e32 v3, v1 +; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v4, v1 +; GFX12-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_pack_b32_f16 v0, v2, v3 +; GFX12-NEXT: v_bfe_i32 v2, v4, 0, 16 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_v2f16_v2i32: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -311,6 +367,20 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) { ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_v2f16_v2i32_only_use_fract: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-NEXT: v_frexp_mant_f16_e32 v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_frexp_mant_f16_e32 v1, v1 +; GFX12-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_fract: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -386,6 +456,22 @@ define <2 x i32> @test_frexp_v2f16_v2i32_only_use_exp(<2 x half> %a) { ; GFX11-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_v2f16_v2i32_only_use_exp: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v1, v1 +; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_exp: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -463,6 +549,19 @@ define { half, i16 } @test_frexp_f16_i16(half %a) { ; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f16_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f16_e32 v2, v0 +; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f16_i16: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -509,6 +608,16 @@ define half @test_frexp_f16_i16_only_use_fract(half %a) { ; GFX11-NEXT: v_frexp_mant_f16_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f16_i16_only_use_fract: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f16_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f16_i16_only_use_fract: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -554,6 +663,16 @@ define i16 @test_frexp_f16_i16_only_use_exp(half %a) { ; GFX11-NEXT: v_frexp_exp_i16_f16_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f16_i16_only_use_exp: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f16_i16_only_use_exp: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -623,6 +742,19 @@ define { float, i32 } @test_frexp_f32_i32(float %a) { ; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f32_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f32_e32 v2, v0 +; GFX12-NEXT: v_frexp_exp_i32_f32_e32 v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f32_i32: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -665,6 +797,16 @@ define float @test_frexp_f32_i32_only_use_fract(float %a) { ; GFX11-NEXT: v_frexp_mant_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f32_i32_only_use_fract: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f32_i32_only_use_fract: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -706,6 +848,16 @@ define i32 @test_frexp_f32_i32_only_use_exp(float %a) { ; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f32_i32_only_use_exp: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f32_i32_only_use_exp: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -771,6 +923,21 @@ define { <2 x float>, <2 x i32> } @test_frexp_v2f32_v2i32(<2 x float> %a) { ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_v2f32_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f32_e32 v4, v0 +; GFX12-NEXT: v_frexp_mant_f32_e32 v5, v1 +; GFX12-NEXT: v_frexp_exp_i32_f32_e32 v2, v0 +; GFX12-NEXT: v_frexp_exp_i32_f32_e32 v3, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_v2f32_v2i32: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -846,6 +1013,17 @@ define <2 x float> @test_frexp_v2f32_v2i32_only_use_fract(<2 x float> %a) { ; GFX11-NEXT: v_frexp_mant_f32_e32 v1, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_v2f32_v2i32_only_use_fract: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX12-NEXT: v_frexp_mant_f32_e32 v1, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_v2f32_v2i32_only_use_fract: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -896,6 +1074,17 @@ define <2 x i32> @test_frexp_v2f32_v2i32_only_use_exp(<2 x float> %a) { ; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_v2f32_v2i32_only_use_exp: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX12-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_v2f32_v2i32_only_use_exp: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -954,6 +1143,19 @@ define { double, i32 } @test_frexp_f64_i32(double %a) { ; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f64_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f64_e32 v[3:4], v[0:1] +; GFX12-NEXT: v_frexp_exp_i32_f64_e32 v2, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f64_i32: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1000,6 +1202,16 @@ define double @test_frexp_f64_i32_only_use_fract(double %a) { ; GFX11-NEXT: v_frexp_mant_f64_e32 v[0:1], v[0:1] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f64_i32_only_use_fract: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f64_e32 v[0:1], v[0:1] +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f64_i32_only_use_fract: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1044,6 +1256,16 @@ define i32 @test_frexp_f64_i32_only_use_exp(double %a) { ; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v0, v[0:1] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f64_i32_only_use_exp: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_exp_i32_f64_e32 v0, v[0:1] +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f64_i32_only_use_exp: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1116,6 +1338,22 @@ define { <2 x double>, <2 x i32> } @test_frexp_v2f64_v2i32(<2 x double> %a) { ; GFX11-NEXT: v_dual_mov_b32 v2, v6 :: v_dual_mov_b32 v3, v7 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_v2f64_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f64_e32 v[8:9], v[0:1] +; GFX12-NEXT: v_frexp_mant_f64_e32 v[6:7], v[2:3] +; GFX12-NEXT: v_frexp_exp_i32_f64_e32 v4, v[0:1] +; GFX12-NEXT: v_frexp_exp_i32_f64_e32 v5, v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 +; GFX12-NEXT: v_dual_mov_b32 v2, v6 :: v_dual_mov_b32 v3, v7 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_v2f64_v2i32: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1174,6 +1412,17 @@ define <2 x double> @test_frexp_v2f64_v2i32_only_use_fract(<2 x double> %a) { ; GFX11-NEXT: v_frexp_mant_f64_e32 v[0:1], v[0:1] ; GFX11-NEXT: v_frexp_mant_f64_e32 v[2:3], v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_frexp_v2f64_v2i32_only_use_fract: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f64_e32 v[0:1], v[0:1] +; GFX12-NEXT: v_frexp_mant_f64_e32 v[2:3], v[2:3] +; GFX12-NEXT: s_setpc_b64 s[30:31] %result = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %a) %result.0 = extractvalue { <2 x double>, <2 x i32> } %result, 0 ret <2 x double> %result.0 @@ -1213,6 +1462,17 @@ define <2 x i32> @test_frexp_v2f64_v2i32_only_use_exp(<2 x double> %a) { ; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v0, v[0:1] ; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v1, v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_frexp_v2f64_v2i32_only_use_exp: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_exp_i32_f64_e32 v0, v[0:1] +; GFX12-NEXT: v_frexp_exp_i32_f64_e32 v1, v[2:3] +; GFX12-NEXT: s_setpc_b64 s[30:31] %result = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %a) %result.1 = extractvalue { <2 x double>, <2 x i32> } %result, 1 ret <2 x i32> %result.1 @@ -1235,3 +1495,5 @@ attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memo ; GCN: {{.*}} ; GFX11-GISEL: {{.*}} ; GFX11-SDAG: {{.*}} +; GFX12-GISEL: {{.*}} +; GFX12-SDAG: {{.*}} diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s index 9b9837b46b26d..b98955d268a72 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s @@ -2351,50 +2351,65 @@ v_frexp_exp_i32_f64 v5, src_scc v_frexp_exp_i32_f64 v255, 0xaf123456 // GFX11: v_frexp_exp_i32_f64_e32 v255, 0xaf123456 ; encoding: [0xff,0x78,0xfe,0x7f,0x56,0x34,0x12,0xaf] -v_frexp_mant_f16 v5, v1 -// GFX11: v_frexp_mant_f16_e32 v5, v1 ; encoding: [0x01,0xb3,0x0a,0x7e] +v_frexp_mant_f16 v5.l, v1.l +// GFX11: v_frexp_mant_f16_e32 v5.l, v1.l ; encoding: [0x01,0xb3,0x0a,0x7e] -v_frexp_mant_f16 v5, v127 -// GFX11: v_frexp_mant_f16_e32 v5, v127 ; encoding: [0x7f,0xb3,0x0a,0x7e] +v_frexp_mant_f16 v5.l, v127.l +// GFX11: v_frexp_mant_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xb3,0x0a,0x7e] -v_frexp_mant_f16 v5, s1 -// GFX11: v_frexp_mant_f16_e32 v5, s1 ; encoding: [0x01,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, s1 +// GFX11: v_frexp_mant_f16_e32 v5.l, s1 ; encoding: [0x01,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, s105 -// GFX11: v_frexp_mant_f16_e32 v5, s105 ; encoding: [0x69,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, s105 +// GFX11: v_frexp_mant_f16_e32 v5.l, s105 ; encoding: [0x69,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, vcc_lo -// GFX11: v_frexp_mant_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, vcc_lo +// GFX11: v_frexp_mant_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, vcc_hi -// GFX11: v_frexp_mant_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, vcc_hi +// GFX11: v_frexp_mant_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, ttmp15 -// GFX11: v_frexp_mant_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, ttmp15 +// GFX11: v_frexp_mant_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, m0 -// GFX11: v_frexp_mant_f16_e32 v5, m0 ; encoding: [0x7d,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, m0 +// GFX11: v_frexp_mant_f16_e32 v5.l, m0 ; encoding: [0x7d,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, exec_lo -// GFX11: v_frexp_mant_f16_e32 v5, exec_lo ; encoding: [0x7e,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, exec_lo +// GFX11: v_frexp_mant_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, exec_hi -// GFX11: v_frexp_mant_f16_e32 v5, exec_hi ; encoding: [0x7f,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, exec_hi +// GFX11: v_frexp_mant_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, null -// GFX11: v_frexp_mant_f16_e32 v5, null ; encoding: [0x7c,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, null +// GFX11: v_frexp_mant_f16_e32 v5.l, null ; encoding: [0x7c,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, -1 -// GFX11: v_frexp_mant_f16_e32 v5, -1 ; encoding: [0xc1,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, -1 +// GFX11: v_frexp_mant_f16_e32 v5.l, -1 ; encoding: [0xc1,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, 0.5 -// GFX11: v_frexp_mant_f16_e32 v5, 0.5 ; encoding: [0xf0,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, 0.5 +// GFX11: v_frexp_mant_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, src_scc -// GFX11: v_frexp_mant_f16_e32 v5, src_scc ; encoding: [0xfd,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, src_scc +// GFX11: v_frexp_mant_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v127, 0xfe0b -// GFX11: v_frexp_mant_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_frexp_mant_f16 v127.l, 0xfe0b +// GFX11: v_frexp_mant_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +v_frexp_mant_f16 v5.l, v1.h +// GFX11: v_frexp_mant_f16_e32 v5.l, v1.h ; encoding: [0x81,0xb3,0x0a,0x7e] + +v_frexp_mant_f16 v5.l, v127.h +// GFX11: v_frexp_mant_f16_e32 v5.l, v127.h ; encoding: [0xff,0xb3,0x0a,0x7e] + +v_frexp_mant_f16 v127.l, 0.5 +// GFX11: v_frexp_mant_f16_e32 v127.l, 0.5 ; encoding: [0xf0,0xb2,0xfe,0x7e] + +v_frexp_mant_f16 v5.h, src_scc +// GFX11: v_frexp_mant_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xb2,0x0a,0x7f] + +v_frexp_mant_f16 v127.h, 0xfe0b +// GFX11: v_frexp_mant_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xb2,0xfe,0x7f,0x0b,0xfe,0x00,0x00] v_frexp_mant_f32 v5, v1 // GFX11: v_frexp_mant_f32_e32 v5, v1 ; encoding: [0x01,0x81,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s index b080bd9fca461..f46abd344d607 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s @@ -1766,47 +1766,56 @@ v_frexp_exp_i32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 f v_frexp_exp_i32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_frexp_exp_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x7e,0xfe,0x7f,0xff,0x6f,0x35,0x30] -v_frexp_mant_f16 v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_frexp_mant_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +v_frexp_mant_f16 v5.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff] -v_frexp_mant_f16 v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +v_frexp_mant_f16 v5.l, v1.l quad_perm:[0,1,2,3] +// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff] -v_frexp_mant_f16 v5, v1 row_mirror -// GFX11: v_frexp_mant_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_mirror +// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_half_mirror -// GFX11: v_frexp_mant_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_half_mirror +// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_shl:1 -// GFX11: v_frexp_mant_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_shl:1 +// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_shl:15 -// GFX11: v_frexp_mant_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_shl:15 +// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_shr:1 -// GFX11: v_frexp_mant_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_shr:1 +// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_shr:15 -// GFX11: v_frexp_mant_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_shr:15 +// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_ror:1 -// GFX11: v_frexp_mant_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_ror:1 +// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_ror:15 -// GFX11: v_frexp_mant_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_ror:15 +// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_frexp_mant_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_frexp_mant_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +v_frexp_mant_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01] -v_frexp_mant_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_frexp_mant_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x09,0x13] +v_frexp_mant_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x09,0x13] -v_frexp_mant_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_frexp_mant_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +v_frexp_mant_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_frexp_mant_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x35,0x30] + +v_frexp_mant_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_frexp_mant_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x5f,0x01,0x01] + +v_frexp_mant_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_frexp_mant_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xb2,0x0a,0x7f,0x81,0x60,0x09,0x13] + +v_frexp_mant_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_frexp_mant_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7f,0xff,0x6f,0x35,0x30] v_frexp_mant_f32 v5, v1 quad_perm:[3,2,1,0] // GFX11: v_frexp_mant_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s index 6a47dce49ed2a..c5df74758d71e 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s @@ -416,14 +416,23 @@ v_frexp_exp_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_frexp_exp_i32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_frexp_exp_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x7e,0xfe,0x7f,0xff,0x00,0x00,0x00] -v_frexp_mant_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_frexp_mant_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_frexp_mant_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_frexp_mant_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_frexp_mant_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_frexp_mant_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +v_frexp_mant_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_frexp_mant_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +v_frexp_mant_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_frexp_mant_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x77,0x39,0x05] + +v_frexp_mant_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_frexp_mant_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xb2,0x0a,0x7f,0x81,0x77,0x39,0x05] + +v_frexp_mant_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_frexp_mant_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xb2,0xfe,0x7f,0xff,0x00,0x00,0x00] v_frexp_mant_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_frexp_mant_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x80,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s index 34f10c98e1468..ca181f1e59db5 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s @@ -521,6 +521,12 @@ v_frexp_exp_i16_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0] v_frexp_mant_f16_e32 v128, 0xfe0b // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_frexp_mant_f16_e32 v128.h, 0xfe0b +// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v128.l, 0xfe0b +// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction + v_frexp_mant_f16_e32 v255, v1 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -530,6 +536,24 @@ v_frexp_mant_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0] v_frexp_mant_f16_e32 v255, v1 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:31: error: invalid operand for instruction +v_frexp_mant_f16_e32 v255.h, v1.h +// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v255.l, v1.l +// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction + v_frexp_mant_f16_e32 v5, v199 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -539,6 +563,24 @@ v_frexp_mant_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_frexp_mant_f16_e32 v5, v199 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:31: error: invalid operand for instruction +v_frexp_mant_f16_e32 v5.h, v199.h +// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v5.l, v199.l +// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction + v_log_f16_e32 v128.h, 0xfe0b // GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s index 9e424fbd004e4..a0a07a03e14c3 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s @@ -1406,71 +1406,137 @@ v_frexp_exp_i16_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] v_frexp_exp_i16_f16 v5.l, v199.l quad_perm:[3,2,1,0] // GFX11: v_frexp_exp_i16_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xda,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] -v_frexp_mant_f16 v128, 0xfe0b -// GFX11: v_frexp_mant_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xd9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_frexp_mant_f16 v128.h, 0xfe0b +// GFX11: v_frexp_mant_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xd9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_frexp_mant_f16 v255, -1 -// GFX11: v_frexp_mant_f16_e64 v255, -1 ; encoding: [0xff,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] +v_frexp_mant_f16 v128.l, 0xfe0b +// GFX11: v_frexp_mant_f16_e64 v128.l, 0xfe0b ; encoding: [0x80,0x00,0xd9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_frexp_mant_f16 v255, 0.5 -// GFX11: v_frexp_mant_f16_e64 v255, 0.5 ; encoding: [0xff,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, -1 +// GFX11: v_frexp_mant_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0xc1,0x00,0x00,0x00] -v_frexp_mant_f16 v255, exec_hi -// GFX11: v_frexp_mant_f16_e64 v255, exec_hi ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, 0.5 +// GFX11: v_frexp_mant_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0xf0,0x00,0x00,0x00] -v_frexp_mant_f16 v255, exec_lo -// GFX11: v_frexp_mant_f16_e64 v255, exec_lo ; encoding: [0xff,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, exec_hi +// GFX11: v_frexp_mant_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7f,0x00,0x00,0x00] -v_frexp_mant_f16 v255, m0 -// GFX11: v_frexp_mant_f16_e64 v255, m0 ; encoding: [0xff,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, exec_lo +// GFX11: v_frexp_mant_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7e,0x00,0x00,0x00] -v_frexp_mant_f16 v255, null -// GFX11: v_frexp_mant_f16_e64 v255, null ; encoding: [0xff,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, m0 +// GFX11: v_frexp_mant_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7d,0x00,0x00,0x00] -v_frexp_mant_f16 v255, s1 -// GFX11: v_frexp_mant_f16_e64 v255, s1 ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, null +// GFX11: v_frexp_mant_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7c,0x00,0x00,0x00] -v_frexp_mant_f16 v255, s105 -// GFX11: v_frexp_mant_f16_e64 v255, s105 ; encoding: [0xff,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, s1 +// GFX11: v_frexp_mant_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x01,0x00,0x00,0x00] -v_frexp_mant_f16 v255, src_scc -// GFX11: v_frexp_mant_f16_e64 v255, src_scc ; encoding: [0xff,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, s105 +// GFX11: v_frexp_mant_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x69,0x00,0x00,0x00] -v_frexp_mant_f16 v255, ttmp15 -// GFX11: v_frexp_mant_f16_e64 v255, ttmp15 ; encoding: [0xff,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, src_scc +// GFX11: v_frexp_mant_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0xfd,0x00,0x00,0x00] -v_frexp_mant_f16 v255, v1 -// GFX11: v_frexp_mant_f16_e64 v255, v1 ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] +v_frexp_mant_f16 v255.h, ttmp15 +// GFX11: v_frexp_mant_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7b,0x00,0x00,0x00] -v_frexp_mant_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_frexp_mant_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_frexp_mant_f16 v255.h, v1.h +// GFX11: v_frexp_mant_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00] -v_frexp_mant_f16 v255, v1 quad_perm:[3,2,1,0] -// GFX11: v_frexp_mant_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_frexp_mant_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_frexp_mant_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_frexp_mant_f16 v255, v127 -// GFX11: v_frexp_mant_f16_e64 v255, v127 ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x01,0x00,0x00] +v_frexp_mant_f16 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX11: v_frexp_mant_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_frexp_mant_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_frexp_mant_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] +v_frexp_mant_f16 v255.h, v127.h +// GFX11: v_frexp_mant_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xd9,0xd5,0x7f,0x01,0x00,0x00] -v_frexp_mant_f16 v255, v127 quad_perm:[3,2,1,0] -// GFX11: v_frexp_mant_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] +v_frexp_mant_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_frexp_mant_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] -v_frexp_mant_f16 v255, vcc_hi -// GFX11: v_frexp_mant_f16_e64 v255, vcc_hi ; encoding: [0xff,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, v127.h quad_perm:[3,2,1,0] +// GFX11: v_frexp_mant_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] -v_frexp_mant_f16 v255, vcc_lo -// GFX11: v_frexp_mant_f16_e64 v255, vcc_lo ; encoding: [0xff,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, vcc_hi +// GFX11: v_frexp_mant_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x6b,0x00,0x00,0x00] -v_frexp_mant_f16 v5, v199 -// GFX11: v_frexp_mant_f16_e64 v5, v199 ; encoding: [0x05,0x00,0xd9,0xd5,0xc7,0x01,0x00,0x00] +v_frexp_mant_f16 v255.h, vcc_lo +// GFX11: v_frexp_mant_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x6a,0x00,0x00,0x00] -v_frexp_mant_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_frexp_mant_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] +v_frexp_mant_f16 v255.l, -1 +// GFX11: v_frexp_mant_f16_e64 v255.l, -1 ; encoding: [0xff,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] -v_frexp_mant_f16 v5, v199 quad_perm:[3,2,1,0] -// GFX11: v_frexp_mant_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] +v_frexp_mant_f16 v255.l, 0.5 +// GFX11: v_frexp_mant_f16_e64 v255.l, 0.5 ; encoding: [0xff,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, exec_hi +// GFX11: v_frexp_mant_f16_e64 v255.l, exec_hi ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, exec_lo +// GFX11: v_frexp_mant_f16_e64 v255.l, exec_lo ; encoding: [0xff,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, m0 +// GFX11: v_frexp_mant_f16_e64 v255.l, m0 ; encoding: [0xff,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, null +// GFX11: v_frexp_mant_f16_e64 v255.l, null ; encoding: [0xff,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, s1 +// GFX11: v_frexp_mant_f16_e64 v255.l, s1 ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, s105 +// GFX11: v_frexp_mant_f16_e64 v255.l, s105 ; encoding: [0xff,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, src_scc +// GFX11: v_frexp_mant_f16_e64 v255.l, src_scc ; encoding: [0xff,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, ttmp15 +// GFX11: v_frexp_mant_f16_e64 v255.l, ttmp15 ; encoding: [0xff,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, v1.l +// GFX11: v_frexp_mant_f16_e64 v255.l, v1.l ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] + +v_frexp_mant_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_frexp_mant_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_frexp_mant_f16 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_frexp_mant_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_frexp_mant_f16 v255.l, v127.l +// GFX11: v_frexp_mant_f16_e64 v255.l, v127.l ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x01,0x00,0x00] + +v_frexp_mant_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_frexp_mant_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] + +v_frexp_mant_f16 v255.l, v127.l quad_perm:[3,2,1,0] +// GFX11: v_frexp_mant_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] + +v_frexp_mant_f16 v255.l, vcc_hi +// GFX11: v_frexp_mant_f16_e64 v255.l, vcc_hi ; encoding: [0xff,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, vcc_lo +// GFX11: v_frexp_mant_f16_e64 v255.l, vcc_lo ; encoding: [0xff,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] + +v_frexp_mant_f16 v5.h, v199.h +// GFX11: v_frexp_mant_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xd9,0xd5,0xc7,0x01,0x00,0x00] + +v_frexp_mant_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_frexp_mant_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_frexp_mant_f16 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX11: v_frexp_mant_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] + +v_frexp_mant_f16 v5.l, v199.l +// GFX11: v_frexp_mant_f16_e64 v5.l, v199.l ; encoding: [0x05,0x00,0xd9,0xd5,0xc7,0x01,0x00,0x00] + +v_frexp_mant_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_frexp_mant_f16 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] v_log_f16 v128, 0xfe0b // GFX11: v_log_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xd7,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s index 3992b869c46d5..1a7eb2c23a7d2 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s @@ -1861,47 +1861,56 @@ v_frexp_exp_i32_f32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ v_frexp_exp_i32_f32_e64_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x01,0xbf,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x05,0x30] -v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_mirror -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_mirror +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_half_mirror +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1 -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:1 +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15 -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:15 +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1 -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:1 +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15 -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:15 +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1 -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:1 +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15 -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:15 +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +v_frexp_mant_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: [0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +v_frexp_mant_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: [0x05,0x08,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: [0xff,0xc1,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] v_frexp_mant_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX11: v_frexp_mant_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s index a123c73c73bcb..73c21ce24d15c 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s @@ -538,17 +538,26 @@ v_frexp_exp_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_frexp_exp_i32_f32_e64_dpp v255, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0xbf,0xd5,0xe9,0x00,0x00,0x20,0xff,0x00,0x00,0x00] -v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_frexp_mant_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xd9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xd9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +v_frexp_mant_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: [0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +v_frexp_mant_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: [0x05,0x08,0xd9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: [0xff,0xc1,0xd9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] v_frexp_mant_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_frexp_mant_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s index 4b055165871cf..860c0f4eca7b3 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s @@ -2347,50 +2347,59 @@ v_frexp_exp_i32_f64_e64 v5, -|src_scc| v_frexp_exp_i32_f64_e64 v255, 0xaf123456 // GFX11: v_frexp_exp_i32_f64_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xbc,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] -v_frexp_mant_f16_e64 v5, v1 -// GFX11: v_frexp_mant_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, v1.l +// GFX11: v_frexp_mant_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] -v_frexp_mant_f16_e64 v5, v255 -// GFX11: v_frexp_mant_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, v255.l +// GFX11: v_frexp_mant_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00] -v_frexp_mant_f16_e64 v5, s1 -// GFX11: v_frexp_mant_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, s1 +// GFX11: v_frexp_mant_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, s105 -// GFX11: v_frexp_mant_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, s105 +// GFX11: v_frexp_mant_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, vcc_lo -// GFX11: v_frexp_mant_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, vcc_lo +// GFX11: v_frexp_mant_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, vcc_hi -// GFX11: v_frexp_mant_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, vcc_hi +// GFX11: v_frexp_mant_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, ttmp15 -// GFX11: v_frexp_mant_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, ttmp15 +// GFX11: v_frexp_mant_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, m0 -// GFX11: v_frexp_mant_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, m0 +// GFX11: v_frexp_mant_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, exec_lo -// GFX11: v_frexp_mant_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, exec_lo +// GFX11: v_frexp_mant_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, exec_hi -// GFX11: v_frexp_mant_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, exec_hi +// GFX11: v_frexp_mant_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, null -// GFX11: v_frexp_mant_f16_e64 v5, null ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, null +// GFX11: v_frexp_mant_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, -1 -// GFX11: v_frexp_mant_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, -1 +// GFX11: v_frexp_mant_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, 0.5 mul:2 -// GFX11: v_frexp_mant_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08] +v_frexp_mant_f16_e64 v5.l, 0.5 mul:2 +// GFX11: v_frexp_mant_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08] -v_frexp_mant_f16_e64 v5, src_scc mul:4 -// GFX11: v_frexp_mant_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10] +v_frexp_mant_f16_e64 v5.l, src_scc mul:4 +// GFX11: v_frexp_mant_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10] -v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 -// GFX11: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +v_frexp_mant_f16_e64 v255.l, -|0xfe0b| clamp div:2 +// GFX11: v_frexp_mant_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_frexp_mant_f16_e64 v5.h, v1.h +// GFX11: [0x05,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00] + +v_frexp_mant_f16_e64 v5.l, v255.h +// GFX11: [0x05,0x08,0xd9,0xd5,0xff,0x01,0x00,0x00] + +v_frexp_mant_f16_e64 v255.h, -|0xfe0b| clamp div:2 +// GFX11: [0xff,0xc1,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] v_frexp_mant_f32_e64 v5, v1 // GFX11: v_frexp_mant_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s index ed90e480012c0..0195c34a552e3 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s @@ -2432,50 +2432,62 @@ v_frexp_exp_i32_f64 v5, src_scc v_frexp_exp_i32_f64 v255, 0xaf123456 // GFX12: v_frexp_exp_i32_f64_e32 v255, 0xaf123456 ; encoding: [0xff,0x78,0xfe,0x7f,0x56,0x34,0x12,0xaf] -v_frexp_mant_f16 v5, v1 -// GFX12: v_frexp_mant_f16_e32 v5, v1 ; encoding: [0x01,0xb3,0x0a,0x7e] +v_frexp_mant_f16 v5.l, v1.l +// GFX12: v_frexp_mant_f16_e32 v5.l, v1.l ; encoding: [0x01,0xb3,0x0a,0x7e] -v_frexp_mant_f16 v5, v127 -// GFX12: v_frexp_mant_f16_e32 v5, v127 ; encoding: [0x7f,0xb3,0x0a,0x7e] +v_frexp_mant_f16 v5.l, v127.l +// GFX12: v_frexp_mant_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xb3,0x0a,0x7e] -v_frexp_mant_f16 v5, s1 -// GFX12: v_frexp_mant_f16_e32 v5, s1 ; encoding: [0x01,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, s1 +// GFX12: v_frexp_mant_f16_e32 v5.l, s1 ; encoding: [0x01,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, s105 -// GFX12: v_frexp_mant_f16_e32 v5, s105 ; encoding: [0x69,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, s105 +// GFX12: v_frexp_mant_f16_e32 v5.l, s105 ; encoding: [0x69,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, vcc_lo -// GFX12: v_frexp_mant_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, vcc_lo +// GFX12: v_frexp_mant_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, vcc_hi -// GFX12: v_frexp_mant_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, vcc_hi +// GFX12: v_frexp_mant_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, ttmp15 -// GFX12: v_frexp_mant_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, ttmp15 +// GFX12: v_frexp_mant_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, m0 -// GFX12: v_frexp_mant_f16_e32 v5, m0 ; encoding: [0x7d,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, m0 +// GFX12: v_frexp_mant_f16_e32 v5.l, m0 ; encoding: [0x7d,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, exec_lo -// GFX12: v_frexp_mant_f16_e32 v5, exec_lo ; encoding: [0x7e,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, exec_lo +// GFX12: v_frexp_mant_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, exec_hi -// GFX12: v_frexp_mant_f16_e32 v5, exec_hi ; encoding: [0x7f,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, exec_hi +// GFX12: v_frexp_mant_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, null -// GFX12: v_frexp_mant_f16_e32 v5, null ; encoding: [0x7c,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, null +// GFX12: v_frexp_mant_f16_e32 v5.l, null ; encoding: [0x7c,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, -1 -// GFX12: v_frexp_mant_f16_e32 v5, -1 ; encoding: [0xc1,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, -1 +// GFX12: v_frexp_mant_f16_e32 v5.l, -1 ; encoding: [0xc1,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, 0.5 -// GFX12: v_frexp_mant_f16_e32 v5, 0.5 ; encoding: [0xf0,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, 0.5 +// GFX12: v_frexp_mant_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, src_scc -// GFX12: v_frexp_mant_f16_e32 v5, src_scc ; encoding: [0xfd,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, src_scc +// GFX12: v_frexp_mant_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v127, 0xfe0b -// GFX12: v_frexp_mant_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_frexp_mant_f16 v127.l, 0xfe0b +// GFX12: v_frexp_mant_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +v_frexp_mant_f16 v5.l, v1.h +// GFX12: v_frexp_mant_f16_e32 v5.l, v1.h ; encoding: [0x81,0xb3,0x0a,0x7e] + +v_frexp_mant_f16 v5.l, v127.h +// GFX12: v_frexp_mant_f16_e32 v5.l, v127.h ; encoding: [0xff,0xb3,0x0a,0x7e] + +v_frexp_mant_f16 v5.h, src_scc +// GFX12: v_frexp_mant_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xb2,0x0a,0x7f] + +v_frexp_mant_f16 v127.h, 0xfe0b +// GFX12: v_frexp_mant_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xb2,0xfe,0x7f,0x0b,0xfe,0x00,0x00] v_frexp_mant_f32 v5, v1 // GFX12: v_frexp_mant_f32_e32 v5, v1 ; encoding: [0x01,0x81,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s index 90968055e2a82..072544e66e4a5 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s @@ -1834,47 +1834,53 @@ v_frexp_exp_i32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 f v_frexp_exp_i32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_frexp_exp_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x7e,0xfe,0x7f,0xff,0x6f,0x35,0x30] -v_frexp_mant_f16 v5, v1 quad_perm:[3,2,1,0] -// GFX12: v_frexp_mant_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +v_frexp_mant_f16 v5.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff] -v_frexp_mant_f16 v5, v1 quad_perm:[0,1,2,3] -// GFX12: v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +v_frexp_mant_f16 v5.l, v1.l quad_perm:[0,1,2,3] +// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff] -v_frexp_mant_f16 v5, v1 row_mirror -// GFX12: v_frexp_mant_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_mirror +// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_half_mirror -// GFX12: v_frexp_mant_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_half_mirror +// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_shl:1 -// GFX12: v_frexp_mant_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_shl:1 +// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_shl:15 -// GFX12: v_frexp_mant_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_shl:15 +// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_shr:1 -// GFX12: v_frexp_mant_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_shr:1 +// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_shr:15 -// GFX12: v_frexp_mant_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_shr:15 +// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_ror:1 -// GFX12: v_frexp_mant_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_ror:1 +// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_ror:15 -// GFX12: v_frexp_mant_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_ror:15 +// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_frexp_mant_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_frexp_mant_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +v_frexp_mant_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01] -v_frexp_mant_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_frexp_mant_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x09,0x13] +v_frexp_mant_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x09,0x13] -v_frexp_mant_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_frexp_mant_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +v_frexp_mant_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_frexp_mant_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x35,0x30] + +v_frexp_mant_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_frexp_mant_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xb2,0x0a,0x7f,0x81,0x60,0x09,0x13] + +v_frexp_mant_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_frexp_mant_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7f,0xff,0x6f,0x35,0x30] v_frexp_mant_f32 v5, v1 quad_perm:[3,2,1,0] // GFX12: v_frexp_mant_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s index 0ce0087918f56..bc3559e3c65ed 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s @@ -448,14 +448,20 @@ v_frexp_exp_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_frexp_exp_i32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_frexp_exp_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x7e,0xfe,0x7f,0xff,0x00,0x00,0x00] -v_frexp_mant_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_frexp_mant_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_frexp_mant_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_frexp_mant_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_frexp_mant_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_frexp_mant_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +v_frexp_mant_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_frexp_mant_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +v_frexp_mant_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_frexp_mant_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xb2,0x0a,0x7f,0x81,0x77,0x39,0x05] + +v_frexp_mant_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_frexp_mant_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xb2,0xfe,0x7f,0xff,0x00,0x00,0x00] v_frexp_mant_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_frexp_mant_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x80,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s index 92a0d15bbc6f0..0d759baf0af0d 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s @@ -494,6 +494,12 @@ v_frexp_exp_i16_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0] v_frexp_mant_f16_e32 v128, 0xfe0b // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_frexp_mant_f16_e32 v128.h, 0xfe0b +// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v128.l, 0xfe0b +// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction + v_frexp_mant_f16_e32 v255, v1 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -503,6 +509,24 @@ v_frexp_mant_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0] v_frexp_mant_f16_e32 v255, v1 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:31: error: invalid operand for instruction +v_frexp_mant_f16_e32 v255.h, v1.h +// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v255.l, v1.l +// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction + v_frexp_mant_f16_e32 v5, v199 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -512,6 +536,24 @@ v_frexp_mant_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_frexp_mant_f16_e32 v5, v199 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:31: error: invalid operand for instruction +v_frexp_mant_f16_e32 v5.h, v199.h +// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v5.l, v199.l +// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction + v_log_f16_e32 v128, 0xfe0b // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s index bbe7b65d03281..976b6bb69c33e 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s @@ -1366,71 +1366,137 @@ v_frexp_exp_i16_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] v_frexp_exp_i16_f16 v5.l, v199.l quad_perm:[3,2,1,0] // GFX12: v_frexp_exp_i16_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xda,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] -v_frexp_mant_f16 v128, 0xfe0b -// GFX12: v_frexp_mant_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xd9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_frexp_mant_f16 v128.h, 0xfe0b +// GFX12: v_frexp_mant_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xd9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_frexp_mant_f16 v255, -1 -// GFX12: v_frexp_mant_f16_e64 v255, -1 ; encoding: [0xff,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] +v_frexp_mant_f16 v128.l, 0xfe0b +// GFX12: v_frexp_mant_f16_e64 v128.l, 0xfe0b ; encoding: [0x80,0x00,0xd9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_frexp_mant_f16 v255, 0.5 -// GFX12: v_frexp_mant_f16_e64 v255, 0.5 ; encoding: [0xff,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, -1 +// GFX12: v_frexp_mant_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0xc1,0x00,0x00,0x00] -v_frexp_mant_f16 v255, exec_hi -// GFX12: v_frexp_mant_f16_e64 v255, exec_hi ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, 0.5 +// GFX12: v_frexp_mant_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0xf0,0x00,0x00,0x00] -v_frexp_mant_f16 v255, exec_lo -// GFX12: v_frexp_mant_f16_e64 v255, exec_lo ; encoding: [0xff,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, exec_hi +// GFX12: v_frexp_mant_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7f,0x00,0x00,0x00] -v_frexp_mant_f16 v255, m0 -// GFX12: v_frexp_mant_f16_e64 v255, m0 ; encoding: [0xff,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, exec_lo +// GFX12: v_frexp_mant_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7e,0x00,0x00,0x00] -v_frexp_mant_f16 v255, null -// GFX12: v_frexp_mant_f16_e64 v255, null ; encoding: [0xff,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, m0 +// GFX12: v_frexp_mant_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7d,0x00,0x00,0x00] -v_frexp_mant_f16 v255, s1 -// GFX12: v_frexp_mant_f16_e64 v255, s1 ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, null +// GFX12: v_frexp_mant_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7c,0x00,0x00,0x00] -v_frexp_mant_f16 v255, s105 -// GFX12: v_frexp_mant_f16_e64 v255, s105 ; encoding: [0xff,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, s1 +// GFX12: v_frexp_mant_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x01,0x00,0x00,0x00] -v_frexp_mant_f16 v255, src_scc -// GFX12: v_frexp_mant_f16_e64 v255, src_scc ; encoding: [0xff,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, s105 +// GFX12: v_frexp_mant_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x69,0x00,0x00,0x00] -v_frexp_mant_f16 v255, ttmp15 -// GFX12: v_frexp_mant_f16_e64 v255, ttmp15 ; encoding: [0xff,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, src_scc +// GFX12: v_frexp_mant_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0xfd,0x00,0x00,0x00] -v_frexp_mant_f16 v255, v1 -// GFX12: v_frexp_mant_f16_e64 v255, v1 ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] +v_frexp_mant_f16 v255.h, ttmp15 +// GFX12: v_frexp_mant_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7b,0x00,0x00,0x00] -v_frexp_mant_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_frexp_mant_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_frexp_mant_f16 v255.h, v1.h +// GFX12: v_frexp_mant_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00] -v_frexp_mant_f16 v255, v1 quad_perm:[3,2,1,0] -// GFX12: v_frexp_mant_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_frexp_mant_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_frexp_mant_f16 v255, v127 -// GFX12: v_frexp_mant_f16_e64 v255, v127 ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x01,0x00,0x00] +v_frexp_mant_f16 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_frexp_mant_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_frexp_mant_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] +v_frexp_mant_f16 v255.h, v127.h +// GFX12: v_frexp_mant_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xd9,0xd5,0x7f,0x01,0x00,0x00] -v_frexp_mant_f16 v255, v127 quad_perm:[3,2,1,0] -// GFX12: v_frexp_mant_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] +v_frexp_mant_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] -v_frexp_mant_f16 v255, vcc_hi -// GFX12: v_frexp_mant_f16_e64 v255, vcc_hi ; encoding: [0xff,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, v127.h quad_perm:[3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] -v_frexp_mant_f16 v255, vcc_lo -// GFX12: v_frexp_mant_f16_e64 v255, vcc_lo ; encoding: [0xff,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, vcc_hi +// GFX12: v_frexp_mant_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x6b,0x00,0x00,0x00] -v_frexp_mant_f16 v5, v199 -// GFX12: v_frexp_mant_f16_e64 v5, v199 ; encoding: [0x05,0x00,0xd9,0xd5,0xc7,0x01,0x00,0x00] +v_frexp_mant_f16 v255.h, vcc_lo +// GFX12: v_frexp_mant_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x6a,0x00,0x00,0x00] -v_frexp_mant_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_frexp_mant_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] +v_frexp_mant_f16 v255.l, -1 +// GFX12: v_frexp_mant_f16_e64 v255.l, -1 ; encoding: [0xff,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] -v_frexp_mant_f16 v5, v199 quad_perm:[3,2,1,0] -// GFX12: v_frexp_mant_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] +v_frexp_mant_f16 v255.l, 0.5 +// GFX12: v_frexp_mant_f16_e64 v255.l, 0.5 ; encoding: [0xff,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, exec_hi +// GFX12: v_frexp_mant_f16_e64 v255.l, exec_hi ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, exec_lo +// GFX12: v_frexp_mant_f16_e64 v255.l, exec_lo ; encoding: [0xff,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, m0 +// GFX12: v_frexp_mant_f16_e64 v255.l, m0 ; encoding: [0xff,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, null +// GFX12: v_frexp_mant_f16_e64 v255.l, null ; encoding: [0xff,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, s1 +// GFX12: v_frexp_mant_f16_e64 v255.l, s1 ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, s105 +// GFX12: v_frexp_mant_f16_e64 v255.l, s105 ; encoding: [0xff,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, src_scc +// GFX12: v_frexp_mant_f16_e64 v255.l, src_scc ; encoding: [0xff,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, ttmp15 +// GFX12: v_frexp_mant_f16_e64 v255.l, ttmp15 ; encoding: [0xff,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, v1.l +// GFX12: v_frexp_mant_f16_e64 v255.l, v1.l ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] + +v_frexp_mant_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_frexp_mant_f16 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_frexp_mant_f16 v255.l, v127.l +// GFX12: v_frexp_mant_f16_e64 v255.l, v127.l ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x01,0x00,0x00] + +v_frexp_mant_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] + +v_frexp_mant_f16 v255.l, v127.l quad_perm:[3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] + +v_frexp_mant_f16 v255.l, vcc_hi +// GFX12: v_frexp_mant_f16_e64 v255.l, vcc_hi ; encoding: [0xff,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, vcc_lo +// GFX12: v_frexp_mant_f16_e64 v255.l, vcc_lo ; encoding: [0xff,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] + +v_frexp_mant_f16 v5.h, v199.h +// GFX12: v_frexp_mant_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xd9,0xd5,0xc7,0x01,0x00,0x00] + +v_frexp_mant_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_frexp_mant_f16 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] + +v_frexp_mant_f16 v5.l, v199.l +// GFX12: v_frexp_mant_f16_e64 v5.l, v199.l ; encoding: [0x05,0x00,0xd9,0xd5,0xc7,0x01,0x00,0x00] + +v_frexp_mant_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_frexp_mant_f16 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] v_log_f16 v128, 0xfe0b // GFX12: v_log_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xd7,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s index 5af15f2eb971f..e4f62eadc0e49 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s @@ -2497,50 +2497,59 @@ v_frexp_exp_i32_f64_e64 v5, -|src_scc| v_frexp_exp_i32_f64_e64 v255, 0xaf123456 // GFX12: v_frexp_exp_i32_f64_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xbc,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] -v_frexp_mant_f16_e64 v5, v1 -// GFX12: v_frexp_mant_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, v1.l +// GFX12: v_frexp_mant_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] -v_frexp_mant_f16_e64 v5, v255 -// GFX12: v_frexp_mant_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, v255.l +// GFX12: v_frexp_mant_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00] -v_frexp_mant_f16_e64 v5, s1 -// GFX12: v_frexp_mant_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, s1 +// GFX12: v_frexp_mant_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, s105 -// GFX12: v_frexp_mant_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, s105 +// GFX12: v_frexp_mant_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, vcc_lo -// GFX12: v_frexp_mant_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, vcc_lo +// GFX12: v_frexp_mant_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, vcc_hi -// GFX12: v_frexp_mant_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, vcc_hi +// GFX12: v_frexp_mant_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, ttmp15 -// GFX12: v_frexp_mant_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, ttmp15 +// GFX12: v_frexp_mant_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, m0 -// GFX12: v_frexp_mant_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, m0 +// GFX12: v_frexp_mant_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, exec_lo -// GFX12: v_frexp_mant_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, exec_lo +// GFX12: v_frexp_mant_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, exec_hi -// GFX12: v_frexp_mant_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, exec_hi +// GFX12: v_frexp_mant_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, null -// GFX12: v_frexp_mant_f16_e64 v5, null ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, null +// GFX12: v_frexp_mant_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, -1 -// GFX12: v_frexp_mant_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, -1 +// GFX12: v_frexp_mant_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, 0.5 mul:2 -// GFX12: v_frexp_mant_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08] +v_frexp_mant_f16_e64 v5.l, 0.5 mul:2 +// GFX12: v_frexp_mant_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08] -v_frexp_mant_f16_e64 v5, src_scc mul:4 -// GFX12: v_frexp_mant_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10] +v_frexp_mant_f16_e64 v5.l, src_scc mul:4 +// GFX12: v_frexp_mant_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10] -v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 -// GFX12: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +v_frexp_mant_f16_e64 v255.l, -|0xfe0b| clamp div:2 +// GFX12: v_frexp_mant_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_frexp_mant_f16_e64 v5.h, v1.h +// GFX12: v_frexp_mant_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00] + +v_frexp_mant_f16_e64 v5.l, v255.h +// GFX12: v_frexp_mant_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xd9,0xd5,0xff,0x01,0x00,0x00] + +v_frexp_mant_f16_e64 v255.h, -|0xfe0b| clamp div:2 +// GFX12: v_frexp_mant_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] v_frexp_mant_f32_e64 v5, v1 // GFX12: v_frexp_mant_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s index 39638cefd44ad..fb57e5cd54ab8 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s @@ -1876,47 +1876,56 @@ v_frexp_exp_i32_f32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ v_frexp_exp_i32_f32_e64_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x01,0xbf,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x05,0x30] -v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_mirror -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_mirror +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_half_mirror +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1 -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:1 +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15 -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:15 +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1 -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:1 +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15 -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:15 +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1 -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:1 +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15 -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:15 +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +v_frexp_mant_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +v_frexp_mant_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x08,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc1,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] v_frexp_mant_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX12: v_frexp_mant_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s index a6cef6f134b0a..acb73d8dbaf73 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s @@ -553,17 +553,26 @@ v_frexp_exp_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_frexp_exp_i32_f32_e64_dpp v255, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0xbf,0xd5,0xe9,0x00,0x00,0x20,0xff,0x00,0x00,0x00] -v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_frexp_mant_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xd9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xd9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +v_frexp_mant_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +v_frexp_mant_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x08,0xd9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc1,0xd9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] v_frexp_mant_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_frexp_mant_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt index 0abced9f2f77b..55b2081c04917 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt @@ -2290,49 +2290,82 @@ # GFX11: v_frexp_exp_i32_f64_e32 v255, 0xaf123456 ; encoding: [0xff,0x78,0xfe,0x7f,0x56,0x34,0x12,0xaf] 0x01,0xb3,0x0a,0x7e -# GFX11: v_frexp_mant_f16_e32 v5, v1 ; encoding: [0x01,0xb3,0x0a,0x7e] +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, v1.l ; encoding: [0x01,0xb3,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, v1 ; encoding: [0x01,0xb3,0x0a,0x7e] 0x7f,0xb3,0x0a,0x7e -# GFX11: v_frexp_mant_f16_e32 v5, v127 ; encoding: [0x7f,0xb3,0x0a,0x7e] +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xb3,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, v127 ; encoding: [0x7f,0xb3,0x0a,0x7e] 0x01,0xb2,0x0a,0x7e -# GFX11: v_frexp_mant_f16_e32 v5, s1 ; encoding: [0x01,0xb2,0x0a,0x7e] +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, s1 ; encoding: [0x01,0xb2,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, s1 ; encoding: [0x01,0xb2,0x0a,0x7e] 0x69,0xb2,0x0a,0x7e -# GFX11: v_frexp_mant_f16_e32 v5, s105 ; encoding: [0x69,0xb2,0x0a,0x7e] +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, s105 ; encoding: [0x69,0xb2,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, s105 ; encoding: [0x69,0xb2,0x0a,0x7e] 0x6a,0xb2,0x0a,0x7e -# GFX11: v_frexp_mant_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xb2,0x0a,0x7e] +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xb2,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xb2,0x0a,0x7e] 0x6b,0xb2,0x0a,0x7e -# GFX11: v_frexp_mant_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xb2,0x0a,0x7e] +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xb2,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xb2,0x0a,0x7e] 0x7b,0xb2,0x0a,0x7e -# GFX11: v_frexp_mant_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xb2,0x0a,0x7e] +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xb2,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xb2,0x0a,0x7e] 0x7d,0xb2,0x0a,0x7e -# GFX11: v_frexp_mant_f16_e32 v5, m0 ; encoding: [0x7d,0xb2,0x0a,0x7e] +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, m0 ; encoding: [0x7d,0xb2,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, m0 ; encoding: [0x7d,0xb2,0x0a,0x7e] 0x7e,0xb2,0x0a,0x7e -# GFX11: v_frexp_mant_f16_e32 v5, exec_lo ; encoding: [0x7e,0xb2,0x0a,0x7e] +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xb2,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, exec_lo ; encoding: [0x7e,0xb2,0x0a,0x7e] 0x7f,0xb2,0x0a,0x7e -# GFX11: v_frexp_mant_f16_e32 v5, exec_hi ; encoding: [0x7f,0xb2,0x0a,0x7e] +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xb2,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, exec_hi ; encoding: [0x7f,0xb2,0x0a,0x7e] 0x7c,0xb2,0x0a,0x7e -# GFX11: v_frexp_mant_f16_e32 v5, null ; encoding: [0x7c,0xb2,0x0a,0x7e] +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, null ; encoding: [0x7c,0xb2,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, null ; encoding: [0x7c,0xb2,0x0a,0x7e] 0xc1,0xb2,0x0a,0x7e -# GFX11: v_frexp_mant_f16_e32 v5, -1 ; encoding: [0xc1,0xb2,0x0a,0x7e] +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, -1 ; encoding: [0xc1,0xb2,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, -1 ; encoding: [0xc1,0xb2,0x0a,0x7e] 0xf0,0xb2,0x0a,0x7e -# GFX11: v_frexp_mant_f16_e32 v5, 0.5 ; encoding: [0xf0,0xb2,0x0a,0x7e] +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xb2,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, 0.5 ; encoding: [0xf0,0xb2,0x0a,0x7e] 0xfd,0xb2,0x0a,0x7e -# GFX11: v_frexp_mant_f16_e32 v5, src_scc ; encoding: [0xfd,0xb2,0x0a,0x7e] +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xb2,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, src_scc ; encoding: [0xfd,0xb2,0x0a,0x7e] 0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00 -# GFX11: v_frexp_mant_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +0x81,0xb3,0x0a,0x7e +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, v1.h ; encoding: [0x81,0xb3,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xb3,0x0a,0x7e] + +0xff,0xb3,0x0a,0x7e +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, v127.h ; encoding: [0xff,0xb3,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xb3,0x0a,0x7e] + +0xf0,0xb2,0xfe,0x7e +# GFX11-REAL16: v_frexp_mant_f16_e32 v127.l, 0.5 ; encoding: [0xf0,0xb2,0xfe,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v127, 0.5 ; encoding: [0xf0,0xb2,0xfe,0x7e] + +0xfd,0xb2,0x0a,0x7f +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xb2,0x0a,0x7f] + +0xff,0xb2,0xfe,0x7f,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_frexp_mant_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xb2,0xfe,0x7f,0x0b,0xfe,0x00,0x00] 0x01,0x81,0x0a,0x7e # GFX11: v_frexp_mant_f32_e32 v5, v1 ; encoding: [0x01,0x81,0x0a,0x7e] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt index 7043f3b2b9f29..d2e1e926cc19e 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt @@ -1727,46 +1727,72 @@ # GFX11: v_frexp_exp_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x7e,0xfe,0x7f,0xff,0x6f,0x3d,0x30] 0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX11: v_frexp_mant_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX11: v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX11: v_frexp_mant_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX11: v_frexp_mant_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX11: v_frexp_mant_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX11: v_frexp_mant_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX11: v_frexp_mant_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX11: v_frexp_mant_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX11: v_frexp_mant_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX11: v_frexp_mant_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX11: v_frexp_mant_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX11: v_frexp_mant_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX11: v_frexp_mant_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30 -# GFX11: v_frexp_mant_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX11-REAL16: v_frexp_mant_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] + +0xfa,0xb2,0xfe,0x7e,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_frexp_mant_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x5f,0x01,0x01] + +0xfa,0xb2,0x0a,0x7f,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7f,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_mul_i32_i24_e32 v128, 1, v176 ; encoding: [0x81,0x60,0x01,0x13] + +0xfa,0xb2,0xfe,0x7f,0xff,0x6f,0x3d,0x30 +# GFX11-REAL16: v_frexp_mant_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7f,0xff,0x6f,0x3d,0x30] +# GFX11-FAKE16: v_lshlrev_b32_e32 v30, v255, v183 ; encoding: [0xff,0x6f,0x3d,0x30] 0xfa,0x80,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX11: v_frexp_mant_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt index d2eb919849fd3..93fb5e2b4c01a 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt @@ -335,14 +335,33 @@ # GFX11: v_frexp_exp_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x7e,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX11: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00 -# GFX11: v_frexp_mant_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16-REAL16: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-REAL16-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xe9,0x80,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX11: v_frexp_mant_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x80,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-FAKE16-REAL16: v_frexp_mant_f16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x77,0x39,0x05] +# GFX11-REAL16-REAL16: v_frexp_mant_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x77,0x39,0x05] +0xe9,0xb2,0xfe,0x7e,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_frexp_mant_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x77,0x39,0x05] + +# GFX11-REAL16-REAL16: v_frexp_mant_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7f,0x81,0x77,0x39,0x05] +0xe9,0xb2,0x0a,0x7f,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7f,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v129, v187 ; encoding: [0x81,0x77,0x39,0x05] + +# GFX11-REAL16-REAL16: v_frexp_mant_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7f,0xff,0x00,0x00,0x00] +0xea,0xb2,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_frexp_mant_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7f,0xff,0x00,0x00,0x00] + 0xea,0x80,0xfe,0x7f,0xff,0x00,0x00,0x00 # GFX11: v_frexp_mant_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x80,0xfe,0x7f,0xff,0x00,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt index 5c3fde7b80556..74d875081d113 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt @@ -1853,46 +1853,72 @@ # GFX11: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x01,0xbf,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] 0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 -# GFX11: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] + +0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +0x05,0x08,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] + +0xff,0xc1,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xc0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX11: v_frexp_mant_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt index 28b39f4b0344a..a4bdfe9f4a975 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt @@ -505,16 +505,32 @@ # GFX11: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x01,0xbf,0xd5,0xea,0x00,0x00,0x20,0xff,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] 0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX11: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +0x05,0x08,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +0xff,0xc1,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0x05,0x00,0xc0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX11: v_frexp_mant_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt index d078bc2b8cb04..7c4f1634026fd 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt @@ -2316,49 +2316,76 @@ # GFX11: v_frexp_exp_i32_f64_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xbc,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] 0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00 -# GFX11: v_frexp_mant_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00 -# GFX11: v_frexp_mant_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00 -# GFX11: v_frexp_mant_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00 -# GFX11: v_frexp_mant_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00 -# GFX11: v_frexp_mant_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00 -# GFX11: v_frexp_mant_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00 -# GFX11: v_frexp_mant_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00 -# GFX11: v_frexp_mant_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00 -# GFX11: v_frexp_mant_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00 -# GFX11: v_frexp_mant_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00 -# GFX11: v_frexp_mant_f16_e64 v5, null ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, null ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00 -# GFX11: v_frexp_mant_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08 -# GFX11: v_frexp_mant_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08] +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08] 0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10 -# GFX11: v_frexp_mant_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10] +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10] 0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 -# GFX11: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00 +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x08,0xd9,0xd5,0xff,0x01,0x00,0x00 +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xd9,0xd5,0xff,0x01,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00] + +0xff,0xc1,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_frexp_mant_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] 0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00 # GFX11: v_frexp_mant_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt index 46dedd970a320..24dc882e8beb0 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt @@ -1839,46 +1839,68 @@ # GFX12: v_frexp_exp_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x7e,0xfe,0x7f,0xff,0x6f,0x3d,0x30] 0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX12: v_frexp_mant_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX12: v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX12: v_frexp_mant_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX12: v_frexp_mant_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX12: v_frexp_mant_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX12: v_frexp_mant_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX12: v_frexp_mant_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX12: v_frexp_mant_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX12: v_frexp_mant_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX12: v_frexp_mant_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX12: v_frexp_mant_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX12: v_frexp_mant_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX12: v_frexp_mant_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30 -# GFX12: v_frexp_mant_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-REAL16: v_frexp_mant_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] + +0xfa,0xb2,0x0a,0x7f,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7f,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_mul_i32_i24_e32 v128, 1, v176 ; encoding: [0x81,0x60,0x01,0x13] + +0xfa,0xb2,0xfe,0x7f,0xff,0x6f,0x3d,0x30 +# GFX12-REAL16: v_frexp_mant_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7f,0xff,0x6f,0x3d,0x30] +# GFX12-FAKE16: v_lshlrev_b32_e32 v30, v255, v183 ; encoding: [0xff,0x6f,0x3d,0x30] 0xfa,0x80,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX12: v_frexp_mant_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt index 551dab7ec3e7c..2eeb220b913fd 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt @@ -348,10 +348,19 @@ # GFX12: v_frexp_exp_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x7e,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX12: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00 -# GFX12: v_frexp_mant_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_frexp_mant_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +0xe9,0xb2,0x0a,0x7f,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7f,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05] + +0xea,0xb2,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_frexp_mant_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0x80,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX12: v_frexp_mant_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x80,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt index 0d01be721e60d..661d072f46c1a 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt @@ -2368,50 +2368,76 @@ # GFX12: v_frexp_exp_i32_f64_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xbc,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] 0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00 -# GFX12: v_frexp_mant_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00 -# GFX12: v_frexp_mant_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00] +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00 -# GFX12: v_frexp_mant_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00 -# GFX12: v_frexp_mant_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00 -# GFX12: v_frexp_mant_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00 -# GFX12: v_frexp_mant_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00 -# GFX12: v_frexp_mant_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00 -# GFX12: v_frexp_mant_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00 -# GFX12: v_frexp_mant_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00 -# GFX12: v_frexp_mant_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00 -# GFX12: v_frexp_mant_f16_e64 v5, null ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, null ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00 -# GFX12: v_frexp_mant_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08 -# GFX12: v_frexp_mant_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08] 0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10 -# GFX12: v_frexp_mant_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10] 0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 -# GFX12: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] - +# GFX12-REAL16: v_frexp_mant_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00 +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] +0x05,0x08,0xd9,0xd5,0xff,0x01,0x00,0x00 +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xd9,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00] +0xff,0xc1,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_frexp_mant_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +# GFX11: v_frexp_mant_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00 # GFX12: v_frexp_mant_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt index d501d62c006eb..a1e431bc49d34 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt @@ -1859,47 +1859,72 @@ # GFX12: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x01,0xbf,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] 0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 -# GFX12: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] - +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] + +0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +0x05,0x08,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +0xff,0xc1,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] + +# GFX11: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xc0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX12: v_frexp_mant_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt index aba7d3ff43d8b..405b716c110e1 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt @@ -523,17 +523,32 @@ # GFX12: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x01,0xbf,0xd5,0xea,0x00,0x00,0x20,0xff,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] 0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX12: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +0x05,0x08,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +0xff,0xc1,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +# GFX11: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xc0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX12: v_frexp_mant_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] From 9d8e634e85ca46fbec07733d3e69d34c0d7814ac Mon Sep 17 00:00:00 2001 From: Jeff Niu Date: Fri, 3 Jan 2025 11:44:46 -0800 Subject: [PATCH 395/567] [mlir][scf] Always remove for iter args that are loop invariant (#121555) This alters the condition in ForOpIterArgsFolder to always remove iter args when their initial value equals the yielded value, not just when the arg has no use. --- mlir/lib/Dialect/SCF/IR/SCF.cpp | 31 +++++++++++-------------- mlir/test/Dialect/SCF/canonicalize.mlir | 22 ++++++++++++++---- 2 files changed, 32 insertions(+), 21 deletions(-) diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp index eded1c394f126..83ae79ce48266 100644 --- a/mlir/lib/Dialect/SCF/IR/SCF.cpp +++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp @@ -839,8 +839,7 @@ mlir::scf::replaceAndCastForOpIterArg(RewriterBase &rewriter, scf::ForOp forOp, namespace { // Fold away ForOp iter arguments when: // 1) The op yields the iter arguments. -// 2) The iter arguments have no use and the corresponding outer region -// iterators (inputs) are yielded. +// 2) The argument's corresponding outer region iterators (inputs) are yielded. // 3) The iter arguments have no use and the corresponding (operation) results // have no use. // @@ -872,30 +871,28 @@ struct ForOpIterArgsFolder : public OpRewritePattern { newIterArgs.reserve(forOp.getInitArgs().size()); newYieldValues.reserve(numResults); newResultValues.reserve(numResults); - for (auto it : llvm::zip(forOp.getInitArgs(), // iter from outside - forOp.getRegionIterArgs(), // iter inside region - forOp.getResults(), // op results - forOp.getYieldedValues() // iter yield - )) { + for (auto [init, arg, result, yielded] : + llvm::zip(forOp.getInitArgs(), // iter from outside + forOp.getRegionIterArgs(), // iter inside region + forOp.getResults(), // op results + forOp.getYieldedValues() // iter yield + )) { // Forwarded is `true` when: // 1) The region `iter` argument is yielded. - // 2) The region `iter` argument has no use, and the corresponding iter - // operand (input) is yielded. + // 2) The region `iter` argument the corresponding input is yielded. // 3) The region `iter` argument has no use, and the corresponding op // result has no use. - bool forwarded = ((std::get<1>(it) == std::get<3>(it)) || - (std::get<1>(it).use_empty() && - (std::get<0>(it) == std::get<3>(it) || - std::get<2>(it).use_empty()))); + bool forwarded = (arg == yielded) || (init == yielded) || + (arg.use_empty() && result.use_empty()); keepMask.push_back(!forwarded); canonicalize |= forwarded; if (forwarded) { - newBlockTransferArgs.push_back(std::get<0>(it)); - newResultValues.push_back(std::get<0>(it)); + newBlockTransferArgs.push_back(init); + newResultValues.push_back(init); continue; } - newIterArgs.push_back(std::get<0>(it)); - newYieldValues.push_back(std::get<3>(it)); + newIterArgs.push_back(init); + newYieldValues.push_back(yielded); newBlockTransferArgs.push_back(Value()); // placeholder with null value newResultValues.push_back(Value()); // placeholder with null value } diff --git a/mlir/test/Dialect/SCF/canonicalize.mlir b/mlir/test/Dialect/SCF/canonicalize.mlir index 8c4e7a41ee6bc..828758df6d31c 100644 --- a/mlir/test/Dialect/SCF/canonicalize.mlir +++ b/mlir/test/Dialect/SCF/canonicalize.mlir @@ -408,6 +408,20 @@ func.func @for_yields_4() -> i32 { // ----- +// CHECK-LABEL: @constant_iter_arg +func.func @constant_iter_arg(%arg0: index, %arg1: index, %arg2: index) { + %c0_i32 = arith.constant 0 : i32 + // CHECK: scf.for %arg3 = %arg0 to %arg1 step %arg2 { + %0 = scf.for %i = %arg0 to %arg1 step %arg2 iter_args(%arg3 = %c0_i32) -> i32 { + // CHECK-NEXT: "test.use"(%c0_i32) + "test.use"(%arg3) : (i32) -> () + scf.yield %c0_i32 : i32 + } + return +} + +// ----- + // CHECK-LABEL: @replace_true_if func.func @replace_true_if() { %true = arith.constant true @@ -1789,7 +1803,7 @@ module { } // CHECK-LABEL: @fold_iter_args_not_being_modified_within_scfforall // CHECK-SAME: (%{{.*}}: index, %[[ARG1:.*]]: tensor, %[[ARG2:.*]]: tensor) -> (tensor, tensor) { -// CHECK: %[[RESULT:.*]] = scf.forall +// CHECK: %[[RESULT:.*]] = scf.forall // CHECK-SAME: shared_outs(%[[ITER_ARG_5:.*]] = %[[ARG2]]) -> (tensor) { // CHECK: %[[OPERAND0:.*]] = tensor.extract_slice %[[ARG1]] // CHECK: %[[OPERAND1:.*]] = tensor.extract_slice %[[ITER_ARG_5]] @@ -1832,7 +1846,7 @@ module { } // CHECK-LABEL: @fold_iter_args_with_no_use_of_result_scfforall // CHECK-SAME: (%{{.*}}: index, %[[ARG1:.*]]: tensor, %[[ARG2:.*]]: tensor, %[[ARG3:.*]]: tensor) -> tensor { -// CHECK: %[[RESULT:.*]] = scf.forall +// CHECK: %[[RESULT:.*]] = scf.forall // CHECK-SAME: shared_outs(%[[ITER_ARG_6:.*]] = %[[ARG2]]) -> (tensor) { // CHECK: %[[OPERAND0:.*]] = tensor.extract_slice %[[ARG1]] // CHECK: %[[OPERAND1:.*]] = tensor.extract_slice %[[ARG3]] @@ -1856,7 +1870,7 @@ func.func @index_switch_fold() -> (f32, f32) { %y = arith.constant 42.0 : f32 scf.yield %y : f32 } - + %switch_cst_2 = arith.constant 2: index %1 = scf.index_switch %switch_cst_2 -> f32 case 0 { @@ -1867,7 +1881,7 @@ func.func @index_switch_fold() -> (f32, f32) { %y = arith.constant 42.0 : f32 scf.yield %y : f32 } - + return %0, %1 : f32, f32 } From b2adeae8650fb720873ad7fa39153beaa8194afc Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Fri, 3 Jan 2025 11:49:51 -0800 Subject: [PATCH 396/567] [AMDGPU][MC] Allow null where 128b or larger dst reg is expected (#115200) For GFX10+, currently null cannot be used as dst reg in instructions that expect the dst reg to be 128b or larger (e.g., s_load_dwordx4). This patch fixes this problem while ensuring null cannot be used as S#, T#, or V#. --- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 8 +- llvm/lib/Target/AMDGPU/BUFInstructions.td | 6 +- .../Disassembler/AMDGPUDisassembler.cpp | 25 ++ .../AMDGPU/Disassembler/AMDGPUDisassembler.h | 1 + llvm/lib/Target/AMDGPU/MIMGInstructions.td | 56 ++--- llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 19 +- llvm/lib/Target/AMDGPU/SMInstructions.td | 72 +++--- llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll | 8 +- llvm/test/MC/AMDGPU/gfx10_asm_mimg_err.s | 127 ++++++++++ llvm/test/MC/AMDGPU/gfx10_asm_mtbuf_err.s | 49 ++++ llvm/test/MC/AMDGPU/gfx10_asm_mubuf_err.s | 160 ++++++++++++ llvm/test/MC/AMDGPU/gfx10_asm_smem.s | 16 ++ llvm/test/MC/AMDGPU/gfx10_asm_smem_err.s | 86 +++++++ llvm/test/MC/AMDGPU/gfx11_asm_mimg_err.s | 117 +++++++++ llvm/test/MC/AMDGPU/gfx11_asm_mtbuf_err.s | 49 ++++ llvm/test/MC/AMDGPU/gfx11_asm_mubuf_err.s | 229 ++++++++++++++++++ llvm/test/MC/AMDGPU/gfx11_asm_smem.s | 16 ++ llvm/test/MC/AMDGPU/gfx11_asm_smem_err.s | 31 +++ llvm/test/MC/AMDGPU/gfx12_asm_mimg_err.s | 119 +++++++++ llvm/test/MC/AMDGPU/gfx12_asm_smem.s | 19 ++ llvm/test/MC/AMDGPU/gfx12_asm_smem_err.s | 31 +++ .../MC/AMDGPU/gfx12_asm_vbuffer_mtbuf_err.s | 49 ++++ .../MC/AMDGPU/gfx12_asm_vbuffer_mubuf_err.s | 220 +++++++++++++++++ .../MC/Disassembler/AMDGPU/gfx10_smem.txt | 15 ++ .../Disassembler/AMDGPU/gfx11_dasm_smem.txt | 15 ++ .../Disassembler/AMDGPU/gfx12_dasm_smem.txt | 18 ++ 26 files changed, 1485 insertions(+), 76 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/gfx10_asm_mtbuf_err.s create mode 100644 llvm/test/MC/AMDGPU/gfx10_asm_mubuf_err.s create mode 100644 llvm/test/MC/AMDGPU/gfx10_asm_smem_err.s create mode 100644 llvm/test/MC/AMDGPU/gfx11_asm_mtbuf_err.s create mode 100644 llvm/test/MC/AMDGPU/gfx11_asm_mubuf_err.s create mode 100644 llvm/test/MC/AMDGPU/gfx11_asm_smem_err.s create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_smem_err.s create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mtbuf_err.s create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf_err.s diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index ed956a1f755c0..d8f441d1ccfe4 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -9760,10 +9760,14 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op, case MCK_SReg_64: case MCK_SReg_64_XEXEC: // Null is defined as a 32-bit register but - // it should also be enabled with 64-bit operands. - // The following code enables it for SReg_64 operands + // it should also be enabled with 64-bit operands or larger. + // The following code enables it for SReg_64 and larger operands // used as source and destination. Remaining source // operands are handled in isInlinableImm. + case MCK_SReg_96: + case MCK_SReg_128: + case MCK_SReg_256: + case MCK_SReg_512: return Operand.isNull() ? Match_Success : Match_InvalidOperand; default: return Match_InvalidOperand; diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index a351f451584f9..88205ea361c55 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -168,7 +168,7 @@ class getMTBUFInsDA vdataList, dag SOffset = !if(hasRestrictedSOffset, (ins SReg_32:$soffset), (ins SCSrc_b32:$soffset)); - dag NonVaddrInputs = !con((ins SReg_128:$srsrc), SOffset, + dag NonVaddrInputs = !con((ins SReg_128_XNULL:$srsrc), SOffset, (ins Offset:$offset, FORMAT:$format, CPol_0:$cpol, i1imm_0:$swz)); dag Inputs = !if(!empty(vaddrList), @@ -418,7 +418,7 @@ class getMUBUFInsDA vdataList, RegisterOperand vdata_op = getLdStVDataRegisterOperand.ret; dag SOffset = !if(hasRestrictedSOffset, (ins SReg_32:$soffset), (ins SCSrc_b32:$soffset)); - dag NonVaddrInputs = !con((ins SReg_128:$srsrc), SOffset, (ins Offset:$offset, CPol_0:$cpol, i1imm_0:$swz)); + dag NonVaddrInputs = !con((ins SReg_128_XNULL:$srsrc), SOffset, (ins Offset:$offset, CPol_0:$cpol, i1imm_0:$swz)); dag Inputs = !if(!empty(vaddrList), NonVaddrInputs, !con((ins vaddrClass:$vaddr), NonVaddrInputs)); dag ret = !if(!empty(vdataList), Inputs, !con((ins vdata_op:$vdata), Inputs)); @@ -703,7 +703,7 @@ class getMUBUFAtomicInsDA : MIMG_gfx6789 { - let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc, + let InOperandList = !con((ins addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -435,7 +435,7 @@ class MIMG_NoSampler_Helper_gfx90a : MIMG_gfx90a .ret:$vdata), dns> { - let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc, + let InOperandList = !con((ins addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, LWE:$lwe, DA:$da), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -447,7 +447,7 @@ class MIMG_NoSampler_gfx10 : MIMG_gfx10 { - let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, DMask:$dmask, + let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -460,7 +460,7 @@ class MIMG_NoSampler_nsa_gfx10 : MIMG_nsa_gfx10 { let InOperandList = !con(AddrIns, - (ins SReg_256:$srsrc, DMask:$dmask, + (ins SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -472,7 +472,7 @@ class MIMG_NoSampler_gfx11 : MIMG_gfx11 { - let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, DMask:$dmask, + let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -485,7 +485,7 @@ class MIMG_NoSampler_nsa_gfx11 : MIMG_nsa_gfx11 { let InOperandList = !con(AddrIns, - (ins SReg_256:$srsrc, DMask:$dmask, + (ins SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -498,7 +498,7 @@ class VIMAGE_NoSampler_gfx12 : VIMAGE_gfx12 { let InOperandList = !con(AddrIns, - (ins SReg_256:$rsrc, DMask:$dmask, Dim:$dim, + (ins SReg_256_XNULL:$rsrc, DMask:$dmask, Dim:$dim, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); let AsmString = opcode#" $vdata, "#AddrAsm#", $rsrc$dmask$dim$cpol$r128$a16$tfe" @@ -510,8 +510,8 @@ class VSAMPLE_Sampler_gfx12 : VSAMPLE_gfx12 { let InOperandList = !con(AddrIns, - (ins SReg_256:$rsrc), - !if(BaseOpcode.Sampler, (ins SReg_128:$samp), (ins)), + (ins SReg_256_XNULL:$rsrc), + !if(BaseOpcode.Sampler, (ins SReg_128_XNULL:$samp), (ins)), (ins DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), @@ -527,8 +527,8 @@ class VSAMPLE_Sampler_nortn_gfx12 : VSAMPLE_gfx12 { let InOperandList = !con(AddrIns, - (ins SReg_256:$rsrc), - !if(BaseOpcode.Sampler, (ins SReg_128:$samp), (ins)), + (ins SReg_256_XNULL:$rsrc), + !if(BaseOpcode.Sampler, (ins SReg_128_XNULL:$samp), (ins)), (ins DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), @@ -679,7 +679,7 @@ class MIMG_Store_Helper : MIMG_gfx6789 { - let InOperandList = !con((ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, + let InOperandList = !con((ins data_rc:$vdata, addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -693,7 +693,7 @@ class MIMG_Store_Helper_gfx90a : MIMG_gfx90a { let InOperandList = !con((ins getLdStRegisterOperand.ret:$vdata, - addr_rc:$vaddr, SReg_256:$srsrc, + addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, LWE:$lwe, DA:$da), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -705,7 +705,7 @@ class MIMG_Store_gfx10 : MIMG_gfx10 { - let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc, + let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -719,7 +719,7 @@ class MIMG_Store_nsa_gfx10 { let InOperandList = !con((ins DataRC:$vdata), AddrIns, - (ins SReg_256:$srsrc, DMask:$dmask, + (ins SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -731,7 +731,7 @@ class MIMG_Store_gfx11 : MIMG_gfx11 { - let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc, + let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -745,7 +745,7 @@ class MIMG_Store_nsa_gfx11 { let InOperandList = !con((ins DataRC:$vdata), AddrIns, - (ins SReg_256:$srsrc, DMask:$dmask, + (ins SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -759,7 +759,7 @@ class VIMAGE_Store_gfx12 { let InOperandList = !con((ins DataRC:$vdata), AddrIns, - (ins SReg_256:$rsrc, DMask:$dmask, Dim:$dim, + (ins SReg_256_XNULL:$rsrc, DMask:$dmask, Dim:$dim, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); let AsmString = opcode#" $vdata, "#AddrAsm#", $rsrc$dmask$dim$cpol$r128$a16$tfe" @@ -875,7 +875,7 @@ class MIMG_Atomic_gfx6789_base op, string asm, RegisterClass data_rc, : MIMG_gfx6789 { let Constraints = "$vdst = $vdata"; - let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, + let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da); let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da"; @@ -887,7 +887,7 @@ class MIMG_Atomic_gfx90a_base op, string asm, RegisterClass data_rc, let Constraints = "$vdst = $vdata"; let InOperandList = (ins getLdStRegisterOperand.ret:$vdata, - addr_rc:$vaddr, SReg_256:$srsrc, + addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, LWE:$lwe, DA:$da); let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da"; @@ -921,7 +921,7 @@ class MIMG_Atomic_gfx10 { let Constraints = "$vdst = $vdata"; - let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc, + let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe); let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; @@ -936,7 +936,7 @@ class MIMG_Atomic_nsa_gfx10 { let Constraints = "$vdst = $vdata"; - let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc, + let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe); let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; @@ -964,7 +964,7 @@ class MIMG_Atomic_nsa_gfx11 : MIMG_gfx6789 { - let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, + let InOperandList = !con((ins src_rc:$vaddr, SReg_256_XNULL:$srsrc, SReg_128_XNULL:$ssamp, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -1139,7 +1139,7 @@ class MIMG_Sampler_Helper : MIMG_gfx90a.ret:$vdata), dns> { - let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, + let InOperandList = !con((ins src_rc:$vaddr, SReg_256_XNULL:$srsrc, SReg_128_XNULL:$ssamp, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, LWE:$lwe, DA:$da), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -1149,7 +1149,7 @@ class MIMG_Sampler_gfx90a { dag ret = !con(OpPrefix, - (ins SReg_256:$srsrc, SReg_128:$ssamp, + (ins SReg_256_XNULL:$srsrc, SReg_128_XNULL:$ssamp, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), !if(HasD16, (ins D16:$d16), (ins))); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 16a7a9cfbc49a..f3a962eea7539 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -809,6 +809,9 @@ def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, let BaseClassOrder = 32; } +def SGPR_NULL128 : SIReg<"null">; +def SGPR_NULL256 : SIReg<"null">; + let GeneratePressureSet = 0 in { def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32, (add SReg_32, LDS_DIRECT_CLASS)> { @@ -885,6 +888,7 @@ multiclass SRegClass regTypes, SIRegisterTuples regList, SIRegisterTuples ttmpList = regList, + bit hasNull = 0, int copyCost = !sra(!add(numRegs, 1), 1)> { defvar hasTTMP = !ne(regList, ttmpList); defvar suffix = !cast(!mul(numRegs, 32)); @@ -901,7 +905,7 @@ multiclass SRegClass(sgprName)), !if(hasTTMP, @@ -910,15 +914,24 @@ multiclass SRegClass("SReg_" # suffix # "_XNULL"), !cast("SGPR_NULL" # suffix))> { + let isAllocatable = 0; + let BaseClassOrder = !mul(numRegs, 32); + } + } } } defm "" : SRegClass<3, Reg96Types.types, SGPR_96Regs, TTMP_96Regs>; -defm "" : SRegClass<4, Reg128Types.types, SGPR_128Regs, TTMP_128Regs>; +defm "" : SRegClass<4, Reg128Types.types, SGPR_128Regs, TTMP_128Regs, /*hasNull*/ true>; defm "" : SRegClass<5, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>; defm "" : SRegClass<6, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>; defm "" : SRegClass<7, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>; -defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16, v16bf16], SGPR_256Regs, TTMP_256Regs>; +defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16, v16bf16], SGPR_256Regs, TTMP_256Regs, /*hasNull*/ true>; defm "" : SRegClass<9, [v9i32, v9f32], SGPR_288Regs, TTMP_288Regs>; defm "" : SRegClass<10, [v10i32, v10f32], SGPR_320Regs, TTMP_320Regs>; defm "" : SRegClass<11, [v11i32, v11f32], SGPR_352Regs, TTMP_352Regs>; diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index 1aeb4e8b20e8f..60e4ce92ac25d 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -332,15 +332,15 @@ defm S_LOAD_I16 : SM_Pseudo_Loads ; defm S_LOAD_U16 : SM_Pseudo_Loads ; let is_buffer = 1 in { -defm S_BUFFER_LOAD_DWORD : SM_Pseudo_Loads ; +defm S_BUFFER_LOAD_DWORD : SM_Pseudo_Loads ; // FIXME: exec_lo/exec_hi appear to be allowed for SMRD loads on // SI/CI, bit disallowed for SMEM on VI. -defm S_BUFFER_LOAD_DWORDX2 : SM_Pseudo_Loads ; +defm S_BUFFER_LOAD_DWORDX2 : SM_Pseudo_Loads ; let SubtargetPredicate = HasScalarDwordx3Loads in - defm S_BUFFER_LOAD_DWORDX3 : SM_Pseudo_Loads ; -defm S_BUFFER_LOAD_DWORDX4 : SM_Pseudo_Loads ; -defm S_BUFFER_LOAD_DWORDX8 : SM_Pseudo_Loads ; -defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads ; + defm S_BUFFER_LOAD_DWORDX3 : SM_Pseudo_Loads ; +defm S_BUFFER_LOAD_DWORDX4 : SM_Pseudo_Loads ; +defm S_BUFFER_LOAD_DWORDX8 : SM_Pseudo_Loads ; +defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads ; defm S_BUFFER_LOAD_I8 : SM_Pseudo_Loads ; defm S_BUFFER_LOAD_U8 : SM_Pseudo_Loads ; defm S_BUFFER_LOAD_I16 : SM_Pseudo_Loads ; @@ -353,9 +353,9 @@ defm S_STORE_DWORDX2 : SM_Pseudo_Stores ; defm S_STORE_DWORDX4 : SM_Pseudo_Stores ; let is_buffer = 1 in { -defm S_BUFFER_STORE_DWORD : SM_Pseudo_Stores ; -defm S_BUFFER_STORE_DWORDX2 : SM_Pseudo_Stores ; -defm S_BUFFER_STORE_DWORDX4 : SM_Pseudo_Stores ; +defm S_BUFFER_STORE_DWORD : SM_Pseudo_Stores ; +defm S_BUFFER_STORE_DWORDX2 : SM_Pseudo_Stores ; +defm S_BUFFER_STORE_DWORDX4 : SM_Pseudo_Stores ; } } // End SubtargetPredicate = HasScalarStores @@ -401,33 +401,33 @@ defm S_SCRATCH_STORE_DWORDX4 : SM_Pseudo_Stores ; let SubtargetPredicate = HasScalarAtomics in { let is_buffer = 1 in { -defm S_BUFFER_ATOMIC_SWAP : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_CMPSWAP : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_ADD : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_SUB : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_SMIN : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_UMIN : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_SMAX : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_UMAX : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_AND : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_OR : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_XOR : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_INC : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_DEC : SM_Pseudo_Atomics ; - -defm S_BUFFER_ATOMIC_SWAP_X2 : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_CMPSWAP_X2 : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_ADD_X2 : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_SUB_X2 : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_SMIN_X2 : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_UMIN_X2 : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_SMAX_X2 : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_UMAX_X2 : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_AND_X2 : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_OR_X2 : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_XOR_X2 : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_INC_X2 : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_DEC_X2 : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_SWAP : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_CMPSWAP : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_ADD : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_SUB : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_SMIN : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_UMIN : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_SMAX : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_UMAX : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_AND : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_OR : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_XOR : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_INC : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_DEC : SM_Pseudo_Atomics ; + +defm S_BUFFER_ATOMIC_SWAP_X2 : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_CMPSWAP_X2 : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_ADD_X2 : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_SUB_X2 : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_SMIN_X2 : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_UMIN_X2 : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_SMAX_X2 : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_UMAX_X2 : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_AND_X2 : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_OR_X2 : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_XOR_X2 : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_INC_X2 : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_DEC_X2 : SM_Pseudo_Atomics ; } defm S_ATOMIC_SWAP : SM_Pseudo_Atomics ; diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll index cf9fdbdc34391..2ceaca3497ece 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll @@ -8,16 +8,16 @@ define amdgpu_kernel void @s_input_output_i128() { ; GFX908-LABEL: name: s_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:SGPR_128 */, def %12 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7471114 /* regdef:SGPR_128 */, def %12 ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %12 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7405577 /* reguse:SGPR_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7471113 /* reguse:SGPR_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: s_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:SGPR_128 */, def %10 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7471114 /* regdef:SGPR_128 */, def %10 ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %10 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7405577 /* reguse:SGPR_128 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7471113 /* reguse:SGPR_128 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=s"() call void asm sideeffect "; use $0", "s"(i128 %val) diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_mimg_err.s b/llvm/test/MC/AMDGPU/gfx10_asm_mimg_err.s index bd61ad3908d21..f6ea86ed7fe93 100644 --- a/llvm/test/MC/AMDGPU/gfx10_asm_mimg_err.s +++ b/llvm/test/MC/AMDGPU/gfx10_asm_mimg_err.s @@ -359,3 +359,130 @@ image_sample_c_d_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_ image_load v[0:1], v0, s[0:7] dmask:0x9 dim:1 D // NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid dim value + +// null is not allowed as SRSRC or SSAMP +image_atomic_add v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_and v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_cmpswap v[0:1], v[10:11], null dmask:0x3 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_dec v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_fcmpswap v[1:2], v[2:3], null dmask:0x3 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_fmax v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_fmin v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_inc v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_or v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_smax v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_smin v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_sub v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_swap v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_umax v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_umin v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_xor v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4 v[64:67], v32, null, s[4:11], dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4 v[64:67], v32, s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_b v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_b v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_c v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_c v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4h v[64:67], v32, null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4h v[64:67], v32, s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_l v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_l v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_o v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_o v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_load v[4:7], v0, null dmask:0xf dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_store v[0:3], v[254:255], null dmask:0xf dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample v[5:6], v1, null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample v[5:6], v1, s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_b v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_b v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_c v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_c v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_d v[5:6], v[1:3], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_d v[5:6], v[1:3], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_l v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_l v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_o v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_o v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_mtbuf_err.s b/llvm/test/MC/AMDGPU/gfx10_asm_mtbuf_err.s new file mode 100644 index 0000000000000..5eb2e9c579a7d --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx10_asm_mtbuf_err.s @@ -0,0 +1,49 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=NOGFX10 --implicit-check-not=error: %s + +tbuffer_load_format_d16_x v3, v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_d16_xy v[3:4], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_d16_xyz v[3:5], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_d16_xyzw v[3:6], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_xy v[3:4], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_xyz v[3:5], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_xyzw v[3:6], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_d16_x v3, v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_d16_xy v[3:4], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_d16_xyz v[3:5], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_d16_xyzw v[3:6], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_xy v[3:4], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_xyz v[3:5], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_xyzw v[3:6], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_mubuf_err.s b/llvm/test/MC/AMDGPU/gfx10_asm_mubuf_err.s new file mode 100644 index 0000000000000..bd7acfeb4b033 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx10_asm_mubuf_err.s @@ -0,0 +1,160 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=NOGFX10 --implicit-check-not=error: %s + +buffer_atomic_add v5, v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_add_x2 v[5:6], v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_and v5, v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_and_x2 v[5:6], v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_cmpswap v[5:6], v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_cmpswap_x2 v[5:8], v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_dec v5, v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_dec_x2 v[5:6], v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_inc v5, v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_inc_x2 v[5:6], v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_or v5, v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_or_x2 v[5:6], v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_smax v5, v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_smax_x2 v[5:6], v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_smin v5, v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_smin_x2 v[5:6], v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_sub v5, v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_sub_x2 v[5:6], v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_swap v5, v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_swap_x2 v[5:6], v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_umax v5, v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_umax_x2 v[5:6], v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_umin v5, v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_umin_x2 v[5:6], v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_xor v5, v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_xor_x2 v[5:6], v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_format_d16_x v3, v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_format_xy v[3:4], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_format_xyz v[3:5], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_format_xyzw v[3:6], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_dword v5, v0, null, s3 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_dwordx2 v[5:6], v0, null, s3 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_dwordx3 v[5:7], v0, null, s3 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_dwordx4 v[5:8], v0, null, s3 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_sbyte v5, v0, null, s3 idxen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_sshort v5, v0, null, s3 idxen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_ubyte v5, v0, null, s3 idxen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_ushort v5, v0, null, s3 idxen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_byte v1, v0, null, s4 idxen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_dword v1, v0, null, s4 idxen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_dwordx2 v[1:2], v0, null, s4 idxen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_dwordx3 v[1:3], v0, null, s4 idxen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_dwordx4 v[1:4], v0, null, s4 idxen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_d16_hi_x v1, v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_d16_x v1, v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_d16_xy v1, v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_d16_xyz v[1:2], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_d16_xyzw v[1:3], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_x v1, v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_xy v[1:2], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_xyz v[1:3], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_xyzw v[1:4], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_smem.s b/llvm/test/MC/AMDGPU/gfx10_asm_smem.s index b582de83a2f29..683a0195037cf 100644 --- a/llvm/test/MC/AMDGPU/gfx10_asm_smem.s +++ b/llvm/test/MC/AMDGPU/gfx10_asm_smem.s @@ -281,6 +281,22 @@ s_load_dwordx16 s[20:35], s[2:3], 0x1234 glc dlc s_load_dwordx16 s[20:35], s[2:3], s0 offset:0x12345 glc dlc // GFX10: encoding: [0x01,0x45,0x11,0xf4,0x45,0x23,0x01,0x00] +// null as dst +s_load_dword null, s[2:3], s0 +// GFX10: encoding: [0x41,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00] + +s_load_dwordx2 null, s[2:3], s0 +// GFX10: encoding: [0x41,0x1f,0x04,0xf4,0x00,0x00,0x00,0x00] + +s_load_dwordx4 null, s[2:3], s0 +// GFX10: encoding: [0x41,0x1f,0x08,0xf4,0x00,0x00,0x00,0x00] + +s_load_dwordx8 null, s[2:3], s0 +// GFX10: encoding: [0x41,0x1f,0x0c,0xf4,0x00,0x00,0x00,0x00] + +s_load_dwordx16 null, s[2:3], s0 +// GFX10: encoding: [0x41,0x1f,0x10,0xf4,0x00,0x00,0x00,0x00] + s_buffer_load_dword s5, s[4:7], s0 // GFX10: encoding: [0x42,0x01,0x20,0xf4,0x00,0x00,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_smem_err.s b/llvm/test/MC/AMDGPU/gfx10_asm_smem_err.s new file mode 100644 index 0000000000000..670e97325355b --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx10_asm_smem_err.s @@ -0,0 +1,86 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=NOGFX10 --implicit-check-not=error: %s + +s_buffer_atomic_add s4, null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_add_x2 s[4:5], null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_and s4, null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_cmpswap s[4:5], null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_cmpswap_x2 s[4:7], null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_dec s4, null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_dec_x2 s[4:5], null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_inc s4, null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_inc_x2 s[4:5], null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_or s4, null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_or_x2 s[4:5], null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_smax s4, null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_smax_x2 s[4:5], null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_smin s4, null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_smin_x2 s[4:5], null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_sub s4, null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_sub_x2 s[4:5], null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_swap s4, null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_umax s4, null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_umax_x2 s[4:5], null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_umin s4, null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_umin_x2 s[4:5], null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_dword s4, null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_dwordx2 s[4:5], null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_dwordx4 s[4:7], null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_dwordx8 s[4:11], null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_dwordx16 s[4:19], null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_store_dword s4, null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_mimg_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_mimg_err.s index 9bf72a11e5eed..9c614453c1ebd 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_mimg_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_mimg_err.s @@ -400,3 +400,120 @@ image_store_pck v1, v[2:3], s[12:19] dmask:0x1 unorm image_store_mip_pck v1, v[2:3], s[12:19] dmask:0x0 unorm // NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand +// null is not allowed as SRSRC or SSAMP +image_atomic_add v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_and v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_cmpswap v[0:1], v[10:11], null dmask:0x3 dim:SQ_RSRC_IMG_2D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_dec v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_inc v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_or v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_smax v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_smin v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_sub v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_swap v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_umax v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_umin v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_xor v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4 v[64:67], v32, null, s[4:11], dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4 v[64:67], v32, s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_b v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_b v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_c v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_c v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4h v[64:67], v32, null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4h v[64:67], v32, s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_l v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_l v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_o v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_o v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_load v[4:7], v0, null dmask:0xf dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_store v[0:3], v[254:255], null dmask:0xf dim:SQ_RSRC_IMG_2D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample v[5:6], v1, null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample v[5:6], v1, s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_b v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_b v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_c v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_c v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_d v[5:6], v[1:3], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_d v[5:6], v[1:3], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_l v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_l v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_o v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_o v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_mtbuf_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_mtbuf_err.s new file mode 100644 index 0000000000000..3b69835c8eb51 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx11_asm_mtbuf_err.s @@ -0,0 +1,49 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck --check-prefixes=NOGFX11 --implicit-check-not=error: %s + +tbuffer_load_format_d16_x v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_d16_xy v[3:4], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_d16_xyz v[3:5], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_d16_xyzw v[3:6], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_xy v[3:4], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_xyz v[3:5], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_xyzw v[3:6], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_d16_x v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_d16_xy v[3:4], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_d16_xyz v[3:5], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_d16_xyzw v[3:6], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_xy v[3:4], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_xyz v[3:5], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_xyzw v[3:6], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_mubuf_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_mubuf_err.s new file mode 100644 index 0000000000000..d3d74467d8099 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx11_asm_mubuf_err.s @@ -0,0 +1,229 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck --check-prefixes=NOGFX11 --implicit-check-not=error: %s + +buffer_atomic_add_f32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_add_u32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_add_u64 v[5:6], v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_and_b32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_and_b64 v[5:6], v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_cmpswap_b32 v[5:6], v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_cmpswap_b64 v[5:8], v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_cmpswap_f32 v[5:6], v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_csub_u32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_dec_u32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_dec_u64 v[5:6], v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_inc_u32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_inc_u64 v[5:6], v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_max_f32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_max_i32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_max_i64 v[5:6], v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_max_u32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_max_u64 v[5:6], v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_min_f32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_min_i32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_min_i64 v[5:6], v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_min_u32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_min_u64 v[5:6], v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_or_b32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_or_b64 v[5:6], v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_sub_u32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_sub_u64 v[5:6], v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_swap_b32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_swap_b64 v[5:6], v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_xor_b32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_xor_b64 v[5:6], v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_b128 v[5:8], v0, null, s3 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_b32 v5, v0, null, s3 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_b64 v[1:2], v0, null, s4 idxen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_b96 v[1:3], v0, null, s4 idxen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_b16 v5, v0, null, s3 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_format_xy v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_format_xyz v[3:4], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_format_xyzw v[3:4], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_hi_b16 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_hi_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_hi_i8 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_hi_u8 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_i8 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_u8 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_format_xy v[3:4], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_format_xyz v[3:5], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_format_xyzw v[3:6], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_i16 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_i8 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_lds_b32 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_lds_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_lds_i16 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_lds_i8 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_lds_u16 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_lds_u8 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_u16 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_u8 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_b16 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_b32 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_b64 v[3:4], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_b8 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_b96 v[3:5], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_d16_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_d16_format_xy v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_d16_format_xyz v[3:4], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_d16_format_xyzw v[3:4], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_d16_hi_b16 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_d16_hi_b8 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_d16_hi_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_x v1, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_xy v[1:2], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_xyz v[1:3], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_xyzw v[1:4], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_smem.s b/llvm/test/MC/AMDGPU/gfx11_asm_smem.s index 1d6b947609075..e071c67f85891 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_smem.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_smem.s @@ -239,6 +239,22 @@ s_load_b512 s[20:35], s[2:3], s0 glc dlc s_load_b512 s[20:35], s[2:3], 0x1234 glc dlc // GFX11: encoding: [0x01,0x65,0x10,0xf4,0x34,0x12,0x00,0xf8] +// null as dst +s_load_b32 null, s[2:3], s0 +// GFX11: encoding: [0x01,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00] + +s_load_b64 null, s[2:3], s0 +// GFX11: encoding: [0x01,0x1f,0x04,0xf4,0x00,0x00,0x00,0x00] + +s_load_b128 null, s[2:3], s0 +// GFX11: encoding: [0x01,0x1f,0x08,0xf4,0x00,0x00,0x00,0x00] + +s_load_b256 null, s[2:3], s0 +// GFX11: encoding: [0x01,0x1f,0x0c,0xf4,0x00,0x00,0x00,0x00] + +s_load_b512 null, s[2:3], s0 +// GFX11: encoding: [0x01,0x1f,0x10,0xf4,0x00,0x00,0x00,0x00] + s_buffer_load_b32 s5, s[4:7], s0 // GFX11: encoding: [0x42,0x01,0x20,0xf4,0x00,0x00,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_smem_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_smem_err.s new file mode 100644 index 0000000000000..da195b4a41182 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx11_asm_smem_err.s @@ -0,0 +1,31 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck --check-prefixes=NOGFX11 --implicit-check-not=error: %s + +s_buffer_load_b32 s4, null, s101 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_b64 s4, null, s101 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_b128 s4, null, s101 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_b256 s4, null, s101 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_b512 s4, null, s101 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_dword s4, null, s101 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_dwordx2 s[4:5], null, s101 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_dwordx4 s[4:7], null, s101 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_dwordx8 s[4:11], null, s101 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_dwordx16 s[4:19], null, s101 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_mimg_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_mimg_err.s index a0d11c985c6b7..0f2cfc39e2ec8 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_mimg_err.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_mimg_err.s @@ -255,3 +255,122 @@ image_store_pck v5, v1, s[8:15] dmask:0x1 th:TH_STORE_NT image_store_mip_pck v5, [v0, v1], s[8:15] dmask:0x1 th:TH_STORE_NT // NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand + +// null is not allowed as SRSRC or SSAMP +image_atomic_add v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_and v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_cmpswap v[0:1], v[10:11], null dmask:0x3 dim:SQ_RSRC_IMG_2D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_dec v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_inc v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_or v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_smax v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_smin v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_sub v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_swap v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_umax v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_umin v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_xor v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4 v[64:67], v32, null, s[4:11], dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4 v[64:67], v32, s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_b v[64:67], [v32, v33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_b v[64:67], [v32, v33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_c v[64:67], [v32, v33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_c v[64:67], [v32, v33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4h v[64:67], v32, null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4h v[64:67], v32, s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_l v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_l v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_o v[64:67], [v32, v33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_o v[64:67], [v32, v33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_load v[4:7], v0, null dmask:0xf dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_store v[0:3], v[254:255], null dmask:0xf dim:SQ_RSRC_IMG_2D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample v[5:6], v1, null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample v[5:6], v1, s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_b v[5:6], [v1, v2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_b v[5:6], [v1, v2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_c v[5:6], [v1, v2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_c v[5:6], [v1, v2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_d v[5:6], [v1, v2, v3], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_d v[5:6], [v1, v2, v3], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_l v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_l v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_o v[5:6], [v1, v2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_o v[5:6], [v1, v2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_smem.s b/llvm/test/MC/AMDGPU/gfx12_asm_smem.s index 668f767661f68..2ef027459fa6a 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_smem.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_smem.s @@ -541,6 +541,25 @@ s_load_b512 s[20:35], s[2:3], m0 s_load_b512 s[20:35], s[2:3], 0x0 // GFX12: s_load_b512 s[20:35], s[2:3], 0x0 ; encoding: [0x01,0x85,0x00,0xf4,0x00,0x00,0x00,0xf8] +// null as dst +s_load_b32 null, s[2:3], s0 offset:0x0 +// GFX12: encoding: [0x01,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00] + +s_load_b64 null, s[2:3], s0 offset:0x0 +// GFX12: encoding: [0x01,0x3f,0x00,0xf4,0x00,0x00,0x00,0x00] + +s_load_b96 null, s[2:3], s0 offset:0x0 +// GFX12: encoding: [0x01,0xbf,0x00,0xf4,0x00,0x00,0x00,0x00] + +s_load_b128 null, s[2:3], s0 offset:0x0 +// GFX12: encoding: [0x01,0x5f,0x00,0xf4,0x00,0x00,0x00,0x00] + +s_load_b256 null, s[2:3], s0 offset:0x0 +// GFX12: encoding: [0x01,0x7f,0x00,0xf4,0x00,0x00,0x00,0x00] + +s_load_b512 null, s[2:3], s0 offset:0x0 +// GFX12: encoding: [0x01,0x9f,0x00,0xf4,0x00,0x00,0x00,0x00] + s_buffer_load_b32 s5, s[4:7], s0 // GFX12: s_buffer_load_b32 s5, s[4:7], s0 offset:0x0 ; encoding: [0x42,0x01,0x02,0xf4,0x00,0x00,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_smem_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_smem_err.s new file mode 100644 index 0000000000000..0f62c8b939991 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx12_asm_smem_err.s @@ -0,0 +1,31 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefixes=NOGFX12 --implicit-check-not=error: %s + +s_buffer_load_b32 s4, null, s101 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_b64 s4, null, s101 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_b128 s4, null, s101 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_b256 s4, null, s101 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_b512 s4, null, s101 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_dword s4, null, s101 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_dwordx2 s[4:5], null, s101 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_dwordx4 s[4:7], null, s101 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_dwordx8 s[4:11], null, s101 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_dwordx16 s[4:19], null, s101 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mtbuf_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mtbuf_err.s new file mode 100644 index 0000000000000..040119ce892e6 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mtbuf_err.s @@ -0,0 +1,49 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefixes=NOGFX12 --implicit-check-not=error: %s + +tbuffer_load_format_d16_x v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_d16_xy v[3:4], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_d16_xyz v[3:5], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_d16_xyzw v[3:6], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_xy v[3:4], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_xyz v[3:5], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_xyzw v[3:6], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_d16_x v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_d16_xy v[3:4], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_d16_xyz v[3:5], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_d16_xyzw v[3:6], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_xy v[3:4], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_xyz v[3:5], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_xyzw v[3:6], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf_err.s new file mode 100644 index 0000000000000..2c9ce7a7efe21 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf_err.s @@ -0,0 +1,220 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefixes=NOGFX12 --implicit-check-not=error: %s + +buffer_atomic_add_f32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_add_u32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_add_u64 v[5:6], v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_and_b32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_and_b64 v[5:6], v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_cmpswap_b32 v[5:6], v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_cmpswap_b64 v[5:8], v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_cond_sub_u32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_dec_u32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_dec_u64 v[5:6], v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_inc_u32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_inc_u64 v[5:6], v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_max_i32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_max_i64 v[5:6], v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_max_num_f32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_max_u32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_max_u64 v[5:6], v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_min_i32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_min_i64 v[5:6], v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_min_u32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_min_u64 v[5:6], v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_min_num_f32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_or_b32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_or_b64 v[5:6], v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_pk_add_bf16 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_pk_add_f16 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_sub_clamp_u32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_sub_u32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_sub_u64 v[5:6], v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_swap_b32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_swap_b64 v[5:6], v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_xor_b32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_xor_b64 v[5:6], v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_b128 v[5:8], v0, null, s3 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_b32 v5, v0, null, s3 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_b64 v[1:2], v0, null, s4 idxen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_b96 v[1:3], v0, null, s4 idxen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_b16 v5, v0, null, s3 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_format_xy v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_format_xyz v[3:4], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_format_xyzw v[3:4], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_hi_b16 v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_hi_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_hi_i8 v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_hi_u8 v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_i8 v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_u8 v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_format_xy v[3:4], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_format_xyz v[3:5], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_format_xyzw v[3:6], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_i16 v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_i8 v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_u16 v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_u8 v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_b128 v[3:6], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_b16 v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_b32 v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_b64 v[3:4], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_b8 v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_b96 v[3:5], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_d16_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_d16_format_xy v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_d16_format_xyz v[3:4], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_d16_format_xyzw v[3:4], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_d16_hi_b16 v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_d16_hi_b8 v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_d16_hi_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_x v1, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_xy v[1:2], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_xyz v[1:3], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_xyzw v[1:4], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_smem.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_smem.txt index 890a64b22f399..95301677272c2 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_smem.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_smem.txt @@ -1261,3 +1261,18 @@ # GFX10: s_store_dwordx4 s[96:99], s[4:5], s0 ; encoding: [0x02,0x18,0x48,0xf4,0x00,0x00,0x00,0x00] 0x02,0x18,0x48,0xf4,0x00,0x00,0x00,0x00 + +# GFX10: s_load_dword null, s[2:3], s0 ; encoding: [0x41,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00] +0x41,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00 + +# GFX10: s_load_dwordx2 null, s[2:3], s0 ; encoding: [0x41,0x1f,0x04,0xf4,0x00,0x00,0x00,0x00] +0x41,0x1f,0x04,0xf4,0x00,0x00,0x00,0x00 + +# GFX10: s_load_dwordx4 null, s[2:3], s0 ; encoding: [0x41,0x1f,0x08,0xf4,0x00,0x00,0x00,0x00] +0x41,0x1f,0x08,0xf4,0x00,0x00,0x00,0x00 + +# GFX10: s_load_dwordx8 null, s[2:3], s0 ; encoding: [0x41,0x1f,0x0c,0xf4,0x00,0x00,0x00,0x00] +0x41,0x1f,0x0c,0xf4,0x00,0x00,0x00,0x00 + +# GFX10: s_load_dwordx16 null, s[2:3], s0 ; encoding: [0x41,0x1f,0x10,0xf4,0x00,0x00,0x00,0x00] +0x41,0x1f,0x10,0xf4,0x00,0x00,0x00,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_smem.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_smem.txt index 8b49de5d89909..8396132a5b29c 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_smem.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_smem.txt @@ -471,3 +471,18 @@ # GFX11: s_gl1_inv ; encoding: [0x00,0x00,0x80,0xf4,0x00,0x00,0x00,0x00] 0x00,0x00,0x80,0xf4,0x00,0x00,0x00,0x00 + +# GFX11: s_load_b32 null, s[2:3], s0 ; encoding: [0x01,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00] +0x01,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00 + +# GFX11: s_load_b64 null, s[2:3], s0 ; encoding: [0x01,0x1f,0x04,0xf4,0x00,0x00,0x00,0x00] +0x01,0x1f,0x04,0xf4,0x00,0x00,0x00,0x00 + +# GFX11: s_load_b128 null, s[2:3], s0 ; encoding: [0x01,0x1f,0x08,0xf4,0x00,0x00,0x00,0x00] +0x01,0x1f,0x08,0xf4,0x00,0x00,0x00,0x00 + +# GFX11: s_load_b256 null, s[2:3], s0 ; encoding: [0x01,0x1f,0x0c,0xf4,0x00,0x00,0x00,0x00] +0x01,0x1f,0x0c,0xf4,0x00,0x00,0x00,0x00 + +# GFX11: s_load_b512 null, s[2:3], s0 ; encoding: [0x01,0x1f,0x10,0xf4,0x00,0x00,0x00,0x00] +0x01,0x1f,0x10,0xf4,0x00,0x00,0x00,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_smem.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_smem.txt index 28decdd4c5b1e..02641e6eb97f0 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_smem.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_smem.txt @@ -1277,3 +1277,21 @@ # GFX12: s_buffer_load_u16 s5, s[96:99], s0 offset:0x0 th:TH_LOAD_HT scope:SCOPE_SYS ; encoding: [0x70,0x61,0x63,0xf5,0x00,0x00,0x00,0x00] 0x70,0x61,0x63,0xf5,0x00,0x00,0x00,0x00 + +# GFX12: s_load_b32 null, s[2:3], s0 offset:0x0 ; encoding: [0x01,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00] +0x01,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00 + +# GFX12: s_load_b64 null, s[2:3], s0 offset:0x0 ; encoding: [0x01,0x3f,0x00,0xf4,0x00,0x00,0x00,0x00] +0x01,0x3f,0x00,0xf4,0x00,0x00,0x00,0x00 + +# GFX12: s_load_b96 null, s[2:3], s0 offset:0x0 ; encoding: [0x01,0xbf,0x00,0xf4,0x00,0x00,0x00,0x00] +0x01,0xbf,0x00,0xf4,0x00,0x00,0x00,0x00 + +# GFX12: s_load_b128 null, s[2:3], s0 offset:0x0 ; encoding: [0x01,0x5f,0x00,0xf4,0x00,0x00,0x00,0x00] +0x01,0x5f,0x00,0xf4,0x00,0x00,0x00,0x00 + +# GFX12: s_load_b256 null, s[2:3], s0 offset:0x0 ; encoding: [0x01,0x7f,0x00,0xf4,0x00,0x00,0x00,0x00] +0x01,0x7f,0x00,0xf4,0x00,0x00,0x00,0x00 + +# GFX12: s_load_b512 null, s[2:3], s0 offset:0x0 ; encoding: [0x01,0x9f,0x00,0xf4,0x00,0x00,0x00,0x00] +0x01,0x9f,0x00,0xf4,0x00,0x00,0x00,0x00 From 20d491bb993218eae6a13e4901da943ebd804f7a Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 3 Jan 2025 19:56:44 +0000 Subject: [PATCH 397/567] [VPlan] Remove re-using vector PH in VPBasicBlock::execute (NFC). Remove logic to re-use the previous basic block for the vector pre header from VPBasicBlock::execute. The preheader is now modeled as VPIRBasicBlock, so the code is no longer needed. Split off from https://github.com/llvm/llvm-project/pull/108378. --- llvm/lib/Transforms/Vectorize/VPlan.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 6d02efc05614a..06c36396a17f3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -487,11 +487,9 @@ void VPBasicBlock::execute(VPTransformState *State) { }; // 1. Create an IR basic block. - if (this == getPlan()->getVectorPreheader() || - (Replica && this == getParent()->getEntry()) || + if ((Replica && this == getParent()->getEntry()) || IsReplicateRegion(getSingleHierarchicalPredecessor())) { // Reuse the previous basic block if the current VPBB is either - // * the vector preheader, // * the entry to a replicate region, or // * the exit of a replicate region. State->CFG.VPBB2IRBB[this] = NewBB; From dc307be1b573c1bd6c2f8a3af9edd3455508dc7c Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Fri, 3 Jan 2025 15:45:33 -0500 Subject: [PATCH 398/567] [AMDGPU][True16][MC] true16 for v_fract_f16 (#120647) Support true16 format for v_fract_f16 in MC --- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 2 +- llvm/test/CodeGen/AMDGPU/fract-match.ll | 428 ++++++++++++++++++ llvm/test/MC/AMDGPU/gfx11_asm_vop1.s | 75 +-- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s | 65 +-- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s | 21 +- llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s | 42 ++ .../MC/AMDGPU/gfx11_asm_vop1_t16_promote.s | 154 +++++-- .../AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s | 65 +-- .../MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s | 25 +- .../test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s | 69 +-- llvm/test/MC/AMDGPU/gfx12_asm_vop1.s | 72 +-- llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s | 62 +-- llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s | 18 +- llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s | 42 ++ .../MC/AMDGPU/gfx12_asm_vop1_t16_promote.s | 154 +++++-- .../test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s | 69 +-- .../AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s | 65 +-- .../MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s | 25 +- .../Disassembler/AMDGPU/gfx11_dasm_vop1.txt | 63 ++- .../AMDGPU/gfx11_dasm_vop1_dpp16.txt | 54 ++- .../AMDGPU/gfx11_dasm_vop1_dpp8.txt | 17 +- .../gfx11_dasm_vop3_dpp16_from_vop1.txt | 54 ++- .../AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt | 24 +- .../AMDGPU/gfx11_dasm_vop3_from_vop1.txt | 57 ++- .../AMDGPU/gfx12_dasm_vop1_dpp16.txt | 50 +- .../AMDGPU/gfx12_dasm_vop1_dpp8.txt | 13 +- .../AMDGPU/gfx12_dasm_vop3_from_vop1.txt | 58 ++- .../gfx12_dasm_vop3_from_vop1_dpp16.txt | 55 ++- .../AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt | 25 +- 29 files changed, 1461 insertions(+), 462 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 79f0caec418ba..b58b7a5fcdcd0 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -1044,7 +1044,7 @@ defm V_CEIL_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16 defm V_CEIL_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">; defm V_TRUNC_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05d, "v_trunc_f16">; defm V_RNDNE_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05e, "v_rndne_f16">; -defm V_FRACT_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05f, "v_fract_f16">; +defm V_FRACT_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05f, "v_fract_f16">; defm V_SIN_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x060, "v_sin_f16">; defm V_COS_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x061, "v_cos_f16">; defm V_SAT_PK_U8_I16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x062, "v_sat_pk_u8_i16">; diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll index f6ee007facd7f..80b4d64b1236f 100644 --- a/llvm/test/CodeGen/AMDGPU/fract-match.ll +++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll @@ -14,6 +14,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s ; Test patterns to match v_fract_* instructions. @@ -103,6 +104,21 @@ define float @safe_math_fract_f32(float %x, ptr addrspace(1) nocapture writeonly ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo ; GFX11-NEXT: global_store_b32 v[1:2], v4, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: safe_math_fract_f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v3, v0 +; GFX12-NEXT: v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX12-NEXT: v_floor_f32_e32 v4, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo +; GFX12-NEXT: global_store_b32 v[1:2], v4, off +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -181,6 +197,18 @@ define float @safe_math_fract_f32_noinf_check(float %x, ptr addrspace(1) nocaptu ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: global_store_b32 v[1:2], v3, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: safe_math_fract_f32_noinf_check: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v3, v0 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: global_store_b32 v[1:2], v3, off +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -263,6 +291,22 @@ define float @no_nan_check_math_fract_f32(float %x, ptr addrspace(1) nocapture w ; GFX11-NEXT: v_min_f32_e32 v4, 0x3f7fffff, v4 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: no_nan_check_math_fract_f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v3, v0 +; GFX12-NEXT: v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v4, v0, v3 +; GFX12-NEXT: global_store_b32 v[1:2], v3, off +; GFX12-NEXT: v_min_num_f32_e32 v4, 0x3f7fffff, v4 +; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -314,6 +358,16 @@ define float @basic_fract_f32_nonans(float nofpclass(nan) %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: basic_fract_f32_nonans: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -362,6 +416,19 @@ define float @basic_fract_f32_flags_minnum(float %x) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_min_f32_e32 v0, 0x3f7fffff, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: basic_fract_f32_flags_minnum: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX12-NEXT: v_min_num_f32_e32 v0, 0x3f7fffff, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -409,6 +476,16 @@ define float @basic_fract_f32_flags_fsub(float nofpclass(nan) %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: basic_fract_f32_flags_fsub: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub nsz float %x, %floor @@ -467,6 +544,17 @@ define <2 x float> @basic_fract_v2f32_nonans(<2 x float> nofpclass(nan) %x) { ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: v_fract_f32_e32 v1, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: basic_fract_v2f32_nonans: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: v_fract_f32_e32 v1, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call <2 x float> @llvm.floor.v2f32(<2 x float> %x) %sub = fsub <2 x float> %x, %floor @@ -540,6 +628,20 @@ define float @basic_fract_f32_multi_use_fsub_nonans(float nofpclass(nan) %x, ptr ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: global_store_b32 v[1:2], v3, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: basic_fract_f32_multi_use_fsub_nonans: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v3, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v3, v0, v3 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: global_store_b32 v[1:2], v3, off +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -588,6 +690,16 @@ define float @nnan_minnum_fract_f32(float %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: nnan_minnum_fract_f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -638,6 +750,19 @@ define float @nnan_fsub_fract_f32(float %x) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_min_f32_e32 v0, 0x3f7fffff, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: nnan_fsub_fract_f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX12-NEXT: v_min_num_f32_e32 v0, 0x3f7fffff, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub nnan float %x, %floor @@ -686,6 +811,19 @@ define float @nnan_floor_fract_f32(float %x) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_min_f32_e32 v0, 0x3f7fffff, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: nnan_floor_fract_f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX12-NEXT: v_min_num_f32_e32 v0, 0x3f7fffff, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call nnan float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -733,6 +871,16 @@ define float @nnan_src_fract_f32(float nofpclass(nan) %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: nnan_src_fract_f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -782,6 +930,19 @@ define float @not_fract_f32_wrong_const(float nofpclass(nan) %x) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_min_f32_e32 v0, 0x3f7ffffe, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: not_fract_f32_wrong_const: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX12-NEXT: v_min_num_f32_e32 v0, 0x3f7ffffe, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -831,6 +992,19 @@ define float @not_fract_f32_swapped_fsub(float nofpclass(nan) %x) { ; GFX11-NEXT: v_sub_f32_e32 v0, v1, v0 ; GFX11-NEXT: v_min_f32_e32 v0, 0x3f7fffff, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: not_fract_f32_swapped_fsub: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v0, v1, v0 +; GFX12-NEXT: v_min_num_f32_e32 v0, 0x3f7fffff, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %floor, %x @@ -880,6 +1054,19 @@ define float @not_fract_f32_not_floor(float nofpclass(nan) %x) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_min_f32_e32 v0, 0x3f7fffff, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: not_fract_f32_not_floor: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_trunc_f32_e32 v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX12-NEXT: v_min_num_f32_e32 v0, 0x3f7fffff, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.trunc.f32(float %x) %sub = fsub float %x, %floor @@ -929,6 +1116,19 @@ define float @not_fract_f32_different_floor(float %x, float %y) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_min_f32_e32 v0, 0x3f7fffff, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: not_fract_f32_different_floor: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX12-NEXT: v_min_num_f32_e32 v0, 0x3f7fffff, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %y) %sub = fsub float %x, %floor @@ -978,6 +1178,19 @@ define float @not_fract_f32_maxnum(float nofpclass(nan) %x) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_max_f32_e32 v0, 0x3f7fffff, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: not_fract_f32_maxnum: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX12-NEXT: v_max_num_f32_e32 v0, 0x3f7fffff, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -1000,6 +1213,15 @@ define float @fcmp_uno_check_is_nan_f32(float %x) { ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: fcmp_uno_check_is_nan_f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -1054,6 +1276,16 @@ define float @select_nan_fract_f32(float %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: select_nan_fract_f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -1107,6 +1339,16 @@ define float @commuted_select_nan_fract_f32(float %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: commuted_select_nan_fract_f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -1168,6 +1410,22 @@ define float @wrong_commuted_nan_select_f32(float %x) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: wrong_commuted_nan_select_f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v1, v0 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v1, v0, v1 +; GFX12-NEXT: v_min_num_f32_e32 v1, 0x3f7fffff, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -1231,6 +1489,16 @@ define half @basic_fract_f16_nonan(half nofpclass(nan) %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f16_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: basic_fract_f16_nonan: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f16_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call half @llvm.floor.f16(half %x) %sub = fsub half %x, %floor @@ -1313,6 +1581,20 @@ define <2 x half> @basic_fract_v2f16_nonan(<2 x half> nofpclass(nan) %x) { ; GFX11-NEXT: v_fract_f16_e32 v1, v1 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: basic_fract_v2f16_nonan: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-NEXT: v_fract_f16_e32 v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_fract_f16_e32 v1, v1 +; GFX12-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call <2 x half> @llvm.floor.v2f16(<2 x half> %x) %sub = fsub <2 x half> %x, %floor @@ -1369,6 +1651,16 @@ define double @basic_fract_f64_nanans(double nofpclass(nan) %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f64_e32 v[0:1], v[0:1] ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: basic_fract_f64_nanans: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f64_e32 v[0:1], v[0:1] +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call double @llvm.floor.f64(double %x) %sub = fsub double %x, %floor @@ -1461,6 +1753,18 @@ define half @safe_math_fract_f16_noinf_check(half %x, ptr addrspace(1) nocapture ; GFX11-NEXT: v_fract_f16_e32 v0, v0 ; GFX11-NEXT: global_store_b16 v[1:2], v3, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: safe_math_fract_f16_noinf_check: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f16_e32 v3, v0 +; GFX12-NEXT: v_fract_f16_e32 v0, v0 +; GFX12-NEXT: global_store_b16 v[1:2], v3, off +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call half @llvm.floor.f16(half %x) %sub = fsub half %x, %floor @@ -1546,6 +1850,18 @@ define double @safe_math_fract_f64_noinf_check(double %x, ptr addrspace(1) nocap ; GFX11-NEXT: v_fract_f64_e32 v[0:1], v[0:1] ; GFX11-NEXT: global_store_b64 v[2:3], v[4:5], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: safe_math_fract_f64_noinf_check: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f64_e32 v[4:5], v[0:1] +; GFX12-NEXT: v_fract_f64_e32 v[0:1], v[0:1] +; GFX12-NEXT: global_store_b64 v[2:3], v[4:5], off +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call double @llvm.floor.f64(double %x) %sub = fsub double %x, %floor @@ -1600,6 +1916,16 @@ define float @select_nan_fract_f32_flags_select(float %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: select_nan_fract_f32_flags_select: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -1653,6 +1979,16 @@ define float @select_nan_fract_f32_flags_minnum(float %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: select_nan_fract_f32_flags_minnum: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -1769,6 +2105,25 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap ; GFX11-NEXT: global_store_b64 v[2:3], v[4:5], off ; GFX11-NEXT: v_cndmask_b32_e64 v1, v7, 0, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: safe_math_fract_v2f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v6, v0 +; GFX12-NEXT: v_cmp_class_f32_e64 s0, v0, 0x204 +; GFX12-NEXT: v_fract_f32_e32 v7, v1 +; GFX12-NEXT: v_floor_f32_e32 v4, v0 +; GFX12-NEXT: v_floor_f32_e32 v5, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, 0, s0 +; GFX12-NEXT: v_cmp_class_f32_e64 s0, v1, 0x204 +; GFX12-NEXT: global_store_b64 v[2:3], v[4:5], off +; GFX12-NEXT: v_cndmask_b32_e64 v1, v7, 0, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call <2 x float> @llvm.floor.v2f32(<2 x float> %x) %sub = fsub <2 x float> %x, %floor @@ -1881,6 +2236,21 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) nocapture writeon ; GFX11-NEXT: v_dual_cndmask_b32 v0, 0, v4 :: v_dual_cndmask_b32 v1, 0, v5 ; GFX11-NEXT: global_store_b64 v[2:3], v[6:7], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: safe_math_fract_f64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f64_e32 v[4:5], v[0:1] +; GFX12-NEXT: v_cmp_neq_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| +; GFX12-NEXT: v_floor_f64_e32 v[6:7], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_dual_cndmask_b32 v0, 0, v4 :: v_dual_cndmask_b32 v1, 0, v5 +; GFX12-NEXT: global_store_b64 v[2:3], v[6:7], off +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call double @llvm.floor.f64(double %x) %sub = fsub double %x, %floor @@ -2002,6 +2372,21 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) nocapture writeonly % ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo ; GFX11-NEXT: global_store_b16 v[1:2], v4, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: safe_math_fract_f16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f16_e32 v3, v0 +; GFX12-NEXT: v_cmp_neq_f16_e64 vcc_lo, 0x7c00, |v0| +; GFX12-NEXT: v_floor_f16_e32 v4, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo +; GFX12-NEXT: global_store_b16 v[1:2], v4, off +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call half @llvm.floor.f16(half %x) %sub = fsub half %x, %floor @@ -2168,6 +2553,29 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu ; GFX11-NEXT: global_store_b32 v[1:2], v4, off ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: safe_math_fract_v2f16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX12-NEXT: v_fract_f16_e32 v6, v0 +; GFX12-NEXT: v_floor_f16_e32 v5, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_fract_f16_e32 v4, v3 +; GFX12-NEXT: v_cmp_class_f16_e64 s0, v3, 0x204 +; GFX12-NEXT: v_floor_f16_e32 v7, v3 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v4, 0, s0 +; GFX12-NEXT: v_cmp_class_f16_e64 s0, v0, 0x204 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pack_b32_f16 v4, v5, v7 +; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, 0, s0 +; GFX12-NEXT: global_store_b32 v[1:2], v4, off +; GFX12-NEXT: v_pack_b32_f16 v0, v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call <2 x half> @llvm.floor.v2f16(<2 x half> %x) %sub = fsub <2 x half> %x, %floor @@ -2311,6 +2719,26 @@ define <2 x double> @safe_math_fract_v2f64(<2 x double> %x, ptr addrspace(1) noc ; GFX11-NEXT: v_cndmask_b32_e64 v3, v13, 0, s1 ; GFX11-NEXT: global_store_b128 v[4:5], v[6:9], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: safe_math_fract_v2f64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f64_e32 v[10:11], v[0:1] +; GFX12-NEXT: v_cmp_class_f64_e64 s0, v[0:1], 0x204 +; GFX12-NEXT: v_fract_f64_e32 v[12:13], v[2:3] +; GFX12-NEXT: v_cmp_class_f64_e64 s1, v[2:3], 0x204 +; GFX12-NEXT: v_floor_f64_e32 v[8:9], v[2:3] +; GFX12-NEXT: v_floor_f64_e32 v[6:7], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e64 v0, v10, 0, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v1, v11, 0, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v2, v12, 0, s1 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v13, 0, s1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call <2 x double> @llvm.floor.v2f64(<2 x double> %x) %sub = fsub <2 x double> %x, %floor diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s index b98955d268a72..0d29ed985269a 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s @@ -2093,50 +2093,65 @@ v_floor_f64 v[5:6], src_scc v_floor_f64 v[254:255], 0xaf123456 // GFX11: v_floor_f64_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x34,0xfc,0x7f,0x56,0x34,0x12,0xaf] -v_fract_f16 v5, v1 -// GFX11: v_fract_f16_e32 v5, v1 ; encoding: [0x01,0xbf,0x0a,0x7e] +v_fract_f16 v5.l, v1.l +// GFX11: v_fract_f16_e32 v5.l, v1.l ; encoding: [0x01,0xbf,0x0a,0x7e] -v_fract_f16 v5, v127 -// GFX11: v_fract_f16_e32 v5, v127 ; encoding: [0x7f,0xbf,0x0a,0x7e] +v_fract_f16 v5.l, v127.l +// GFX11: v_fract_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xbf,0x0a,0x7e] -v_fract_f16 v5, s1 -// GFX11: v_fract_f16_e32 v5, s1 ; encoding: [0x01,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, s1 +// GFX11: v_fract_f16_e32 v5.l, s1 ; encoding: [0x01,0xbe,0x0a,0x7e] -v_fract_f16 v5, s105 -// GFX11: v_fract_f16_e32 v5, s105 ; encoding: [0x69,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, s105 +// GFX11: v_fract_f16_e32 v5.l, s105 ; encoding: [0x69,0xbe,0x0a,0x7e] -v_fract_f16 v5, vcc_lo -// GFX11: v_fract_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, vcc_lo +// GFX11: v_fract_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xbe,0x0a,0x7e] -v_fract_f16 v5, vcc_hi -// GFX11: v_fract_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, vcc_hi +// GFX11: v_fract_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xbe,0x0a,0x7e] -v_fract_f16 v5, ttmp15 -// GFX11: v_fract_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, ttmp15 +// GFX11: v_fract_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xbe,0x0a,0x7e] -v_fract_f16 v5, m0 -// GFX11: v_fract_f16_e32 v5, m0 ; encoding: [0x7d,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, m0 +// GFX11: v_fract_f16_e32 v5.l, m0 ; encoding: [0x7d,0xbe,0x0a,0x7e] -v_fract_f16 v5, exec_lo -// GFX11: v_fract_f16_e32 v5, exec_lo ; encoding: [0x7e,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, exec_lo +// GFX11: v_fract_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xbe,0x0a,0x7e] -v_fract_f16 v5, exec_hi -// GFX11: v_fract_f16_e32 v5, exec_hi ; encoding: [0x7f,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, exec_hi +// GFX11: v_fract_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xbe,0x0a,0x7e] -v_fract_f16 v5, null -// GFX11: v_fract_f16_e32 v5, null ; encoding: [0x7c,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, null +// GFX11: v_fract_f16_e32 v5.l, null ; encoding: [0x7c,0xbe,0x0a,0x7e] -v_fract_f16 v5, -1 -// GFX11: v_fract_f16_e32 v5, -1 ; encoding: [0xc1,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, -1 +// GFX11: v_fract_f16_e32 v5.l, -1 ; encoding: [0xc1,0xbe,0x0a,0x7e] -v_fract_f16 v5, 0.5 -// GFX11: v_fract_f16_e32 v5, 0.5 ; encoding: [0xf0,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, 0.5 +// GFX11: v_fract_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xbe,0x0a,0x7e] -v_fract_f16 v5, src_scc -// GFX11: v_fract_f16_e32 v5, src_scc ; encoding: [0xfd,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, src_scc +// GFX11: v_fract_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xbe,0x0a,0x7e] -v_fract_f16 v127, 0xfe0b -// GFX11: v_fract_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_fract_f16 v127.l, 0xfe0b +// GFX11: v_fract_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +v_fract_f16 v5.l, v1.h +// GFX11: v_fract_f16_e32 v5.l, v1.h ; encoding: [0x81,0xbf,0x0a,0x7e] + +v_fract_f16 v5.l, v127.h +// GFX11: v_fract_f16_e32 v5.l, v127.h ; encoding: [0xff,0xbf,0x0a,0x7e] + +v_fract_f16 v127.l, 0.5 +// GFX11: v_fract_f16_e32 v127.l, 0.5 ; encoding: [0xf0,0xbe,0xfe,0x7e] + +v_fract_f16 v5.h, src_scc +// GFX11: v_fract_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xbe,0x0a,0x7f] + +v_fract_f16 v127.h, 0xfe0b +// GFX11: v_fract_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xbe,0xfe,0x7f,0x0b,0xfe,0x00,0x00] v_fract_f32 v5, v1 // GFX11: v_fract_f32_e32 v5, v1 ; encoding: [0x01,0x41,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s index f46abd344d607..d4fb880f25b55 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s @@ -1598,47 +1598,56 @@ v_floor_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_floor_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_floor_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x48,0xfe,0x7f,0xff,0x6f,0x35,0x30] -v_fract_f16 v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_fract_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff] +v_fract_f16 v5.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_fract_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff] -v_fract_f16 v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff] +v_fract_f16 v5.l, v1.l quad_perm:[0,1,2,3] +// GFX11: v_fract_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff] -v_fract_f16 v5, v1 row_mirror -// GFX11: v_fract_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff] +v_fract_f16 v5.l, v1.l row_mirror +// GFX11: v_fract_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff] -v_fract_f16 v5, v1 row_half_mirror -// GFX11: v_fract_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff] +v_fract_f16 v5.l, v1.l row_half_mirror +// GFX11: v_fract_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff] -v_fract_f16 v5, v1 row_shl:1 -// GFX11: v_fract_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff] +v_fract_f16 v5.l, v1.l row_shl:1 +// GFX11: v_fract_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff] -v_fract_f16 v5, v1 row_shl:15 -// GFX11: v_fract_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff] +v_fract_f16 v5.l, v1.l row_shl:15 +// GFX11: v_fract_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff] -v_fract_f16 v5, v1 row_shr:1 -// GFX11: v_fract_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff] +v_fract_f16 v5.l, v1.l row_shr:1 +// GFX11: v_fract_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff] -v_fract_f16 v5, v1 row_shr:15 -// GFX11: v_fract_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff] +v_fract_f16 v5.l, v1.l row_shr:15 +// GFX11: v_fract_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff] -v_fract_f16 v5, v1 row_ror:1 -// GFX11: v_fract_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff] +v_fract_f16 v5.l, v1.l row_ror:1 +// GFX11: v_fract_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff] -v_fract_f16 v5, v1 row_ror:15 -// GFX11: v_fract_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff] +v_fract_f16 v5.l, v1.l row_ror:15 +// GFX11: v_fract_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff] -v_fract_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_fract_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff] +v_fract_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_fract_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff] -v_fract_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_fract_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01] +v_fract_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_fract_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01] -v_fract_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_fract_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x09,0x13] +v_fract_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_fract_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x09,0x13] -v_fract_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_fract_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +v_fract_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_fract_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x35,0x30] + +v_fract_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_fract_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x5f,0x01,0x01] + +v_fract_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_fract_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbe,0x0a,0x7f,0x81,0x60,0x09,0x13] + +v_fract_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_fract_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7f,0xff,0x6f,0x35,0x30] v_fract_f32 v5, v1 quad_perm:[3,2,1,0] // GFX11: v_fract_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s index c5df74758d71e..b6094c5ea3bd6 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s @@ -377,14 +377,23 @@ v_floor_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_floor_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_floor_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x48,0xfe,0x7f,0xff,0x00,0x00,0x00] -v_fract_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_fract_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_fract_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_fract_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_fract_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_fract_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_fract_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_fract_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00] +v_fract_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_fract_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +v_fract_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_fract_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0xfe,0x7e,0x7f,0x77,0x39,0x05] + +v_fract_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_fract_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbe,0x0a,0x7f,0x81,0x77,0x39,0x05] + +v_fract_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_fract_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbe,0xfe,0x7f,0xff,0x00,0x00,0x00] v_fract_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_fract_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x40,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s index ca181f1e59db5..98db7cc8bbc40 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s @@ -458,6 +458,12 @@ v_floor_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0] v_fract_f16_e32 v128, 0xfe0b // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_fract_f16_e32 v128.h, 0xfe0b +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_fract_f16_e32 v128.l, 0xfe0b +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + v_fract_f16_e32 v255, v1 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -467,6 +473,24 @@ v_fract_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0] v_fract_f16_e32 v255, v1 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction +v_fract_f16_e32 v255.h, v1.h +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_fract_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_fract_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_fract_f16_e32 v255.l, v1.l +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_fract_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_fract_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + v_fract_f16_e32 v5, v199 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -476,6 +500,24 @@ v_fract_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_fract_f16_e32 v5, v199 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction +v_fract_f16_e32 v5.h, v199.h +// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_fract_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_fract_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_fract_f16_e32 v5.l, v199.l +// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_fract_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_fract_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction + v_frexp_exp_i16_f16_e32 v128.h, 0xfe0b // GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s index a0a07a03e14c3..9de05d4a82465 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s @@ -1208,71 +1208,137 @@ v_floor_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_floor_f16 v5, v199 quad_perm:[3,2,1,0] // GFX11: v_floor_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] -v_fract_f16 v128, 0xfe0b -// GFX11: v_fract_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xdf,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_fract_f16 v128.h, 0xfe0b +// GFX11: v_fract_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xdf,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_fract_f16 v255, -1 -// GFX11: v_fract_f16_e64 v255, -1 ; encoding: [0xff,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] +v_fract_f16 v128.l, 0xfe0b +// GFX11: v_fract_f16_e64 v128.l, 0xfe0b ; encoding: [0x80,0x00,0xdf,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_fract_f16 v255, 0.5 -// GFX11: v_fract_f16_e64 v255, 0.5 ; encoding: [0xff,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x00] +v_fract_f16 v255.h, -1 +// GFX11: v_fract_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0xc1,0x00,0x00,0x00] -v_fract_f16 v255, exec_hi -// GFX11: v_fract_f16_e64 v255, exec_hi ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] +v_fract_f16 v255.h, 0.5 +// GFX11: v_fract_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0xf0,0x00,0x00,0x00] -v_fract_f16 v255, exec_lo -// GFX11: v_fract_f16_e64 v255, exec_lo ; encoding: [0xff,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] +v_fract_f16 v255.h, exec_hi +// GFX11: v_fract_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7f,0x00,0x00,0x00] -v_fract_f16 v255, m0 -// GFX11: v_fract_f16_e64 v255, m0 ; encoding: [0xff,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] +v_fract_f16 v255.h, exec_lo +// GFX11: v_fract_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7e,0x00,0x00,0x00] -v_fract_f16 v255, null -// GFX11: v_fract_f16_e64 v255, null ; encoding: [0xff,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] +v_fract_f16 v255.h, m0 +// GFX11: v_fract_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7d,0x00,0x00,0x00] -v_fract_f16 v255, s1 -// GFX11: v_fract_f16_e64 v255, s1 ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] +v_fract_f16 v255.h, null +// GFX11: v_fract_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7c,0x00,0x00,0x00] -v_fract_f16 v255, s105 -// GFX11: v_fract_f16_e64 v255, s105 ; encoding: [0xff,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] +v_fract_f16 v255.h, s1 +// GFX11: v_fract_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x01,0x00,0x00,0x00] -v_fract_f16 v255, src_scc -// GFX11: v_fract_f16_e64 v255, src_scc ; encoding: [0xff,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x00] +v_fract_f16 v255.h, s105 +// GFX11: v_fract_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x69,0x00,0x00,0x00] -v_fract_f16 v255, ttmp15 -// GFX11: v_fract_f16_e64 v255, ttmp15 ; encoding: [0xff,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] +v_fract_f16 v255.h, src_scc +// GFX11: v_fract_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0xfd,0x00,0x00,0x00] -v_fract_f16 v255, v1 -// GFX11: v_fract_f16_e64 v255, v1 ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] +v_fract_f16 v255.h, ttmp15 +// GFX11: v_fract_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7b,0x00,0x00,0x00] -v_fract_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_fract_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_fract_f16 v255.h, v1.h +// GFX11: v_fract_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00] -v_fract_f16 v255, v1 quad_perm:[3,2,1,0] -// GFX11: v_fract_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_fract_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_fract_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_fract_f16 v255, v127 -// GFX11: v_fract_f16_e64 v255, v127 ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x01,0x00,0x00] +v_fract_f16 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX11: v_fract_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_fract_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_fract_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] +v_fract_f16 v255.h, v127.h +// GFX11: v_fract_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdf,0xd5,0x7f,0x01,0x00,0x00] -v_fract_f16 v255, v127 quad_perm:[3,2,1,0] -// GFX11: v_fract_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] +v_fract_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_fract_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] -v_fract_f16 v255, vcc_hi -// GFX11: v_fract_f16_e64 v255, vcc_hi ; encoding: [0xff,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] +v_fract_f16 v255.h, v127.h quad_perm:[3,2,1,0] +// GFX11: v_fract_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] -v_fract_f16 v255, vcc_lo -// GFX11: v_fract_f16_e64 v255, vcc_lo ; encoding: [0xff,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] +v_fract_f16 v255.h, vcc_hi +// GFX11: v_fract_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x6b,0x00,0x00,0x00] -v_fract_f16 v5, v199 -// GFX11: v_fract_f16_e64 v5, v199 ; encoding: [0x05,0x00,0xdf,0xd5,0xc7,0x01,0x00,0x00] +v_fract_f16 v255.h, vcc_lo +// GFX11: v_fract_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x6a,0x00,0x00,0x00] -v_fract_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_fract_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] +v_fract_f16 v255.l, -1 +// GFX11: v_fract_f16_e64 v255.l, -1 ; encoding: [0xff,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] -v_fract_f16 v5, v199 quad_perm:[3,2,1,0] -// GFX11: v_fract_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] +v_fract_f16 v255.l, 0.5 +// GFX11: v_fract_f16_e64 v255.l, 0.5 ; encoding: [0xff,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x00] + +v_fract_f16 v255.l, exec_hi +// GFX11: v_fract_f16_e64 v255.l, exec_hi ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] + +v_fract_f16 v255.l, exec_lo +// GFX11: v_fract_f16_e64 v255.l, exec_lo ; encoding: [0xff,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] + +v_fract_f16 v255.l, m0 +// GFX11: v_fract_f16_e64 v255.l, m0 ; encoding: [0xff,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] + +v_fract_f16 v255.l, null +// GFX11: v_fract_f16_e64 v255.l, null ; encoding: [0xff,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] + +v_fract_f16 v255.l, s1 +// GFX11: v_fract_f16_e64 v255.l, s1 ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] + +v_fract_f16 v255.l, s105 +// GFX11: v_fract_f16_e64 v255.l, s105 ; encoding: [0xff,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] + +v_fract_f16 v255.l, src_scc +// GFX11: v_fract_f16_e64 v255.l, src_scc ; encoding: [0xff,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x00] + +v_fract_f16 v255.l, ttmp15 +// GFX11: v_fract_f16_e64 v255.l, ttmp15 ; encoding: [0xff,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] + +v_fract_f16 v255.l, v1.l +// GFX11: v_fract_f16_e64 v255.l, v1.l ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] + +v_fract_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_fract_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_fract_f16 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_fract_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_fract_f16 v255.l, v127.l +// GFX11: v_fract_f16_e64 v255.l, v127.l ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x01,0x00,0x00] + +v_fract_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_fract_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] + +v_fract_f16 v255.l, v127.l quad_perm:[3,2,1,0] +// GFX11: v_fract_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] + +v_fract_f16 v255.l, vcc_hi +// GFX11: v_fract_f16_e64 v255.l, vcc_hi ; encoding: [0xff,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] + +v_fract_f16 v255.l, vcc_lo +// GFX11: v_fract_f16_e64 v255.l, vcc_lo ; encoding: [0xff,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] + +v_fract_f16 v5.h, v199.h +// GFX11: v_fract_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdf,0xd5,0xc7,0x01,0x00,0x00] + +v_fract_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_fract_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_fract_f16 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX11: v_fract_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] + +v_fract_f16 v5.l, v199.l +// GFX11: v_fract_f16_e64 v5.l, v199.l ; encoding: [0x05,0x00,0xdf,0xd5,0xc7,0x01,0x00,0x00] + +v_fract_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_fract_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_fract_f16 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX11: v_fract_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] v_frexp_exp_i16_f16 v128.h, 0xfe0b // GFX11: v_frexp_exp_i16_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xda,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s index 1a7eb2c23a7d2..b674395fddf63 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s @@ -1684,47 +1684,56 @@ v_floor_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct v_floor_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xa4,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_fract_f16_e64_dpp v5, v1 row_mirror -// GFX11: v_fract_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_mirror +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_half_mirror -// GFX11: v_fract_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_half_mirror +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_shl:1 -// GFX11: v_fract_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_shl:1 +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_shl:15 -// GFX11: v_fract_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_shl:15 +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_shr:1 -// GFX11: v_fract_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_shr:1 +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_shr:15 -// GFX11: v_fract_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_shr:15 +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_ror:1 -// GFX11: v_fract_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_ror:1 +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_ror:15 -// GFX11: v_fract_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_ror:15 +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +v_fract_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +v_fract_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +v_fract_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: [0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +v_fract_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: [0x05,0x08,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +v_fract_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: [0xff,0xc1,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] v_fract_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX11: v_fract_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s index 73c21ce24d15c..a07db726574e5 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s @@ -487,17 +487,26 @@ v_floor_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_floor_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xa4,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_fract_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +v_fract_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +v_fract_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdf,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdf,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +v_fract_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: [0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +v_fract_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: [0x05,0x08,0xdf,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +v_fract_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: [0xff,0xc1,0xdf,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] v_fract_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_fract_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s index 860c0f4eca7b3..964a19205df5c 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s @@ -2086,50 +2086,59 @@ v_floor_f64_e64 v[5:6], -|src_scc| mul:4 v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2 // GFX11: v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x9a,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] -v_fract_f16_e64 v5, v1 -// GFX11: v_fract_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] +v_fract_f16_e64 v5.l, v1.l +// GFX11: v_fract_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] -v_fract_f16_e64 v5, v255 -// GFX11: v_fract_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00] +v_fract_f16_e64 v5.l, v255.l +// GFX11: v_fract_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00] -v_fract_f16_e64 v5, s1 -// GFX11: v_fract_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, s1 +// GFX11: v_fract_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] -v_fract_f16_e64 v5, s105 -// GFX11: v_fract_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, s105 +// GFX11: v_fract_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] -v_fract_f16_e64 v5, vcc_lo -// GFX11: v_fract_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, vcc_lo +// GFX11: v_fract_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] -v_fract_f16_e64 v5, vcc_hi -// GFX11: v_fract_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, vcc_hi +// GFX11: v_fract_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] -v_fract_f16_e64 v5, ttmp15 -// GFX11: v_fract_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, ttmp15 +// GFX11: v_fract_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] -v_fract_f16_e64 v5, m0 -// GFX11: v_fract_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, m0 +// GFX11: v_fract_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] -v_fract_f16_e64 v5, exec_lo -// GFX11: v_fract_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, exec_lo +// GFX11: v_fract_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] -v_fract_f16_e64 v5, exec_hi -// GFX11: v_fract_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, exec_hi +// GFX11: v_fract_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] -v_fract_f16_e64 v5, null -// GFX11: v_fract_f16_e64 v5, null ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, null +// GFX11: v_fract_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] -v_fract_f16_e64 v5, -1 -// GFX11: v_fract_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, -1 +// GFX11: v_fract_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] -v_fract_f16_e64 v5, 0.5 mul:2 -// GFX11: v_fract_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08] +v_fract_f16_e64 v5.l, 0.5 mul:2 +// GFX11: v_fract_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08] -v_fract_f16_e64 v5, src_scc mul:4 -// GFX11: v_fract_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10] +v_fract_f16_e64 v5.l, src_scc mul:4 +// GFX11: v_fract_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10] -v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 -// GFX11: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +v_fract_f16_e64 v255.l, -|0xfe0b| clamp div:2 +// GFX11: v_fract_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_fract_f16_e64 v5.h, v1.h +// GFX11: [0x05,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00] + +v_fract_f16_e64 v5.l, v255.h +// GFX11: [0x05,0x08,0xdf,0xd5,0xff,0x01,0x00,0x00] + +v_fract_f16_e64 v255.h, -|0xfe0b| clamp div:2 +// GFX11: [0xff,0xc1,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] v_fract_f32_e64 v5, v1 // GFX11: v_fract_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s index 0195c34a552e3..42b9dc464dd90 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s @@ -2168,50 +2168,62 @@ v_floor_f64 v[5:6], src_scc v_floor_f64 v[254:255], 0xaf123456 // GFX12: v_floor_f64_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x34,0xfc,0x7f,0x56,0x34,0x12,0xaf] -v_fract_f16 v5, v1 -// GFX12: v_fract_f16_e32 v5, v1 ; encoding: [0x01,0xbf,0x0a,0x7e] +v_fract_f16 v5.l, v1.l +// GFX12: v_fract_f16_e32 v5.l, v1.l ; encoding: [0x01,0xbf,0x0a,0x7e] -v_fract_f16 v5, v127 -// GFX12: v_fract_f16_e32 v5, v127 ; encoding: [0x7f,0xbf,0x0a,0x7e] +v_fract_f16 v5.l, v127.l +// GFX12: v_fract_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xbf,0x0a,0x7e] -v_fract_f16 v5, s1 -// GFX12: v_fract_f16_e32 v5, s1 ; encoding: [0x01,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, s1 +// GFX12: v_fract_f16_e32 v5.l, s1 ; encoding: [0x01,0xbe,0x0a,0x7e] -v_fract_f16 v5, s105 -// GFX12: v_fract_f16_e32 v5, s105 ; encoding: [0x69,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, s105 +// GFX12: v_fract_f16_e32 v5.l, s105 ; encoding: [0x69,0xbe,0x0a,0x7e] -v_fract_f16 v5, vcc_lo -// GFX12: v_fract_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, vcc_lo +// GFX12: v_fract_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xbe,0x0a,0x7e] -v_fract_f16 v5, vcc_hi -// GFX12: v_fract_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, vcc_hi +// GFX12: v_fract_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xbe,0x0a,0x7e] -v_fract_f16 v5, ttmp15 -// GFX12: v_fract_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, ttmp15 +// GFX12: v_fract_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xbe,0x0a,0x7e] -v_fract_f16 v5, m0 -// GFX12: v_fract_f16_e32 v5, m0 ; encoding: [0x7d,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, m0 +// GFX12: v_fract_f16_e32 v5.l, m0 ; encoding: [0x7d,0xbe,0x0a,0x7e] -v_fract_f16 v5, exec_lo -// GFX12: v_fract_f16_e32 v5, exec_lo ; encoding: [0x7e,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, exec_lo +// GFX12: v_fract_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xbe,0x0a,0x7e] -v_fract_f16 v5, exec_hi -// GFX12: v_fract_f16_e32 v5, exec_hi ; encoding: [0x7f,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, exec_hi +// GFX12: v_fract_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xbe,0x0a,0x7e] -v_fract_f16 v5, null -// GFX12: v_fract_f16_e32 v5, null ; encoding: [0x7c,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, null +// GFX12: v_fract_f16_e32 v5.l, null ; encoding: [0x7c,0xbe,0x0a,0x7e] -v_fract_f16 v5, -1 -// GFX12: v_fract_f16_e32 v5, -1 ; encoding: [0xc1,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, -1 +// GFX12: v_fract_f16_e32 v5.l, -1 ; encoding: [0xc1,0xbe,0x0a,0x7e] -v_fract_f16 v5, 0.5 -// GFX12: v_fract_f16_e32 v5, 0.5 ; encoding: [0xf0,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, 0.5 +// GFX12: v_fract_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xbe,0x0a,0x7e] -v_fract_f16 v5, src_scc -// GFX12: v_fract_f16_e32 v5, src_scc ; encoding: [0xfd,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, src_scc +// GFX12: v_fract_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xbe,0x0a,0x7e] -v_fract_f16 v127, 0xfe0b -// GFX12: v_fract_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_fract_f16 v127.l, 0xfe0b +// GFX12: v_fract_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +v_fract_f16 v5.l, v1.h +// GFX12: v_fract_f16_e32 v5.l, v1.h ; encoding: [0x81,0xbf,0x0a,0x7e] + +v_fract_f16 v5.l, v127.h +// GFX12: v_fract_f16_e32 v5.l, v127.h ; encoding: [0xff,0xbf,0x0a,0x7e] + +v_fract_f16 v5.h, src_scc +// GFX12: v_fract_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xbe,0x0a,0x7f] + +v_fract_f16 v127.h, 0xfe0b +// GFX12: v_fract_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xbe,0xfe,0x7f,0x0b,0xfe,0x00,0x00] v_fract_f32 v5, v1 // GFX12: v_fract_f32_e32 v5, v1 ; encoding: [0x01,0x41,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s index 072544e66e4a5..7cfc2c1d45285 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s @@ -1660,47 +1660,53 @@ v_floor_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_floor_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_floor_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x48,0xfe,0x7f,0xff,0x6f,0x35,0x30] -v_fract_f16 v5, v1 quad_perm:[3,2,1,0] -// GFX12: v_fract_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff] +v_fract_f16 v5.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_fract_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff] -v_fract_f16 v5, v1 quad_perm:[0,1,2,3] -// GFX12: v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff] +v_fract_f16 v5.l, v1.l quad_perm:[0,1,2,3] +// GFX12: v_fract_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff] -v_fract_f16 v5, v1 row_mirror -// GFX12: v_fract_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff] +v_fract_f16 v5.l, v1.l row_mirror +// GFX12: v_fract_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff] -v_fract_f16 v5, v1 row_half_mirror -// GFX12: v_fract_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff] +v_fract_f16 v5.l, v1.l row_half_mirror +// GFX12: v_fract_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff] -v_fract_f16 v5, v1 row_shl:1 -// GFX12: v_fract_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff] +v_fract_f16 v5.l, v1.l row_shl:1 +// GFX12: v_fract_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff] -v_fract_f16 v5, v1 row_shl:15 -// GFX12: v_fract_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff] +v_fract_f16 v5.l, v1.l row_shl:15 +// GFX12: v_fract_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff] -v_fract_f16 v5, v1 row_shr:1 -// GFX12: v_fract_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff] +v_fract_f16 v5.l, v1.l row_shr:1 +// GFX12: v_fract_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff] -v_fract_f16 v5, v1 row_shr:15 -// GFX12: v_fract_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff] +v_fract_f16 v5.l, v1.l row_shr:15 +// GFX12: v_fract_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff] -v_fract_f16 v5, v1 row_ror:1 -// GFX12: v_fract_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff] +v_fract_f16 v5.l, v1.l row_ror:1 +// GFX12: v_fract_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff] -v_fract_f16 v5, v1 row_ror:15 -// GFX12: v_fract_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff] +v_fract_f16 v5.l, v1.l row_ror:15 +// GFX12: v_fract_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff] -v_fract_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_fract_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff] +v_fract_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_fract_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff] -v_fract_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_fract_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01] +v_fract_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_fract_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01] -v_fract_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_fract_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x09,0x13] +v_fract_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_fract_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x09,0x13] -v_fract_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_fract_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +v_fract_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_fract_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x35,0x30] + +v_fract_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_fract_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbe,0x0a,0x7f,0x81,0x60,0x09,0x13] + +v_fract_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_fract_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7f,0xff,0x6f,0x35,0x30] v_fract_f32 v5, v1 quad_perm:[3,2,1,0] // GFX12: v_fract_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s index bc3559e3c65ed..ddb4029f10208 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s @@ -406,14 +406,20 @@ v_floor_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_floor_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_floor_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x48,0xfe,0x7f,0xff,0x00,0x00,0x00] -v_fract_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_fract_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_fract_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_fract_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_fract_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_fract_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_fract_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_fract_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00] +v_fract_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_fract_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +v_fract_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_fract_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbe,0x0a,0x7f,0x81,0x77,0x39,0x05] + +v_fract_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_fract_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbe,0xfe,0x7f,0xff,0x00,0x00,0x00] v_fract_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_fract_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x40,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s index 0d759baf0af0d..05a990eed89c8 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s @@ -431,6 +431,12 @@ v_floor_f16_e32 v5, v199 quad_perm:[3,2,1,0] v_fract_f16_e32 v128, 0xfe0b // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_fract_f16_e32 v128.h, 0xfe0b +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_fract_f16_e32 v128.l, 0xfe0b +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + v_fract_f16_e32 v255, v1 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -440,6 +446,24 @@ v_fract_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0] v_fract_f16_e32 v255, v1 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction +v_fract_f16_e32 v255.h, v1.h +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_fract_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_fract_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_fract_f16_e32 v255.l, v1.l +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_fract_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_fract_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + v_fract_f16_e32 v5, v199 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -449,6 +473,24 @@ v_fract_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_fract_f16_e32 v5, v199 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction +v_fract_f16_e32 v5.h, v199.h +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_fract_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_fract_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_fract_f16_e32 v5.l, v199.l +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_fract_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_fract_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + v_frexp_exp_i16_f16_e32 v128.h, 0xfe0b // GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s index 976b6bb69c33e..96de27842c072 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s @@ -1168,71 +1168,137 @@ v_floor_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_floor_f16 v5, v199 quad_perm:[3,2,1,0] // GFX12: v_floor_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] -v_fract_f16 v128, 0xfe0b -// GFX12: v_fract_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xdf,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_fract_f16 v128.h, 0xfe0b +// GFX12: v_fract_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xdf,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_fract_f16 v255, -1 -// GFX12: v_fract_f16_e64 v255, -1 ; encoding: [0xff,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] +v_fract_f16 v128.l, 0xfe0b +// GFX12: v_fract_f16_e64 v128.l, 0xfe0b ; encoding: [0x80,0x00,0xdf,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_fract_f16 v255, 0.5 -// GFX12: v_fract_f16_e64 v255, 0.5 ; encoding: [0xff,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x00] +v_fract_f16 v255.h, -1 +// GFX12: v_fract_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0xc1,0x00,0x00,0x00] -v_fract_f16 v255, exec_hi -// GFX12: v_fract_f16_e64 v255, exec_hi ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] +v_fract_f16 v255.h, 0.5 +// GFX12: v_fract_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0xf0,0x00,0x00,0x00] -v_fract_f16 v255, exec_lo -// GFX12: v_fract_f16_e64 v255, exec_lo ; encoding: [0xff,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] +v_fract_f16 v255.h, exec_hi +// GFX12: v_fract_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7f,0x00,0x00,0x00] -v_fract_f16 v255, m0 -// GFX12: v_fract_f16_e64 v255, m0 ; encoding: [0xff,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] +v_fract_f16 v255.h, exec_lo +// GFX12: v_fract_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7e,0x00,0x00,0x00] -v_fract_f16 v255, null -// GFX12: v_fract_f16_e64 v255, null ; encoding: [0xff,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] +v_fract_f16 v255.h, m0 +// GFX12: v_fract_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7d,0x00,0x00,0x00] -v_fract_f16 v255, s1 -// GFX12: v_fract_f16_e64 v255, s1 ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] +v_fract_f16 v255.h, null +// GFX12: v_fract_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7c,0x00,0x00,0x00] -v_fract_f16 v255, s105 -// GFX12: v_fract_f16_e64 v255, s105 ; encoding: [0xff,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] +v_fract_f16 v255.h, s1 +// GFX12: v_fract_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x01,0x00,0x00,0x00] -v_fract_f16 v255, src_scc -// GFX12: v_fract_f16_e64 v255, src_scc ; encoding: [0xff,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x00] +v_fract_f16 v255.h, s105 +// GFX12: v_fract_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x69,0x00,0x00,0x00] -v_fract_f16 v255, ttmp15 -// GFX12: v_fract_f16_e64 v255, ttmp15 ; encoding: [0xff,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] +v_fract_f16 v255.h, src_scc +// GFX12: v_fract_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0xfd,0x00,0x00,0x00] -v_fract_f16 v255, v1 -// GFX12: v_fract_f16_e64 v255, v1 ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] +v_fract_f16 v255.h, ttmp15 +// GFX12: v_fract_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7b,0x00,0x00,0x00] -v_fract_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_fract_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_fract_f16 v255.h, v1.h +// GFX12: v_fract_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00] -v_fract_f16 v255, v1 quad_perm:[3,2,1,0] -// GFX12: v_fract_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_fract_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_fract_f16 v255, v127 -// GFX12: v_fract_f16_e64 v255, v127 ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x01,0x00,0x00] +v_fract_f16 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_fract_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_fract_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] +v_fract_f16 v255.h, v127.h +// GFX12: v_fract_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdf,0xd5,0x7f,0x01,0x00,0x00] -v_fract_f16 v255, v127 quad_perm:[3,2,1,0] -// GFX12: v_fract_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] +v_fract_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] -v_fract_f16 v255, vcc_hi -// GFX12: v_fract_f16_e64 v255, vcc_hi ; encoding: [0xff,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] +v_fract_f16 v255.h, v127.h quad_perm:[3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] -v_fract_f16 v255, vcc_lo -// GFX12: v_fract_f16_e64 v255, vcc_lo ; encoding: [0xff,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] +v_fract_f16 v255.h, vcc_hi +// GFX12: v_fract_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x6b,0x00,0x00,0x00] -v_fract_f16 v5, v199 -// GFX12: v_fract_f16_e64 v5, v199 ; encoding: [0x05,0x00,0xdf,0xd5,0xc7,0x01,0x00,0x00] +v_fract_f16 v255.h, vcc_lo +// GFX12: v_fract_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x6a,0x00,0x00,0x00] -v_fract_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_fract_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] +v_fract_f16 v255.l, -1 +// GFX12: v_fract_f16_e64 v255.l, -1 ; encoding: [0xff,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] -v_fract_f16 v5, v199 quad_perm:[3,2,1,0] -// GFX12: v_fract_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] +v_fract_f16 v255.l, 0.5 +// GFX12: v_fract_f16_e64 v255.l, 0.5 ; encoding: [0xff,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x00] + +v_fract_f16 v255.l, exec_hi +// GFX12: v_fract_f16_e64 v255.l, exec_hi ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] + +v_fract_f16 v255.l, exec_lo +// GFX12: v_fract_f16_e64 v255.l, exec_lo ; encoding: [0xff,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] + +v_fract_f16 v255.l, m0 +// GFX12: v_fract_f16_e64 v255.l, m0 ; encoding: [0xff,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] + +v_fract_f16 v255.l, null +// GFX12: v_fract_f16_e64 v255.l, null ; encoding: [0xff,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] + +v_fract_f16 v255.l, s1 +// GFX12: v_fract_f16_e64 v255.l, s1 ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] + +v_fract_f16 v255.l, s105 +// GFX12: v_fract_f16_e64 v255.l, s105 ; encoding: [0xff,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] + +v_fract_f16 v255.l, src_scc +// GFX12: v_fract_f16_e64 v255.l, src_scc ; encoding: [0xff,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x00] + +v_fract_f16 v255.l, ttmp15 +// GFX12: v_fract_f16_e64 v255.l, ttmp15 ; encoding: [0xff,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] + +v_fract_f16 v255.l, v1.l +// GFX12: v_fract_f16_e64 v255.l, v1.l ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] + +v_fract_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_fract_f16 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_fract_f16 v255.l, v127.l +// GFX12: v_fract_f16_e64 v255.l, v127.l ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x01,0x00,0x00] + +v_fract_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] + +v_fract_f16 v255.l, v127.l quad_perm:[3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] + +v_fract_f16 v255.l, vcc_hi +// GFX12: v_fract_f16_e64 v255.l, vcc_hi ; encoding: [0xff,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] + +v_fract_f16 v255.l, vcc_lo +// GFX12: v_fract_f16_e64 v255.l, vcc_lo ; encoding: [0xff,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] + +v_fract_f16 v5.h, v199.h +// GFX12: v_fract_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdf,0xd5,0xc7,0x01,0x00,0x00] + +v_fract_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_fract_f16 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] + +v_fract_f16 v5.l, v199.l +// GFX12: v_fract_f16_e64 v5.l, v199.l ; encoding: [0x05,0x00,0xdf,0xd5,0xc7,0x01,0x00,0x00] + +v_fract_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_fract_f16 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] v_frexp_exp_i16_f16 v128.h, 0xfe0b // GFX12: v_frexp_exp_i16_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xda,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s index e4f62eadc0e49..613a70f46800e 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s @@ -2236,50 +2236,59 @@ v_floor_f64_e64 v[5:6], -|src_scc| mul:4 v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2 // GFX12: v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x9a,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] -v_fract_f16_e64 v5, v1 -// GFX12: v_fract_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] +v_fract_f16_e64 v5.l, v1.l +// GFX12: v_fract_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] -v_fract_f16_e64 v5, v255 -// GFX12: v_fract_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00] +v_fract_f16_e64 v5.l, v255.l +// GFX12: v_fract_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00] -v_fract_f16_e64 v5, s1 -// GFX12: v_fract_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, s1 +// GFX12: v_fract_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] -v_fract_f16_e64 v5, s105 -// GFX12: v_fract_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, s105 +// GFX12: v_fract_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] -v_fract_f16_e64 v5, vcc_lo -// GFX12: v_fract_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, vcc_lo +// GFX12: v_fract_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] -v_fract_f16_e64 v5, vcc_hi -// GFX12: v_fract_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, vcc_hi +// GFX12: v_fract_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] -v_fract_f16_e64 v5, ttmp15 -// GFX12: v_fract_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, ttmp15 +// GFX12: v_fract_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] -v_fract_f16_e64 v5, m0 -// GFX12: v_fract_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, m0 +// GFX12: v_fract_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] -v_fract_f16_e64 v5, exec_lo -// GFX12: v_fract_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, exec_lo +// GFX12: v_fract_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] -v_fract_f16_e64 v5, exec_hi -// GFX12: v_fract_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, exec_hi +// GFX12: v_fract_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] -v_fract_f16_e64 v5, null -// GFX12: v_fract_f16_e64 v5, null ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, null +// GFX12: v_fract_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] -v_fract_f16_e64 v5, -1 -// GFX12: v_fract_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, -1 +// GFX12: v_fract_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] -v_fract_f16_e64 v5, 0.5 mul:2 -// GFX12: v_fract_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08] +v_fract_f16_e64 v5.l, 0.5 mul:2 +// GFX12: v_fract_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08] -v_fract_f16_e64 v5, src_scc mul:4 -// GFX12: v_fract_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10] +v_fract_f16_e64 v5.l, src_scc mul:4 +// GFX12: v_fract_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10] -v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 -// GFX12: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +v_fract_f16_e64 v255.l, -|0xfe0b| clamp div:2 +// GFX12: v_fract_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_fract_f16_e64 v5.h, v1.h +// GFX12: v_fract_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00] + +v_fract_f16_e64 v5.l, v255.h +// GFX12: v_fract_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xdf,0xd5,0xff,0x01,0x00,0x00] + +v_fract_f16_e64 v255.h, -|0xfe0b| clamp div:2 +// GFX12: v_fract_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] v_fract_f32_e64 v5, v1 // GFX12: v_fract_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s index fb57e5cd54ab8..2044058566052 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s @@ -1699,47 +1699,56 @@ v_floor_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct v_floor_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xa4,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] -// GFX12: v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] -// GFX12: v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_fract_f16_e64_dpp v5, v1 row_mirror -// GFX12: v_fract_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_mirror +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_half_mirror -// GFX12: v_fract_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_half_mirror +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_shl:1 -// GFX12: v_fract_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_shl:1 +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_shl:15 -// GFX12: v_fract_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_shl:15 +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_shr:1 -// GFX12: v_fract_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_shr:1 +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_shr:15 -// GFX12: v_fract_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_shr:15 +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_ror:1 -// GFX12: v_fract_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_ror:1 +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_ror:15 -// GFX12: v_fract_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_ror:15 +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +v_fract_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +v_fract_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +v_fract_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +v_fract_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_fract_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x08,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +v_fract_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_fract_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc1,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] v_fract_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX12: v_fract_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s index acb73d8dbaf73..b0283c2df7169 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s @@ -502,17 +502,26 @@ v_floor_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_floor_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xa4,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_fract_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +v_fract_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +v_fract_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdf,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdf,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +v_fract_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +v_fract_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_fract_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x08,0xdf,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +v_fract_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_fract_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc1,0xdf,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] v_fract_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_fract_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt index 55b2081c04917..67b39ee2fdea0 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt @@ -2017,49 +2017,82 @@ # GFX11: v_floor_f64_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x34,0xfc,0x7f,0x56,0x34,0x12,0xaf] 0x01,0xbf,0x0a,0x7e -# GFX11: v_fract_f16_e32 v5, v1 ; encoding: [0x01,0xbf,0x0a,0x7e] +# GFX11-REAL16: v_fract_f16_e32 v5.l, v1.l ; encoding: [0x01,0xbf,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, v1 ; encoding: [0x01,0xbf,0x0a,0x7e] 0x7f,0xbf,0x0a,0x7e -# GFX11: v_fract_f16_e32 v5, v127 ; encoding: [0x7f,0xbf,0x0a,0x7e] +# GFX11-REAL16: v_fract_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xbf,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, v127 ; encoding: [0x7f,0xbf,0x0a,0x7e] 0x01,0xbe,0x0a,0x7e -# GFX11: v_fract_f16_e32 v5, s1 ; encoding: [0x01,0xbe,0x0a,0x7e] +# GFX11-REAL16: v_fract_f16_e32 v5.l, s1 ; encoding: [0x01,0xbe,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, s1 ; encoding: [0x01,0xbe,0x0a,0x7e] 0x69,0xbe,0x0a,0x7e -# GFX11: v_fract_f16_e32 v5, s105 ; encoding: [0x69,0xbe,0x0a,0x7e] +# GFX11-REAL16: v_fract_f16_e32 v5.l, s105 ; encoding: [0x69,0xbe,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, s105 ; encoding: [0x69,0xbe,0x0a,0x7e] 0x6a,0xbe,0x0a,0x7e -# GFX11: v_fract_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xbe,0x0a,0x7e] +# GFX11-REAL16: v_fract_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xbe,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xbe,0x0a,0x7e] 0x6b,0xbe,0x0a,0x7e -# GFX11: v_fract_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xbe,0x0a,0x7e] +# GFX11-REAL16: v_fract_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xbe,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xbe,0x0a,0x7e] 0x7b,0xbe,0x0a,0x7e -# GFX11: v_fract_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xbe,0x0a,0x7e] +# GFX11-REAL16: v_fract_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xbe,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xbe,0x0a,0x7e] 0x7d,0xbe,0x0a,0x7e -# GFX11: v_fract_f16_e32 v5, m0 ; encoding: [0x7d,0xbe,0x0a,0x7e] +# GFX11-REAL16: v_fract_f16_e32 v5.l, m0 ; encoding: [0x7d,0xbe,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, m0 ; encoding: [0x7d,0xbe,0x0a,0x7e] 0x7e,0xbe,0x0a,0x7e -# GFX11: v_fract_f16_e32 v5, exec_lo ; encoding: [0x7e,0xbe,0x0a,0x7e] +# GFX11-REAL16: v_fract_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xbe,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, exec_lo ; encoding: [0x7e,0xbe,0x0a,0x7e] 0x7f,0xbe,0x0a,0x7e -# GFX11: v_fract_f16_e32 v5, exec_hi ; encoding: [0x7f,0xbe,0x0a,0x7e] +# GFX11-REAL16: v_fract_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xbe,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, exec_hi ; encoding: [0x7f,0xbe,0x0a,0x7e] 0x7c,0xbe,0x0a,0x7e -# GFX11: v_fract_f16_e32 v5, null ; encoding: [0x7c,0xbe,0x0a,0x7e] +# GFX11-REAL16: v_fract_f16_e32 v5.l, null ; encoding: [0x7c,0xbe,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, null ; encoding: [0x7c,0xbe,0x0a,0x7e] 0xc1,0xbe,0x0a,0x7e -# GFX11: v_fract_f16_e32 v5, -1 ; encoding: [0xc1,0xbe,0x0a,0x7e] +# GFX11-REAL16: v_fract_f16_e32 v5.l, -1 ; encoding: [0xc1,0xbe,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, -1 ; encoding: [0xc1,0xbe,0x0a,0x7e] 0xf0,0xbe,0x0a,0x7e -# GFX11: v_fract_f16_e32 v5, 0.5 ; encoding: [0xf0,0xbe,0x0a,0x7e] +# GFX11-REAL16: v_fract_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xbe,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, 0.5 ; encoding: [0xf0,0xbe,0x0a,0x7e] 0xfd,0xbe,0x0a,0x7e -# GFX11: v_fract_f16_e32 v5, src_scc ; encoding: [0xfd,0xbe,0x0a,0x7e] +# GFX11-REAL16: v_fract_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xbe,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, src_scc ; encoding: [0xfd,0xbe,0x0a,0x7e] 0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00 -# GFX11: v_fract_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_fract_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +0x81,0xbf,0x0a,0x7e +# GFX11-REAL16: v_fract_f16_e32 v5.l, v1.h ; encoding: [0x81,0xbf,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xbf,0x0a,0x7e] + +0xff,0xbf,0x0a,0x7e +# GFX11-REAL16: v_fract_f16_e32 v5.l, v127.h ; encoding: [0xff,0xbf,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xbf,0x0a,0x7e] + +0xf0,0xbe,0xfe,0x7e +# GFX11-REAL16: v_fract_f16_e32 v127.l, 0.5 ; encoding: [0xf0,0xbe,0xfe,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v127, 0.5 ; encoding: [0xf0,0xbe,0xfe,0x7e] + +0xfd,0xbe,0x0a,0x7f +# GFX11-REAL16: v_fract_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xbe,0x0a,0x7f] + +0xff,0xbe,0xfe,0x7f,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_fract_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xbe,0xfe,0x7f,0x0b,0xfe,0x00,0x00] 0x01,0x41,0x0a,0x7e # GFX11: v_fract_f32_e32 v5, v1 ; encoding: [0x01,0x41,0x0a,0x7e] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt index d2e1e926cc19e..55a128f386b7c 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt @@ -1545,46 +1545,72 @@ # GFX11: v_floor_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x48,0xfe,0x7f,0xff,0x6f,0x3d,0x30] 0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX11: v_fract_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_fract_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX11: v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX11: v_fract_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX11: v_fract_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX11: v_fract_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX11: v_fract_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX11: v_fract_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX11: v_fract_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX11: v_fract_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX11: v_fract_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX11: v_fract_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX11: v_fract_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX11: v_fract_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30 -# GFX11: v_fract_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX11-REAL16: v_fract_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX11-FAKE16: v_fract_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] + +0xfa,0xbe,0xfe,0x7e,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_fract_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_fract_f16_dpp v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x5f,0x01,0x01] + +0xfa,0xbe,0x0a,0x7f,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_fract_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7f,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_mul_i32_i24_e32 v128, 1, v176 ; encoding: [0x81,0x60,0x01,0x13] + +0xfa,0xbe,0xfe,0x7f,0xff,0x6f,0x3d,0x30 +# GFX11-REAL16: v_fract_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7f,0xff,0x6f,0x3d,0x30] +# GFX11-FAKE16: v_lshlrev_b32_e32 v30, v255, v183 ; encoding: [0xff,0x6f,0x3d,0x30] 0xfa,0x40,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX11: v_fract_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt index 93fb5e2b4c01a..0a4d263862407 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt @@ -302,10 +302,23 @@ # GFX11: v_floor_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x48,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX11: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00 -# GFX11: v_fract_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_fract_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +0xe9,0xbe,0xfe,0x7e,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_fract_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0xfe,0x7e,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_fract_f16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0xfe,0x7e,0x7f,0x77,0x39,0x05] + +0xe9,0xbe,0x0a,0x7f,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_fract_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7f,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v129, v187 ; encoding: [0x81,0x77,0x39,0x05] + +0xea,0xbe,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_fract_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0x40,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX11: v_fract_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x40,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt index 74d875081d113..4e64ecaa85ecc 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt @@ -1659,46 +1659,72 @@ # GFX11: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xa4,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX11: v_fract_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX11: v_fract_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX11: v_fract_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_fract_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX11: v_fract_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_fract_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX11: v_fract_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_fract_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX11: v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX11: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 -# GFX11: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] 0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 -# GFX11: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] + +0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +0x05,0x08,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] + +0xff,0xc1,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_fract_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xa0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX11: v_fract_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt index a4bdfe9f4a975..1d9edc8535d60 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt @@ -455,16 +455,32 @@ # GFX11: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xa4,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX11: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX11: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] 0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX11: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +0x05,0x08,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +0xff,0xc1,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_fract_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0x05,0x00,0xa0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX11: v_fract_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt index 7c4f1634026fd..c3889208779f8 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt @@ -2037,49 +2037,76 @@ # GFX11: v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x9a,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] 0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00 -# GFX11: v_fract_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] +# GFX11-REAL16: v_fract_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00 -# GFX11: v_fract_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00] +# GFX11-REAL16: v_fract_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00 -# GFX11: v_fract_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] +# GFX11-REAL16: v_fract_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00 -# GFX11: v_fract_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] +# GFX11-REAL16: v_fract_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00 -# GFX11: v_fract_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] +# GFX11-REAL16: v_fract_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00 -# GFX11: v_fract_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] +# GFX11-REAL16: v_fract_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00 -# GFX11: v_fract_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] +# GFX11-REAL16: v_fract_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00 -# GFX11: v_fract_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] +# GFX11-REAL16: v_fract_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00 -# GFX11: v_fract_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] +# GFX11-REAL16: v_fract_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00 -# GFX11: v_fract_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_fract_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00 -# GFX11: v_fract_f16_e64 v5, null ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] +# GFX11-REAL16: v_fract_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v5, null ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00 -# GFX11: v_fract_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] +# GFX11-REAL16: v_fract_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08 -# GFX11: v_fract_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08] +# GFX11-REAL16: v_fract_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08] +# GFX11-FAKE16: v_fract_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08] 0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10 -# GFX11: v_fract_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10] +# GFX11-REAL16: v_fract_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10] +# GFX11-FAKE16: v_fract_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10] 0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 -# GFX11: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_fract_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00 +# GFX11-REAL16: v_fract_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x08,0xdf,0xd5,0xff,0x01,0x00,0x00 +# GFX11-REAL16: v_fract_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xdf,0xd5,0xff,0x01,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00] + +0xff,0xc1,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_fract_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] 0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00 # GFX11: v_fract_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt index 24dc882e8beb0..b9e8c46a084f4 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt @@ -1649,46 +1649,68 @@ # GFX12: v_floor_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x48,0xfe,0x7f,0xff,0x6f,0x3d,0x30] 0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX12: v_fract_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_fract_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX12: v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX12: v_fract_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX12: v_fract_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX12: v_fract_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX12: v_fract_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX12: v_fract_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX12: v_fract_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX12: v_fract_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX12: v_fract_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX12: v_fract_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX12: v_fract_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX12: v_fract_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30 -# GFX12: v_fract_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-REAL16: v_fract_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-FAKE16: v_fract_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] + +0xfa,0xbe,0x0a,0x7f,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_fract_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7f,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_mul_i32_i24_e32 v128, 1, v176 ; encoding: [0x81,0x60,0x01,0x13] + +0xfa,0xbe,0xfe,0x7f,0xff,0x6f,0x3d,0x30 +# GFX12-REAL16: v_fract_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7f,0xff,0x6f,0x3d,0x30] +# GFX12-FAKE16: v_lshlrev_b32_e32 v30, v255, v183 ; encoding: [0xff,0x6f,0x3d,0x30] 0xfa,0x40,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX12: v_fract_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt index 2eeb220b913fd..66cdd104850fd 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt @@ -315,10 +315,19 @@ # GFX12: v_floor_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x48,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX12: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00 -# GFX12: v_fract_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_fract_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +0xe9,0xbe,0x0a,0x7f,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_fract_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7f,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05] + +0xea,0xbe,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_fract_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0x40,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX12: v_fract_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x40,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt index 661d072f46c1a..6c98db919a9d7 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt @@ -2089,50 +2089,78 @@ # GFX12: v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x9a,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] 0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00 -# GFX12: v_fract_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] +# GFX12-REAL16: v_fract_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00 -# GFX12: v_fract_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00] +# GFX12-REAL16: v_fract_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00 -# GFX12: v_fract_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] +# GFX12-REAL16: v_fract_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00 -# GFX12: v_fract_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] +# GFX12-REAL16: v_fract_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00 -# GFX12: v_fract_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-REAL16: v_fract_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00 -# GFX12: v_fract_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-REAL16: v_fract_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00 -# GFX12: v_fract_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-REAL16: v_fract_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00 -# GFX12: v_fract_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-REAL16: v_fract_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00 -# GFX12: v_fract_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-REAL16: v_fract_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00 -# GFX12: v_fract_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_fract_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00 -# GFX12: v_fract_f16_e64 v5, null ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-REAL16: v_fract_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v5, null ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00 -# GFX12: v_fract_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-REAL16: v_fract_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08 -# GFX12: v_fract_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-REAL16: v_fract_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-FAKE16: v_fract_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08] 0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10 -# GFX12: v_fract_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-REAL16: v_fract_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-FAKE16: v_fract_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10] 0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 -# GFX12: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_fract_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +0x05,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00 +# GFX12-REAL16: v_fract_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x08,0xdf,0xd5,0xff,0x01,0x00,0x00 +# GFX12-REAL16: v_fract_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xdf,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00] + +0xff,0xc1,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_fract_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +# GFX11: v_fract_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00 # GFX12: v_fract_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt index a1e431bc49d34..829e21f9b4b99 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt @@ -1665,47 +1665,74 @@ # GFX12: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xa4,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX12: v_fract_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX12: v_fract_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX12: v_fract_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_fract_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX12: v_fract_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_fract_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX12: v_fract_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_fract_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX12: v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX12: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 -# GFX12: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] 0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 -# GFX12: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +0x05,0x08,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] + +0xff,0xc1,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_fract_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] + +# GFX11: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xa0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX12: v_fract_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt index 405b716c110e1..c22c8745d86c2 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt @@ -473,17 +473,34 @@ # GFX12: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xa4,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX12: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX12: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] 0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX12: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +0x05,0x08,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +0xff,0xc1,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_fract_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +# GFX11: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xa0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX12: v_fract_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] From b71a6fd042173098977e97a47ee0bedb4040069a Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Fri, 3 Jan 2025 15:46:06 -0500 Subject: [PATCH 399/567] [AMDGPU][True16][MC] true16 for v_cvt_i32_i16 (#120645) Support true16 format for v_cvt_i32_i16 in MC --- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 2 +- llvm/test/MC/AMDGPU/gfx11_asm_vop1.s | 14 ++-- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s | 65 +++++++++++-------- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s | 21 ++++-- llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s | 18 +++++ .../MC/AMDGPU/gfx11_asm_vop1_t16_promote.s | 21 ++++-- .../AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s | 59 +++++++++-------- .../MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s | 15 +++-- .../test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s | 11 ++-- llvm/test/MC/AMDGPU/gfx12_asm_vop1.s | 14 ++-- llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s | 62 ++++++++++-------- llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s | 18 +++-- llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s | 18 +++++ .../MC/AMDGPU/gfx12_asm_vop1_t16_promote.s | 21 ++++-- .../test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s | 11 ++-- .../AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s | 59 +++++++++-------- .../MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s | 15 +++-- .../Disassembler/AMDGPU/gfx11_dasm_vop1.txt | 15 ++++- .../AMDGPU/gfx11_dasm_vop1_dpp16.txt | 54 +++++++++++---- .../AMDGPU/gfx11_dasm_vop1_dpp8.txt | 18 ++++- .../gfx11_dasm_vop3_dpp16_from_vop1.txt | 46 +++++++++---- .../AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt | 10 ++- .../AMDGPU/gfx11_dasm_vop3_from_vop1.txt | 10 ++- .../AMDGPU/gfx12_dasm_vop1_dpp16.txt | 50 ++++++++++---- .../AMDGPU/gfx12_dasm_vop1_dpp8.txt | 14 +++- .../AMDGPU/gfx12_dasm_vop3_from_vop1.txt | 10 ++- .../gfx12_dasm_vop3_from_vop1_dpp16.txt | 46 +++++++++---- .../AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt | 10 ++- 28 files changed, 492 insertions(+), 235 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index b58b7a5fcdcd0..e1d97bd9f6399 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -1019,7 +1019,7 @@ defm V_SWAP_B16 : VOP1Only_Real_gfx11_gfx12<0x066>; defm V_PERMLANE64_B32 : VOP1Only_Real_gfx11_gfx12<0x067>; defm V_MOV_B16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x01c, "v_mov_b16">; defm V_NOT_B16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x069, "v_not_b16">; -defm V_CVT_I32_I16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x06a, "v_cvt_i32_i16">; +defm V_CVT_I32_I16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x06a, "v_cvt_i32_i16">; defm V_CVT_U32_U16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x06b, "v_cvt_u32_u16">; defm V_CVT_F16_U16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x050, "v_cvt_f16_u16">; diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s index 0d29ed985269a..ace776f789eba 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s @@ -1271,11 +1271,11 @@ v_cvt_i32_f64 v5, src_scc v_cvt_i32_f64 v255, 0xaf123456 // GFX11: v_cvt_i32_f64_e32 v255, 0xaf123456 ; encoding: [0xff,0x06,0xfe,0x7f,0x56,0x34,0x12,0xaf] -v_cvt_i32_i16 v5, v1 -// GFX11: v_cvt_i32_i16_e32 v5, v1 ; encoding: [0x01,0xd5,0x0a,0x7e] +v_cvt_i32_i16 v5, v1.l +// GFX11: v_cvt_i32_i16_e32 v5, v1.l ; encoding: [0x01,0xd5,0x0a,0x7e] -v_cvt_i32_i16 v5, v127 -// GFX11: v_cvt_i32_i16_e32 v5, v127 ; encoding: [0x7f,0xd5,0x0a,0x7e] +v_cvt_i32_i16 v5, v127.l +// GFX11: v_cvt_i32_i16_e32 v5, v127.l ; encoding: [0x7f,0xd5,0x0a,0x7e] v_cvt_i32_i16 v5, s1 // GFX11: v_cvt_i32_i16_e32 v5, s1 ; encoding: [0x01,0xd4,0x0a,0x7e] @@ -1316,6 +1316,12 @@ v_cvt_i32_i16 v5, src_scc v_cvt_i32_i16 v255, 0xfe0b // GFX11: v_cvt_i32_i16_e32 v255, 0xfe0b ; encoding: [0xff,0xd4,0xfe,0x7f,0x0b,0xfe,0x00,0x00] +v_cvt_i32_i16 v5, v1.h +// GFX11: v_cvt_i32_i16_e32 v5, v1.h ; encoding: [0x81,0xd5,0x0a,0x7e] + +v_cvt_i32_i16 v5, v127.h +// GFX11: v_cvt_i32_i16_e32 v5, v127.h ; encoding: [0xff,0xd5,0x0a,0x7e] + v_cvt_nearest_i32_f32 v5, v1 // GFX11: v_cvt_nearest_i32_f32_e32 v5, v1 ; encoding: [0x01,0x19,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s index d4fb880f25b55..93c120ac59477 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s @@ -926,47 +926,56 @@ v_cvt_i32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cvt_i32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cvt_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x10,0xfe,0x7f,0xff,0x6f,0x35,0x30] -v_cvt_i32_i16 v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_cvt_i32_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff] +v_cvt_i32_i16 v5, v1.l quad_perm:[3,2,1,0] +// GFX11: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff] -v_cvt_i32_i16 v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_cvt_i32_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff] +v_cvt_i32_i16 v5, v1.l quad_perm:[0,1,2,3] +// GFX11: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff] -v_cvt_i32_i16 v5, v1 row_mirror -// GFX11: v_cvt_i32_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_mirror +// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_half_mirror -// GFX11: v_cvt_i32_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_half_mirror +// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_shl:1 -// GFX11: v_cvt_i32_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_shl:1 +// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_shl:15 -// GFX11: v_cvt_i32_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_shl:15 +// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_shr:1 -// GFX11: v_cvt_i32_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_shr:1 +// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_shr:15 -// GFX11: v_cvt_i32_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_shr:15 +// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_ror:1 -// GFX11: v_cvt_i32_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_ror:1 +// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_ror:15 -// GFX11: v_cvt_i32_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_ror:15 +// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cvt_i32_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cvt_i32_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01] +v_cvt_i32_i16 v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01] -v_cvt_i32_i16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cvt_i32_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x09,0x13] +v_cvt_i32_i16 v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x09,0x13] -v_cvt_i32_i16 v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cvt_i32_i16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x05,0x30] +v_cvt_i32_i16 v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_cvt_i32_i16_dpp v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x05,0x30] + +v_cvt_i32_i16 v5, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cvt_i32_i16_dpp v5, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x7f,0x5f,0x01,0x01] + +v_cvt_i32_i16 v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cvt_i32_i16_dpp v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x09,0x13] + +v_cvt_i32_i16 v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cvt_i32_i16_dpp v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x05,0x30] v_cvt_nearest_i32_f32 v5, v1 quad_perm:[3,2,1,0] // GFX11: v_cvt_nearest_i32_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s index b6094c5ea3bd6..2029baee77df9 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s @@ -218,14 +218,23 @@ v_cvt_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cvt_i32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cvt_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x10,0xfe,0x7f,0xff,0x00,0x00,0x00] -v_cvt_i32_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_cvt_i32_i16 v5, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cvt_i32_i16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_cvt_i32_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_cvt_i32_i16 v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cvt_i32_i16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_cvt_i32_i16 v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cvt_i32_i16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00] +v_cvt_i32_i16 v255, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_cvt_i32_i16_dpp v255, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00] + +v_cvt_i32_i16 v5, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cvt_i32_i16_dpp v5, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x7f,0x77,0x39,0x05] + +v_cvt_i32_i16 v5, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cvt_i32_i16_dpp v5, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05] + +v_cvt_i32_i16 v255, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cvt_i32_i16_dpp v255, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00] v_cvt_nearest_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cvt_nearest_i32_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s index 98db7cc8bbc40..936cce46f2ebc 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s @@ -236,6 +236,24 @@ v_cvt_i32_i16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_cvt_i32_i16_e32 v5, v199 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction +v_cvt_i32_i16_e32 v5.h, v199.h +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cvt_i32_i16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cvt_i32_i16_e32 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cvt_i32_i16_e32 v5.l, v199.l +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cvt_i32_i16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cvt_i32_i16_e32 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + v_cvt_norm_i16_f16_e32 v128.h, 0xfe0b // GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s index 9de05d4a82465..1c8d7e43be081 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s @@ -662,14 +662,23 @@ v_cvt_i16_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] v_cvt_i16_f16 v5.l, v199.l quad_perm:[3,2,1,0] // GFX11: v_cvt_i16_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd3,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] -v_cvt_i32_i16 v5, v199 -// GFX11: v_cvt_i32_i16_e64 v5, v199 ; encoding: [0x05,0x00,0xea,0xd5,0xc7,0x01,0x00,0x00] +v_cvt_i32_i16 v5, v199.h +// GFX11: v_cvt_i32_i16_e64 v5, v199.h op_sel:[1,0] ; encoding: [0x05,0x08,0xea,0xd5,0xc7,0x01,0x00,0x00] -v_cvt_i32_i16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cvt_i32_i16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] +v_cvt_i32_i16 v5, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cvt_i32_i16_e64_dpp v5, v199.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xea,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] -v_cvt_i32_i16 v5, v199 quad_perm:[3,2,1,0] -// GFX11: v_cvt_i32_i16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] +v_cvt_i32_i16 v5, v199.h quad_perm:[3,2,1,0] +// GFX11: v_cvt_i32_i16_e64_dpp v5, v199.h op_sel:[1,0] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] + +v_cvt_i32_i16 v5, v199.l +// GFX11: v_cvt_i32_i16_e64 v5, v199.l ; encoding: [0x05,0x00,0xea,0xd5,0xc7,0x01,0x00,0x00] + +v_cvt_i32_i16 v5, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cvt_i32_i16_e64_dpp v5, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_cvt_i32_i16 v5, v199.l quad_perm:[3,2,1,0] +// GFX11: v_cvt_i32_i16_e64_dpp v5, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] v_cvt_norm_i16_f16 v128.h, 0xfe0b // GFX11: v_cvt_norm_i16_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe3,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s index b674395fddf63..204d87c280525 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s @@ -967,47 +967,50 @@ v_cvt_i32_f32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_cvt_i32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x88,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x05,0x30] -v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[3,2,1,0] +// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] +// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_mirror -// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_mirror +// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror -// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_half_mirror +// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1 -// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:1 +// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15 -// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:15 +// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1 -// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:1 +// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15 -// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:15 +// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1 -// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:1 +// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15 -// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:15 +// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +v_cvt_i32_i16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] -v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13] +v_cvt_i32_i16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13] -v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] +v_cvt_i32_i16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cvt_i32_i16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] + +v_cvt_i32_i16_e64_dpp v255, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: [0xff,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] v_cvt_nearest_i32_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX11: v_cvt_nearest_i32_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x8c,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s index a07db726574e5..d779b65bc0ba9 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s @@ -283,14 +283,17 @@ v_cvt_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cvt_i32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x88,0xd5,0xe9,0x00,0x00,0x20,0xff,0x00,0x00,0x00] -v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +v_cvt_i32_i16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cvt_i32_i16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] + +v_cvt_i32_i16_e64_dpp v255, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: [0xff,0x08,0xea,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] v_cvt_nearest_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cvt_nearest_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x8c,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s index 964a19205df5c..7abc0185d6af6 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s @@ -1267,11 +1267,11 @@ v_cvt_i32_f64_e64 v5, -|src_scc| v_cvt_i32_f64_e64 v255, 0xaf123456 clamp // GFX11: v_cvt_i32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x83,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cvt_i32_i16_e64 v5, v1 -// GFX11: v_cvt_i32_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00] +v_cvt_i32_i16_e64 v5, v1.l +// GFX11: v_cvt_i32_i16_e64 v5, v1.l ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00] -v_cvt_i32_i16_e64 v5, v255 -// GFX11: v_cvt_i32_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00] +v_cvt_i32_i16_e64 v5, v255.l +// GFX11: v_cvt_i32_i16_e64 v5, v255.l ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00] v_cvt_i32_i16_e64 v5, s1 // GFX11: v_cvt_i32_i16_e64 v5, s1 ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00] @@ -1312,6 +1312,9 @@ v_cvt_i32_i16_e64 v5, src_scc v_cvt_i32_i16_e64 v255, 0xfe0b // GFX11: v_cvt_i32_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cvt_i32_i16_e64 v5, v255.h +// GFX11: [0x05,0x08,0xea,0xd5,0xff,0x01,0x00,0x00] + v_cvt_nearest_i32_f32_e64 v5, v1 // GFX11: v_cvt_nearest_i32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s index 42b9dc464dd90..8f517ecdfc84a 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s @@ -1338,11 +1338,11 @@ v_cvt_i32_f64 v5, src_scc v_cvt_i32_f64 v255, 0xaf123456 // GFX12: v_cvt_i32_f64_e32 v255, 0xaf123456 ; encoding: [0xff,0x06,0xfe,0x7f,0x56,0x34,0x12,0xaf] -v_cvt_i32_i16 v5, v1 -// GFX12: v_cvt_i32_i16_e32 v5, v1 ; encoding: [0x01,0xd5,0x0a,0x7e] +v_cvt_i32_i16 v5, v1.l +// GFX12: v_cvt_i32_i16_e32 v5, v1.l ; encoding: [0x01,0xd5,0x0a,0x7e] -v_cvt_i32_i16 v5, v127 -// GFX12: v_cvt_i32_i16_e32 v5, v127 ; encoding: [0x7f,0xd5,0x0a,0x7e] +v_cvt_i32_i16 v5, v127.l +// GFX12: v_cvt_i32_i16_e32 v5, v127.l ; encoding: [0x7f,0xd5,0x0a,0x7e] v_cvt_i32_i16 v5, s1 // GFX12: v_cvt_i32_i16_e32 v5, s1 ; encoding: [0x01,0xd4,0x0a,0x7e] @@ -1384,6 +1384,12 @@ v_cvt_i32_i16 v5, src_scc v_cvt_i32_i16 v255, 0xfe0b // GFX12: v_cvt_i32_i16_e32 v255, 0xfe0b ; encoding: [0xff,0xd4,0xfe,0x7f,0x0b,0xfe,0x00,0x00] +v_cvt_i32_i16 v5, v1.h +// GFX12: v_cvt_i32_i16_e32 v5, v1.h ; encoding: [0x81,0xd5,0x0a,0x7e] + +v_cvt_i32_i16 v5, v127.h +// GFX12: v_cvt_i32_i16_e32 v5, v127.h ; encoding: [0xff,0xd5,0x0a,0x7e] + v_cvt_nearest_i32_f32 v5, v1 // GFX12: v_cvt_nearest_i32_f32_e32 v5, v1 ; encoding: [0x01,0x19,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s index 7cfc2c1d45285..914cfcbb229a3 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s @@ -970,47 +970,53 @@ v_cvt_i32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cvt_i32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cvt_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x10,0xfe,0x7f,0xff,0x6f,0x35,0x30] -v_cvt_i32_i16 v5, v1 quad_perm:[3,2,1,0] -// GFX12: v_cvt_i32_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff] +v_cvt_i32_i16 v5, v1.l quad_perm:[3,2,1,0] +// GFX12: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff] -v_cvt_i32_i16 v5, v1 quad_perm:[0,1,2,3] -// GFX12: v_cvt_i32_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff] +v_cvt_i32_i16 v5, v1.l quad_perm:[0,1,2,3] +// GFX12: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff] -v_cvt_i32_i16 v5, v1 row_mirror -// GFX12: v_cvt_i32_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_mirror +// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_half_mirror -// GFX12: v_cvt_i32_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_half_mirror +// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_shl:1 -// GFX12: v_cvt_i32_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_shl:1 +// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_shl:15 -// GFX12: v_cvt_i32_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_shl:15 +// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_shr:1 -// GFX12: v_cvt_i32_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_shr:1 +// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_shr:15 -// GFX12: v_cvt_i32_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_shr:15 +// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_ror:1 -// GFX12: v_cvt_i32_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_ror:1 +// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_ror:15 -// GFX12: v_cvt_i32_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_ror:15 +// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cvt_i32_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cvt_i32_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01] +v_cvt_i32_i16 v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01] -v_cvt_i32_i16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cvt_i32_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x09,0x13] +v_cvt_i32_i16 v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x09,0x13] -v_cvt_i32_i16 v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cvt_i32_i16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x05,0x30] +v_cvt_i32_i16 v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cvt_i32_i16_dpp v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x05,0x30] + +v_cvt_i32_i16 v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cvt_i32_i16_dpp v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x09,0x13] + +v_cvt_i32_i16 v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cvt_i32_i16_dpp v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x05,0x30] v_cvt_nearest_i32_f32 v5, v1 quad_perm:[3,2,1,0] // GFX12: v_cvt_nearest_i32_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s index ddb4029f10208..f1c4e863b1873 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s @@ -244,14 +244,20 @@ v_cvt_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cvt_i32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cvt_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x10,0xfe,0x7f,0xff,0x00,0x00,0x00] -v_cvt_i32_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_cvt_i32_i16 v5, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cvt_i32_i16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_cvt_i32_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_cvt_i32_i16 v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cvt_i32_i16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_cvt_i32_i16 v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cvt_i32_i16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00] +v_cvt_i32_i16 v255, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cvt_i32_i16_dpp v255, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00] + +v_cvt_i32_i16 v5, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cvt_i32_i16_dpp v5, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05] + +v_cvt_i32_i16 v255, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cvt_i32_i16_dpp v255, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00] v_cvt_nearest_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cvt_nearest_i32_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s index 05a990eed89c8..eb7b86635f35d 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s @@ -251,6 +251,24 @@ v_cvt_i32_i16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_cvt_i32_i16_e32 v5, v199 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction +v_cvt_i32_i16_e32 v5, v199.h +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_cvt_i32_i16_e32 v5, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_cvt_i32_i16_e32 v5, v199.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_cvt_i32_i16_e32 v5, v199.l +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_cvt_i32_i16_e32 v5, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_cvt_i32_i16_e32 v5, v199.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + v_cvt_norm_i16_f16_e32 v128.h, 0xfe0b // GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s index 96de27842c072..2f0c0a1192f2f 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s @@ -622,14 +622,23 @@ v_cvt_i16_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] v_cvt_i16_f16 v5.l, v199.l quad_perm:[3,2,1,0] // GFX12: v_cvt_i16_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd3,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] -v_cvt_i32_i16 v5, v199 -// GFX12: v_cvt_i32_i16_e64 v5, v199 ; encoding: [0x05,0x00,0xea,0xd5,0xc7,0x01,0x00,0x00] +v_cvt_i32_i16 v5, v199.h +// GFX12: v_cvt_i32_i16_e64 v5, v199.h op_sel:[1,0] ; encoding: [0x05,0x08,0xea,0xd5,0xc7,0x01,0x00,0x00] -v_cvt_i32_i16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cvt_i32_i16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] +v_cvt_i32_i16 v5, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cvt_i32_i16_e64_dpp v5, v199.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xea,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] -v_cvt_i32_i16 v5, v199 quad_perm:[3,2,1,0] -// GFX12: v_cvt_i32_i16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] +v_cvt_i32_i16 v5, v199.h quad_perm:[3,2,1,0] +// GFX12: v_cvt_i32_i16_e64_dpp v5, v199.h op_sel:[1,0] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] + +v_cvt_i32_i16 v5, v199.l +// GFX12: v_cvt_i32_i16_e64 v5, v199.l ; encoding: [0x05,0x00,0xea,0xd5,0xc7,0x01,0x00,0x00] + +v_cvt_i32_i16 v5, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cvt_i32_i16_e64_dpp v5, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_cvt_i32_i16 v5, v199.l quad_perm:[3,2,1,0] +// GFX12: v_cvt_i32_i16_e64_dpp v5, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] v_cvt_norm_i16_f16 v128.h, 0xfe0b // GFX12: v_cvt_norm_i16_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe3,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s index 613a70f46800e..224f7f090a64f 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s @@ -1417,11 +1417,11 @@ v_cvt_i32_f64_e64 v5, -|src_scc| v_cvt_i32_f64_e64 v255, 0xaf123456 clamp // GFX12: v_cvt_i32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x83,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cvt_i32_i16_e64 v5, v1 -// GFX12: v_cvt_i32_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00] +v_cvt_i32_i16_e64 v5, v1.l +// GFX12: v_cvt_i32_i16_e64 v5, v1.l ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00] -v_cvt_i32_i16_e64 v5, v255 -// GFX12: v_cvt_i32_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00] +v_cvt_i32_i16_e64 v5, v255.l +// GFX12: v_cvt_i32_i16_e64 v5, v255.l ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00] v_cvt_i32_i16_e64 v5, s1 // GFX12: v_cvt_i32_i16_e64 v5, s1 ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00] @@ -1462,6 +1462,9 @@ v_cvt_i32_i16_e64 v5, src_scc v_cvt_i32_i16_e64 v255, 0xfe0b // GFX12: v_cvt_i32_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cvt_i32_i16_e64 v5, v255.h +// GFX12: v_cvt_i32_i16_e64 v5, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xea,0xd5,0xff,0x01,0x00,0x00] + v_cvt_nearest_i32_f32_e64 v5, v1 // GFX12: v_cvt_nearest_i32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s index 2044058566052..0a8ce42e130c3 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s @@ -1000,47 +1000,50 @@ v_cvt_i32_f32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_cvt_i32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x88,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x05,0x30] -v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] -// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[3,2,1,0] +// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] -// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] +// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_mirror -// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_mirror +// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror -// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_half_mirror +// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1 -// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:1 +// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15 -// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:15 +// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1 -// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:1 +// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15 -// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:15 +// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1 -// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:1 +// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15 -// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:15 +// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +v_cvt_i32_i16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] -v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13] +v_cvt_i32_i16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13] -v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] +v_cvt_i32_i16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cvt_i32_i16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] + +v_cvt_i32_i16_e64_dpp v255, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cvt_i32_i16_e64_dpp v255, v255.h op_sel:[1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] v_cvt_nearest_i32_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX12: v_cvt_nearest_i32_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x8c,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s index b0283c2df7169..930f8f8d56957 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s @@ -316,14 +316,17 @@ v_cvt_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cvt_i32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x88,0xd5,0xe9,0x00,0x00,0x20,0xff,0x00,0x00,0x00] -v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +v_cvt_i32_i16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cvt_i32_i16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] + +v_cvt_i32_i16_e64_dpp v255, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cvt_i32_i16_e64_dpp v255, v255.h op_sel:[1,0] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x08,0xea,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] v_cvt_nearest_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cvt_nearest_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x8c,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt index 67b39ee2fdea0..cc3b8fdd9093b 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt @@ -1300,10 +1300,12 @@ # GFX11: v_cvt_i32_f64_e32 v255, 0xaf123456 ; encoding: [0xff,0x06,0xfe,0x7f,0x56,0x34,0x12,0xaf] 0x01,0xd5,0x0a,0x7e -# GFX11: v_cvt_i32_i16_e32 v5, v1 ; encoding: [0x01,0xd5,0x0a,0x7e] +# GFX11-REAL16: v_cvt_i32_i16_e32 v5, v1.l ; encoding: [0x01,0xd5,0x0a,0x7e] +# GFX11-FAKE16: v_cvt_i32_i16_e32 v5, v1 ; encoding: [0x01,0xd5,0x0a,0x7e] 0x7f,0xd5,0x0a,0x7e -# GFX11: v_cvt_i32_i16_e32 v5, v127 ; encoding: [0x7f,0xd5,0x0a,0x7e] +# GFX11-REAL16: v_cvt_i32_i16_e32 v5, v127.l ; encoding: [0x7f,0xd5,0x0a,0x7e] +# GFX11-FAKE16: v_cvt_i32_i16_e32 v5, v127 ; encoding: [0x7f,0xd5,0x0a,0x7e] 0x01,0xd4,0x0a,0x7e # GFX11: v_cvt_i32_i16_e32 v5, s1 ; encoding: [0x01,0xd4,0x0a,0x7e] @@ -1344,6 +1346,15 @@ 0xff,0xd4,0xfe,0x7f,0x0b,0xfe,0x00,0x00 # GFX11: v_cvt_i32_i16_e32 v255, 0xfe0b ; encoding: [0xff,0xd4,0xfe,0x7f,0x0b,0xfe,0x00,0x00] +0x81,0xd5,0x0a,0x7e +# GFX11-REAL16: v_cvt_i32_i16_e32 v5, v1.h ; encoding: [0x81,0xd5,0x0a,0x7e] +# GFX11-FAKE16: v_cvt_i32_i16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xd5,0x0a,0x7e] + +0xff,0xd5,0x0a,0x7e +# GFX11-REAL16: v_cvt_i32_i16_e32 v5, v127.h ; encoding: [0xff,0xd5,0x0a,0x7e] +# GFX11-FAKE16: v_cvt_i32_i16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xd5,0x0a,0x7e] + + 0x01,0x19,0x0a,0x7e # GFX11: v_cvt_nearest_i32_f32_e32 v5, v1 ; encoding: [0x01,0x19,0x0a,0x7e] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt index 55a128f386b7c..ba9e8142942de 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt @@ -971,46 +971,72 @@ # GFX11: v_cvt_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x10,0xfe,0x7f,0xff,0x6f,0x3d,0x30] 0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX11: v_cvt_i32_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX11: v_cvt_i32_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX11: v_cvt_i32_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX11: v_cvt_i32_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX11: v_cvt_i32_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX11: v_cvt_i32_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX11: v_cvt_i32_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX11: v_cvt_i32_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX11: v_cvt_i32_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX11: v_cvt_i32_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX11: v_cvt_i32_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX11: v_cvt_i32_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX11: v_cvt_i32_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30 -# GFX11: v_cvt_i32_i16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cvt_i32_i16_dpp v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30] + +0xfa,0xd4,0x0a,0x7e,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x7f,0x5f,0x01,0x01] + +0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x01,0x13] + +0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cvt_i32_i16_dpp v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v255, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x0d,0x30] 0xfa,0x18,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX11: v_cvt_nearest_i32_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt index 0a4d263862407..dda9dfcb35b1a 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt @@ -185,10 +185,24 @@ # GFX11: v_cvt_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x10,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX11: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00 -# GFX11: v_cvt_i32_i16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cvt_i32_i16_dpp v255, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00] + +0xe9,0xd4,0x0a,0x7e,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x7f,0x77,0x39,0x05] + +0xe9,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05] + +0xea,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cvt_i32_i16_dpp v255, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v255, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX11: v_cvt_nearest_i32_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt index 4e64ecaa85ecc..0191f37c14e31 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt @@ -1025,46 +1025,64 @@ # GFX11: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0x88,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01 -# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13 -# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] 0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 -# GFX11: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] + +0xff,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.h op_sel:[1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0x05,0x00,0x8c,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cvt_nearest_i32_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x8c,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt index 1d9edc8535d60..ab3788deeed3d 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt @@ -291,10 +291,16 @@ # GFX11: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0x88,0xd5,0xea,0x00,0x00,0x20,0xff,0x00,0x00,0x00] 0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 -# GFX11: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] + +0xff,0x08,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.h op_sel:[1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x08,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0x05,0x00,0x8c,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cvt_nearest_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x8c,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt index c3889208779f8..2e741322eb122 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt @@ -1314,10 +1314,12 @@ # GFX11: v_cvt_i32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x83,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] 0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00 -# GFX11: v_cvt_i32_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00] +# GFX11-REAL16: v_cvt_i32_i16_e64 v5, v1.l ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00] +# GFX11-FAKE16: v_cvt_i32_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00 -# GFX11: v_cvt_i32_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00] +# GFX11-REAL16: v_cvt_i32_i16_e64 v5, v255.l ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00] +# GFX11-FAKE16: v_cvt_i32_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00 # GFX11: v_cvt_i32_i16_e64 v5, s1 ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00] @@ -1358,6 +1360,10 @@ 0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX11: v_cvt_i32_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +0x05,0x08,0xea,0xd5,0xff,0x01,0x00,0x00 +# GFX11-REAL16: v_cvt_i32_i16_e64 v5, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xea,0xd5,0xff,0x01,0x00,0x00] +# GFX11-FAKE16: v_cvt_i32_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00] + 0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00 # GFX11: v_cvt_nearest_i32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt index b9e8c46a084f4..4d6e8ffbd9a27 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt @@ -1035,46 +1035,68 @@ # GFX12: v_cvt_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x10,0xfe,0x7f,0xff,0x6f,0x3d,0x30] 0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX12: v_cvt_i32_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX12: v_cvt_i32_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX12: v_cvt_i32_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX12: v_cvt_i32_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX12: v_cvt_i32_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX12: v_cvt_i32_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX12: v_cvt_i32_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX12: v_cvt_i32_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX12: v_cvt_i32_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX12: v_cvt_i32_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX12: v_cvt_i32_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX12: v_cvt_i32_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX12: v_cvt_i32_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30 -# GFX12: v_cvt_i32_i16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cvt_i32_i16_dpp v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30] + +0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x01,0x13] + +0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cvt_i32_i16_dpp v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v255, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x0d,0x30] 0xfa,0x18,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX12: v_cvt_nearest_i32_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt index 66cdd104850fd..fcc1d3f97dcb1 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt @@ -198,10 +198,20 @@ # GFX12: v_cvt_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x10,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX12: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00 -# GFX12: v_cvt_i32_i16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cvt_i32_i16_dpp v255, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00] + +0xe9,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05] + +0xea,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cvt_i32_i16_dpp v255, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v255, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX12: v_cvt_nearest_i32_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt index 6c98db919a9d7..dad6b502e0bd0 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt @@ -1366,10 +1366,12 @@ # GFX12: v_cvt_i32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x83,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] 0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00 -# GFX12: v_cvt_i32_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00] +# GFX12-REAL16: v_cvt_i32_i16_e64 v5, v1.l ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_cvt_i32_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00 -# GFX12: v_cvt_i32_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00] +# GFX12-REAL16: v_cvt_i32_i16_e64 v5, v255.l ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_cvt_i32_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00 # GFX12: v_cvt_i32_i16_e64 v5, s1 ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00] @@ -1410,6 +1412,10 @@ 0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX12: v_cvt_i32_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +0x05,0x08,0xea,0xd5,0xff,0x01,0x00,0x00 +# GFX12-REAL16: v_cvt_i32_i16_e64 v5, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xea,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_cvt_i32_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00] + 0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00 # GFX12: v_cvt_nearest_i32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt index 829e21f9b4b99..ccf5f4b21b73c 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt @@ -1055,46 +1055,64 @@ # GFX12: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0x88,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01 -# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13 -# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] 0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 -# GFX12: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] + +0xff,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.h op_sel:[1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0x05,0x00,0x8c,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX12: v_cvt_nearest_i32_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x8c,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt index c22c8745d86c2..8018f80798573 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt @@ -321,10 +321,16 @@ # GFX12: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0x88,0xd5,0xea,0x00,0x00,0x20,0xff,0x00,0x00,0x00] 0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 -# GFX12: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] + +0xff,0x08,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.h op_sel:[1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x08,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0x05,0x00,0x8c,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX12: v_cvt_nearest_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x8c,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] From bf274b3d8044cab8478bef50ccf96313e4dbf21e Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Fri, 3 Jan 2025 15:46:41 -0500 Subject: [PATCH 400/567] [AMDGPU][True16][MC] true16 for v_cos_f16 (#120639) Support true16 format for v_cos_f16 in MC --- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 2 +- llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll | 32 ++++ llvm/test/MC/AMDGPU/gfx11_asm_vop1.s | 75 +++++---- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s | 65 ++++---- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s | 21 ++- llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s | 42 +++++ .../MC/AMDGPU/gfx11_asm_vop1_t16_promote.s | 154 +++++++++++++----- .../AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s | 65 ++++---- .../MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s | 25 ++- .../test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s | 69 ++++---- llvm/test/MC/AMDGPU/gfx12_asm_vop1.s | 72 ++++---- llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s | 62 +++---- llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s | 18 +- llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s | 42 +++++ .../MC/AMDGPU/gfx12_asm_vop1_t16_promote.s | 154 +++++++++++++----- .../test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s | 69 ++++---- .../AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s | 65 ++++---- .../MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s | 25 ++- .../Disassembler/AMDGPU/gfx11_dasm_vop1.txt | 63 +++++-- .../AMDGPU/gfx11_dasm_vop1_dpp16.txt | 54 ++++-- .../AMDGPU/gfx11_dasm_vop1_dpp8.txt | 17 +- .../gfx11_dasm_vop3_dpp16_from_vop1.txt | 54 ++++-- .../AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt | 24 ++- .../AMDGPU/gfx11_dasm_vop3_from_vop1.txt | 57 +++++-- .../AMDGPU/gfx12_dasm_vop1_dpp16.txt | 50 ++++-- .../AMDGPU/gfx12_dasm_vop1_dpp8.txt | 13 +- .../AMDGPU/gfx12_dasm_vop3_from_vop1.txt | 57 +++++-- .../gfx12_dasm_vop3_from_vop1_dpp16.txt | 54 ++++-- .../AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt | 24 ++- 29 files changed, 1062 insertions(+), 462 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index e1d97bd9f6399..fc22b539d7153 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -1046,7 +1046,7 @@ defm V_TRUNC_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05d, defm V_RNDNE_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05e, "v_rndne_f16">; defm V_FRACT_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05f, "v_fract_f16">; defm V_SIN_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x060, "v_sin_f16">; -defm V_COS_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x061, "v_cos_f16">; +defm V_COS_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x061, "v_cos_f16">; defm V_SAT_PK_U8_I16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x062, "v_sat_pk_u8_i16">; defm V_CVT_NORM_I16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x063, "v_cvt_norm_i16_f16">; defm V_CVT_NORM_U16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x064, "v_cvt_norm_u16_f16">; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll index 3ff759a5cdb94..867025adca944 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -4,6 +4,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX12 %s define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX6-LABEL: cos_f16: @@ -80,6 +81,19 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX11-NEXT: v_cos_f16_e32 v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: cos_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cos_f16_e32 v1, v1 +; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX12-NEXT: s_endpgm %a.val = load half, ptr addrspace(1) %a %r.val = call half @llvm.cos.f16(half %a.val) store half %r.val, ptr addrspace(1) %r @@ -188,6 +202,24 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX11-NEXT: v_pack_b32_f16 v1, v1, v2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: cos_v2f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX12-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_mul_f16_e32 v2, 0.15915494, v2 +; GFX12-NEXT: v_cos_f16_e32 v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX12-NEXT: v_cos_f16_e32 v2, v2 +; GFX12-NEXT: v_pack_b32_f16 v1, v1, v2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_endpgm %a.val = load <2 x half>, ptr addrspace(1) %a %r.val = call <2 x half> @llvm.cos.v2f16(<2 x half> %a.val) store <2 x half> %r.val, ptr addrspace(1) %r diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s index ace776f789eba..40a6e434b438d 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s @@ -269,50 +269,65 @@ v_clz_i32_u32 v5, src_scc v_clz_i32_u32 v255, 0xaf123456 // GFX11: v_clz_i32_u32_e32 v255, 0xaf123456 ; encoding: [0xff,0x72,0xfe,0x7f,0x56,0x34,0x12,0xaf] -v_cos_f16 v5, v1 -// GFX11: v_cos_f16_e32 v5, v1 ; encoding: [0x01,0xc3,0x0a,0x7e] +v_cos_f16 v5.l, v1.l +// GFX11: v_cos_f16_e32 v5.l, v1.l ; encoding: [0x01,0xc3,0x0a,0x7e] -v_cos_f16 v5, v127 -// GFX11: v_cos_f16_e32 v5, v127 ; encoding: [0x7f,0xc3,0x0a,0x7e] +v_cos_f16 v5.l, v127.l +// GFX11: v_cos_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xc3,0x0a,0x7e] -v_cos_f16 v5, s1 -// GFX11: v_cos_f16_e32 v5, s1 ; encoding: [0x01,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, s1 +// GFX11: v_cos_f16_e32 v5.l, s1 ; encoding: [0x01,0xc2,0x0a,0x7e] -v_cos_f16 v5, s105 -// GFX11: v_cos_f16_e32 v5, s105 ; encoding: [0x69,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, s105 +// GFX11: v_cos_f16_e32 v5.l, s105 ; encoding: [0x69,0xc2,0x0a,0x7e] -v_cos_f16 v5, vcc_lo -// GFX11: v_cos_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, vcc_lo +// GFX11: v_cos_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xc2,0x0a,0x7e] -v_cos_f16 v5, vcc_hi -// GFX11: v_cos_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, vcc_hi +// GFX11: v_cos_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xc2,0x0a,0x7e] -v_cos_f16 v5, ttmp15 -// GFX11: v_cos_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, ttmp15 +// GFX11: v_cos_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xc2,0x0a,0x7e] -v_cos_f16 v5, m0 -// GFX11: v_cos_f16_e32 v5, m0 ; encoding: [0x7d,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, m0 +// GFX11: v_cos_f16_e32 v5.l, m0 ; encoding: [0x7d,0xc2,0x0a,0x7e] -v_cos_f16 v5, exec_lo -// GFX11: v_cos_f16_e32 v5, exec_lo ; encoding: [0x7e,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, exec_lo +// GFX11: v_cos_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xc2,0x0a,0x7e] -v_cos_f16 v5, exec_hi -// GFX11: v_cos_f16_e32 v5, exec_hi ; encoding: [0x7f,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, exec_hi +// GFX11: v_cos_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xc2,0x0a,0x7e] -v_cos_f16 v5, null -// GFX11: v_cos_f16_e32 v5, null ; encoding: [0x7c,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, null +// GFX11: v_cos_f16_e32 v5.l, null ; encoding: [0x7c,0xc2,0x0a,0x7e] -v_cos_f16 v5, -1 -// GFX11: v_cos_f16_e32 v5, -1 ; encoding: [0xc1,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, -1 +// GFX11: v_cos_f16_e32 v5.l, -1 ; encoding: [0xc1,0xc2,0x0a,0x7e] -v_cos_f16 v5, 0.5 -// GFX11: v_cos_f16_e32 v5, 0.5 ; encoding: [0xf0,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, 0.5 +// GFX11: v_cos_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xc2,0x0a,0x7e] -v_cos_f16 v5, src_scc -// GFX11: v_cos_f16_e32 v5, src_scc ; encoding: [0xfd,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, src_scc +// GFX11: v_cos_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xc2,0x0a,0x7e] -v_cos_f16 v127, 0xfe0b -// GFX11: v_cos_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_cos_f16 v127.l, 0xfe0b +// GFX11: v_cos_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +v_cos_f16 v5.l, v1.h +// GFX11: v_cos_f16_e32 v5.l, v1.h ; encoding: [0x81,0xc3,0x0a,0x7e] + +v_cos_f16 v5.l, v127.h +// GFX11: v_cos_f16_e32 v5.l, v127.h ; encoding: [0xff,0xc3,0x0a,0x7e] + +v_cos_f16 v127.l, 0.5 +// GFX11: v_cos_f16_e32 v127.l, 0.5 ; encoding: [0xf0,0xc2,0xfe,0x7e] + +v_cos_f16 v5.h, src_scc +// GFX11: v_cos_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xc2,0x0a,0x7f] + +v_cos_f16 v127.h, 0xfe0b +// GFX11: v_cos_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xc2,0xfe,0x7f,0x0b,0xfe,0x00,0x00] v_cos_f32 v5, v1 // GFX11: v_cos_f32_e32 v5, v1 ; encoding: [0x01,0x6d,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s index 93c120ac59477..706cb6e32f88a 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s @@ -212,47 +212,56 @@ v_clz_i32_u32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_clz_i32_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_clz_i32_u32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x72,0xfe,0x7f,0xff,0x6f,0x05,0x30] -v_cos_f16 v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_cos_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +v_cos_f16 v5.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_cos_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff] -v_cos_f16 v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +v_cos_f16 v5.l, v1.l quad_perm:[0,1,2,3] +// GFX11: v_cos_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff] -v_cos_f16 v5, v1 row_mirror -// GFX11: v_cos_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff] +v_cos_f16 v5.l, v1.l row_mirror +// GFX11: v_cos_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff] -v_cos_f16 v5, v1 row_half_mirror -// GFX11: v_cos_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff] +v_cos_f16 v5.l, v1.l row_half_mirror +// GFX11: v_cos_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff] -v_cos_f16 v5, v1 row_shl:1 -// GFX11: v_cos_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff] +v_cos_f16 v5.l, v1.l row_shl:1 +// GFX11: v_cos_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff] -v_cos_f16 v5, v1 row_shl:15 -// GFX11: v_cos_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +v_cos_f16 v5.l, v1.l row_shl:15 +// GFX11: v_cos_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff] -v_cos_f16 v5, v1 row_shr:1 -// GFX11: v_cos_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff] +v_cos_f16 v5.l, v1.l row_shr:1 +// GFX11: v_cos_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff] -v_cos_f16 v5, v1 row_shr:15 -// GFX11: v_cos_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +v_cos_f16 v5.l, v1.l row_shr:15 +// GFX11: v_cos_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff] -v_cos_f16 v5, v1 row_ror:1 -// GFX11: v_cos_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff] +v_cos_f16 v5.l, v1.l row_ror:1 +// GFX11: v_cos_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff] -v_cos_f16 v5, v1 row_ror:15 -// GFX11: v_cos_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +v_cos_f16 v5.l, v1.l row_ror:15 +// GFX11: v_cos_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff] -v_cos_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cos_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff] +v_cos_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cos_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff] -v_cos_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cos_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +v_cos_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cos_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01] -v_cos_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cos_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x09,0x13] +v_cos_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_cos_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x09,0x13] -v_cos_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cos_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +v_cos_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_cos_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x35,0x30] + +v_cos_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cos_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x5f,0x01,0x01] + +v_cos_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cos_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc2,0x0a,0x7f,0x81,0x60,0x09,0x13] + +v_cos_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cos_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7f,0xff,0x6f,0x35,0x30] v_cos_f32 v5, v1 quad_perm:[3,2,1,0] // GFX11: v_cos_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s index 2029baee77df9..d7051aff42d77 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s @@ -50,14 +50,23 @@ v_clz_i32_u32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_clz_i32_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_clz_i32_u32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x72,0xfe,0x7f,0xff,0x00,0x00,0x00] -v_cos_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_cos_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cos_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_cos_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_cos_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cos_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_cos_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cos_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +v_cos_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_cos_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +v_cos_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cos_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0xfe,0x7e,0x7f,0x77,0x39,0x05] + +v_cos_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cos_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc2,0x0a,0x7f,0x81,0x77,0x39,0x05] + +v_cos_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cos_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc2,0xfe,0x7f,0xff,0x00,0x00,0x00] v_cos_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cos_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6c,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s index 936cce46f2ebc..263ad4bf513a1 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s @@ -47,6 +47,12 @@ v_ceil_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0] v_cos_f16_e32 v128, 0xfe0b // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cos_f16_e32 v128.h, 0xfe0b +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_cos_f16_e32 v128.l, 0xfe0b +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + v_cos_f16_e32 v255, v1 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -56,6 +62,24 @@ v_cos_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0] v_cos_f16_e32 v255, v1 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction +v_cos_f16_e32 v255.h, v1.h +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_cos_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_cos_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_cos_f16_e32 v255.l, v1.l +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_cos_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_cos_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + v_cos_f16_e32 v5, v199 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -65,6 +89,24 @@ v_cos_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_cos_f16_e32 v5, v199 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction +v_cos_f16_e32 v5.h, v199.h +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_cos_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_cos_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_cos_f16_e32 v5.l, v199.l +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_cos_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_cos_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + v_cvt_f16_f32_e32 v128, 0xaf123456 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s index 1c8d7e43be081..42c36538f2bf6 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s @@ -68,71 +68,137 @@ v_ceil_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_ceil_f16 v5, v199 quad_perm:[3,2,1,0] // GFX11: v_ceil_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] -v_cos_f16 v128, 0xfe0b -// GFX11: v_cos_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xe1,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cos_f16 v128.h, 0xfe0b +// GFX11: v_cos_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe1,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_cos_f16 v255, -1 -// GFX11: v_cos_f16_e64 v255, -1 ; encoding: [0xff,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] +v_cos_f16 v128.l, 0xfe0b +// GFX11: v_cos_f16_e64 v128.l, 0xfe0b ; encoding: [0x80,0x00,0xe1,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_cos_f16 v255, 0.5 -// GFX11: v_cos_f16_e64 v255, 0.5 ; encoding: [0xff,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x00] +v_cos_f16 v255.h, -1 +// GFX11: v_cos_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0xc1,0x00,0x00,0x00] -v_cos_f16 v255, exec_hi -// GFX11: v_cos_f16_e64 v255, exec_hi ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] +v_cos_f16 v255.h, 0.5 +// GFX11: v_cos_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0xf0,0x00,0x00,0x00] -v_cos_f16 v255, exec_lo -// GFX11: v_cos_f16_e64 v255, exec_lo ; encoding: [0xff,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] +v_cos_f16 v255.h, exec_hi +// GFX11: v_cos_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7f,0x00,0x00,0x00] -v_cos_f16 v255, m0 -// GFX11: v_cos_f16_e64 v255, m0 ; encoding: [0xff,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] +v_cos_f16 v255.h, exec_lo +// GFX11: v_cos_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7e,0x00,0x00,0x00] -v_cos_f16 v255, null -// GFX11: v_cos_f16_e64 v255, null ; encoding: [0xff,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] +v_cos_f16 v255.h, m0 +// GFX11: v_cos_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7d,0x00,0x00,0x00] -v_cos_f16 v255, s1 -// GFX11: v_cos_f16_e64 v255, s1 ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] +v_cos_f16 v255.h, null +// GFX11: v_cos_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7c,0x00,0x00,0x00] -v_cos_f16 v255, s105 -// GFX11: v_cos_f16_e64 v255, s105 ; encoding: [0xff,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] +v_cos_f16 v255.h, s1 +// GFX11: v_cos_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x01,0x00,0x00,0x00] -v_cos_f16 v255, src_scc -// GFX11: v_cos_f16_e64 v255, src_scc ; encoding: [0xff,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x00] +v_cos_f16 v255.h, s105 +// GFX11: v_cos_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x69,0x00,0x00,0x00] -v_cos_f16 v255, ttmp15 -// GFX11: v_cos_f16_e64 v255, ttmp15 ; encoding: [0xff,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] +v_cos_f16 v255.h, src_scc +// GFX11: v_cos_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0xfd,0x00,0x00,0x00] -v_cos_f16 v255, v1 -// GFX11: v_cos_f16_e64 v255, v1 ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] +v_cos_f16 v255.h, ttmp15 +// GFX11: v_cos_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7b,0x00,0x00,0x00] -v_cos_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cos_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_cos_f16 v255.h, v1.h +// GFX11: v_cos_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00] -v_cos_f16 v255, v1 quad_perm:[3,2,1,0] -// GFX11: v_cos_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cos_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cos_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_cos_f16 v255, v127 -// GFX11: v_cos_f16_e64 v255, v127 ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x01,0x00,0x00] +v_cos_f16 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX11: v_cos_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_cos_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cos_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] +v_cos_f16 v255.h, v127.h +// GFX11: v_cos_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe1,0xd5,0x7f,0x01,0x00,0x00] -v_cos_f16 v255, v127 quad_perm:[3,2,1,0] -// GFX11: v_cos_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] +v_cos_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cos_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] -v_cos_f16 v255, vcc_hi -// GFX11: v_cos_f16_e64 v255, vcc_hi ; encoding: [0xff,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] +v_cos_f16 v255.h, v127.h quad_perm:[3,2,1,0] +// GFX11: v_cos_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] -v_cos_f16 v255, vcc_lo -// GFX11: v_cos_f16_e64 v255, vcc_lo ; encoding: [0xff,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] +v_cos_f16 v255.h, vcc_hi +// GFX11: v_cos_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x6b,0x00,0x00,0x00] -v_cos_f16 v5, v199 -// GFX11: v_cos_f16_e64 v5, v199 ; encoding: [0x05,0x00,0xe1,0xd5,0xc7,0x01,0x00,0x00] +v_cos_f16 v255.h, vcc_lo +// GFX11: v_cos_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x6a,0x00,0x00,0x00] -v_cos_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cos_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] +v_cos_f16 v255.l, -1 +// GFX11: v_cos_f16_e64 v255.l, -1 ; encoding: [0xff,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] -v_cos_f16 v5, v199 quad_perm:[3,2,1,0] -// GFX11: v_cos_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] +v_cos_f16 v255.l, 0.5 +// GFX11: v_cos_f16_e64 v255.l, 0.5 ; encoding: [0xff,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x00] + +v_cos_f16 v255.l, exec_hi +// GFX11: v_cos_f16_e64 v255.l, exec_hi ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] + +v_cos_f16 v255.l, exec_lo +// GFX11: v_cos_f16_e64 v255.l, exec_lo ; encoding: [0xff,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] + +v_cos_f16 v255.l, m0 +// GFX11: v_cos_f16_e64 v255.l, m0 ; encoding: [0xff,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] + +v_cos_f16 v255.l, null +// GFX11: v_cos_f16_e64 v255.l, null ; encoding: [0xff,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] + +v_cos_f16 v255.l, s1 +// GFX11: v_cos_f16_e64 v255.l, s1 ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] + +v_cos_f16 v255.l, s105 +// GFX11: v_cos_f16_e64 v255.l, s105 ; encoding: [0xff,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] + +v_cos_f16 v255.l, src_scc +// GFX11: v_cos_f16_e64 v255.l, src_scc ; encoding: [0xff,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x00] + +v_cos_f16 v255.l, ttmp15 +// GFX11: v_cos_f16_e64 v255.l, ttmp15 ; encoding: [0xff,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] + +v_cos_f16 v255.l, v1.l +// GFX11: v_cos_f16_e64 v255.l, v1.l ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] + +v_cos_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cos_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_cos_f16 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_cos_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_cos_f16 v255.l, v127.l +// GFX11: v_cos_f16_e64 v255.l, v127.l ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x01,0x00,0x00] + +v_cos_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cos_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] + +v_cos_f16 v255.l, v127.l quad_perm:[3,2,1,0] +// GFX11: v_cos_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] + +v_cos_f16 v255.l, vcc_hi +// GFX11: v_cos_f16_e64 v255.l, vcc_hi ; encoding: [0xff,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] + +v_cos_f16 v255.l, vcc_lo +// GFX11: v_cos_f16_e64 v255.l, vcc_lo ; encoding: [0xff,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] + +v_cos_f16 v5.h, v199.h +// GFX11: v_cos_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe1,0xd5,0xc7,0x01,0x00,0x00] + +v_cos_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cos_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_cos_f16 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX11: v_cos_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] + +v_cos_f16 v5.l, v199.l +// GFX11: v_cos_f16_e64 v5.l, v199.l ; encoding: [0x05,0x00,0xe1,0xd5,0xc7,0x01,0x00,0x00] + +v_cos_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cos_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_cos_f16 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX11: v_cos_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] v_cvt_f16_f32 v128.h, 0xaf123456 // GFX11: v_cvt_f16_f32_e64 v128.h, 0xaf123456 op_sel:[0,1] ; encoding: [0x80,0x40,0x8a,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s index 204d87c280525..874fb5bffa0ad 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s @@ -220,47 +220,56 @@ v_clz_i32_u32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_clz_i32_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_clz_i32_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xb9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] -v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_cos_f16_e64_dpp v5, v1 row_mirror -// GFX11: v_cos_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_mirror +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_half_mirror -// GFX11: v_cos_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_half_mirror +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_shl:1 -// GFX11: v_cos_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_shl:1 +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_shl:15 -// GFX11: v_cos_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_shl:15 +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_shr:1 -// GFX11: v_cos_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_shr:1 +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_shr:15 -// GFX11: v_cos_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_shr:15 +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_ror:1 -// GFX11: v_cos_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_ror:1 +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_ror:15 -// GFX11: v_cos_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_ror:15 +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +v_cos_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +v_cos_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +v_cos_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: [0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +v_cos_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: [0x05,0x08,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +v_cos_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: [0xff,0xc1,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] v_cos_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX11: v_cos_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s index d779b65bc0ba9..8e6783e0f413c 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s @@ -61,17 +61,26 @@ v_clz_i32_u32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_clz_i32_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_clz_i32_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xb9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] -v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_cos_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +v_cos_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +v_cos_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe1,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe1,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +v_cos_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: [0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +v_cos_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: [0x05,0x08,0xe1,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +v_cos_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: [0xff,0xc1,0xe1,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] v_cos_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cos_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s index 7abc0185d6af6..3f9af472a6372 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s @@ -262,50 +262,59 @@ v_clz_i32_u32_e64 v5, src_scc v_clz_i32_u32_e64 v255, 0xaf123456 // GFX11: v_clz_i32_u32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xb9,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cos_f16_e64 v5, v1 -// GFX11: v_cos_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] +v_cos_f16_e64 v5.l, v1.l +// GFX11: v_cos_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] -v_cos_f16_e64 v5, v255 -// GFX11: v_cos_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00] +v_cos_f16_e64 v5.l, v255.l +// GFX11: v_cos_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00] -v_cos_f16_e64 v5, s1 -// GFX11: v_cos_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, s1 +// GFX11: v_cos_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] -v_cos_f16_e64 v5, s105 -// GFX11: v_cos_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, s105 +// GFX11: v_cos_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] -v_cos_f16_e64 v5, vcc_lo -// GFX11: v_cos_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, vcc_lo +// GFX11: v_cos_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] -v_cos_f16_e64 v5, vcc_hi -// GFX11: v_cos_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, vcc_hi +// GFX11: v_cos_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] -v_cos_f16_e64 v5, ttmp15 -// GFX11: v_cos_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, ttmp15 +// GFX11: v_cos_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] -v_cos_f16_e64 v5, m0 -// GFX11: v_cos_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, m0 +// GFX11: v_cos_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] -v_cos_f16_e64 v5, exec_lo -// GFX11: v_cos_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, exec_lo +// GFX11: v_cos_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] -v_cos_f16_e64 v5, exec_hi -// GFX11: v_cos_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, exec_hi +// GFX11: v_cos_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] -v_cos_f16_e64 v5, null -// GFX11: v_cos_f16_e64 v5, null ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, null +// GFX11: v_cos_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] -v_cos_f16_e64 v5, -1 -// GFX11: v_cos_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, -1 +// GFX11: v_cos_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] -v_cos_f16_e64 v5, 0.5 mul:2 -// GFX11: v_cos_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08] +v_cos_f16_e64 v5.l, 0.5 mul:2 +// GFX11: v_cos_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08] -v_cos_f16_e64 v5, src_scc mul:4 -// GFX11: v_cos_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10] +v_cos_f16_e64 v5.l, src_scc mul:4 +// GFX11: v_cos_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10] -v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 -// GFX11: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +v_cos_f16_e64 v255.l, -|0xfe0b| clamp div:2 +// GFX11: v_cos_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_cos_f16_e64 v5.h, v1.h +// GFX11: [0x05,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00] + +v_cos_f16_e64 v5.l, v255.h +// GFX11: [0x05,0x08,0xe1,0xd5,0xff,0x01,0x00,0x00] + +v_cos_f16_e64 v255.h, -|0xfe0b| clamp div:2 +// GFX11: [0xff,0xc1,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] v_cos_f32_e64 v5, v1 // GFX11: v_cos_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s index 8f517ecdfc84a..6c69f3fb78bc0 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s @@ -265,50 +265,62 @@ v_clz_i32_u32 v5, src_scc v_clz_i32_u32 v255, 0xaf123456 // GFX12: v_clz_i32_u32_e32 v255, 0xaf123456 ; encoding: [0xff,0x72,0xfe,0x7f,0x56,0x34,0x12,0xaf] -v_cos_f16 v5, v1 -// GFX12: v_cos_f16_e32 v5, v1 ; encoding: [0x01,0xc3,0x0a,0x7e] +v_cos_f16 v5.l, v1.l +// GFX12: v_cos_f16_e32 v5.l, v1.l ; encoding: [0x01,0xc3,0x0a,0x7e] -v_cos_f16 v5, v127 -// GFX12: v_cos_f16_e32 v5, v127 ; encoding: [0x7f,0xc3,0x0a,0x7e] +v_cos_f16 v5.l, v127.l +// GFX12: v_cos_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xc3,0x0a,0x7e] -v_cos_f16 v5, s1 -// GFX12: v_cos_f16_e32 v5, s1 ; encoding: [0x01,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, s1 +// GFX12: v_cos_f16_e32 v5.l, s1 ; encoding: [0x01,0xc2,0x0a,0x7e] -v_cos_f16 v5, s105 -// GFX12: v_cos_f16_e32 v5, s105 ; encoding: [0x69,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, s105 +// GFX12: v_cos_f16_e32 v5.l, s105 ; encoding: [0x69,0xc2,0x0a,0x7e] -v_cos_f16 v5, vcc_lo -// GFX12: v_cos_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, vcc_lo +// GFX12: v_cos_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xc2,0x0a,0x7e] -v_cos_f16 v5, vcc_hi -// GFX12: v_cos_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, vcc_hi +// GFX12: v_cos_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xc2,0x0a,0x7e] -v_cos_f16 v5, ttmp15 -// GFX12: v_cos_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, ttmp15 +// GFX12: v_cos_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xc2,0x0a,0x7e] -v_cos_f16 v5, m0 -// GFX12: v_cos_f16_e32 v5, m0 ; encoding: [0x7d,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, m0 +// GFX12: v_cos_f16_e32 v5.l, m0 ; encoding: [0x7d,0xc2,0x0a,0x7e] -v_cos_f16 v5, exec_lo -// GFX12: v_cos_f16_e32 v5, exec_lo ; encoding: [0x7e,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, exec_lo +// GFX12: v_cos_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xc2,0x0a,0x7e] -v_cos_f16 v5, exec_hi -// GFX12: v_cos_f16_e32 v5, exec_hi ; encoding: [0x7f,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, exec_hi +// GFX12: v_cos_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xc2,0x0a,0x7e] -v_cos_f16 v5, null -// GFX12: v_cos_f16_e32 v5, null ; encoding: [0x7c,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, null +// GFX12: v_cos_f16_e32 v5.l, null ; encoding: [0x7c,0xc2,0x0a,0x7e] -v_cos_f16 v5, -1 -// GFX12: v_cos_f16_e32 v5, -1 ; encoding: [0xc1,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, -1 +// GFX12: v_cos_f16_e32 v5.l, -1 ; encoding: [0xc1,0xc2,0x0a,0x7e] -v_cos_f16 v5, 0.5 -// GFX12: v_cos_f16_e32 v5, 0.5 ; encoding: [0xf0,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, 0.5 +// GFX12: v_cos_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xc2,0x0a,0x7e] -v_cos_f16 v5, src_scc -// GFX12: v_cos_f16_e32 v5, src_scc ; encoding: [0xfd,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, src_scc +// GFX12: v_cos_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xc2,0x0a,0x7e] -v_cos_f16 v127, 0xfe0b -// GFX12: v_cos_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_cos_f16 v127.l, 0xfe0b +// GFX12: v_cos_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +v_cos_f16 v5.l, v1.h +// GFX12: v_cos_f16_e32 v5.l, v1.h ; encoding: [0x81,0xc3,0x0a,0x7e] + +v_cos_f16 v5.l, v127.h +// GFX12: v_cos_f16_e32 v5.l, v127.h ; encoding: [0xff,0xc3,0x0a,0x7e] + +v_cos_f16 v5.h, src_scc +// GFX12: v_cos_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xc2,0x0a,0x7f] + +v_cos_f16 v127.h, 0xfe0b +// GFX12: v_cos_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xc2,0xfe,0x7f,0x0b,0xfe,0x00,0x00] v_cos_f32 v5, v1 // GFX12: v_cos_f32_e32 v5, v1 ; encoding: [0x01,0x6d,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s index 914cfcbb229a3..05a5f8bd44b9c 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s @@ -214,47 +214,53 @@ v_clz_i32_u32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_clz_i32_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_clz_i32_u32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x72,0xfe,0x7f,0xff,0x6f,0x05,0x30] -v_cos_f16 v5, v1 quad_perm:[3,2,1,0] -// GFX12: v_cos_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +v_cos_f16 v5.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_cos_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff] -v_cos_f16 v5, v1 quad_perm:[0,1,2,3] -// GFX12: v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +v_cos_f16 v5.l, v1.l quad_perm:[0,1,2,3] +// GFX12: v_cos_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff] -v_cos_f16 v5, v1 row_mirror -// GFX12: v_cos_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff] +v_cos_f16 v5.l, v1.l row_mirror +// GFX12: v_cos_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff] -v_cos_f16 v5, v1 row_half_mirror -// GFX12: v_cos_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff] +v_cos_f16 v5.l, v1.l row_half_mirror +// GFX12: v_cos_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff] -v_cos_f16 v5, v1 row_shl:1 -// GFX12: v_cos_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff] +v_cos_f16 v5.l, v1.l row_shl:1 +// GFX12: v_cos_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff] -v_cos_f16 v5, v1 row_shl:15 -// GFX12: v_cos_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +v_cos_f16 v5.l, v1.l row_shl:15 +// GFX12: v_cos_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff] -v_cos_f16 v5, v1 row_shr:1 -// GFX12: v_cos_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff] +v_cos_f16 v5.l, v1.l row_shr:1 +// GFX12: v_cos_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff] -v_cos_f16 v5, v1 row_shr:15 -// GFX12: v_cos_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +v_cos_f16 v5.l, v1.l row_shr:15 +// GFX12: v_cos_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff] -v_cos_f16 v5, v1 row_ror:1 -// GFX12: v_cos_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff] +v_cos_f16 v5.l, v1.l row_ror:1 +// GFX12: v_cos_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff] -v_cos_f16 v5, v1 row_ror:15 -// GFX12: v_cos_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +v_cos_f16 v5.l, v1.l row_ror:15 +// GFX12: v_cos_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff] -v_cos_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cos_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff] +v_cos_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cos_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff] -v_cos_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cos_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +v_cos_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cos_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01] -v_cos_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cos_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x09,0x13] +v_cos_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cos_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x09,0x13] -v_cos_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cos_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +v_cos_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cos_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x35,0x30] + +v_cos_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cos_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc2,0x0a,0x7f,0x81,0x60,0x09,0x13] + +v_cos_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cos_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7f,0xff,0x6f,0x35,0x30] v_cos_f32 v5, v1 quad_perm:[3,2,1,0] // GFX12: v_cos_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s index f1c4e863b1873..bf03e7f8e518c 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s @@ -49,14 +49,20 @@ v_clz_i32_u32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_clz_i32_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_clz_i32_u32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x72,0xfe,0x7f,0xff,0x00,0x00,0x00] -v_cos_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_cos_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cos_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_cos_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_cos_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cos_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_cos_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cos_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +v_cos_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cos_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +v_cos_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cos_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc2,0x0a,0x7f,0x81,0x77,0x39,0x05] + +v_cos_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cos_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc2,0xfe,0x7f,0xff,0x00,0x00,0x00] v_cos_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cos_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6c,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s index eb7b86635f35d..f584b69c33ec8 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s @@ -26,6 +26,12 @@ v_ceil_f16_e32 v5, v199 quad_perm:[3,2,1,0] v_cos_f16_e32 v128, 0xfe0b // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cos_f16_e32 v128.h, 0xfe0b +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_cos_f16_e32 v128.l, 0xfe0b +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + v_cos_f16_e32 v255, v1 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -35,6 +41,24 @@ v_cos_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0] v_cos_f16_e32 v255, v1 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction +v_cos_f16_e32 v255.h, v1.h +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_cos_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_cos_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_cos_f16_e32 v255.l, v1.l +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_cos_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_cos_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + v_cos_f16_e32 v5, v199 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -44,6 +68,24 @@ v_cos_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_cos_f16_e32 v5, v199 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction +v_cos_f16_e32 v5.h, v199.h +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_cos_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_cos_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_cos_f16_e32 v5.l, v199.l +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_cos_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_cos_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + v_cvt_f16_f32_e32 v128.h, 0xaf123456 // GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s index 2f0c0a1192f2f..27e92b7e4f22b 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s @@ -67,71 +67,137 @@ v_ceil_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_ceil_f16 v5, v199 quad_perm:[3,2,1,0] // GFX12: v_ceil_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] -v_cos_f16 v128, 0xfe0b -// GFX12: v_cos_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xe1,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cos_f16 v128.h, 0xfe0b +// GFX12: v_cos_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe1,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_cos_f16 v255, -1 -// GFX12: v_cos_f16_e64 v255, -1 ; encoding: [0xff,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] +v_cos_f16 v128.l, 0xfe0b +// GFX12: v_cos_f16_e64 v128.l, 0xfe0b ; encoding: [0x80,0x00,0xe1,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_cos_f16 v255, 0.5 -// GFX12: v_cos_f16_e64 v255, 0.5 ; encoding: [0xff,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x00] +v_cos_f16 v255.h, -1 +// GFX12: v_cos_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0xc1,0x00,0x00,0x00] -v_cos_f16 v255, exec_hi -// GFX12: v_cos_f16_e64 v255, exec_hi ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] +v_cos_f16 v255.h, 0.5 +// GFX12: v_cos_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0xf0,0x00,0x00,0x00] -v_cos_f16 v255, exec_lo -// GFX12: v_cos_f16_e64 v255, exec_lo ; encoding: [0xff,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] +v_cos_f16 v255.h, exec_hi +// GFX12: v_cos_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7f,0x00,0x00,0x00] -v_cos_f16 v255, m0 -// GFX12: v_cos_f16_e64 v255, m0 ; encoding: [0xff,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] +v_cos_f16 v255.h, exec_lo +// GFX12: v_cos_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7e,0x00,0x00,0x00] -v_cos_f16 v255, null -// GFX12: v_cos_f16_e64 v255, null ; encoding: [0xff,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] +v_cos_f16 v255.h, m0 +// GFX12: v_cos_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7d,0x00,0x00,0x00] -v_cos_f16 v255, s1 -// GFX12: v_cos_f16_e64 v255, s1 ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] +v_cos_f16 v255.h, null +// GFX12: v_cos_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7c,0x00,0x00,0x00] -v_cos_f16 v255, s105 -// GFX12: v_cos_f16_e64 v255, s105 ; encoding: [0xff,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] +v_cos_f16 v255.h, s1 +// GFX12: v_cos_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x01,0x00,0x00,0x00] -v_cos_f16 v255, src_scc -// GFX12: v_cos_f16_e64 v255, src_scc ; encoding: [0xff,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x00] +v_cos_f16 v255.h, s105 +// GFX12: v_cos_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x69,0x00,0x00,0x00] -v_cos_f16 v255, ttmp15 -// GFX12: v_cos_f16_e64 v255, ttmp15 ; encoding: [0xff,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] +v_cos_f16 v255.h, src_scc +// GFX12: v_cos_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0xfd,0x00,0x00,0x00] -v_cos_f16 v255, v1 -// GFX12: v_cos_f16_e64 v255, v1 ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] +v_cos_f16 v255.h, ttmp15 +// GFX12: v_cos_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7b,0x00,0x00,0x00] -v_cos_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cos_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_cos_f16 v255.h, v1.h +// GFX12: v_cos_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00] -v_cos_f16 v255, v1 quad_perm:[3,2,1,0] -// GFX12: v_cos_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cos_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_cos_f16 v255, v127 -// GFX12: v_cos_f16_e64 v255, v127 ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x01,0x00,0x00] +v_cos_f16 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_cos_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cos_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] +v_cos_f16 v255.h, v127.h +// GFX12: v_cos_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe1,0xd5,0x7f,0x01,0x00,0x00] -v_cos_f16 v255, v127 quad_perm:[3,2,1,0] -// GFX12: v_cos_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] +v_cos_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] -v_cos_f16 v255, vcc_hi -// GFX12: v_cos_f16_e64 v255, vcc_hi ; encoding: [0xff,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] +v_cos_f16 v255.h, v127.h quad_perm:[3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] -v_cos_f16 v255, vcc_lo -// GFX12: v_cos_f16_e64 v255, vcc_lo ; encoding: [0xff,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] +v_cos_f16 v255.h, vcc_hi +// GFX12: v_cos_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x6b,0x00,0x00,0x00] -v_cos_f16 v5, v199 -// GFX12: v_cos_f16_e64 v5, v199 ; encoding: [0x05,0x00,0xe1,0xd5,0xc7,0x01,0x00,0x00] +v_cos_f16 v255.h, vcc_lo +// GFX12: v_cos_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x6a,0x00,0x00,0x00] -v_cos_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cos_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] +v_cos_f16 v255.l, -1 +// GFX12: v_cos_f16_e64 v255.l, -1 ; encoding: [0xff,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] -v_cos_f16 v5, v199 quad_perm:[3,2,1,0] -// GFX12: v_cos_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] +v_cos_f16 v255.l, 0.5 +// GFX12: v_cos_f16_e64 v255.l, 0.5 ; encoding: [0xff,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x00] + +v_cos_f16 v255.l, exec_hi +// GFX12: v_cos_f16_e64 v255.l, exec_hi ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] + +v_cos_f16 v255.l, exec_lo +// GFX12: v_cos_f16_e64 v255.l, exec_lo ; encoding: [0xff,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] + +v_cos_f16 v255.l, m0 +// GFX12: v_cos_f16_e64 v255.l, m0 ; encoding: [0xff,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] + +v_cos_f16 v255.l, null +// GFX12: v_cos_f16_e64 v255.l, null ; encoding: [0xff,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] + +v_cos_f16 v255.l, s1 +// GFX12: v_cos_f16_e64 v255.l, s1 ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] + +v_cos_f16 v255.l, s105 +// GFX12: v_cos_f16_e64 v255.l, s105 ; encoding: [0xff,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] + +v_cos_f16 v255.l, src_scc +// GFX12: v_cos_f16_e64 v255.l, src_scc ; encoding: [0xff,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x00] + +v_cos_f16 v255.l, ttmp15 +// GFX12: v_cos_f16_e64 v255.l, ttmp15 ; encoding: [0xff,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] + +v_cos_f16 v255.l, v1.l +// GFX12: v_cos_f16_e64 v255.l, v1.l ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] + +v_cos_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_cos_f16 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_cos_f16 v255.l, v127.l +// GFX12: v_cos_f16_e64 v255.l, v127.l ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x01,0x00,0x00] + +v_cos_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] + +v_cos_f16 v255.l, v127.l quad_perm:[3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] + +v_cos_f16 v255.l, vcc_hi +// GFX12: v_cos_f16_e64 v255.l, vcc_hi ; encoding: [0xff,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] + +v_cos_f16 v255.l, vcc_lo +// GFX12: v_cos_f16_e64 v255.l, vcc_lo ; encoding: [0xff,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] + +v_cos_f16 v5.h, v199.h +// GFX12: v_cos_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe1,0xd5,0xc7,0x01,0x00,0x00] + +v_cos_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_cos_f16 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] + +v_cos_f16 v5.l, v199.l +// GFX12: v_cos_f16_e64 v5.l, v199.l ; encoding: [0x05,0x00,0xe1,0xd5,0xc7,0x01,0x00,0x00] + +v_cos_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_cos_f16 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] v_cvt_f16_f32 v128.h, 0xaf123456 // GFX12: v_cvt_f16_f32_e64 v128.h, 0xaf123456 op_sel:[0,1] ; encoding: [0x80,0x40,0x8a,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s index 224f7f090a64f..0ba9874b1a22e 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s @@ -262,50 +262,59 @@ v_clz_i32_u32_e64 v5, src_scc v_clz_i32_u32_e64 v255, 0xaf123456 // GFX12: v_clz_i32_u32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xb9,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cos_f16_e64 v5, v1 -// GFX12: v_cos_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] +v_cos_f16_e64 v5.l, v1.l +// GFX12: v_cos_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] -v_cos_f16_e64 v5, v255 -// GFX12: v_cos_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00] +v_cos_f16_e64 v5.l, v255.l +// GFX12: v_cos_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00] -v_cos_f16_e64 v5, s1 -// GFX12: v_cos_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, s1 +// GFX12: v_cos_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] -v_cos_f16_e64 v5, s105 -// GFX12: v_cos_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, s105 +// GFX12: v_cos_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] -v_cos_f16_e64 v5, vcc_lo -// GFX12: v_cos_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, vcc_lo +// GFX12: v_cos_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] -v_cos_f16_e64 v5, vcc_hi -// GFX12: v_cos_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, vcc_hi +// GFX12: v_cos_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] -v_cos_f16_e64 v5, ttmp15 -// GFX12: v_cos_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, ttmp15 +// GFX12: v_cos_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] -v_cos_f16_e64 v5, m0 -// GFX12: v_cos_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, m0 +// GFX12: v_cos_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] -v_cos_f16_e64 v5, exec_lo -// GFX12: v_cos_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, exec_lo +// GFX12: v_cos_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] -v_cos_f16_e64 v5, exec_hi -// GFX12: v_cos_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, exec_hi +// GFX12: v_cos_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] -v_cos_f16_e64 v5, null -// GFX12: v_cos_f16_e64 v5, null ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, null +// GFX12: v_cos_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] -v_cos_f16_e64 v5, -1 -// GFX12: v_cos_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, -1 +// GFX12: v_cos_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] -v_cos_f16_e64 v5, 0.5 mul:2 -// GFX12: v_cos_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08] +v_cos_f16_e64 v5.l, 0.5 mul:2 +// GFX12: v_cos_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08] -v_cos_f16_e64 v5, src_scc mul:4 -// GFX12: v_cos_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10] +v_cos_f16_e64 v5.l, src_scc mul:4 +// GFX12: v_cos_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10] -v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 -// GFX12: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +v_cos_f16_e64 v255.l, -|0xfe0b| clamp div:2 +// GFX12: v_cos_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_cos_f16_e64 v5.h, v1.h +// GFX12: v_cos_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00] + +v_cos_f16_e64 v5.l, v255.h +// GFX12: v_cos_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe1,0xd5,0xff,0x01,0x00,0x00] + +v_cos_f16_e64 v255.h, -|0xfe0b| clamp div:2 +// GFX12: v_cos_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] v_cos_f32_e64 v5, v1 // GFX12: v_cos_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s index 0a8ce42e130c3..197f02719905d 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s @@ -211,47 +211,56 @@ v_clz_i32_u32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_clz_i32_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_clz_i32_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xb9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] -v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] -// GFX12: v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] -// GFX12: v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_cos_f16_e64_dpp v5, v1 row_mirror -// GFX12: v_cos_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_mirror +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_half_mirror -// GFX12: v_cos_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_half_mirror +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_shl:1 -// GFX12: v_cos_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_shl:1 +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_shl:15 -// GFX12: v_cos_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_shl:15 +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_shr:1 -// GFX12: v_cos_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_shr:1 +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_shr:15 -// GFX12: v_cos_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_shr:15 +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_ror:1 -// GFX12: v_cos_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_ror:1 +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_ror:15 -// GFX12: v_cos_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_ror:15 +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +v_cos_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +v_cos_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +v_cos_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cos_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +v_cos_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cos_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x08,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +v_cos_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cos_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc1,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] v_cos_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX12: v_cos_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s index 930f8f8d56957..0dfc47b4e4020 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s @@ -52,17 +52,26 @@ v_clz_i32_u32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_clz_i32_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_clz_i32_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xb9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] -v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_cos_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +v_cos_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +v_cos_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe1,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe1,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +v_cos_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +v_cos_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cos_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x08,0xe1,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +v_cos_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cos_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc1,0xe1,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] v_cos_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cos_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt index cc3b8fdd9093b..8d86bafca059f 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt @@ -287,49 +287,82 @@ # GFX11: v_clz_i32_u32_e32 v255, 0xaf123456 ; encoding: [0xff,0x72,0xfe,0x7f,0x56,0x34,0x12,0xaf] 0x01,0xc3,0x0a,0x7e -# GFX11: v_cos_f16_e32 v5, v1 ; encoding: [0x01,0xc3,0x0a,0x7e] +# GFX11-REAL16: v_cos_f16_e32 v5.l, v1.l ; encoding: [0x01,0xc3,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, v1 ; encoding: [0x01,0xc3,0x0a,0x7e] 0x7f,0xc3,0x0a,0x7e -# GFX11: v_cos_f16_e32 v5, v127 ; encoding: [0x7f,0xc3,0x0a,0x7e] +# GFX11-REAL16: v_cos_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xc3,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, v127 ; encoding: [0x7f,0xc3,0x0a,0x7e] 0x01,0xc2,0x0a,0x7e -# GFX11: v_cos_f16_e32 v5, s1 ; encoding: [0x01,0xc2,0x0a,0x7e] +# GFX11-REAL16: v_cos_f16_e32 v5.l, s1 ; encoding: [0x01,0xc2,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, s1 ; encoding: [0x01,0xc2,0x0a,0x7e] 0x69,0xc2,0x0a,0x7e -# GFX11: v_cos_f16_e32 v5, s105 ; encoding: [0x69,0xc2,0x0a,0x7e] +# GFX11-REAL16: v_cos_f16_e32 v5.l, s105 ; encoding: [0x69,0xc2,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, s105 ; encoding: [0x69,0xc2,0x0a,0x7e] 0x6a,0xc2,0x0a,0x7e -# GFX11: v_cos_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xc2,0x0a,0x7e] +# GFX11-REAL16: v_cos_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xc2,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xc2,0x0a,0x7e] 0x6b,0xc2,0x0a,0x7e -# GFX11: v_cos_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xc2,0x0a,0x7e] +# GFX11-REAL16: v_cos_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xc2,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xc2,0x0a,0x7e] 0x7b,0xc2,0x0a,0x7e -# GFX11: v_cos_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xc2,0x0a,0x7e] +# GFX11-REAL16: v_cos_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xc2,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xc2,0x0a,0x7e] 0x7d,0xc2,0x0a,0x7e -# GFX11: v_cos_f16_e32 v5, m0 ; encoding: [0x7d,0xc2,0x0a,0x7e] +# GFX11-REAL16: v_cos_f16_e32 v5.l, m0 ; encoding: [0x7d,0xc2,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, m0 ; encoding: [0x7d,0xc2,0x0a,0x7e] 0x7e,0xc2,0x0a,0x7e -# GFX11: v_cos_f16_e32 v5, exec_lo ; encoding: [0x7e,0xc2,0x0a,0x7e] +# GFX11-REAL16: v_cos_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xc2,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, exec_lo ; encoding: [0x7e,0xc2,0x0a,0x7e] 0x7f,0xc2,0x0a,0x7e -# GFX11: v_cos_f16_e32 v5, exec_hi ; encoding: [0x7f,0xc2,0x0a,0x7e] +# GFX11-REAL16: v_cos_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xc2,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, exec_hi ; encoding: [0x7f,0xc2,0x0a,0x7e] 0x7c,0xc2,0x0a,0x7e -# GFX11: v_cos_f16_e32 v5, null ; encoding: [0x7c,0xc2,0x0a,0x7e] +# GFX11-REAL16: v_cos_f16_e32 v5.l, null ; encoding: [0x7c,0xc2,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, null ; encoding: [0x7c,0xc2,0x0a,0x7e] 0xc1,0xc2,0x0a,0x7e -# GFX11: v_cos_f16_e32 v5, -1 ; encoding: [0xc1,0xc2,0x0a,0x7e] +# GFX11-REAL16: v_cos_f16_e32 v5.l, -1 ; encoding: [0xc1,0xc2,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, -1 ; encoding: [0xc1,0xc2,0x0a,0x7e] 0xf0,0xc2,0x0a,0x7e -# GFX11: v_cos_f16_e32 v5, 0.5 ; encoding: [0xf0,0xc2,0x0a,0x7e] +# GFX11-REAL16: v_cos_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xc2,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, 0.5 ; encoding: [0xf0,0xc2,0x0a,0x7e] 0xfd,0xc2,0x0a,0x7e -# GFX11: v_cos_f16_e32 v5, src_scc ; encoding: [0xfd,0xc2,0x0a,0x7e] +# GFX11-REAL16: v_cos_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xc2,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, src_scc ; encoding: [0xfd,0xc2,0x0a,0x7e] 0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00 -# GFX11: v_cos_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cos_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +0x81,0xc3,0x0a,0x7e +# GFX11-REAL16: v_cos_f16_e32 v5.l, v1.h ; encoding: [0x81,0xc3,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xc3,0x0a,0x7e] + +0xff,0xc3,0x0a,0x7e +# GFX11-REAL16: v_cos_f16_e32 v5.l, v127.h ; encoding: [0xff,0xc3,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xc3,0x0a,0x7e] + +0xf0,0xc2,0xfe,0x7e +# GFX11-REAL16: v_cos_f16_e32 v127.l, 0.5 ; encoding: [0xf0,0xc2,0xfe,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v127, 0.5 ; encoding: [0xf0,0xc2,0xfe,0x7e] + +0xfd,0xc2,0x0a,0x7f +# GFX11-REAL16: v_cos_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xc2,0x0a,0x7f] + +0xff,0xc2,0xfe,0x7f,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cos_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xc2,0xfe,0x7f,0x0b,0xfe,0x00,0x00] 0x01,0x6d,0x0a,0x7e # GFX11: v_cos_f32_e32 v5, v1 ; encoding: [0x01,0x6d,0x0a,0x7e] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt index ba9e8142942de..f01ce5a31be3c 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt @@ -229,46 +229,72 @@ # GFX11: v_clz_i32_u32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x72,0xfe,0x7f,0xff,0x6f,0x0d,0x30] 0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX11: v_cos_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cos_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX11: v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX11: v_cos_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX11: v_cos_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX11: v_cos_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX11: v_cos_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX11: v_cos_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX11: v_cos_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX11: v_cos_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX11: v_cos_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX11: v_cos_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX11: v_cos_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX11: v_cos_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30 -# GFX11: v_cos_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX11-REAL16: v_cos_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX11-FAKE16: v_cos_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] + +0xfa,0xc2,0xfe,0x7e,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cos_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cos_f16_dpp v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x5f,0x01,0x01] + +0xfa,0xc2,0x0a,0x7f,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cos_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7f,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_mul_i32_i24_e32 v128, 1, v176 ; encoding: [0x81,0x60,0x01,0x13] + +0xfa,0xc2,0xfe,0x7f,0xff,0x6f,0x3d,0x30 +# GFX11-REAL16: v_cos_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7f,0xff,0x6f,0x3d,0x30] +# GFX11-FAKE16: v_lshlrev_b32_e32 v30, v255, v183 ; encoding: [0xff,0x6f,0x3d,0x30] 0xfa,0x6c,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX11: v_cos_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt index dda9dfcb35b1a..0f102c52a3666 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt @@ -43,10 +43,23 @@ # GFX11: v_clz_i32_u32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x72,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX11: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00 -# GFX11: v_cos_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cos_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +0xe9,0xc2,0xfe,0x7e,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cos_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0xfe,0x7e,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cos_f16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0xfe,0x7e,0x7f,0x77,0x39,0x05] + +0xe9,0xc2,0x0a,0x7f,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cos_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7f,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v129, v187 ; encoding: [0x81,0x77,0x39,0x05] + +0xea,0xc2,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cos_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0x6c,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX11: v_cos_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6c,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt index 0191f37c14e31..6e5239522df41 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt @@ -239,46 +239,72 @@ # GFX11: v_clz_i32_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xb9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cos_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cos_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cos_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cos_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cos_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cos_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cos_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cos_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX11: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 -# GFX11: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] 0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 -# GFX11: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] + +0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_cos_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +0x05,0x08,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] + +0xff,0xc1,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cos_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xb6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cos_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt index ab3788deeed3d..d7f9e86d3ca00 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt @@ -61,16 +61,32 @@ # GFX11: v_clz_i32_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xb9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX11: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX11: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] 0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX11: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cos_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +0x05,0x08,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +0xff,0xc1,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cos_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0x05,0x00,0xb6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cos_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt index 2e741322eb122..db5ba967d709b 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt @@ -281,49 +281,76 @@ # GFX11: v_clz_i32_u32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xb9,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] 0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00 -# GFX11: v_cos_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] +# GFX11-REAL16: v_cos_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00 -# GFX11: v_cos_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00] +# GFX11-REAL16: v_cos_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00 -# GFX11: v_cos_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] +# GFX11-REAL16: v_cos_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00 -# GFX11: v_cos_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] +# GFX11-REAL16: v_cos_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00 -# GFX11: v_cos_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] +# GFX11-REAL16: v_cos_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00 -# GFX11: v_cos_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] +# GFX11-REAL16: v_cos_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00 -# GFX11: v_cos_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] +# GFX11-REAL16: v_cos_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00 -# GFX11: v_cos_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] +# GFX11-REAL16: v_cos_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00 -# GFX11: v_cos_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] +# GFX11-REAL16: v_cos_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00 -# GFX11: v_cos_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cos_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00 -# GFX11: v_cos_f16_e64 v5, null ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] +# GFX11-REAL16: v_cos_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v5, null ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00 -# GFX11: v_cos_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] +# GFX11-REAL16: v_cos_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08 -# GFX11: v_cos_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08] +# GFX11-REAL16: v_cos_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08] +# GFX11-FAKE16: v_cos_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08] 0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10 -# GFX11: v_cos_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10] +# GFX11-REAL16: v_cos_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10] +# GFX11-FAKE16: v_cos_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10] 0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 -# GFX11: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cos_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00 +# GFX11-REAL16: v_cos_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x08,0xe1,0xd5,0xff,0x01,0x00,0x00 +# GFX11-REAL16: v_cos_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe1,0xd5,0xff,0x01,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00] + +0xff,0xc1,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cos_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] 0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00 # GFX11: v_cos_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt index 4d6e8ffbd9a27..d37c9229f1666 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt @@ -237,46 +237,68 @@ # GFX12: v_clz_i32_u32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x72,0xfe,0x7f,0xff,0x6f,0x0d,0x30] 0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX12: v_cos_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cos_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX12: v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX12: v_cos_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX12: v_cos_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX12: v_cos_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX12: v_cos_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX12: v_cos_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX12: v_cos_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX12: v_cos_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX12: v_cos_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX12: v_cos_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX12: v_cos_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX12: v_cos_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30 -# GFX12: v_cos_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-REAL16: v_cos_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-FAKE16: v_cos_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] + +0xfa,0xc2,0x0a,0x7f,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_cos_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7f,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_mul_i32_i24_e32 v128, 1, v176 ; encoding: [0x81,0x60,0x01,0x13] + +0xfa,0xc2,0xfe,0x7f,0xff,0x6f,0x3d,0x30 +# GFX12-REAL16: v_cos_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7f,0xff,0x6f,0x3d,0x30] +# GFX12-FAKE16: v_lshlrev_b32_e32 v30, v255, v183 ; encoding: [0xff,0x6f,0x3d,0x30] 0xfa,0x6c,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX12: v_cos_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt index fcc1d3f97dcb1..f3dce5e6d5b93 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt @@ -44,10 +44,19 @@ # GFX12: v_clz_i32_u32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x72,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX12: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00 -# GFX12: v_cos_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cos_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +0xe9,0xc2,0x0a,0x7f,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_cos_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7f,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05] + +0xea,0xc2,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cos_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0x6c,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX12: v_cos_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6c,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt index dad6b502e0bd0..e4b619d87e400 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt @@ -279,49 +279,76 @@ # GFX12: v_clz_i32_u32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xb9,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] 0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00 -# GFX12: v_cos_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] +# GFX12-REAL16: v_cos_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00 -# GFX12: v_cos_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00] +# GFX12-REAL16: v_cos_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00 -# GFX12: v_cos_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] +# GFX12-REAL16: v_cos_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00 -# GFX12: v_cos_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] +# GFX12-REAL16: v_cos_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00 -# GFX12: v_cos_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-REAL16: v_cos_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00 -# GFX12: v_cos_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-REAL16: v_cos_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00 -# GFX12: v_cos_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-REAL16: v_cos_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00 -# GFX12: v_cos_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-REAL16: v_cos_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00 -# GFX12: v_cos_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-REAL16: v_cos_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00 -# GFX12: v_cos_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cos_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00 -# GFX12: v_cos_f16_e64 v5, null ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-REAL16: v_cos_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v5, null ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00 -# GFX12: v_cos_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-REAL16: v_cos_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08 -# GFX12: v_cos_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-REAL16: v_cos_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-FAKE16: v_cos_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08] 0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10 -# GFX12: v_cos_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-REAL16: v_cos_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-FAKE16: v_cos_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10] 0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 -# GFX12: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_cos_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00 +# GFX12-REAL16: v_cos_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x08,0xe1,0xd5,0xff,0x01,0x00,0x00 +# GFX12-REAL16: v_cos_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe1,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00] + +0xff,0xc1,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_cos_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] 0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00 # GFX12: v_cos_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt index ccf5f4b21b73c..b77cf5ab6efc1 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt @@ -227,46 +227,72 @@ # GFX12: v_clz_i32_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xb9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX12: v_cos_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX12: v_cos_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX12: v_cos_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_cos_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX12: v_cos_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_cos_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX12: v_cos_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cos_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX12: v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX12: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 -# GFX12: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] 0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 -# GFX12: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] + +0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_cos_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +0x05,0x08,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] + +0xff,0xc1,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cos_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xb6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX12: v_cos_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt index 8018f80798573..50339f51c5629 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt @@ -49,16 +49,32 @@ # GFX12: v_clz_i32_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xb9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX12: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX12: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] 0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX12: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cos_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +0x05,0x08,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +0xff,0xc1,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cos_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0x05,0x00,0xb6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX12: v_cos_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] From 9f5cefebb482331796ceaebbfcebcd5aee1eb339 Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Fri, 3 Jan 2025 15:12:39 -0600 Subject: [PATCH 401/567] [mlir][Affine] Generalize the linearize(delinearize()) simplifications (#117637) The existing canonicalization patterns would only cancel out cases where the entire result list of an affine.delineraize_index was passed to an affine.lineraize_index and the basis elements matched exactly (except possibly for the outer bounds). This was correct, but limited, and left open many cases where a delinearize_index would take a series of divisions and modulos only for a subsequent linearize_index to use additions and multiplications to undo all that work. This sort of simplification is reasably easy to observe at the level of splititng and merging indexes, but difficult to perform once the underlying arithmetic operations have been created. Therefore, this commit generalizes the existing simplification logic. Now, any run of two or more delinearize_index results that appears within the argument list of a linearize_index operation with the same basis (or where they're both at the outermost position and so can be unbonded, or when `linearize_index disjoint` implies a bound not present on the `delinearize_index`) will be reduced to one signle delinearize_index output, whose basis element (that is, size or length) is equal to the product of the sizes that were simplified away. That is, we can now simplify %0:2 = affine.delinearize_index %n into (8, 8) : inde, index %1 = affine.linearize_index [%x, %0#0, %0#1, %y] by (3, 8, 8, 5) : index to the simpler %1 = affine.linearize_index [%x, %n, %y] by (3, 64, 5) : index This new pattern also works with dynamically-sized basis values. While I'm here, I fixed a bunch of typos in existing tests, and added a new getPaddedBasis() method to make processing potentially-underspecified basis elements simpler in some cases. --- .../mlir/Dialect/Affine/IR/AffineOps.td | 27 +- mlir/lib/Dialect/Affine/IR/AffineOps.cpp | 276 ++++++++++++++++-- mlir/test/Dialect/Affine/canonicalize.mlir | 245 +++++++++++++++- 3 files changed, 506 insertions(+), 42 deletions(-) diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td index f5ca24389065e..e2eab1fb2178e 100644 --- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td +++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td @@ -1083,6 +1083,9 @@ def AffineDelinearizeIndexOp : Affine_Op<"delinearize_index", [Pure]> { %indices_2 = affine.apply #map2()[%linear_index] ``` + In other words, `%0:3 = affine.delinearize_index %x into (B, C)` produces + `%0 = {%x / (B * C), (%x mod (B * C)) / C, %x mod C}`. + The basis may either contain `N` or `N-1` elements, where `N` is the number of results. If there are N basis elements, the first one will not be used during computations, but may be used during analysis and canonicalization to eliminate terms from @@ -1098,7 +1101,12 @@ def AffineDelinearizeIndexOp : Affine_Op<"delinearize_index", [Pure]> { %0:3 = affine.delinearize_index %linear_index into (244, 244) : index, index ``` - Note that, due to the constraints of affine maps, all the basis elements must + Note that, for symmetry with `getPaddedBasis()`, if `hasOuterBound` is `true` + when one of the `OpFoldResult` builders is called but the first element of the + basis is `nullptr`, that first element is ignored and the builder proceeds as if + there was no outer bound. + + Due to the constraints of affine maps, all the basis elements must be strictly positive. A dynamic basis element being 0 or negative causes undefined behavior. }]; @@ -1136,6 +1144,11 @@ def AffineDelinearizeIndexOp : Affine_Op<"delinearize_index", [Pure]> { /// Return a vector that contains the basis of the operation, removing /// the outer bound if one is present. SmallVector getEffectiveBasis(); + + /// Return the vector with one basis element per result of the operation. If + /// there is no outer bound specified, the leading entry of this result will be + /// nullptr. + SmallVector getPaddedBasis(); }]; let hasVerifier = 1; @@ -1160,6 +1173,9 @@ def AffineLinearizeIndexOp : Affine_Op<"linearize_index", sum(i = 0 to N-1) %idx_i * product(j = i + 1 to N-1) B_j ``` + In other words, `%0 = affine.linearize_index [%z, %y, %x] by (Z, Y, X)` + gives `%0 = %x + %y * X + %z * X * Y`, or `%0 = %x + X * (%y + Y * (%z))`. + The basis may either have `N` or `N-1` elements, where `N` is the number of inputs to linearize_index. If `N` inputs are provided, the first one is not used in computation, but may be used during analysis or canonicalization as a bound @@ -1168,6 +1184,10 @@ def AffineLinearizeIndexOp : Affine_Op<"linearize_index", If all `N` basis elements are provided, the linearize_index operation is said to "have an outer bound". + As a convenience, and for symmetry with `getPaddedBasis()`, ifg the first + element of a set of `OpFoldResult`s passed to the builders of this operation is + `nullptr`, that element is ignored. + If the `disjoint` property is present, this is an optimization hint that, for all `i`, `0 <= %idx_i < B_i` - that is, no index affects any other index, except that `%idx_0` may be negative to make the index as a whole negative. @@ -1224,6 +1244,11 @@ def AffineLinearizeIndexOp : Affine_Op<"linearize_index", /// Return a vector that contains the basis of the operation, removing /// the outer bound if one is present. SmallVector getEffectiveBasis(); + + /// Return the vector with one basis element per index operand of the operation. + /// If there is no outer bound specified, the leading entry of this basis will be + /// nullptr. + SmallVector getPaddedBasis(); }]; let hasVerifier = 1; diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp index dceebbfec586c..b45829bcf6d2c 100644 --- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp +++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp @@ -4520,6 +4520,10 @@ void AffineDelinearizeIndexOp::build(OpBuilder &odsBuilder, OperationState &odsState, Value linearIndex, ValueRange basis, bool hasOuterBound) { + if (hasOuterBound && !basis.empty() && basis.front() == nullptr) { + hasOuterBound = false; + basis = basis.drop_front(); + } SmallVector dynamicBasis; SmallVector staticBasis; dispatchIndexOpFoldResults(getAsOpFoldResult(basis), dynamicBasis, @@ -4533,6 +4537,10 @@ void AffineDelinearizeIndexOp::build(OpBuilder &odsBuilder, Value linearIndex, ArrayRef basis, bool hasOuterBound) { + if (hasOuterBound && !basis.empty() && basis.front() == OpFoldResult()) { + hasOuterBound = false; + basis = basis.drop_front(); + } SmallVector dynamicBasis; SmallVector staticBasis; dispatchIndexOpFoldResults(basis, dynamicBasis, staticBasis); @@ -4654,6 +4662,13 @@ SmallVector AffineDelinearizeIndexOp::getEffectiveBasis() { return getMixedValues(getStaticBasis(), getDynamicBasis(), builder); } +SmallVector AffineDelinearizeIndexOp::getPaddedBasis() { + SmallVector ret = getMixedBasis(); + if (!hasOuterBound()) + ret.insert(ret.begin(), OpFoldResult()); + return ret; +} + namespace { // Drops delinearization indices that correspond to unit-extent basis @@ -4672,25 +4687,27 @@ struct DropUnitExtentBasis return zero.value(); }; - bool hasOuterBound = delinearizeOp.hasOuterBound(); // Replace all indices corresponding to unit-extent basis with 0. // Remaining basis can be used to get a new `affine.delinearize_index` op. SmallVector newBasis; - for (auto [index, basis] : llvm::enumerate(delinearizeOp.getMixedBasis())) { - std::optional basisVal = getConstantIntValue(basis); + for (auto [index, basis] : + llvm::enumerate(delinearizeOp.getPaddedBasis())) { + std::optional basisVal = + basis ? getConstantIntValue(basis) : std::nullopt; if (basisVal && *basisVal == 1) - replacements[index + (hasOuterBound ? 0 : 1)] = getZero(); + replacements[index] = getZero(); else newBasis.push_back(basis); } - if (newBasis.size() == delinearizeOp.getStaticBasis().size()) + if (newBasis.size() == delinearizeOp.getNumResults()) return rewriter.notifyMatchFailure(delinearizeOp, "no unit basis elements"); - if (!newBasis.empty() || !hasOuterBound) { + if (!newBasis.empty()) { + // Will drop the leading nullptr from `basis` if there was no outer bound. auto newDelinearizeOp = rewriter.create( - loc, delinearizeOp.getLinearIndex(), newBasis, hasOuterBound); + loc, delinearizeOp.getLinearIndex(), newBasis); int newIndex = 0; // Map back the new delinearized indices to the values they replace. for (auto &replacement : replacements) { @@ -4871,6 +4888,8 @@ void AffineLinearizeIndexOp::build(OpBuilder &odsBuilder, OperationState &odsState, ValueRange multiIndex, ValueRange basis, bool disjoint) { + if (!basis.empty() && basis.front() == Value()) + basis = basis.drop_front(); SmallVector dynamicBasis; SmallVector staticBasis; dispatchIndexOpFoldResults(getAsOpFoldResult(basis), dynamicBasis, @@ -4883,6 +4902,8 @@ void AffineLinearizeIndexOp::build(OpBuilder &odsBuilder, ValueRange multiIndex, ArrayRef basis, bool disjoint) { + if (!basis.empty() && basis.front() == OpFoldResult()) + basis = basis.drop_front(); SmallVector dynamicBasis; SmallVector staticBasis; dispatchIndexOpFoldResults(basis, dynamicBasis, staticBasis); @@ -4965,7 +4986,14 @@ SmallVector AffineLinearizeIndexOp::getEffectiveBasis() { builder); } - return ::mlir::getMixedValues(getStaticBasis(), getDynamicBasis(), builder); + return getMixedValues(getStaticBasis(), getDynamicBasis(), builder); +} + +SmallVector AffineLinearizeIndexOp::getPaddedBasis() { + SmallVector ret = getMixedBasis(); + if (!hasOuterBound()) + ret.insert(ret.begin(), OpFoldResult()); + return ret; } namespace { @@ -5027,38 +5055,228 @@ struct DropLinearizeUnitComponentsIfDisjointOrZero final } }; -/// Cancel out linearize_index(delinearize_index(x, B), B). +/// Return the product of `terms`, creating an `affine.apply` if any of them are +/// non-constant values. If any of `terms` is `nullptr`, return `nullptr`. +static OpFoldResult computeProduct(Location loc, OpBuilder &builder, + ArrayRef terms) { + int64_t nDynamic = 0; + SmallVector dynamicPart; + AffineExpr result = builder.getAffineConstantExpr(1); + for (OpFoldResult term : terms) { + if (!term) + return term; + std::optional maybeConst = getConstantIntValue(term); + if (maybeConst) { + result = result * builder.getAffineConstantExpr(*maybeConst); + } else { + dynamicPart.push_back(term.get()); + result = result * builder.getAffineSymbolExpr(nDynamic++); + } + } + if (auto constant = dyn_cast(result)) + return getAsIndexOpFoldResult(builder.getContext(), constant.getValue()); + return builder.create(loc, result, dynamicPart).getResult(); +} + +/// If conseceutive outputs of a delinearize_index are linearized with the same +/// bounds, canonicalize away the redundant arithmetic. +/// +/// That is, if we have +/// ``` +/// %s:N = affine.delinearize_index %x into (...a, B1, B2, ... BK, ...b) +/// %t = affine.linearize_index [...c, %s#I, %s#(I + 1), ... %s#(I+K-1), ...d] +/// by (...e, B1, B2, ..., BK, ...f) +/// ``` /// -/// That is, rewrite +/// We can rewrite this to /// ``` -/// %0:N = affine.delinearize_index %x by (%b1, %b2, ... %bN) -/// %y = affine.linearize_index [%0#0, %0#1, ... %0#(N-1)] by (%b1, %b2, ... -/// %bN) +/// B = B1 * B2 ... BK +/// %sMerged:(N-K+1) affine.delinearize_index %x into (...a, B, ...b) +/// %t = affine.linearize_index [...c, %s#I, ...d] by (...e, B, ...f) /// ``` -/// to replacing `%y` with `%x`. -struct CancelLinearizeOfDelinearizeExact final +/// where we replace all results of %s unaffected by the change with results +/// from %sMerged. +/// +/// As a special case, if all results of the delinearize are merged in this way +/// we can replace those usages with %x, thus cancelling the delinearization +/// entirely, as in +/// ``` +/// %s:3 = affine.delinearize_index %x into (2, 4, 8) +/// %t = affine.linearize_index [%s#0, %s#1, %s#2, %c0] by (2, 4, 8, 16) +/// ``` +/// becoming `%t = affine.linearize_index [%x, %c0] by (64, 16)` +struct CancelLinearizeOfDelinearizePortion final : OpRewritePattern { using OpRewritePattern::OpRewritePattern; +private: + // Struct representing a case where the cancellation pattern + // applies. A `Match` means that `length` inputs to the linearize operation + // starting at `linStart` can be cancelled with `length` outputs of + // `delinearize`, starting from `delinStart`. + struct Match { + AffineDelinearizeIndexOp delinearize; + unsigned linStart = 0; + unsigned delinStart = 0; + unsigned length = 0; + }; + +public: LogicalResult matchAndRewrite(affine::AffineLinearizeIndexOp linearizeOp, PatternRewriter &rewriter) const override { - auto delinearizeOp = linearizeOp.getMultiIndex() - .front() - .getDefiningOp(); - if (!delinearizeOp) - return rewriter.notifyMatchFailure( - linearizeOp, "last entry doesn't come from a delinearize"); + SmallVector matches; + + const SmallVector linBasis = linearizeOp.getPaddedBasis(); + ArrayRef linBasisRef = linBasis; + + ValueRange multiIndex = linearizeOp.getMultiIndex(); + unsigned numLinArgs = multiIndex.size(); + unsigned linArgIdx = 0; + // We only want to replace one run from the same delinearize op per + // pattern invocation lest we run into invalidation issues. + llvm::SmallPtrSet alreadyMatchedDelinearize; + while (linArgIdx < numLinArgs) { + auto asResult = dyn_cast(multiIndex[linArgIdx]); + if (!asResult) { + linArgIdx++; + continue; + } - if (linearizeOp.getEffectiveBasis() != delinearizeOp.getEffectiveBasis()) - return rewriter.notifyMatchFailure( - linearizeOp, "basis of linearize and delinearize don't match exactly " - "(excluding outer bounds)"); + auto delinearizeOp = + dyn_cast(asResult.getOwner()); + if (!delinearizeOp) { + linArgIdx++; + continue; + } + + /// Result 0 of the delinearize and argument 0 of the linearize can + /// leave their maximum value unspecified. However, even if this happens + /// we can still sometimes start the match process. Specifically, if + /// - The argument we're matching is result 0 and argument 0 (so the + /// bounds don't matter). For example, + /// + /// %0:2 = affine.delinearize_index %x into (8) : index, index + /// %1 = affine.linearize_index [%s#0, %s#1, ...] (8, ...) + /// allows cancellation + /// - The delinearization doesn't specify a bound, but the linearization + /// is `disjoint`, which asserts that the bound on the linearization is + /// correct. + unsigned delinArgIdx = asResult.getResultNumber(); + SmallVector delinBasis = delinearizeOp.getPaddedBasis(); + OpFoldResult firstDelinBound = delinBasis[delinArgIdx]; + OpFoldResult firstLinBound = linBasis[linArgIdx]; + bool boundsMatch = firstDelinBound == firstLinBound; + bool bothAtFront = linArgIdx == 0 && delinArgIdx == 0; + bool knownByDisjoint = + linearizeOp.getDisjoint() && delinArgIdx == 0 && !firstDelinBound; + if (!boundsMatch && !bothAtFront && !knownByDisjoint) { + linArgIdx++; + continue; + } + + unsigned j = 1; + unsigned numDelinOuts = delinearizeOp.getNumResults(); + for (; j + linArgIdx < numLinArgs && j + delinArgIdx < numDelinOuts; + ++j) { + if (multiIndex[linArgIdx + j] != + delinearizeOp.getResult(delinArgIdx + j)) + break; + if (linBasis[linArgIdx + j] != delinBasis[delinArgIdx + j]) + break; + } + // If there're multiple matches against the same delinearize_index, + // only rewrite the first one we find to prevent invalidations. The next + // ones will be taken care of by subsequent pattern invocations. + if (j <= 1 || !alreadyMatchedDelinearize.insert(delinearizeOp).second) { + linArgIdx++; + continue; + } + matches.push_back(Match{delinearizeOp, linArgIdx, delinArgIdx, j}); + linArgIdx += j; + } - if (delinearizeOp.getResults() != linearizeOp.getMultiIndex()) + if (matches.empty()) return rewriter.notifyMatchFailure( - linearizeOp, "not all indices come from delinearize"); + linearizeOp, "no run of delinearize outputs to deal with"); + + // Record all the delinearize replacements so we can do them after creating + // the new linearization operation, since the new operation might use + // outputs of something we're replacing. + SmallVector> delinearizeReplacements; + + SmallVector newIndex; + newIndex.reserve(numLinArgs); + SmallVector newBasis; + newBasis.reserve(numLinArgs); + unsigned prevMatchEnd = 0; + for (Match m : matches) { + unsigned gap = m.linStart - prevMatchEnd; + llvm::append_range(newIndex, multiIndex.slice(prevMatchEnd, gap)); + llvm::append_range(newBasis, linBasisRef.slice(prevMatchEnd, gap)); + // Update here so we don't forget this during early continues + prevMatchEnd = m.linStart + m.length; + + PatternRewriter::InsertionGuard g(rewriter); + rewriter.setInsertionPoint(m.delinearize); + + ArrayRef basisToMerge = + linBasisRef.slice(m.linStart, m.length); + // We use the slice from the linearize's basis above because of the + // "bounds inferred from `disjoint`" case above. + OpFoldResult newSize = + computeProduct(linearizeOp.getLoc(), rewriter, basisToMerge); + + // Trivial case where we can just skip past the delinearize all together + if (m.length == m.delinearize.getNumResults()) { + newIndex.push_back(m.delinearize.getLinearIndex()); + newBasis.push_back(newSize); + // Pad out set of replacements so we don't do anything with this one. + delinearizeReplacements.push_back(SmallVector()); + continue; + } + + SmallVector newDelinResults; + SmallVector newDelinBasis = m.delinearize.getPaddedBasis(); + newDelinBasis.erase(newDelinBasis.begin() + m.delinStart, + newDelinBasis.begin() + m.delinStart + m.length); + newDelinBasis.insert(newDelinBasis.begin() + m.delinStart, newSize); + auto newDelinearize = rewriter.create( + m.delinearize.getLoc(), m.delinearize.getLinearIndex(), + newDelinBasis); + + // Since there may be other uses of the indices we just merged together, + // create a residual affine.delinearize_index that delinearizes the + // merged output into its component parts. + Value combinedElem = newDelinearize.getResult(m.delinStart); + auto residualDelinearize = rewriter.create( + m.delinearize.getLoc(), combinedElem, basisToMerge); + + // Swap all the uses of the unaffected delinearize outputs to the new + // delinearization so that the old code can be removed if this + // linearize_index is the only user of the merged results. + llvm::append_range(newDelinResults, + newDelinearize.getResults().take_front(m.delinStart)); + llvm::append_range(newDelinResults, residualDelinearize.getResults()); + llvm::append_range( + newDelinResults, + newDelinearize.getResults().drop_front(m.delinStart + 1)); + + delinearizeReplacements.push_back(newDelinResults); + newIndex.push_back(combinedElem); + newBasis.push_back(newSize); + } + llvm::append_range(newIndex, multiIndex.drop_front(prevMatchEnd)); + llvm::append_range(newBasis, linBasisRef.drop_front(prevMatchEnd)); + rewriter.replaceOpWithNewOp( + linearizeOp, newIndex, newBasis, linearizeOp.getDisjoint()); + + for (auto [m, newResults] : + llvm::zip_equal(matches, delinearizeReplacements)) { + if (newResults.empty()) + continue; + rewriter.replaceOp(m.delinearize, newResults); + } - rewriter.replaceOp(linearizeOp, delinearizeOp.getLinearIndex()); return success(); } }; @@ -5096,7 +5314,7 @@ struct DropLinearizeLeadingZero final void affine::AffineLinearizeIndexOp::getCanonicalizationPatterns( RewritePatternSet &patterns, MLIRContext *context) { - patterns.add(context); } diff --git a/mlir/test/Dialect/Affine/canonicalize.mlir b/mlir/test/Dialect/Affine/canonicalize.mlir index 717004eb50c0f..a9ac13ad71624 100644 --- a/mlir/test/Dialect/Affine/canonicalize.mlir +++ b/mlir/test/Dialect/Affine/canonicalize.mlir @@ -1917,12 +1917,12 @@ func.func @linearize_one_element_basis(%arg0: index, %arg1: index) -> index { // ----- -// CHECK-LABEL: func @cancel_linearize_denearize_exact( +// CHECK-LABEL: func @cancel_linearize_delinearize_exact( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, // CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index, // CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: index) // CHECK: return %[[ARG0]] -func.func @cancel_linearize_denearize_exact(%arg0: index, %arg1: index, %arg2: index) -> index { +func.func @cancel_linearize_delinearize_exact(%arg0: index, %arg1: index, %arg2: index) -> index { %0:3 = affine.delinearize_index %arg0 into (%arg1, 4, %arg2) : index, index, index %1 = affine.linearize_index [%0#0, %0#1, %0#2] by (%arg1, 4, %arg2) : index return %1 : index @@ -1930,12 +1930,12 @@ func.func @cancel_linearize_denearize_exact(%arg0: index, %arg1: index, %arg2: i // ----- -// CHECK-LABEL: func @cancel_linearize_denearize_linearize_extra_bound( +// CHECK-LABEL: func @cancel_linearize_delinearize_linearize_extra_bound( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, // CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index, // CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: index) // CHECK: return %[[ARG0]] -func.func @cancel_linearize_denearize_linearize_extra_bound(%arg0: index, %arg1: index, %arg2: index) -> index { +func.func @cancel_linearize_delinearize_linearize_extra_bound(%arg0: index, %arg1: index, %arg2: index) -> index { %0:3 = affine.delinearize_index %arg0 into (4, %arg2) : index, index, index %1 = affine.linearize_index [%0#0, %0#1, %0#2] by (%arg1, 4, %arg2) : index return %1 : index @@ -1943,12 +1943,12 @@ func.func @cancel_linearize_denearize_linearize_extra_bound(%arg0: index, %arg1: // ----- -// CHECK-LABEL: func @cancel_linearize_denearize_delinearize_extra_bound( +// CHECK-LABEL: func @cancel_linearize_delinearize_delinearize_extra_bound( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, // CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index, // CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: index) // CHECK: return %[[ARG0]] -func.func @cancel_linearize_denearize_delinearize_extra_bound(%arg0: index, %arg1: index, %arg2: index) -> index { +func.func @cancel_linearize_delinearize_delinearize_extra_bound(%arg0: index, %arg1: index, %arg2: index) -> index { %0:3 = affine.delinearize_index %arg0 into (%arg1, 4, %arg2) : index, index, index %1 = affine.linearize_index [%0#0, %0#1, %0#2] by (4, %arg2) : index return %1 : index @@ -1956,31 +1956,252 @@ func.func @cancel_linearize_denearize_delinearize_extra_bound(%arg0: index, %arg // ----- +// CHECK-LABEL: func @cancel_linearize_delinearize_head( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index) +// CHECK: %[[DELIN:.+]]:2 = affine.delinearize_index %[[ARG0]] into (12, 8) +// CHECK: %[[LIN:.+]] = affine.linearize_index [%[[DELIN]]#0, %[[ARG1]]] by (12, 16) +// CHECK: return %[[LIN]] +func.func @cancel_linearize_delinearize_head(%arg0: index, %arg1: index) -> index { + %0:3 = affine.delinearize_index %arg0 into (3, 4, 8) : index, index, index + %1 = affine.linearize_index [%0#0, %0#1, %arg1] by (3, 4, 16) : index + return %1 : index +} + +// ----- + +// CHECK-LABEL: func @cancel_linearize_delinearize_head_delinearize_unbounded( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index) +// CHECK: %[[DELIN:.+]]:2 = affine.delinearize_index %[[ARG0]] into (12, 8) +// CHECK: %[[LIN:.+]] = affine.linearize_index [%[[DELIN]]#0, %[[ARG1]]] by (12, 16) +// CHECK: return %[[LIN]] +func.func @cancel_linearize_delinearize_head_delinearize_unbounded(%arg0: index, %arg1: index) -> index { + %0:3 = affine.delinearize_index %arg0 into (4, 8) : index, index, index + %1 = affine.linearize_index [%0#0, %0#1, %arg1] by (3, 4, 16) : index + return %1 : index +} + +// ----- + +// CHECK-LABEL: func @cancel_linearize_delinearize_head_linearize_unbounded( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index) +// CHECK: %[[DELIN:.+]]:2 = affine.delinearize_index %[[ARG0]] into (8) +// CHECK: %[[LIN:.+]] = affine.linearize_index [%[[DELIN]]#0, %[[ARG1]]] by (16) +// CHECK: return %[[LIN]] +func.func @cancel_linearize_delinearize_head_linearize_unbounded(%arg0: index, %arg1: index) -> index { + %0:3 = affine.delinearize_index %arg0 into (3, 4, 8) : index, index, index + %1 = affine.linearize_index [%0#0, %0#1, %arg1] by (4, 16) : index + return %1 : index +} + +// ----- + +// CHECK-LABEL: func @cancel_linearize_delinearize_head_both_unbounded( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index) +// CHECK: %[[DELIN:.+]]:2 = affine.delinearize_index %[[ARG0]] into (8) +// CHECK: %[[LIN:.+]] = affine.linearize_index [%[[DELIN]]#0, %[[ARG1]]] by (16) +// CHECK: return %[[LIN]] +func.func @cancel_linearize_delinearize_head_both_unbounded(%arg0: index, %arg1: index) -> index { + %0:3 = affine.delinearize_index %arg0 into (4, 8) : index, index, index + %1 = affine.linearize_index [%0#0, %0#1, %arg1] by (4, 16) : index + return %1 : index +} + +// ----- + +// CHECK-LABEL: func @cancel_linearize_delinearize_tail( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index) +// CHECK: %[[DELIN:.+]]:2 = affine.delinearize_index %[[ARG0]] into (3, 32) +// CHECK: %[[LIN:.+]] = affine.linearize_index [%[[ARG1]], %[[DELIN]]#1] by (5, 32) +// CHECK: return %[[LIN]] +func.func @cancel_linearize_delinearize_tail(%arg0: index, %arg1: index) -> index { + %0:3 = affine.delinearize_index %arg0 into (3, 4, 8) : index, index, index + %1 = affine.linearize_index [%arg1, %0#1, %0#2] by (5, 4, 8) : index + return %1 : index +} + +// ----- + +// CHECK-LABEL: func @cancel_linearize_delinearize_middle_exact( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG2:[a-zA-z0-9]+]]: index) +// CHECK: %[[LIN:.+]] = affine.linearize_index [%[[ARG1]], %[[ARG0]], %[[ARG2]]] by (9, 30, 7) +// CHECK: return %[[LIN]] +func.func @cancel_linearize_delinearize_middle_exact(%arg0: index, %arg1: index, %arg2: index) -> index { + %0:3 = affine.delinearize_index %arg0 into (2, 3, 5) : index, index, index + %1 = affine.linearize_index [%arg1, %0#0, %0#1, %0#2, %arg2] by (9, 2, 3, 5, 7) : index + return %1 : index +} + +// ----- + +// CHECK: #[[$MAP:.+]] = affine_map<()[s0, s1] -> ((s0 * s1) * 16)> + +// CHECK-LABEL: func @cancel_linearize_delinearize_middle_exact_dynamic_basis( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG2:[a-zA-z0-9]+]]: index) +// CHECK: %[[C1:.+]] = arith.constant 1 : index +// CHECK: %[[SIZEPROD:.+]] = affine.apply #[[$MAP]]()[%[[ARG1]], %[[ARG2]]] +// CHECK: %[[LIN:.+]] = affine.linearize_index [%[[C1]], %[[ARG0]], %[[C1]]] by (3, %[[SIZEPROD]], 4) +// CHECK: return %[[LIN]] +func.func @cancel_linearize_delinearize_middle_exact_dynamic_basis(%arg0: index, %arg1: index, %arg2: index) -> index { + %c1 = arith.constant 1 : index + %0:4 = affine.delinearize_index %arg0 into (2, %arg1, %arg2, 8) : index, index, index, index + %1 = affine.linearize_index [%c1, %0#0, %0#1, %0#2, %0#3, %c1] by (3, 2, %arg1, %arg2, 8, 4) : index + return %1 : index +} + +// ----- + +// CHECK-LABEL: func @cancel_linearize_delinearize_middle_exact_delinearize_unbounded_disjoint( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG2:[a-zA-z0-9]+]]: index) +// CHECK: %[[LIN:.+]] = affine.linearize_index disjoint [%[[ARG1]], %[[ARG0]], %[[ARG2]]] by (9, 30, 7) +// CHECK: return %[[LIN]] +func.func @cancel_linearize_delinearize_middle_exact_delinearize_unbounded_disjoint(%arg0: index, %arg1: index, %arg2: index) -> index { + %0:3 = affine.delinearize_index %arg0 into (3, 5) : index, index, index + %1 = affine.linearize_index disjoint [%arg1, %0#0, %0#1, %0#2, %arg2] by (9, 2, 3, 5, 7) : index + return %1 : index +} + +// ----- + +// Unlike in the test above, the linerize indices aren't asserted to be disjoint, so +// we can't know if the `2` from the basis is a correct bound. +// CHECK-LABEL: func @dont_cancel_linearize_delinearize_middle_exact_delinearize_unbounded( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG2:[a-zA-z0-9]+]]: index) +// CHECK: %[[DELIN:.+]]:2 = affine.delinearize_index %[[ARG0]] into (3) +// CHECK: %[[LIN:.+]] = affine.linearize_index [%[[ARG1]], %[[DELIN]]#0, %[[DELIN]]#1, %[[ARG2]]] by (9, 2, 3, 7) +// CHECK: return %[[LIN]] + +func.func @dont_cancel_linearize_delinearize_middle_exact_delinearize_unbounded(%arg0: index, %arg1: index, %arg2: index) -> index { + %0:2 = affine.delinearize_index %arg0 into (3) : index, index + %1 = affine.linearize_index [%arg1, %0#0, %0#1, %arg2] by (9, 2, 3, 7) : index + return %1 : index +} + +// ----- + +// The presence of a `disjoint` here tells us that the "unbounded" term on the +// delinearization can't have been above 2. +// CHECK-LABEL: func @cancel_linearize_delinearize_middle_delinearize_unbounded_disjoint_implied_bound( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG2:[a-zA-z0-9]+]]: index) +// CHECK: %[[DELIN:.+]]:2 = affine.delinearize_index %[[ARG0]] into (6, 5) +// CHECK: %[[LIN:.+]] = affine.linearize_index disjoint [%[[ARG1]], %[[DELIN]]#0, %[[ARG2]]] by (9, 6, 7) +// CHECK: return %[[LIN]] +func.func @cancel_linearize_delinearize_middle_delinearize_unbounded_disjoint_implied_bound(%arg0: index, %arg1: index, %arg2: index) -> index { + %0:3 = affine.delinearize_index %arg0 into (3, 5) : index, index, index + %1 = affine.linearize_index disjoint [%arg1, %0#0, %0#1, %arg2] by (9, 2, 3, 7) : index + return %1 : index +} + +// ----- + +// CHECK-LABEL: func @cancel_linearize_delinearize_multiple_matches( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index) +// CHECK: %[[C0:.+]] = arith.constant 0 +// CHECK: %[[DELIN:.+]]:4 = affine.delinearize_index %[[ARG0]] into (4, 16, 4, 64) +// CHECK: %[[LIN:.+]] = affine.linearize_index [%[[ARG1]], %[[DELIN]]#1, %[[C0]], %[[DELIN]]#3] by (4, 16, 4, 64) +// CHECK: return %[[LIN]] +func.func @cancel_linearize_delinearize_multiple_matches(%arg0: index, %arg1: index) -> index { + %c0 = arith.constant 0 : index + %0:7 = affine.delinearize_index %arg0 into (4, 4, 4, 4, 4, 4, 4) : index, index, index, index, index, index, index + %1 = affine.linearize_index [%arg1, %0#1, %0#2, %c0, %0#4, %0#5, %0#6] by (4, 4, 4, 4, 4, 4, 4) : index + return %1 : index +} + +// ----- + +// CHECK-LABEL: func @cancel_linearize_delinearize_multiple_delinearizes( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index) +// CHECK: %[[LIN:.+]] = affine.linearize_index [%[[ARG0]], %[[ARG1]]] by (32, 32) +// CHECK: return %[[LIN]] +func.func @cancel_linearize_delinearize_multiple_delinearizes(%arg0: index, %arg1: index) -> index { + %0:2 = affine.delinearize_index %arg0 into (4, 8) : index, index + %1:2 = affine.delinearize_index %arg1 into (2, 16) : index, index + %2 = affine.linearize_index [%0#0, %0#1, %1#0, %1#1] by (4, 8, 2, 16) : index + return %2 : index +} + +// ----- + // Don't cancel because the values from the delinearize aren't used in order -// CHECK-LABEL: func @no_cancel_linearize_denearize_permuted( +// CHECK-LABEL: func @no_cancel_linearize_delinearize_permuted( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, // CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index, // CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: index) // CHECK: %[[DELIN:.+]]:3 = affine.delinearize_index %[[ARG0]] into (%[[ARG1]], 4, %[[ARG2]]) -// CHECK: %[[LIN:.+]] = affine.linearize_index [%[[DELIN]]#0, %[[DELIN]]#2, %[[DELIN]]#1] by (%[[ARG1]], 4, %[[ARG2]]) +// CHECK: %[[LIN:.+]] = affine.linearize_index [%[[DELIN]]#0, %[[DELIN]]#2, %[[DELIN]]#1] by (%[[ARG1]], %[[ARG2]], 4) // CHECK: return %[[LIN]] -func.func @no_cancel_linearize_denearize_permuted(%arg0: index, %arg1: index, %arg2: index) -> index { +func.func @no_cancel_linearize_delinearize_permuted(%arg0: index, %arg1: index, %arg2: index) -> index { %0:3 = affine.delinearize_index %arg0 into (%arg1, 4, %arg2) : index, index, index - %1 = affine.linearize_index [%0#0, %0#2, %0#1] by (%arg1, 4, %arg2) : index + %1 = affine.linearize_index [%0#0, %0#2, %0#1] by (%arg1, %arg2, 4) : index + return %1 : index +} + +// ----- + +// CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 3)> +// But these cancel because they're a contiguous segment +// CHECK-LABEL: func @partial_cancel_linearize_delinearize_not_fully_permuted( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: index) +// CHECK: %[[SIZEPROD:.+]] = affine.apply #[[$MAP]]()[%[[ARG2]]] +// CHECK: %[[DELIN:.+]]:3 = affine.delinearize_index %[[ARG0]] into (%[[ARG1]], 4, %[[SIZEPROD]]) +// CHECK: %[[LIN:.+]] = affine.linearize_index [%[[DELIN]]#0, %[[DELIN]]#2, %[[DELIN]]#1] by (%[[ARG1]], %[[SIZEPROD]], 4) +// CHECK: return %[[LIN]] +func.func @partial_cancel_linearize_delinearize_not_fully_permuted(%arg0: index, %arg1: index, %arg2: index) -> index { + %0:4 = affine.delinearize_index %arg0 into (%arg1, 4, %arg2, 3) : index, index, index, index + %1 = affine.linearize_index [%0#0, %0#2, %0#3, %0#1] by (%arg1, %arg2, 3, 4) : index return %1 : index } // ----- +// Ensure we don't get SSA errors when creating new `affine.delinearize` operations. +// CHECK-LABEL: func @cancel_linearize_delinearize_placement +// CHECK-SAME: (%[[ARG0:.+]]: index) +// CHECK: %[[C0:.+]] = arith.constant 0 : index +// CHECK: %[[NEW_DELIN:.+]]:2 = affine.delinearize_index %[[ARG0]] into (8, 32) : index, index +// CHECK-NEXT: %[[DELIN_PART:.+]]:2 = affine.delinearize_index %[[NEW_DELIN]]#1 into (8, 4) : index, index +// CHECK-NEXT: %[[L1:.+]] = affine.linearize_index disjoint [%[[DELIN_PART]]#1, %[[NEW_DELIN]]#0, %[[C0]], %[[C0]]] by (4, 8, 4, 8) +// CHECK-NEXT: %[[L2:.+]] = affine.linearize_index disjoint [%[[NEW_DELIN]]#1, %[[C0]], %[[C0]]] by (32, 8, 4) +// CHECK-NEXT: %[[L3:.+]] = affine.linearize_index disjoint [%[[DELIN_PART]]#0, %[[NEW_DELIN]]#0, %[[C0]], %[[C0]]] by (8, 8, 4, 4) +// CHECK-NEXT: return %[[L1]], %[[L2]], %[[L3]] +func.func @cancel_linearize_delinearize_placement(%arg0: index) -> (index, index, index) { + %c0 = arith.constant 0 : index + %0:3 = affine.delinearize_index %arg0 into (8, 8, 4) : index, index, index + %1 = affine.linearize_index disjoint [%0#2, %0#0, %c0, %c0] by (4, 8, 4, 8) : index + %2 = affine.linearize_index disjoint [%0#1, %0#2, %c0, %c0] by (8, 4, 8, 4) : index + %3 = affine.linearize_index disjoint [%0#1, %0#0, %c0, %c0] by (8, 8, 4, 4) : index + return %1, %2, %3 : index, index, index +} + +// ----- + // Won't cancel because the linearize and delinearize are using a different basis -// CHECK-LABEL: func @no_cancel_linearize_denearize_different_basis( +// CHECK-LABEL: func @no_cancel_linearize_delinearize_different_basis( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, // CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index, // CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: index) // CHECK: %[[DELIN:.+]]:3 = affine.delinearize_index %[[ARG0]] into (%[[ARG1]], 4, %[[ARG2]]) // CHECK: %[[LIN:.+]] = affine.linearize_index [%[[DELIN]]#0, %[[DELIN]]#1, %[[DELIN]]#2] by (%[[ARG1]], 8, %[[ARG2]]) // CHECK: return %[[LIN]] -func.func @no_cancel_linearize_denearize_different_basis(%arg0: index, %arg1: index, %arg2: index) -> index { +func.func @no_cancel_linearize_delinearize_different_basis(%arg0: index, %arg1: index, %arg2: index) -> index { %0:3 = affine.delinearize_index %arg0 into (%arg1, 4, %arg2) : index, index, index %1 = affine.linearize_index [%0#0, %0#1, %0#2] by (%arg1, 8, %arg2) : index return %1 : index From 18b47373cb47f1f63ab1f6e126ccfb22cc52963c Mon Sep 17 00:00:00 2001 From: jmriesen <20286401+jmriesen@users.noreply.github.com> Date: Fri, 3 Jan 2025 15:18:39 -0600 Subject: [PATCH 402/567] Updating broken/outdated links in the ProgrammerManual (#119472) Fixes llvm/llvm-project#117897 --- llvm/docs/ProgrammersManual.rst | 6 +++--- llvm/include/llvm/IR/PassManager.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/docs/ProgrammersManual.rst b/llvm/docs/ProgrammersManual.rst index 98803ddffd082..e2829eb5a8846 100644 --- a/llvm/docs/ProgrammersManual.rst +++ b/llvm/docs/ProgrammersManual.rst @@ -3358,15 +3358,15 @@ the ``PassManager.h`` system, and there is a more detailed introduction to it by Sean Parent in several of his talks and papers: #. `Inheritance Is The Base Class of Evil - `_ + `_ - The GoingNative 2013 talk describing this technique, and probably the best place to start. #. `Value Semantics and Concepts-based Polymorphism `_ - The C++Now! 2012 talk describing this technique in more detail. #. `Sean Parent's Papers and Presentations - `_ - - A GitHub project full of links to slides, video, and sometimes code. + `_ + - Links to slides, videos, and sometimes code. When deciding between creating a type hierarchy (with either tagged or virtual dispatch) and using templates or concepts-based polymorphism, consider whether diff --git a/llvm/include/llvm/IR/PassManager.h b/llvm/include/llvm/IR/PassManager.h index 5dab9d0d0a797..b5230047b0e12 100644 --- a/llvm/include/llvm/IR/PassManager.h +++ b/llvm/include/llvm/IR/PassManager.h @@ -28,9 +28,9 @@ /// polymorphism as outlined in the "Value Semantics and Concept-based /// Polymorphism" talk (or its abbreviated sibling "Inheritance Is The Base /// Class of Evil") by Sean Parent: -/// * http://github.com/sean-parent/sean-parent.github.com/wiki/Papers-and-Presentations +/// * https://sean-parent.stlab.cc/papers-and-presentations /// * http://www.youtube.com/watch?v=_BpMYeUFXv8 -/// * http://channel9.msdn.com/Events/GoingNative/2013/Inheritance-Is-The-Base-Class-of-Evil +/// * https://learn.microsoft.com/en-us/shows/goingnative-2013/inheritance-base-class-of-evil /// //===----------------------------------------------------------------------===// From d85b22ed5dbb794835fd4b5166d5bb79ad9e09f2 Mon Sep 17 00:00:00 2001 From: kefan cao <45958009+caokefan@users.noreply.github.com> Date: Sat, 4 Jan 2025 05:32:02 +0800 Subject: [PATCH 403/567] [Clang][ASTMatcher] Add `dependentTemplateSpecializationType` matcher (#121435) Fixes https://github.com/llvm/llvm-project/issues/121307 --- clang/docs/LibASTMatchersReference.html | 11 +++++++++++ clang/docs/ReleaseNotes.rst | 2 ++ clang/include/clang/ASTMatchers/ASTMatchers.h | 12 ++++++++++++ clang/lib/ASTMatchers/ASTMatchersInternal.cpp | 2 ++ clang/lib/ASTMatchers/Dynamic/Registry.cpp | 1 + clang/unittests/AST/ASTImporterTest.cpp | 4 ---- .../unittests/ASTMatchers/ASTMatchersNodeTest.cpp | 15 +++++++++++++++ 7 files changed, 43 insertions(+), 4 deletions(-) diff --git a/clang/docs/LibASTMatchersReference.html b/clang/docs/LibASTMatchersReference.html index 8564f2650d205..fc55788801325 100644 --- a/clang/docs/LibASTMatchersReference.html +++ b/clang/docs/LibASTMatchersReference.html @@ -2546,6 +2546,17 @@

Node Matchers

};
+Matcher<Type>dependentTemplateSpecializationTypeMatcher<DependentTemplateSpecializationType>... +
Matches a dependent template specialization type.
+
+Example matches A::template B
+
+  template struct A;
+  template struct declToImport {
+    typename A::template B a;
+  };
+
+ Matcher<Type>deducedTemplateSpecializationTypeMatcher<DeducedTemplateSpecializationType>...
Matches C++17 deduced template specialization types, e.g. deduced class
 template types.
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 61d6aa2216cd0..5e75fc447636e 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1114,6 +1114,8 @@ AST Matchers
 
 - Add ``dependentNameType`` matcher to match a dependent name type.
 
+- Add ``dependentTemplateSpecializationType`` matcher to match a dependent template specialization type.
+
 clang-format
 ------------
 
diff --git a/clang/include/clang/ASTMatchers/ASTMatchers.h b/clang/include/clang/ASTMatchers/ASTMatchers.h
index 9a046714068a5..dd0fedb2cda2d 100644
--- a/clang/include/clang/ASTMatchers/ASTMatchers.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchers.h
@@ -7721,6 +7721,18 @@ AST_MATCHER_P(DecayedType, hasDecayedType, internal::Matcher,
 /// \endcode
 extern const AstTypeMatcher dependentNameType;
 
+/// Matches a dependent template specialization type
+///
+/// Example matches A::template B
+/// \code
+///   template struct A;
+///   template struct declToImport {
+///     typename A::template B a;
+///   };
+/// \endcode
+extern const AstTypeMatcher
+    dependentTemplateSpecializationType;
+
 /// Matches declarations whose declaration context, interpreted as a
 /// Decl, matches \c InnerMatcher.
 ///
diff --git a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
index a47633bf4bae2..9c7943a98d652 100644
--- a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
+++ b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
@@ -1109,6 +1109,8 @@ const AstTypeMatcher templateTypeParmType;
 const AstTypeMatcher injectedClassNameType;
 const AstTypeMatcher decayedType;
 const AstTypeMatcher dependentNameType;
+const AstTypeMatcher
+    dependentTemplateSpecializationType;
 AST_TYPELOC_TRAVERSE_MATCHER_DEF(hasElementType,
                                  AST_POLYMORPHIC_SUPPORTED_TYPES(ArrayType,
                                                                  ComplexType));
diff --git a/clang/lib/ASTMatchers/Dynamic/Registry.cpp b/clang/lib/ASTMatchers/Dynamic/Registry.cpp
index bfdee412c5328..97e6bbc093fe4 100644
--- a/clang/lib/ASTMatchers/Dynamic/Registry.cpp
+++ b/clang/lib/ASTMatchers/Dynamic/Registry.cpp
@@ -224,6 +224,7 @@ RegistryMaps::RegistryMaps() {
   REGISTER_MATCHER(declRefExpr);
   REGISTER_MATCHER(dependentNameType);
   REGISTER_MATCHER(dependentScopeDeclRefExpr);
+  REGISTER_MATCHER(dependentTemplateSpecializationType);
   REGISTER_MATCHER(declStmt);
   REGISTER_MATCHER(declaratorDecl);
   REGISTER_MATCHER(decltypeType);
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index ee1d896f1ca6d..d197d30df3adf 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -763,10 +763,6 @@ TEST_P(ImportType, ImportPackExpansion) {
                                    implicitCastExpr(has(declRefExpr()))))))));
 }
 
-const internal::VariadicDynCastAllOfMatcher
-    dependentTemplateSpecializationType;
-
 TEST_P(ImportType, ImportDependentTemplateSpecialization) {
   MatchVerifier Verifier;
   testImport("template"
diff --git a/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
index b8521e2f95768..680e21840b7d3 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
+++ b/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
@@ -1926,6 +1926,21 @@ TEST_P(ASTMatchersTest, DependentNameType) {
       dependentNameType()));
 }
 
+TEST_P(ASTMatchersTest, DependentTemplateSpecializationType) {
+  if (!GetParam().isCXX()) {
+    return;
+  }
+
+  EXPECT_TRUE(matches(
+      R"(
+        template struct A;
+        template struct declToImport {
+          typename A::template B a;
+        };
+      )",
+      dependentTemplateSpecializationType()));
+}
+
 TEST_P(ASTMatchersTest, RecordType) {
   EXPECT_TRUE(matches("struct S {}; struct S s;",
                       recordType(hasDeclaration(recordDecl(hasName("S"))))));

From d7acf03cecef0bc62240c97a890077755323424f Mon Sep 17 00:00:00 2001
From: Brox Chen 
Date: Fri, 3 Jan 2025 16:32:15 -0500
Subject: [PATCH 404/567] [AMDGPU][True16][MC] true16 for v_rndne_f16 (#120691)

Support true16 format for v_rndne_b16 in MC
---
 llvm/lib/Target/AMDGPU/VOP1Instructions.td    |   2 +-
 llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll     |  41 +++++
 llvm/test/MC/AMDGPU/gfx11_asm_vop1.s          |  75 +++++----
 llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s    |  65 ++++----
 llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s     |  21 ++-
 llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s  |  42 +++++
 .../MC/AMDGPU/gfx11_asm_vop1_t16_promote.s    | 154 +++++++++++++-----
 .../AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s   |  65 ++++----
 .../MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s |  25 ++-
 .../test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s |  69 ++++----
 llvm/test/MC/AMDGPU/gfx12_asm_vop1.s          |  72 ++++----
 llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s    |  62 +++----
 llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s     |  18 +-
 llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s  |  42 +++++
 .../MC/AMDGPU/gfx12_asm_vop1_t16_promote.s    | 154 +++++++++++++-----
 .../test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s |  69 ++++----
 .../AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s   |  65 ++++----
 .../MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s |  25 ++-
 .../Disassembler/AMDGPU/gfx11_dasm_vop1.txt   |  63 +++++--
 .../AMDGPU/gfx11_dasm_vop1_dpp16.txt          |  54 ++++--
 .../AMDGPU/gfx11_dasm_vop1_dpp8.txt           |  17 +-
 .../gfx11_dasm_vop3_dpp16_from_vop1.txt       |  54 ++++--
 .../AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt |  24 ++-
 .../AMDGPU/gfx11_dasm_vop3_from_vop1.txt      |  57 +++++--
 .../AMDGPU/gfx12_dasm_vop1_dpp16.txt          |  50 ++++--
 .../AMDGPU/gfx12_dasm_vop1_dpp8.txt           |  13 +-
 .../AMDGPU/gfx12_dasm_vop3_from_vop1.txt      |  57 +++++--
 .../gfx12_dasm_vop3_from_vop1_dpp16.txt       |  54 ++++--
 .../AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt |  24 ++-
 29 files changed, 1071 insertions(+), 462 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index fc22b539d7153..f0d2fe0f4f547 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -1043,7 +1043,7 @@ defm V_FLOOR_F16_fake16      : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f1
 defm V_CEIL_F16_t16          : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">;
 defm V_CEIL_F16_fake16       : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">;
 defm V_TRUNC_F16             : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05d, "v_trunc_f16">;
-defm V_RNDNE_F16_fake16      : VOP1_Real_FULL_t16_gfx11_gfx12<0x05e, "v_rndne_f16">;
+defm V_RNDNE_F16             : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05e, "v_rndne_f16">;
 defm V_FRACT_F16             : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05f, "v_fract_f16">;
 defm V_SIN_F16               : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x060, "v_sin_f16">;
 defm V_COS_F16               : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x061, "v_cos_f16">;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index 4de0c548ad381..795ed6d542a13 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,VI %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,GFX9 %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12 %s
 
 declare half @llvm.rint.f16(half %a)
 declare <2 x half> @llvm.rint.v2f16(<2 x half> %a)
@@ -63,6 +64,24 @@ define amdgpu_kernel void @rint_f16(
 ; GFX11-NEXT:    v_rndne_f16_e32 v0, v0
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: rint_f16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s6, -1
+; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s10, s6
+; GFX12-NEXT:    s_mov_b32 s11, s7
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s2
+; GFX12-NEXT:    s_mov_b32 s9, s3
+; GFX12-NEXT:    s_mov_b32 s4, s0
+; GFX12-NEXT:    buffer_load_u16 v0, off, s[8:11], null
+; GFX12-NEXT:    s_mov_b32 s5, s1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX12-NEXT:    buffer_store_b16 v0, off, s[4:7], null
+; GFX12-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -168,6 +187,28 @@ define amdgpu_kernel void @rint_v2f16(
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: rint_v2f16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s6, -1
+; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s10, s6
+; GFX12-NEXT:    s_mov_b32 s11, s7
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s2
+; GFX12-NEXT:    s_mov_b32 s9, s3
+; GFX12-NEXT:    s_mov_b32 s4, s0
+; GFX12-NEXT:    buffer_load_b32 v0, off, s[8:11], null
+; GFX12-NEXT:    s_mov_b32 s5, s1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_rndne_f16_e32 v1, v1
+; GFX12-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX12-NEXT:    buffer_store_b32 v0, off, s[4:7], null
+; GFX12-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
index 40a6e434b438d..2480be97a7a64 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
@@ -3044,50 +3044,65 @@ v_readfirstlane_b32 ttmp15, v1
 v_readfirstlane_b32 null, v255
 // GFX11: v_readfirstlane_b32 null, v255          ; encoding: [0xff,0x05,0xf8,0x7e]
 
-v_rndne_f16 v5, v1
-// GFX11: v_rndne_f16_e32 v5, v1                  ; encoding: [0x01,0xbd,0x0a,0x7e]
+v_rndne_f16 v5.l, v1.l
+// GFX11: v_rndne_f16_e32 v5.l, v1.l              ; encoding: [0x01,0xbd,0x0a,0x7e]
 
-v_rndne_f16 v5, v127
-// GFX11: v_rndne_f16_e32 v5, v127                ; encoding: [0x7f,0xbd,0x0a,0x7e]
+v_rndne_f16 v5.l, v127.l
+// GFX11: v_rndne_f16_e32 v5.l, v127.l            ; encoding: [0x7f,0xbd,0x0a,0x7e]
 
-v_rndne_f16 v5, s1
-// GFX11: v_rndne_f16_e32 v5, s1                  ; encoding: [0x01,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, s1
+// GFX11: v_rndne_f16_e32 v5.l, s1                ; encoding: [0x01,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, s105
-// GFX11: v_rndne_f16_e32 v5, s105                ; encoding: [0x69,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, s105
+// GFX11: v_rndne_f16_e32 v5.l, s105              ; encoding: [0x69,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, vcc_lo
-// GFX11: v_rndne_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, vcc_lo
+// GFX11: v_rndne_f16_e32 v5.l, vcc_lo            ; encoding: [0x6a,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, vcc_hi
-// GFX11: v_rndne_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, vcc_hi
+// GFX11: v_rndne_f16_e32 v5.l, vcc_hi            ; encoding: [0x6b,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, ttmp15
-// GFX11: v_rndne_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, ttmp15
+// GFX11: v_rndne_f16_e32 v5.l, ttmp15            ; encoding: [0x7b,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, m0
-// GFX11: v_rndne_f16_e32 v5, m0                  ; encoding: [0x7d,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, m0
+// GFX11: v_rndne_f16_e32 v5.l, m0                ; encoding: [0x7d,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, exec_lo
-// GFX11: v_rndne_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, exec_lo
+// GFX11: v_rndne_f16_e32 v5.l, exec_lo           ; encoding: [0x7e,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, exec_hi
-// GFX11: v_rndne_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, exec_hi
+// GFX11: v_rndne_f16_e32 v5.l, exec_hi           ; encoding: [0x7f,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, null
-// GFX11: v_rndne_f16_e32 v5, null                ; encoding: [0x7c,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, null
+// GFX11: v_rndne_f16_e32 v5.l, null              ; encoding: [0x7c,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, -1
-// GFX11: v_rndne_f16_e32 v5, -1                  ; encoding: [0xc1,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, -1
+// GFX11: v_rndne_f16_e32 v5.l, -1                ; encoding: [0xc1,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, 0.5
-// GFX11: v_rndne_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, 0.5
+// GFX11: v_rndne_f16_e32 v5.l, 0.5               ; encoding: [0xf0,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, src_scc
-// GFX11: v_rndne_f16_e32 v5, src_scc             ; encoding: [0xfd,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, src_scc
+// GFX11: v_rndne_f16_e32 v5.l, src_scc           ; encoding: [0xfd,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v127, 0xfe0b
-// GFX11: v_rndne_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_rndne_f16 v127.l, 0xfe0b
+// GFX11: v_rndne_f16_e32 v127.l, 0xfe0b          ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_rndne_f16 v5.l, v1.h
+// GFX11: v_rndne_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbd,0x0a,0x7e]
+
+v_rndne_f16 v5.l, v127.h
+// GFX11: v_rndne_f16_e32 v5.l, v127.h            ; encoding: [0xff,0xbd,0x0a,0x7e]
+
+v_rndne_f16 v127.l, 0.5
+// GFX11: v_rndne_f16_e32 v127.l, 0.5             ; encoding: [0xf0,0xbc,0xfe,0x7e]
+
+v_rndne_f16 v5.h, src_scc
+// GFX11: v_rndne_f16_e32 v5.h, src_scc           ; encoding: [0xfd,0xbc,0x0a,0x7f]
+
+v_rndne_f16 v127.h, 0xfe0b
+// GFX11: v_rndne_f16_e32 v127.h, 0xfe0b          ; encoding: [0xff,0xbc,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_rndne_f32 v5, v1
 // GFX11: v_rndne_f32_e32 v5, v1                  ; encoding: [0x01,0x47,0x0a,0x7e]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s
index 706cb6e32f88a..0f77279397485 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s
@@ -2399,47 +2399,56 @@ v_rcp_iflag_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_rcp_iflag_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_rcp_iflag_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x56,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
-v_rndne_f16 v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_rndne_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_rndne_f16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
-v_rndne_f16 v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_rndne_f16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_rndne_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
-v_rndne_f16 v5, v1 row_mirror
-// GFX11: v_rndne_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_mirror
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_half_mirror
-// GFX11: v_rndne_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_half_mirror
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shl:1
-// GFX11: v_rndne_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shl:1
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shl:15
-// GFX11: v_rndne_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shl:15
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shr:1
-// GFX11: v_rndne_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shr:1
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shr:15
-// GFX11: v_rndne_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shr:15
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_ror:1
-// GFX11: v_rndne_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_ror:1
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_ror:15
-// GFX11: v_rndne_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_ror:15
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_rndne_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_rndne_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_rndne_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_rndne_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_rndne_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_rndne_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_rndne_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_rndne_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_rndne_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_rndne_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_rndne_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_rndne_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+
+v_rndne_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_rndne_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbc,0x0a,0x7f,0x81,0x60,0x09,0x13]
+
+v_rndne_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_rndne_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
 v_rndne_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_rndne_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s
index d7051aff42d77..4a89305a5b353 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s
@@ -593,14 +593,23 @@ v_rcp_iflag_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_rcp_iflag_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_rcp_iflag_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x56,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_rndne_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_rndne_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_rndne_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_rndne_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_rndne_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_rndne_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_rndne_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_rndne_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_rndne_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+
+v_rndne_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_rndne_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbc,0x0a,0x7f,0x81,0x77,0x39,0x05]
+
+v_rndne_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_rndne_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbc,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_rndne_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_rndne_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x46,0x0a,0x7e,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s
index 263ad4bf513a1..7d29adcd73ccc 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s
@@ -833,6 +833,12 @@ v_rcp_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
 v_rndne_f16_e32 v128, 0xfe0b
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
+v_rndne_f16_e32 v128.h, 0xfe0b
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v128.l, 0xfe0b
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_rndne_f16_e32 v255, v1
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -842,6 +848,24 @@ v_rndne_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_rndne_f16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
+v_rndne_f16_e32 v255.h, v1.h
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.l, v1.l
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_rndne_f16_e32 v5, v199
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -851,6 +875,24 @@ v_rndne_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_rndne_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
+v_rndne_f16_e32 v5.h, v199.h
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.l, v199.l
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
 v_rsq_f16_e32 v128.h, 0xfe0b
 // GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
 
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s
index 42c36538f2bf6..f2dbb782186f6 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s
@@ -1943,71 +1943,137 @@ v_rcp_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_rcp_f16 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: v_rcp_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_rndne_f16 v128, 0xfe0b
-// GFX11: v_rndne_f16_e64 v128, 0xfe0b            ; encoding: [0x80,0x00,0xde,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_rndne_f16 v128.h, 0xfe0b
+// GFX11: v_rndne_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xde,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_rndne_f16 v255, -1
-// GFX11: v_rndne_f16_e64 v255, -1                ; encoding: [0xff,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+v_rndne_f16 v128.l, 0xfe0b
+// GFX11: v_rndne_f16_e64 v128.l, 0xfe0b          ; encoding: [0x80,0x00,0xde,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_rndne_f16 v255, 0.5
-// GFX11: v_rndne_f16_e64 v255, 0.5               ; encoding: [0xff,0x00,0xde,0xd5,0xf0,0x00,0x00,0x00]
+v_rndne_f16 v255.h, -1
+// GFX11: v_rndne_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rndne_f16 v255, exec_hi
-// GFX11: v_rndne_f16_e64 v255, exec_hi           ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+v_rndne_f16 v255.h, 0.5
+// GFX11: v_rndne_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0xf0,0x00,0x00,0x00]
 
-v_rndne_f16 v255, exec_lo
-// GFX11: v_rndne_f16_e64 v255, exec_lo           ; encoding: [0xff,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+v_rndne_f16 v255.h, exec_hi
+// GFX11: v_rndne_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7f,0x00,0x00,0x00]
 
-v_rndne_f16 v255, m0
-// GFX11: v_rndne_f16_e64 v255, m0                ; encoding: [0xff,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+v_rndne_f16 v255.h, exec_lo
+// GFX11: v_rndne_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7e,0x00,0x00,0x00]
 
-v_rndne_f16 v255, null
-// GFX11: v_rndne_f16_e64 v255, null              ; encoding: [0xff,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+v_rndne_f16 v255.h, m0
+// GFX11: v_rndne_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7d,0x00,0x00,0x00]
 
-v_rndne_f16 v255, s1
-// GFX11: v_rndne_f16_e64 v255, s1                ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+v_rndne_f16 v255.h, null
+// GFX11: v_rndne_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7c,0x00,0x00,0x00]
 
-v_rndne_f16 v255, s105
-// GFX11: v_rndne_f16_e64 v255, s105              ; encoding: [0xff,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+v_rndne_f16 v255.h, s1
+// GFX11: v_rndne_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x01,0x00,0x00,0x00]
 
-v_rndne_f16 v255, src_scc
-// GFX11: v_rndne_f16_e64 v255, src_scc           ; encoding: [0xff,0x00,0xde,0xd5,0xfd,0x00,0x00,0x00]
+v_rndne_f16 v255.h, s105
+// GFX11: v_rndne_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x69,0x00,0x00,0x00]
 
-v_rndne_f16 v255, ttmp15
-// GFX11: v_rndne_f16_e64 v255, ttmp15            ; encoding: [0xff,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+v_rndne_f16 v255.h, src_scc
+// GFX11: v_rndne_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0xfd,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v1
-// GFX11: v_rndne_f16_e64 v255, v1                ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+v_rndne_f16 v255.h, ttmp15
+// GFX11: v_rndne_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7b,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_rndne_f16 v255.h, v1.h
+// GFX11: v_rndne_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xde,0xd5,0x01,0x01,0x00,0x00]
 
-v_rndne_f16 v255, v1 quad_perm:[3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_rndne_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_rndne_f16 v255, v127
-// GFX11: v_rndne_f16_e64 v255, v127              ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x01,0x00,0x00]
+v_rndne_f16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_rndne_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_rndne_f16 v255.h, v127.h
+// GFX11: v_rndne_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xde,0xd5,0x7f,0x01,0x00,0x00]
 
-v_rndne_f16 v255, v127 quad_perm:[3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_rndne_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xde,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_rndne_f16 v255, vcc_hi
-// GFX11: v_rndne_f16_e64 v255, vcc_hi            ; encoding: [0xff,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+v_rndne_f16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xde,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_rndne_f16 v255, vcc_lo
-// GFX11: v_rndne_f16_e64 v255, vcc_lo            ; encoding: [0xff,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+v_rndne_f16 v255.h, vcc_hi
+// GFX11: v_rndne_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x6b,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v199
-// GFX11: v_rndne_f16_e64 v5, v199                ; encoding: [0x05,0x00,0xde,0xd5,0xc7,0x01,0x00,0x00]
+v_rndne_f16 v255.h, vcc_lo
+// GFX11: v_rndne_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x6a,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_rndne_f16 v255.l, -1
+// GFX11: v_rndne_f16_e64 v255.l, -1              ; encoding: [0xff,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v199 quad_perm:[3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_rndne_f16 v255.l, 0.5
+// GFX11: v_rndne_f16_e64 v255.l, 0.5             ; encoding: [0xff,0x00,0xde,0xd5,0xf0,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, exec_hi
+// GFX11: v_rndne_f16_e64 v255.l, exec_hi         ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, exec_lo
+// GFX11: v_rndne_f16_e64 v255.l, exec_lo         ; encoding: [0xff,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, m0
+// GFX11: v_rndne_f16_e64 v255.l, m0              ; encoding: [0xff,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, null
+// GFX11: v_rndne_f16_e64 v255.l, null            ; encoding: [0xff,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, s1
+// GFX11: v_rndne_f16_e64 v255.l, s1              ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, s105
+// GFX11: v_rndne_f16_e64 v255.l, s105            ; encoding: [0xff,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, src_scc
+// GFX11: v_rndne_f16_e64 v255.l, src_scc         ; encoding: [0xff,0x00,0xde,0xd5,0xfd,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, ttmp15
+// GFX11: v_rndne_f16_e64 v255.l, ttmp15          ; encoding: [0xff,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, v1.l
+// GFX11: v_rndne_f16_e64 v255.l, v1.l            ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+
+v_rndne_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_rndne_f16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_rndne_f16 v255.l, v127.l
+// GFX11: v_rndne_f16_e64 v255.l, v127.l          ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x01,0x00,0x00]
+
+v_rndne_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_rndne_f16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_rndne_f16 v255.l, vcc_hi
+// GFX11: v_rndne_f16_e64 v255.l, vcc_hi          ; encoding: [0xff,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, vcc_lo
+// GFX11: v_rndne_f16_e64 v255.l, vcc_lo          ; encoding: [0xff,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rndne_f16 v5.h, v199.h
+// GFX11: v_rndne_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xde,0xd5,0xc7,0x01,0x00,0x00]
+
+v_rndne_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_rndne_f16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+
+v_rndne_f16 v5.l, v199.l
+// GFX11: v_rndne_f16_e64 v5.l, v199.l            ; encoding: [0x05,0x00,0xde,0xd5,0xc7,0x01,0x00,0x00]
+
+v_rndne_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_rndne_f16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_rsq_f16 v128, 0xfe0b
 // GFX11: v_rsq_f16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xd6,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s
index 874fb5bffa0ad..b0a9478203a34 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s
@@ -2506,47 +2506,56 @@ v_rcp_iflag_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 boun
 v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xab,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_mirror
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_mirror
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_half_mirror
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shl:1
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shl:1
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shl:15
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shl:15
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shr:1
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shr:1
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shr:15
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shr:15
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_ror:1
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_ror:1
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_ror:15
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_ror:15
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_rndne_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_rndne_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+
+v_rndne_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+v_rndne_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x08,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+v_rndne_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0xc1,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_rndne_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_rndne_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa3,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s
index 8e6783e0f413c..eae5d3e799ba7 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s
@@ -742,17 +742,26 @@ v_rcp_iflag_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xab,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_rndne_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_rndne_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xde,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_rndne_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xde,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xde,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xde,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+v_rndne_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_rndne_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x08,0xde,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_rndne_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0xc1,0xde,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_rndne_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_rndne_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa3,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s
index 3f9af472a6372..9ecae211ecd86 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s
@@ -2977,50 +2977,59 @@ v_rcp_iflag_f32_e64 v5, src_scc mul:4
 v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2
 // GFX11: v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xab,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
 
-v_rndne_f16_e64 v5, v1
-// GFX11: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+v_rndne_f16_e64 v5.l, v1.l
+// GFX11: v_rndne_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
 
-v_rndne_f16_e64 v5, v255
-// GFX11: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+v_rndne_f16_e64 v5.l, v255.l
+// GFX11: v_rndne_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
 
-v_rndne_f16_e64 v5, s1
-// GFX11: v_rndne_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, s1
+// GFX11: v_rndne_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, s105
-// GFX11: v_rndne_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, s105
+// GFX11: v_rndne_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, vcc_lo
-// GFX11: v_rndne_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, vcc_lo
+// GFX11: v_rndne_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, vcc_hi
-// GFX11: v_rndne_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, vcc_hi
+// GFX11: v_rndne_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, ttmp15
-// GFX11: v_rndne_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, ttmp15
+// GFX11: v_rndne_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, m0
-// GFX11: v_rndne_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, m0
+// GFX11: v_rndne_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, exec_lo
-// GFX11: v_rndne_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, exec_lo
+// GFX11: v_rndne_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, exec_hi
-// GFX11: v_rndne_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, exec_hi
+// GFX11: v_rndne_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, null
-// GFX11: v_rndne_f16_e64 v5, null                ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, null
+// GFX11: v_rndne_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, -1
-// GFX11: v_rndne_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, -1
+// GFX11: v_rndne_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, 0.5 mul:2
-// GFX11: v_rndne_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
+v_rndne_f16_e64 v5.l, 0.5 mul:2
+// GFX11: v_rndne_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
 
-v_rndne_f16_e64 v5, src_scc mul:4
-// GFX11: v_rndne_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
+v_rndne_f16_e64 v5.l, src_scc mul:4
+// GFX11: v_rndne_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
 
-v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX11: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_rndne_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX11: v_rndne_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_rndne_f16_e64 v5.h, v1.h
+// GFX11: [0x05,0x48,0xde,0xd5,0x01,0x01,0x00,0x00]
+
+v_rndne_f16_e64 v5.l, v255.h
+// GFX11: [0x05,0x08,0xde,0xd5,0xff,0x01,0x00,0x00]
+
+v_rndne_f16_e64 v255.h, -|0xfe0b| clamp div:2
+// GFX11: [0xff,0xc1,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_rndne_f32_e64 v5, v1
 // GFX11: v_rndne_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
index 6c69f3fb78bc0..089ad41448f00 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
@@ -3102,50 +3102,62 @@ v_readfirstlane_b32 ttmp15, v1
 v_readfirstlane_b32 null, v255
 // GFX12: v_readfirstlane_b32 null, v255 ; encoding: [0xff,0x05,0xf8,0x7e]
 
-v_rndne_f16 v5, v1
-// GFX12: v_rndne_f16_e32 v5, v1 ; encoding: [0x01,0xbd,0x0a,0x7e]
+v_rndne_f16 v5.l, v1.l
+// GFX12: v_rndne_f16_e32 v5.l, v1.l ; encoding: [0x01,0xbd,0x0a,0x7e]
 
-v_rndne_f16 v5, v127
-// GFX12: v_rndne_f16_e32 v5, v127 ; encoding: [0x7f,0xbd,0x0a,0x7e]
+v_rndne_f16 v5.l, v127.l
+// GFX12: v_rndne_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xbd,0x0a,0x7e]
 
-v_rndne_f16 v5, s1
-// GFX12: v_rndne_f16_e32 v5, s1 ; encoding: [0x01,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, s1
+// GFX12: v_rndne_f16_e32 v5.l, s1 ; encoding: [0x01,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, s105
-// GFX12: v_rndne_f16_e32 v5, s105 ; encoding: [0x69,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, s105
+// GFX12: v_rndne_f16_e32 v5.l, s105 ; encoding: [0x69,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, vcc_lo
-// GFX12: v_rndne_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, vcc_lo
+// GFX12: v_rndne_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, vcc_hi
-// GFX12: v_rndne_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, vcc_hi
+// GFX12: v_rndne_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, ttmp15
-// GFX12: v_rndne_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, ttmp15
+// GFX12: v_rndne_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, m0
-// GFX12: v_rndne_f16_e32 v5, m0 ; encoding: [0x7d,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, m0
+// GFX12: v_rndne_f16_e32 v5.l, m0 ; encoding: [0x7d,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, exec_lo
-// GFX12: v_rndne_f16_e32 v5, exec_lo ; encoding: [0x7e,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, exec_lo
+// GFX12: v_rndne_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, exec_hi
-// GFX12: v_rndne_f16_e32 v5, exec_hi ; encoding: [0x7f,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, exec_hi
+// GFX12: v_rndne_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, null
-// GFX12: v_rndne_f16_e32 v5, null ; encoding: [0x7c,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, null
+// GFX12: v_rndne_f16_e32 v5.l, null ; encoding: [0x7c,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, -1
-// GFX12: v_rndne_f16_e32 v5, -1 ; encoding: [0xc1,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, -1
+// GFX12: v_rndne_f16_e32 v5.l, -1 ; encoding: [0xc1,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, 0.5
-// GFX12: v_rndne_f16_e32 v5, 0.5 ; encoding: [0xf0,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, 0.5
+// GFX12: v_rndne_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, src_scc
-// GFX12: v_rndne_f16_e32 v5, src_scc ; encoding: [0xfd,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, src_scc
+// GFX12: v_rndne_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v127, 0xfe0b
-// GFX12: v_rndne_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_rndne_f16 v127.l, 0xfe0b
+// GFX12: v_rndne_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_rndne_f16 v5.l, v1.h
+// GFX12: v_rndne_f16_e32 v5.l, v1.h ; encoding: [0x81,0xbd,0x0a,0x7e]
+
+v_rndne_f16 v5.l, v127.h
+// GFX12: v_rndne_f16_e32 v5.l, v127.h ; encoding: [0xff,0xbd,0x0a,0x7e]
+
+v_rndne_f16 v5.h, src_scc
+// GFX12: v_rndne_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xbc,0x0a,0x7f]
+
+v_rndne_f16 v127.h, 0xfe0b
+// GFX12: v_rndne_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xbc,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_rndne_f32 v5, v1
 // GFX12: v_rndne_f32_e32 v5, v1 ; encoding: [0x01,0x47,0x0a,0x7e]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s
index 05a5f8bd44b9c..fc6b9f396a6a7 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s
@@ -2452,47 +2452,53 @@ v_rcp_iflag_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_rcp_iflag_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_rcp_iflag_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x56,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
-v_rndne_f16 v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_rndne_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_rndne_f16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
-v_rndne_f16 v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_rndne_f16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_rndne_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
-v_rndne_f16 v5, v1 row_mirror
-// GFX12: v_rndne_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_mirror
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_half_mirror
-// GFX12: v_rndne_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_half_mirror
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shl:1
-// GFX12: v_rndne_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shl:1
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shl:15
-// GFX12: v_rndne_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shl:15
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shr:1
-// GFX12: v_rndne_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shr:1
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shr:15
-// GFX12: v_rndne_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shr:15
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_ror:1
-// GFX12: v_rndne_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_ror:1
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_ror:15
-// GFX12: v_rndne_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_ror:15
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_rndne_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_rndne_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_rndne_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_rndne_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_rndne_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_rndne_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_rndne_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_rndne_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_rndne_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_rndne_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_rndne_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_rndne_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbc,0x0a,0x7f,0x81,0x60,0x09,0x13]
+
+v_rndne_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_rndne_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
 v_rndne_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_rndne_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s
index bf03e7f8e518c..a77b95e1ef0cd 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s
@@ -604,14 +604,20 @@ v_rcp_iflag_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_rcp_iflag_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_rcp_iflag_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x56,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_rndne_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_rndne_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_rndne_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_rndne_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_rndne_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_rndne_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_rndne_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_rndne_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_rndne_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_rndne_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbc,0x0a,0x7f,0x81,0x77,0x39,0x05]
+
+v_rndne_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_rndne_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbc,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_rndne_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_rndne_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x46,0x0a,0x7e,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s
index f584b69c33ec8..0be79d016b78f 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s
@@ -764,6 +764,12 @@ v_rcp_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 v_rndne_f16_e32 v128, 0xfe0b
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
+v_rndne_f16_e32 v128.h, 0xfe0b
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v128.l, 0xfe0b
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_rndne_f16_e32 v255, v1
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -773,6 +779,24 @@ v_rndne_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_rndne_f16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
 
+v_rndne_f16_e32 v255.h, v1.h
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.l, v1.l
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_rndne_f16_e32 v5, v199
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -782,6 +806,24 @@ v_rndne_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_rndne_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
 
+v_rndne_f16_e32 v5.h, v199.h
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.l, v199.l
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
 v_rsq_f16_e32 v128, 0xfe0b
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s
index 27e92b7e4f22b..440c1f09f6012 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s
@@ -1903,71 +1903,137 @@ v_rcp_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_rcp_f16 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: v_rcp_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_rndne_f16 v128, 0xfe0b
-// GFX12: v_rndne_f16_e64 v128, 0xfe0b            ; encoding: [0x80,0x00,0xde,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_rndne_f16 v128.h, 0xfe0b
+// GFX12: v_rndne_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xde,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_rndne_f16 v255, -1
-// GFX12: v_rndne_f16_e64 v255, -1                ; encoding: [0xff,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+v_rndne_f16 v128.l, 0xfe0b
+// GFX12: v_rndne_f16_e64 v128.l, 0xfe0b          ; encoding: [0x80,0x00,0xde,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_rndne_f16 v255, 0.5
-// GFX12: v_rndne_f16_e64 v255, 0.5               ; encoding: [0xff,0x00,0xde,0xd5,0xf0,0x00,0x00,0x00]
+v_rndne_f16 v255.h, -1
+// GFX12: v_rndne_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rndne_f16 v255, exec_hi
-// GFX12: v_rndne_f16_e64 v255, exec_hi           ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+v_rndne_f16 v255.h, 0.5
+// GFX12: v_rndne_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0xf0,0x00,0x00,0x00]
 
-v_rndne_f16 v255, exec_lo
-// GFX12: v_rndne_f16_e64 v255, exec_lo           ; encoding: [0xff,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+v_rndne_f16 v255.h, exec_hi
+// GFX12: v_rndne_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7f,0x00,0x00,0x00]
 
-v_rndne_f16 v255, m0
-// GFX12: v_rndne_f16_e64 v255, m0                ; encoding: [0xff,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+v_rndne_f16 v255.h, exec_lo
+// GFX12: v_rndne_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7e,0x00,0x00,0x00]
 
-v_rndne_f16 v255, null
-// GFX12: v_rndne_f16_e64 v255, null              ; encoding: [0xff,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+v_rndne_f16 v255.h, m0
+// GFX12: v_rndne_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7d,0x00,0x00,0x00]
 
-v_rndne_f16 v255, s1
-// GFX12: v_rndne_f16_e64 v255, s1                ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+v_rndne_f16 v255.h, null
+// GFX12: v_rndne_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7c,0x00,0x00,0x00]
 
-v_rndne_f16 v255, s105
-// GFX12: v_rndne_f16_e64 v255, s105              ; encoding: [0xff,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+v_rndne_f16 v255.h, s1
+// GFX12: v_rndne_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x01,0x00,0x00,0x00]
 
-v_rndne_f16 v255, src_scc
-// GFX12: v_rndne_f16_e64 v255, src_scc           ; encoding: [0xff,0x00,0xde,0xd5,0xfd,0x00,0x00,0x00]
+v_rndne_f16 v255.h, s105
+// GFX12: v_rndne_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x69,0x00,0x00,0x00]
 
-v_rndne_f16 v255, ttmp15
-// GFX12: v_rndne_f16_e64 v255, ttmp15            ; encoding: [0xff,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+v_rndne_f16 v255.h, src_scc
+// GFX12: v_rndne_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0xfd,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v1
-// GFX12: v_rndne_f16_e64 v255, v1                ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+v_rndne_f16 v255.h, ttmp15
+// GFX12: v_rndne_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7b,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_rndne_f16 v255.h, v1.h
+// GFX12: v_rndne_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xde,0xd5,0x01,0x01,0x00,0x00]
 
-v_rndne_f16 v255, v1 quad_perm:[3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_rndne_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_rndne_f16 v255, v127
-// GFX12: v_rndne_f16_e64 v255, v127              ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x01,0x00,0x00]
+v_rndne_f16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_rndne_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_rndne_f16 v255.h, v127.h
+// GFX12: v_rndne_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xde,0xd5,0x7f,0x01,0x00,0x00]
 
-v_rndne_f16 v255, v127 quad_perm:[3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_rndne_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xde,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_rndne_f16 v255, vcc_hi
-// GFX12: v_rndne_f16_e64 v255, vcc_hi            ; encoding: [0xff,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+v_rndne_f16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xde,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_rndne_f16 v255, vcc_lo
-// GFX12: v_rndne_f16_e64 v255, vcc_lo            ; encoding: [0xff,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+v_rndne_f16 v255.h, vcc_hi
+// GFX12: v_rndne_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x6b,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v199
-// GFX12: v_rndne_f16_e64 v5, v199                ; encoding: [0x05,0x00,0xde,0xd5,0xc7,0x01,0x00,0x00]
+v_rndne_f16 v255.h, vcc_lo
+// GFX12: v_rndne_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x6a,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_rndne_f16 v255.l, -1
+// GFX12: v_rndne_f16_e64 v255.l, -1              ; encoding: [0xff,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v199 quad_perm:[3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_rndne_f16 v255.l, 0.5
+// GFX12: v_rndne_f16_e64 v255.l, 0.5             ; encoding: [0xff,0x00,0xde,0xd5,0xf0,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, exec_hi
+// GFX12: v_rndne_f16_e64 v255.l, exec_hi         ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, exec_lo
+// GFX12: v_rndne_f16_e64 v255.l, exec_lo         ; encoding: [0xff,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, m0
+// GFX12: v_rndne_f16_e64 v255.l, m0              ; encoding: [0xff,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, null
+// GFX12: v_rndne_f16_e64 v255.l, null            ; encoding: [0xff,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, s1
+// GFX12: v_rndne_f16_e64 v255.l, s1              ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, s105
+// GFX12: v_rndne_f16_e64 v255.l, s105            ; encoding: [0xff,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, src_scc
+// GFX12: v_rndne_f16_e64 v255.l, src_scc         ; encoding: [0xff,0x00,0xde,0xd5,0xfd,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, ttmp15
+// GFX12: v_rndne_f16_e64 v255.l, ttmp15          ; encoding: [0xff,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, v1.l
+// GFX12: v_rndne_f16_e64 v255.l, v1.l            ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+
+v_rndne_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_rndne_f16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_rndne_f16 v255.l, v127.l
+// GFX12: v_rndne_f16_e64 v255.l, v127.l          ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x01,0x00,0x00]
+
+v_rndne_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_rndne_f16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_rndne_f16 v255.l, vcc_hi
+// GFX12: v_rndne_f16_e64 v255.l, vcc_hi          ; encoding: [0xff,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, vcc_lo
+// GFX12: v_rndne_f16_e64 v255.l, vcc_lo          ; encoding: [0xff,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rndne_f16 v5.h, v199.h
+// GFX12: v_rndne_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xde,0xd5,0xc7,0x01,0x00,0x00]
+
+v_rndne_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_rndne_f16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+
+v_rndne_f16 v5.l, v199.l
+// GFX12: v_rndne_f16_e64 v5.l, v199.l            ; encoding: [0x05,0x00,0xde,0xd5,0xc7,0x01,0x00,0x00]
+
+v_rndne_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_rndne_f16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_rsq_f16 v128, 0xfe0b
 // GFX12: v_rsq_f16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xd6,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s
index 0ba9874b1a22e..4824241735140 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s
@@ -3127,50 +3127,59 @@ v_rcp_iflag_f32_e64 v5, src_scc mul:4
 v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2
 // GFX12: v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xab,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
 
-v_rndne_f16_e64 v5, v1
-// GFX12: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+v_rndne_f16_e64 v5.l, v1.l
+// GFX12: v_rndne_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
 
-v_rndne_f16_e64 v5, v255
-// GFX12: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+v_rndne_f16_e64 v5.l, v255.l
+// GFX12: v_rndne_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
 
-v_rndne_f16_e64 v5, s1
-// GFX12: v_rndne_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, s1
+// GFX12: v_rndne_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, s105
-// GFX12: v_rndne_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, s105
+// GFX12: v_rndne_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, vcc_lo
-// GFX12: v_rndne_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, vcc_lo
+// GFX12: v_rndne_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, vcc_hi
-// GFX12: v_rndne_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, vcc_hi
+// GFX12: v_rndne_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, ttmp15
-// GFX12: v_rndne_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, ttmp15
+// GFX12: v_rndne_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, m0
-// GFX12: v_rndne_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, m0
+// GFX12: v_rndne_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, exec_lo
-// GFX12: v_rndne_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, exec_lo
+// GFX12: v_rndne_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, exec_hi
-// GFX12: v_rndne_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, exec_hi
+// GFX12: v_rndne_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, null
-// GFX12: v_rndne_f16_e64 v5, null                ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, null
+// GFX12: v_rndne_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, -1
-// GFX12: v_rndne_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, -1
+// GFX12: v_rndne_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, 0.5 mul:2
-// GFX12: v_rndne_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
+v_rndne_f16_e64 v5.l, 0.5 mul:2
+// GFX12: v_rndne_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
 
-v_rndne_f16_e64 v5, src_scc mul:4
-// GFX12: v_rndne_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
+v_rndne_f16_e64 v5.l, src_scc mul:4
+// GFX12: v_rndne_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
 
-v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX12: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_rndne_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX12: v_rndne_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_rndne_f16_e64 v5.h, v1.h
+// GFX12: v_rndne_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xde,0xd5,0x01,0x01,0x00,0x00]
+
+v_rndne_f16_e64 v5.l, v255.h
+// GFX12: v_rndne_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xde,0xd5,0xff,0x01,0x00,0x00]
+
+v_rndne_f16_e64 v255.h, -|0xfe0b| clamp div:2
+// GFX12: v_rndne_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_rndne_f32_e64 v5, v1
 // GFX12: v_rndne_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s
index 197f02719905d..c09471033d144 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s
@@ -2377,47 +2377,56 @@ v_rcp_iflag_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 boun
 v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xab,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_mirror
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_mirror
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_half_mirror
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shl:1
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shl:1
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shl:15
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shl:15
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shr:1
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shr:1
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shr:15
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shr:15
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_ror:1
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_ror:1
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_ror:15
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_ror:15
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_rndne_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_rndne_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+
+v_rndne_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_rndne_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+v_rndne_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x08,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+v_rndne_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_rndne_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc1,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_rndne_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_rndne_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa3,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s
index 0dfc47b4e4020..be3878878b13d 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s
@@ -712,17 +712,26 @@ v_rcp_iflag_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xab,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_rndne_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_rndne_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xde,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_rndne_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xde,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xde,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xde,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+v_rndne_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_rndne_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x08,0xde,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_rndne_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_rndne_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc1,0xde,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_rndne_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_rndne_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa3,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt
index 8d86bafca059f..a3886e6b3a68d 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt
@@ -3090,49 +3090,82 @@
 # GFX11: v_readfirstlane_b32 null, v255          ; encoding: [0xff,0x05,0xf8,0x7e]
 
 0x01,0xbd,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, v1                  ; encoding: [0x01,0xbd,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, v1.l              ; encoding: [0x01,0xbd,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, v1                  ; encoding: [0x01,0xbd,0x0a,0x7e]
 
 0x7f,0xbd,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, v127                ; encoding: [0x7f,0xbd,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, v127.l            ; encoding: [0x7f,0xbd,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, v127                ; encoding: [0x7f,0xbd,0x0a,0x7e]
 
 0x01,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, s1                  ; encoding: [0x01,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, s1                ; encoding: [0x01,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, s1                  ; encoding: [0x01,0xbc,0x0a,0x7e]
 
 0x69,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, s105                ; encoding: [0x69,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, s105              ; encoding: [0x69,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, s105                ; encoding: [0x69,0xbc,0x0a,0x7e]
 
 0x6a,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, vcc_lo            ; encoding: [0x6a,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xbc,0x0a,0x7e]
 
 0x6b,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, vcc_hi            ; encoding: [0x6b,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xbc,0x0a,0x7e]
 
 0x7b,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, ttmp15            ; encoding: [0x7b,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xbc,0x0a,0x7e]
 
 0x7d,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, m0                  ; encoding: [0x7d,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, m0                ; encoding: [0x7d,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, m0                  ; encoding: [0x7d,0xbc,0x0a,0x7e]
 
 0x7e,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, exec_lo           ; encoding: [0x7e,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xbc,0x0a,0x7e]
 
 0x7f,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, exec_hi           ; encoding: [0x7f,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xbc,0x0a,0x7e]
 
 0x7c,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, null                ; encoding: [0x7c,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, null              ; encoding: [0x7c,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, null                ; encoding: [0x7c,0xbc,0x0a,0x7e]
 
 0xc1,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, -1                  ; encoding: [0xc1,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, -1                ; encoding: [0xc1,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, -1                  ; encoding: [0xc1,0xbc,0x0a,0x7e]
 
 0xf0,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, 0.5               ; encoding: [0xf0,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xbc,0x0a,0x7e]
 
 0xfd,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, src_scc             ; encoding: [0xfd,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, src_scc           ; encoding: [0xfd,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, src_scc             ; encoding: [0xfd,0xbc,0x0a,0x7e]
 
 0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-# GFX11: v_rndne_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e32 v127.l, 0xfe0b          ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+0x81,0xbd,0x0a,0x7e
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbd,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xbd,0x0a,0x7e]
+
+0xff,0xbd,0x0a,0x7e
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, v127.h            ; encoding: [0xff,0xbd,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xbd,0x0a,0x7e]
+
+0xf0,0xbc,0xfe,0x7e
+# GFX11-REAL16: v_rndne_f16_e32 v127.l, 0.5             ; encoding: [0xf0,0xbc,0xfe,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v127, 0.5               ; encoding: [0xf0,0xbc,0xfe,0x7e]
+
+0xfd,0xbc,0x0a,0x7f
+# GFX11-REAL16: v_rndne_f16_e32 v5.h, src_scc           ; encoding: [0xfd,0xbc,0x0a,0x7f]
+
+0xff,0xbc,0xfe,0x7f,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_rndne_f16_e32 v127.h, 0xfe0b          ; encoding: [0xff,0xbc,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 0x01,0x47,0x0a,0x7e
 # GFX11: v_rndne_f32_e32 v5, v1                  ; encoding: [0x01,0x47,0x0a,0x7e]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt
index f01ce5a31be3c..9f857cd05696c 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt
@@ -2473,46 +2473,72 @@
 # GFX11: v_rcp_iflag_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x56,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX11: v_rndne_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX11: v_rndne_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
-# GFX11: v_rndne_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX11-REAL16: v_rndne_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX11-FAKE16: v_rndne_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+
+0xfa,0xbc,0xfe,0x7e,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_rndne_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_rndne_f16_dpp v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+
+0xfa,0xbc,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_rndne_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xbc,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+# GFX11-REAL16: v_rndne_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# GFX11-FAKE16: v_lshlrev_b32_e32 v30, v255, v183       ; encoding: [0xff,0x6f,0x3d,0x30]
 
 0xfa,0x46,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX11: v_rndne_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt
index 0f102c52a3666..c45033916cd05 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt
@@ -509,10 +509,23 @@
 # GFX11: v_rcp_iflag_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x56,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX11: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX11: v_rndne_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xbc,0xfe,0x7e,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_rndne_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_rndne_f16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+
+0xe9,0xbc,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_rndne_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v129, v187      ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xbc,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_rndne_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x46,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX11: v_rndne_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x46,0x0a,0x7e,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt
index 6e5239522df41..4f12775fb3796 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt
@@ -2615,46 +2615,72 @@
 # GFX11: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xab,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX11: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
-# GFX11: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 
 0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
-# GFX11: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x08,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+
+0xff,0xc1,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_rndne_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xa3,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_rndne_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa3,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt
index d7f9e86d3ca00..638daca3fdd4f 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt
@@ -731,16 +731,32 @@
 # GFX11: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xab,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX11: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX11: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
 0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x08,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0xff,0xc1,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_rndne_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xa3,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_rndne_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa3,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt
index db5ba967d709b..1b7677b8c088c 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt
@@ -3051,49 +3051,76 @@
 # GFX11: v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xab,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, null                ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, null                ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08
-# GFX11: v_rndne_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
 
 0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10
-# GFX11: v_rndne_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
 
 0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
-# GFX11: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xde,0xd5,0x01,0x01,0x00,0x00
+# GFX11-REAL16: v_rndne_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xde,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x08,0xde,0xd5,0xff,0x01,0x00,0x00
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xde,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+
+0xff,0xc1,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_rndne_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00
 # GFX11: v_rndne_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt
index d37c9229f1666..1635fdab66d86 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt
@@ -2581,46 +2581,68 @@
 # GFX12: v_rcp_iflag_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x56,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX12: v_rndne_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX12: v_rndne_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
-# GFX12: v_rndne_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-REAL16: v_rndne_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_rndne_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+
+0xfa,0xbc,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_rndne_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xbc,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+# GFX12-REAL16: v_rndne_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_lshlrev_b32_e32 v30, v255, v183       ; encoding: [0xff,0x6f,0x3d,0x30]
 
 0xfa,0x46,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX12: v_rndne_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt
index f3dce5e6d5b93..c1fa6aa634f49 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt
@@ -497,10 +497,19 @@
 # GFX12: v_rcp_iflag_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x56,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX12: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX12: v_rndne_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xbc,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_rndne_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xbc,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_rndne_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x46,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX12: v_rndne_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x46,0x0a,0x7e,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt
index e4b619d87e400..43c18a7836687 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt
@@ -3097,49 +3097,76 @@
 # GFX12: v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xab,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, null                ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, null                ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08
-# GFX12: v_rndne_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
 
 0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10
-# GFX12: v_rndne_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
 
 0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
-# GFX12: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xde,0xd5,0x01,0x01,0x00,0x00
+# GFX12-REAL16: v_rndne_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xde,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x08,0xde,0xd5,0xff,0x01,0x00,0x00
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xde,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+
+0xff,0xc1,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_rndne_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00
 # GFX12: v_rndne_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
index b77cf5ab6efc1..cc344f329c2d2 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
@@ -2471,46 +2471,72 @@
 # GFX12: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xab,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX12: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
-# GFX12: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 
 0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
-# GFX12: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x08,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+
+0xff,0xc1,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_rndne_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xa3,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_rndne_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa3,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
index 50339f51c5629..428349fec54fa 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
@@ -707,16 +707,32 @@
 # GFX12: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xab,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX12: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX12: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
 0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x08,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0xff,0xc1,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_rndne_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xa3,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_rndne_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa3,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]

From 78f04477d658ae3de3cd416947c5bac65262a9ec Mon Sep 17 00:00:00 2001
From: Maksim Levental 
Date: Fri, 3 Jan 2025 13:52:07 -0800
Subject: [PATCH 405/567] [mlir][python] declare `_PyClassMethod_New` undefined
 at link time (#121597)

`NanobindAdaptors.h` uses `PyClassMethod_New` to build `pure_subclass`es
but nanobind doesn't declare this API as undefined in its linker flags.
So we need to declare it as such for downstream users that do not do
something like `-undefined dynamic_lookup`
---
 mlir/cmake/modules/AddMLIRPython.cmake | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/mlir/cmake/modules/AddMLIRPython.cmake b/mlir/cmake/modules/AddMLIRPython.cmake
index 9d4e06c7909c8..717a503468a85 100644
--- a/mlir/cmake/modules/AddMLIRPython.cmake
+++ b/mlir/cmake/modules/AddMLIRPython.cmake
@@ -683,6 +683,13 @@ function(add_mlir_python_extension libname extname)
           ${eh_rtti_enable}
       )
     endif()
+    
+    if(APPLE)
+      # NanobindAdaptors.h uses PyClassMethod_New to build `pure_subclass`es but nanobind
+      # doesn't declare this API as undefined in its linker flags. So we need to declare it as such
+      # for downstream users that do not do something like `-undefined dynamic_lookup`.
+      set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -Wl,-U -Wl,_PyClassMethod_New")
+    endif()
   endif()
 
   target_compile_options(${libname} PRIVATE ${eh_rtti_enable})

From 9165848c8285884938583f5c3a35c97ec03ee486 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= 
Date: Fri, 3 Jan 2025 14:37:14 -0800
Subject: [PATCH 406/567] [flang][cuda] Sync global descriptor when nullifying
 pointer (#121595)

---
 .../flang/Optimizer/Builder/CUFCommon.h       |  6 ++++++
 flang/lib/Lower/Allocatable.cpp               | 19 ++-----------------
 flang/lib/Lower/Bridge.cpp                    |  2 ++
 flang/lib/Optimizer/Builder/CUFCommon.cpp     | 17 +++++++++++++++++
 flang/test/Lower/CUDA/cuda-pointer-sync.cuf   |  6 +++++-
 5 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/flang/include/flang/Optimizer/Builder/CUFCommon.h b/flang/include/flang/Optimizer/Builder/CUFCommon.h
index df1b709dc8608..b99e330429622 100644
--- a/flang/include/flang/Optimizer/Builder/CUFCommon.h
+++ b/flang/include/flang/Optimizer/Builder/CUFCommon.h
@@ -15,6 +15,10 @@
 
 static constexpr llvm::StringRef cudaDeviceModuleName = "cuda_device_mod";
 
+namespace fir {
+class FirOpBuilder;
+} // namespace fir
+
 namespace cuf {
 
 /// Retrieve or create the CUDA Fortran GPU module in the given \p mod.
@@ -24,6 +28,8 @@ mlir::gpu::GPUModuleOp getOrCreateGPUModule(mlir::ModuleOp mod,
 bool isInCUDADeviceContext(mlir::Operation *op);
 bool isRegisteredDeviceGlobal(fir::GlobalOp op);
 
+void genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder);
+
 } // namespace cuf
 
 #endif // FORTRAN_OPTIMIZER_TRANSFORMS_CUFCOMMON_H_
diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index 4c64870675816..5c63c79892f42 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -1088,22 +1088,6 @@ bool Fortran::lower::isArraySectionWithoutVectorSubscript(
          !Fortran::evaluate::HasVectorSubscript(expr);
 }
 
-static void genCUFPointerSync(const mlir::Value box,
-                              fir::FirOpBuilder &builder) {
-  if (auto declareOp = box.getDefiningOp()) {
-    if (auto addrOfOp = declareOp.getMemref().getDefiningOp()) {
-      auto mod = addrOfOp->getParentOfType();
-      if (auto globalOp =
-              mod.lookupSymbol(addrOfOp.getSymbol())) {
-        if (cuf::isRegisteredDeviceGlobal(globalOp)) {
-          builder.create(box.getLoc(),
-                                                addrOfOp.getSymbol());
-        }
-      }
-    }
-  }
-}
-
 void Fortran::lower::associateMutableBox(
     Fortran::lower::AbstractConverter &converter, mlir::Location loc,
     const fir::MutableBoxValue &box, const Fortran::lower::SomeExpr &source,
@@ -1111,12 +1095,13 @@ void Fortran::lower::associateMutableBox(
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   if (Fortran::evaluate::UnwrapExpr(source)) {
     fir::factory::disassociateMutableBox(builder, loc, box);
+    cuf::genPointerSync(box.getAddr(), builder);
     return;
   }
   if (converter.getLoweringOptions().getLowerToHighLevelFIR()) {
     fir::ExtendedValue rhs = converter.genExprAddr(loc, source, stmtCtx);
     fir::factory::associateMutableBox(builder, loc, box, rhs, lbounds);
-    genCUFPointerSync(box.getAddr(), builder);
+    cuf::genPointerSync(box.getAddr(), builder);
     return;
   }
   // The right hand side is not be evaluated into a temp. Array sections can
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index c7e2635230e98..c7bf424815548 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -34,6 +34,7 @@
 #include "flang/Lower/StatementContext.h"
 #include "flang/Lower/Support/Utils.h"
 #include "flang/Optimizer/Builder/BoxValue.h"
+#include "flang/Optimizer/Builder/CUFCommon.h"
 #include "flang/Optimizer/Builder/Character.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/Runtime/Assign.h"
@@ -3952,6 +3953,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       } else {
         fir::MutableBoxValue box = genExprMutableBox(loc, *expr);
         fir::factory::disassociateMutableBox(*builder, loc, box);
+        cuf::genPointerSync(box.getAddr(), *builder);
       }
     }
   }
diff --git a/flang/lib/Optimizer/Builder/CUFCommon.cpp b/flang/lib/Optimizer/Builder/CUFCommon.cpp
index 81a8a90ce394e..39848205f47af 100644
--- a/flang/lib/Optimizer/Builder/CUFCommon.cpp
+++ b/flang/lib/Optimizer/Builder/CUFCommon.cpp
@@ -7,7 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Optimizer/Builder/CUFCommon.h"
+#include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Dialect/CUF/CUFOps.h"
+#include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 
@@ -54,3 +56,18 @@ bool cuf::isRegisteredDeviceGlobal(fir::GlobalOp op) {
     return true;
   return false;
 }
+
+void cuf::genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder) {
+  if (auto declareOp = box.getDefiningOp()) {
+    if (auto addrOfOp = declareOp.getMemref().getDefiningOp()) {
+      auto mod = addrOfOp->getParentOfType();
+      if (auto globalOp =
+              mod.lookupSymbol(addrOfOp.getSymbol())) {
+        if (cuf::isRegisteredDeviceGlobal(globalOp)) {
+          builder.create(box.getLoc(),
+                                                addrOfOp.getSymbol());
+        }
+      }
+    }
+  }
+}
diff --git a/flang/test/Lower/CUDA/cuda-pointer-sync.cuf b/flang/test/Lower/CUDA/cuda-pointer-sync.cuf
index e17869b2d6357..4c64f4bd34aa0 100644
--- a/flang/test/Lower/CUDA/cuda-pointer-sync.cuf
+++ b/flang/test/Lower/CUDA/cuda-pointer-sync.cuf
@@ -8,10 +8,14 @@ use devptr
 real, device, target, dimension(4) :: a_dev
 a_dev = 42.0
 dev_ptr => a_dev
+
+dev_ptr => null()
+
+nullify(dev_ptr)
 end
 
 ! CHECK: fir.global @_QMdevptrEdev_ptr {data_attr = #cuf.cuda} : !fir.box>>
 ! CHECK-LABEL: func.func @_QQmain()
 ! CHECK: fir.embox
 ! CHECK: fir.store
-! CHECK: cuf.sync_descriptor @_QMdevptrEdev_ptr
+! CHECK-COUNT-3: cuf.sync_descriptor @_QMdevptrEdev_ptr

From 1b5deaeb2ad0a7ea643f24899e4aad9461d3d426 Mon Sep 17 00:00:00 2001
From: Tom Stellard 
Date: Fri, 3 Jan 2025 15:02:16 -0800
Subject: [PATCH 407/567] workflows/build-ci-container: Make sure to only test
 local containers (#120827)

The container test is run before we create the :latest tag, so we should
not try testing this, otherwise it will pull the :latest tag from the
github registry, and won't test the container we just built.
---
 .github/workflows/build-ci-container.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-ci-container.yml b/.github/workflows/build-ci-container.yml
index 50729e0173506..4fa0713b381ce 100644
--- a/.github/workflows/build-ci-container.yml
+++ b/.github/workflows/build-ci-container.yml
@@ -59,8 +59,9 @@ jobs:
 
       - name: Test Container
         run: |
-          for image in ${{ steps.vars.outputs.container-name-tag }} ${{  steps.vars.outputs.container-name }}; do
-            podman run --rm -it $image /usr/bin/bash -x -c 'cd $HOME && printf '\''#include \nint main(int argc, char **argv) { std::cout << "Hello\\n"; }'\'' | clang++ -x c++ - && ./a.out | grep Hello'
+          for image in ${{ steps.vars.outputs.container-name-tag }}; do
+            # Use --pull=never to ensure we are testing the just built image.
+            podman run --pull=never --rm -it $image /usr/bin/bash -x -c 'cd $HOME && printf '\''#include \nint main(int argc, char **argv) { std::cout << "Hello\\n"; }'\'' | clang++ -x c++ - && ./a.out | grep Hello'
           done
 
   push-ci-container:

From 06cf4f970446ce3c4be0a7104115b82c2fae6448 Mon Sep 17 00:00:00 2001
From: Tom Stellard 
Date: Fri, 3 Jan 2025 15:06:35 -0800
Subject: [PATCH 408/567] workflows/new-issues: Use an llvmbot token to add
 labels (#120840)

There is a separate job that mentions teams based on the label added,
and this job won't run if we use the default github token.
---
 .github/workflows/new-issues.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/new-issues.yml b/.github/workflows/new-issues.yml
index ed15fdb9fba6e..3cac57e268513 100644
--- a/.github/workflows/new-issues.yml
+++ b/.github/workflows/new-issues.yml
@@ -15,7 +15,7 @@ jobs:
     steps:
       - uses: llvm/actions/issue-labeler@main
         with:
-          repo-token: ${{ secrets.GITHUB_TOKEN }}
+          repo-token: ${{ secrets.ISSUE_SUBSCRIBER_TOKEN }}
           configuration-path: .github/new-issues-labeler.yml
           include-title: 1
           include-body: 0

From dfa4312c9b092c23b9b2ec366a8851be729953c4 Mon Sep 17 00:00:00 2001
From: Tom Stellard 
Date: Fri, 3 Jan 2025 15:08:40 -0800
Subject: [PATCH 409/567] workflows/release-binaries: Replace some workflow
 interpolations with env vars (#120860)

This is recommended by the GitHub Actions security hardening guide:
https://docs.github.com/en/actions/security-for-github-actions/security-guides/security-hardening-for-github-actions#using-an-intermediate-environment-variable
---
 .github/workflows/release-binaries.yml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index 1cde628d3f66c..fc5431c96bbf0 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -83,7 +83,7 @@ jobs:
         USER_TOKEN: ${{ secrets.RELEASE_TASKS_USER_TOKEN }}
       shell: bash
       run: |
-        ./llvm/utils/release/./github-upload-release.py --token "$GITHUB_TOKEN" --user ${{ github.actor }} --user-token "$USER_TOKEN" check-permissions
+        ./llvm/utils/release/./github-upload-release.py --token "$GITHUB_TOKEN" --user "$GITHUB_ACTOR" --user-token "$USER_TOKEN" check-permissions
 
     - name: Collect Variables
       id: vars
@@ -102,8 +102,8 @@ jobs:
           release_version="$trimmed"
           ref="llvmorg-$release_version"
         else
-          release_version="${{ (github.event_name == 'pull_request' && format('PR{0}', github.event.pull_request.number)) || 'CI'}}-${{ github.sha }}"
-          ref=${{ github.sha }}
+          release_version="${{ (github.event_name == 'pull_request' && format('PR{0}', github.event.pull_request.number)) || 'CI'}}-$GITHUB_SHA"
+          ref="$GITHUB_SHA"
         fi
         if [ -n "${{ inputs.upload }}" ]; then
           upload="${{ inputs.upload }}"
@@ -114,20 +114,20 @@ jobs:
         echo "ref=$ref" >> $GITHUB_OUTPUT
         echo "upload=$upload" >> $GITHUB_OUTPUT
 
-        release_binary_basename="LLVM-$release_version-${{ runner.os }}-${{ runner.arch }}"
+        release_binary_basename="LLVM-$release_version-$RUNNER_OS-$RUNNER_ARCH"
         echo "release-binary-basename=$release_binary_basename" >> $GITHUB_OUTPUT
         echo "release-binary-filename=$release_binary_basename.tar.xz" >> $GITHUB_OUTPUT
 
         # Detect necessary CMake flags
-        target="${{ runner.os }}-${{ runner.arch }}"
+        target="$RUNNER_OS-$RUNNER_ARCH"
         echo "enable-pgo=false" >> $GITHUB_OUTPUT
         target_cmake_flags="-DLLVM_RELEASE_ENABLE_PGO=OFF"
         # The macOS builds try to cross compile some libraries so we need to
         # add extra CMake args to disable them.
         # See https://github.com/llvm/llvm-project/issues/99767
-        if [ "${{ runner.os }}" = "macOS" ]; then
+        if [ "$RUNNER_OS" = "macOS" ]; then
           target_cmake_flags="$target_cmake_flags -DBOOTSTRAP_COMPILER_RT_ENABLE_IOS=OFF"
-          if [ "${{ runner.arch }}" = "ARM64" ]; then
+          if [ "$RUNNER_ARCH" = "ARM64" ]; then
             arches=arm64
           else
             arches=x86_64
@@ -137,7 +137,7 @@ jobs:
 
         build_flang="true"
 
-        if [ "${{ runner.os }}" = "Windows" ]; then
+        if [ "$RUNNER_OS" = "Windows" ]; then
           # The build times out on Windows, so we need to disable LTO.
           target_cmake_flags="$target_cmake_flags -DLLVM_RELEASE_ENABLE_LTO=OFF"
         fi

From ee1adc5aab4fb517314358ce03cfda426da9c4ce Mon Sep 17 00:00:00 2001
From: Adrian Prantl 
Date: Fri, 3 Jan 2025 15:26:40 -0800
Subject: [PATCH 410/567] [lldb] Add a return opcode to the formatter bytecode
 (#121602)

In LLVM we love our early exists and this opcode allows for simpler code
generation.
---
 lldb/docs/resources/formatterbytecode.rst              |  1 +
 lldb/examples/python/formatter_bytecode.py             |  4 ++++
 lldb/source/DataFormatters/FormatterBytecode.cpp       |  3 +++
 lldb/source/DataFormatters/FormatterBytecode.def       |  1 +
 lldb/unittests/DataFormatter/FormatterBytecodeTest.cpp | 10 ++++++++++
 5 files changed, 19 insertions(+)

diff --git a/lldb/docs/resources/formatterbytecode.rst b/lldb/docs/resources/formatterbytecode.rst
index 20e148363ef95..34fb0f7ee924c 100644
--- a/lldb/docs/resources/formatterbytecode.rst
+++ b/lldb/docs/resources/formatterbytecode.rst
@@ -75,6 +75,7 @@ These manipulate the control stack and program counter. Both `if` and `ifelse` e
  0x12      `ifelse`    `(UInt -> )` pop two blocks from the control stack, if
                        the top of the data stack is nonzero, execute the first,
                        otherwise the second.
+ 0x13      `return`    pop the entire control stack and return
 ========  ==========  ============================================================
 
 Literals for basic types
diff --git a/lldb/examples/python/formatter_bytecode.py b/lldb/examples/python/formatter_bytecode.py
index ccd0c68a75483..36a14be283f31 100644
--- a/lldb/examples/python/formatter_bytecode.py
+++ b/lldb/examples/python/formatter_bytecode.py
@@ -35,6 +35,7 @@ def define_opcode(n, mnemonic, name):
 define_opcode(0x10, "{", "begin")
 define_opcode(0x11, "if", "if")
 define_opcode(0x12, "ifelse", "ifelse")
+define_opcode(0x13, "return", "return")
 
 define_opcode(0x20, None, "lit_uint")
 define_opcode(0x21, None, "lit_int")
@@ -342,6 +343,9 @@ def next_byte():
             else:
                 frame.append(control.pop())
                 control.pop()
+        elif b == op_return:
+            control.clear()
+            return data[-1]
 
         # Literals.
         elif b == op_lit_uint:
diff --git a/lldb/source/DataFormatters/FormatterBytecode.cpp b/lldb/source/DataFormatters/FormatterBytecode.cpp
index e49c750678187..7f3dbe0dba37d 100644
--- a/lldb/source/DataFormatters/FormatterBytecode.cpp
+++ b/lldb/source/DataFormatters/FormatterBytecode.cpp
@@ -304,6 +304,9 @@ llvm::Error Interpret(std::vector &control,
       control.pop_back();
       activate_block();
       continue;
+    case op_return:
+      control.clear();
+      return pc.takeError();
 
     // Literals.
     case op_lit_uint:
diff --git a/lldb/source/DataFormatters/FormatterBytecode.def b/lldb/source/DataFormatters/FormatterBytecode.def
index c6645631fa006..29e0bee541c73 100644
--- a/lldb/source/DataFormatters/FormatterBytecode.def
+++ b/lldb/source/DataFormatters/FormatterBytecode.def
@@ -27,6 +27,7 @@ DEFINE_OPCODE(0x06, "rot",  rot)
 DEFINE_OPCODE(0x10, "{", begin)
 DEFINE_OPCODE(0x11, "if", if)
 DEFINE_OPCODE(0x12, "ifelse", ifelse)
+DEFINE_OPCODE(0x13, "return", return)
 
 DEFINE_OPCODE(0x20, nullptr, lit_uint)
 DEFINE_OPCODE(0x21, nullptr, lit_int)
diff --git a/lldb/unittests/DataFormatter/FormatterBytecodeTest.cpp b/lldb/unittests/DataFormatter/FormatterBytecodeTest.cpp
index 7307db650c162..5e980c3e1913c 100644
--- a/lldb/unittests/DataFormatter/FormatterBytecodeTest.cpp
+++ b/lldb/unittests/DataFormatter/FormatterBytecodeTest.cpp
@@ -97,6 +97,16 @@ TEST_F(FormatterBytecodeTest, ControlOps) {
                           data));
     ASSERT_EQ(data.Pop(), 42u);
   }
+  {
+    DataStack data;
+    ASSERT_TRUE(Interpret({op_lit_uint, 1, op_begin, 3, op_lit_uint, 42,
+                           op_return, op_if, op_lit_uint, 23},
+                          data));
+    ASSERT_EQ(data.Pop(), 42u);
+  }
+}
+
+TEST_F(FormatterBytecodeTest, ConversionOps) {
   {
     DataStack data(lldb::ValueObjectSP{});
     ASSERT_TRUE(Interpret({op_is_null}, data));

From b7637a855722b608ce2fb5aa860149db9b881197 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= 
Date: Fri, 3 Jan 2025 15:27:41 -0800
Subject: [PATCH 411/567] [flang][cuda] Set PINNED variable to false in
 ALLOCATE (#121593)

When `PINNED=` is used with variables that don't have the `PINNED`
attribute, the logical value must be set to false when host allocation
is performed.
---
 flang/lib/Lower/Allocatable.cpp            | 33 +++++++++++++++++-----
 flang/test/Lower/CUDA/cuda-allocatable.cuf | 27 ++++++++++++++++++
 2 files changed, 53 insertions(+), 7 deletions(-)

diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index 5c63c79892f42..dc135543fafc7 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -454,6 +454,19 @@ class AllocateStmtHelper {
                                                    alloc.getSymbol());
   }
 
+  void setPinnedToFalse() {
+    if (!pinnedExpr)
+      return;
+    Fortran::lower::StatementContext stmtCtx;
+    mlir::Value pinned =
+        fir::getBase(converter.genExprAddr(loc, *pinnedExpr, stmtCtx));
+    mlir::Location loc = pinned.getLoc();
+    mlir::Value falseValue = builder.createBool(loc, false);
+    mlir::Value falseConv = builder.createConvert(
+        loc, fir::unwrapRefType(pinned.getType()), falseValue);
+    builder.create(loc, falseConv, pinned);
+  }
+
   void genSimpleAllocation(const Allocation &alloc,
                            const fir::MutableBoxValue &box) {
     bool isCudaSymbol = Fortran::semantics::HasCUDAAttr(alloc.getSymbol());
@@ -469,6 +482,7 @@ class AllocateStmtHelper {
       // can be validated.
       genInlinedAllocation(alloc, box);
       postAllocationAction(alloc);
+      setPinnedToFalse();
       return;
     }
 
@@ -482,11 +496,13 @@ class AllocateStmtHelper {
     genSetDeferredLengthParameters(alloc, box);
     genAllocateObjectBounds(alloc, box);
     mlir::Value stat;
-    if (!isCudaSymbol)
+    if (!isCudaSymbol) {
       stat = genRuntimeAllocate(builder, loc, box, errorManager);
-    else
+      setPinnedToFalse();
+    } else {
       stat =
           genCudaAllocate(builder, loc, box, errorManager, alloc.getSymbol());
+    }
     fir::factory::syncMutableBoxFromIRBox(builder, loc, box);
     postAllocationAction(alloc);
     errorManager.assignStat(builder, loc, stat);
@@ -616,13 +632,16 @@ class AllocateStmtHelper {
       genSetDeferredLengthParameters(alloc, box);
     genAllocateObjectBounds(alloc, box);
     mlir::Value stat;
-    if (Fortran::semantics::HasCUDAAttr(alloc.getSymbol()))
+    if (Fortran::semantics::HasCUDAAttr(alloc.getSymbol())) {
       stat =
           genCudaAllocate(builder, loc, box, errorManager, alloc.getSymbol());
-    else if (isSource)
-      stat = genRuntimeAllocateSource(builder, loc, box, exv, errorManager);
-    else
-      stat = genRuntimeAllocate(builder, loc, box, errorManager);
+    } else {
+      if (isSource)
+        stat = genRuntimeAllocateSource(builder, loc, box, exv, errorManager);
+      else
+        stat = genRuntimeAllocate(builder, loc, box, errorManager);
+      setPinnedToFalse();
+    }
     fir::factory::syncMutableBoxFromIRBox(builder, loc, box);
     postAllocationAction(alloc);
     errorManager.assignStat(builder, loc, stat);
diff --git a/flang/test/Lower/CUDA/cuda-allocatable.cuf b/flang/test/Lower/CUDA/cuda-allocatable.cuf
index 6479425c58d8b..8b287f859aa76 100644
--- a/flang/test/Lower/CUDA/cuda-allocatable.cuf
+++ b/flang/test/Lower/CUDA/cuda-allocatable.cuf
@@ -196,3 +196,30 @@ end subroutine
 ! CHECK: %[[BOX:.*]] = fir.load %[[A]]#1 : !fir.ref>>>
 ! CHECK: %[[BOXADDR:.*]] = fir.box_addr %[[BOX]] : (!fir.box>>) -> !fir.heap>
 ! CHECK: fir.freemem %[[BOXADDR]] : !fir.heap>
+
+subroutine setpinned()
+  integer, allocatable :: i(:)
+  logical :: plog
+  allocate(i(10), pinned=plog)
+end
+
+! CHECK-LABEL: func.func @_QPsetpinned()  
+! CHECK: %[[PLOG:.*]] = fir.alloca !fir.logical<4> {bindc_name = "plog", uniq_name = "_QFsetpinnedEplog"}
+! CHECK: %[[PLOG_DECL:.*]]:2 = hlfir.declare %[[PLOG]] {uniq_name = "_QFsetpinnedEplog"} : (!fir.ref>) -> (!fir.ref>, !fir.ref>)
+! CHECK: %[[FALSE:.*]] = arith.constant false
+! CHECK: %[[FLASE_CONV:.*]] = fir.convert %[[FALSE]] : (i1) -> !fir.logical<4>
+! CHECK: fir.store %[[FLASE_CONV]] to %[[PLOG_DECL]]#1 : !fir.ref>
+
+subroutine setpinnedpointer()
+  integer, pointer :: i(:)
+  logical :: plog
+  allocate(i(10), pinned=plog)
+end
+
+! CHECK-LABEL: func.func @_QPsetpinnedpointer()
+! CHECK: %[[PLOG:.*]] = fir.alloca !fir.logical<4> {bindc_name = "plog", uniq_name = "_QFsetpinnedpointerEplog"}
+! CHECK: %[[PLOG_DECL:.*]]:2 = hlfir.declare %[[PLOG]] {uniq_name = "_QFsetpinnedpointerEplog"} : (!fir.ref>) -> (!fir.ref>, !fir.ref>)
+! CHECK: fir.call @_FortranAPointerAllocate
+! CHECK: %[[FALSE:.*]] = arith.constant false
+! CHECK: %[[FLASE_CONV:.*]] = fir.convert %[[FALSE]] : (i1) -> !fir.logical<4>
+! CHECK: fir.store %[[FLASE_CONV]] to %[[PLOG_DECL]]#1 : !fir.ref>

From 54246a39e4cb06cec7d4bafb014e3cad73b1e4df Mon Sep 17 00:00:00 2001
From: Craig Topper 
Date: Fri, 3 Jan 2025 16:47:08 -0800
Subject: [PATCH 412/567] [RISCV] Pass VSETVLIInfo by const reference. NFC

---
 llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 75985832594d4..1fd130d7e040e 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -627,7 +627,7 @@ class VSETVLIInfo {
     return MI;
   }
 
-  void setAVL(VSETVLIInfo Info) {
+  void setAVL(const VSETVLIInfo &Info) {
     assert(Info.isValid());
     if (Info.isUnknown())
       setUnknown();
@@ -1223,7 +1223,8 @@ bool RISCVInsertVSETVLI::needVSETVLI(const DemandedFields &Used,
 // If we don't use LMUL or the SEW/LMUL ratio, then adjust LMUL so that we
 // maintain the SEW/LMUL ratio. This allows us to eliminate VL toggles in more
 // places.
-static VSETVLIInfo adjustIncoming(VSETVLIInfo PrevInfo, VSETVLIInfo NewInfo,
+static VSETVLIInfo adjustIncoming(const VSETVLIInfo &PrevInfo,
+                                  const VSETVLIInfo &NewInfo,
                                   DemandedFields &Demanded) {
   VSETVLIInfo Info = NewInfo;
 

From 82c0f68c041229eb48a7d018f7aa81d576d456a9 Mon Sep 17 00:00:00 2001
From: "Mikhail R. Gadelha" 
Date: Fri, 3 Jan 2025 22:03:15 -0300
Subject: [PATCH 413/567] [libc] Remove assert to fix rv32 buildbot

---
 libc/src/unistd/linux/dup2.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/libc/src/unistd/linux/dup2.cpp b/libc/src/unistd/linux/dup2.cpp
index c7c7c1a8ca786..7ffc151a053c9 100644
--- a/libc/src/unistd/linux/dup2.cpp
+++ b/libc/src/unistd/linux/dup2.cpp
@@ -32,7 +32,6 @@ LLVM_LIBC_FUNCTION(int, dup2, (int oldfd, int newfd)) {
     int ret = LIBC_NAMESPACE::syscall_impl(SYS_fcntl, oldfd, F_GETFD);
 #elif defined(SYS_fcntl64)
     // Same as fcntl but can handle large offsets
-    static_assert(sizeof(off_t) == 8);
     int ret = LIBC_NAMESPACE::syscall_impl(SYS_fcntl64, oldfd, F_GETFD);
 #else
 #error "SYS_fcntl and SYS_fcntl64 syscalls not available."

From e3dafa88a8f651825ac65aad9b273983598279dd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= 
Date: Fri, 3 Jan 2025 17:35:41 -0800
Subject: [PATCH 414/567] [flang][cuda] Allow GOTO, EXIT, CYCLE and SELECT CASE
 in device procedures (#121612)

---
 flang/lib/Semantics/check-cuda.cpp | 23 +++++++++++++
 flang/test/Semantics/cuf09.cuf     | 53 ++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+)

diff --git a/flang/lib/Semantics/check-cuda.cpp b/flang/lib/Semantics/check-cuda.cpp
index d497ac20e7017..d8a5639227648 100644
--- a/flang/lib/Semantics/check-cuda.cpp
+++ b/flang/lib/Semantics/check-cuda.cpp
@@ -302,6 +302,14 @@ template  class DeviceContextChecker {
             [&](const common::Indirection &x) {
               Check(x.value());
             },
+            [&](const common::Indirection &x) {
+              const auto &caseList{
+                  std::get>(
+                      x.value().t)};
+              for (const parser::CaseConstruct::Case &c : caseList) {
+                Check(std::get(c.t));
+              }
+            },
             [&](const auto &x) {
               if (auto source{parser::GetSource(x)}) {
                 context_.Say(*source,
@@ -347,9 +355,24 @@ template  class DeviceContextChecker {
           hostArray->name());
     }
   }
+  void ErrorInCUFKernel(parser::CharBlock source) {
+    if (IsCUFKernelDo) {
+      context_.Say(
+          source, "Statement may not appear in cuf kernel code"_err_en_US);
+    }
+  }
   void Check(const parser::ActionStmt &stmt, const parser::CharBlock &source) {
     common::visit(
         common::visitors{
+            [&](const common::Indirection &) {
+              ErrorInCUFKernel(source);
+            },
+            [&](const common::Indirection &) {
+              ErrorInCUFKernel(source);
+            },
+            [&](const common::Indirection &) {
+              ErrorInCUFKernel(source);
+            },
             [&](const common::Indirection &) { return; },
             [&](const common::Indirection &) {},
             [&](const common::Indirection &x) {
diff --git a/flang/test/Semantics/cuf09.cuf b/flang/test/Semantics/cuf09.cuf
index 3307e2a862672..06c9070fcbcd0 100644
--- a/flang/test/Semantics/cuf09.cuf
+++ b/flang/test/Semantics/cuf09.cuf
@@ -54,6 +54,59 @@ module m
     print*,threadIdx%x
     stop ! ok
   end subroutine
+
+  attributes(global) subroutine cycletest()
+    integer :: i
+    do i = 1, 10
+      cycle ! ok
+    end do
+  end subroutine
+
+  attributes(global) subroutine gototest()
+    integer :: i
+    goto 10
+    10 print *, "X is negative!" 
+  end subroutine
+
+  attributes(global) subroutine exittest()
+    integer :: i
+    do i = 1, 10
+      if (i == 1) then
+        exit ! ok
+      end if
+    end do
+  end subroutine
+
+  attributes(global) subroutine selectcasetest()
+    integer :: i
+    select case(i)
+    case (1)
+      print*,'main'
+    case default
+      print*, 'default'
+    end select
+  end subroutine
+
+  subroutine host()
+    integer :: i
+    !$cuf kernel do
+    do i = 1, 10
+      !ERROR: Statement may not appear in cuf kernel code
+      cycle
+    end do
+
+    !$cuf kernel do
+    do i = 1, 10
+      if (i == 1) then
+        !ERROR: Statement may not appear in cuf kernel code
+        exit ! ok
+      end if
+
+      !ERROR: Statement may not appear in cuf kernel code
+      goto 10
+      10 print *, "X is negative!"
+    end do
+  end subroutine
 end
 
 program main

From 7c86ab8a18897c434fdb1ee3cd5ff2a71e6aae5a Mon Sep 17 00:00:00 2001
From: Slava Zakharin 
Date: Fri, 3 Jan 2025 18:25:31 -0800
Subject: [PATCH 415/567] [flang] Fixed the missing dependency. (#121370)

My local build with the shared libraries is broken.
I suppose this was introduced by #120374.

`flang/include/flang/Evaluate/constant.h` ends up being included
by `MapInfoFinalization.cpp` via `flang/Lower/DirectivesCommon.h`.
The undefined references are related to `ConstantBase` classes.
---
 flang/lib/Optimizer/OpenMP/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/flang/lib/Optimizer/OpenMP/CMakeLists.txt b/flang/lib/Optimizer/OpenMP/CMakeLists.txt
index 4f23b2b970fa4..026889cca238a 100644
--- a/flang/lib/Optimizer/OpenMP/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenMP/CMakeLists.txt
@@ -22,6 +22,7 @@ add_flang_library(FlangOpenMPTransforms
   FIRDialectSupport
   FIRSupport
   FortranCommon
+  FortranEvaluate
   MLIRFuncDialect
   MLIROpenMPDialect
   HLFIRDialect

From a2b9058c392995660956e56c2ac8695a44dc2e4e Mon Sep 17 00:00:00 2001
From: Craig Topper 
Date: Fri, 3 Jan 2025 19:04:13 -0800
Subject: [PATCH 416/567] [RISCV] Reduce size of CSR lookup tables. NFC
 (#121606)

Instead of storing 3 different names in each row of the table, use a
separate row for each name and use a flag to indicate what type of name
it is. The AltName and DeprecatedName weren't used often enough to
justify storing them as a possibility for every register.

This reduces the .rodata size by 27k and reduces the number of dynamic
relocations since we now only need 1 lookup by name function. The lookup
by name function each contained a ~400 entry table of const char*
pointing to constant strings. Each of those requires a dynamic
relocation.

I also capitalized IsRV32Only in the C++ code to match coding
standards.
---
 .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 25 ++++++++-----
 .../Target/RISCV/MCTargetDesc/RISCVBaseInfo.h |  8 ++--
 .../RISCV/MCTargetDesc/RISCVInstPrinter.cpp   |  2 +
 llvm/lib/Target/RISCV/RISCVSystemOperands.td  | 37 +++++++------------
 4 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 4c1fd5aa41e2b..2205c67c2d21b 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -1915,6 +1915,8 @@ ParseStatus RISCVAsmParser::parseCSRSystemRegister(OperandVector &Operands) {
         // Accept an immediate representing a named Sys Reg if it satisfies the
         // the required features.
         for (auto &Reg : Range) {
+          if (Reg.IsAltName || Reg.IsDeprecatedName)
+            continue;
           if (Reg.haveRequiredFeatures(STI->getFeatureBits()))
             return RISCVOperand::createSysReg(Reg.Name, S, Imm);
         }
@@ -1952,22 +1954,27 @@ ParseStatus RISCVAsmParser::parseCSRSystemRegister(OperandVector &Operands) {
       return ParseStatus::Failure;
 
     const auto *SysReg = RISCVSysReg::lookupSysRegByName(Identifier);
-    if (!SysReg)
-      SysReg = RISCVSysReg::lookupSysRegByAltName(Identifier);
-    if (!SysReg)
-      if ((SysReg = RISCVSysReg::lookupSysRegByDeprecatedName(Identifier)))
-        Warning(S, "'" + Identifier + "' is a deprecated alias for '" +
-                       SysReg->Name + "'");
-
-    // Accept a named Sys Reg if the required features are present.
+
     if (SysReg) {
+      if (SysReg->IsDeprecatedName) {
+        // Lookup the undeprecated name.
+        auto Range = RISCVSysReg::lookupSysRegByEncoding(SysReg->Encoding);
+        for (auto &Reg : Range) {
+          if (Reg.IsAltName || Reg.IsDeprecatedName)
+            continue;
+          Warning(S, "'" + Identifier + "' is a deprecated alias for '" +
+                         Reg.Name + "'");
+        }
+      }
+
+      // Accept a named Sys Reg if the required features are present.
       const auto &FeatureBits = getSTI().getFeatureBits();
       if (!SysReg->haveRequiredFeatures(FeatureBits)) {
         const auto *Feature = llvm::find_if(RISCVFeatureKV, [&](auto Feature) {
           return SysReg->FeaturesRequired[Feature.Value];
         });
         auto ErrorMsg = std::string("system register '") + SysReg->Name + "' ";
-        if (SysReg->isRV32Only && FeatureBits[RISCV::Feature64Bit]) {
+        if (SysReg->IsRV32Only && FeatureBits[RISCV::Feature64Bit]) {
           ErrorMsg += "is RV32 only";
           if (Feature != std::end(RISCVFeatureKV))
             ErrorMsg += " and ";
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index 7fb5fc7a83130..1c1a8b8009d2c 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -454,8 +454,6 @@ int getLoadFPImm(APFloat FPImm);
 namespace RISCVSysReg {
 struct SysReg {
   const char Name[32];
-  const char AltName[32];
-  const char DeprecatedName[32];
   unsigned Encoding;
   // FIXME: add these additional fields when needed.
   // Privilege Access: Read, Write, Read-Only.
@@ -467,11 +465,13 @@ struct SysReg {
   // Register number without the privilege bits.
   // unsigned Number;
   FeatureBitset FeaturesRequired;
-  bool isRV32Only;
+  bool IsRV32Only;
+  bool IsAltName;
+  bool IsDeprecatedName;
 
   bool haveRequiredFeatures(const FeatureBitset &ActiveFeatures) const {
     // Not in 32-bit mode.
-    if (isRV32Only && ActiveFeatures[RISCV::Feature64Bit])
+    if (IsRV32Only && ActiveFeatures[RISCV::Feature64Bit])
       return false;
     // No required feature associated with the system register.
     if (FeaturesRequired.none())
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
index d36c0d7238cdc..d5254719b3839 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
@@ -121,6 +121,8 @@ void RISCVInstPrinter::printCSRSystemRegister(const MCInst *MI, unsigned OpNo,
   unsigned Imm = MI->getOperand(OpNo).getImm();
   auto Range = RISCVSysReg::lookupSysRegByEncoding(Imm);
   for (auto &Reg : Range) {
+    if (Reg.IsAltName || Reg.IsDeprecatedName)
+      continue;
     if (Reg.haveRequiredFeatures(STI.getFeatureBits())) {
       markup(O, Markup::Register) << Reg.Name;
       return;
diff --git a/llvm/lib/Target/RISCV/RISCVSystemOperands.td b/llvm/lib/Target/RISCV/RISCVSystemOperands.td
index 72275daa1b8d1..39853cf13a920 100644
--- a/llvm/lib/Target/RISCV/RISCVSystemOperands.td
+++ b/llvm/lib/Target/RISCV/RISCVSystemOperands.td
@@ -19,12 +19,6 @@ include "llvm/TableGen/SearchableTable.td"
 
 class SysReg op> {
   string Name = name;
-  // A maximum of one alias is supported right now.
-  string AltName = name;
-  // A maximum of one deprecated name is supported right now.  Unlike the
-  // `AltName` alias, a `DeprecatedName` generates a diagnostic when the name is
-  // used to encourage software to migrate away from the name.
-  string DeprecatedName = "";
   bits<12> Encoding = op;
   // FIXME: add these additional fields when needed.
   // Privilege Access: Read and Write = 0, 1, 2; Read-Only = 3.
@@ -37,14 +31,16 @@ class SysReg op> {
   // bits<6> Number = op{5 - 0};
   code FeaturesRequired = [{ {} }];
   bit isRV32Only = 0;
+  bit isAltName = 0;
+  bit isDeprecatedName = 0;
 }
 
 def SysRegsList : GenericTable {
   let FilterClass = "SysReg";
   // FIXME: add "ReadWrite", "Mode", "Extra", "Number" fields when needed.
   let Fields = [
-    "Name", "AltName", "DeprecatedName", "Encoding", "FeaturesRequired",
-    "isRV32Only",
+    "Name", "Encoding", "FeaturesRequired",
+    "isRV32Only", "isAltName", "isDeprecatedName"
   ];
 
   let PrimaryKey = [ "Encoding" ];
@@ -57,16 +53,6 @@ def lookupSysRegByName : SearchIndex {
   let Key = [ "Name" ];
 }
 
-def lookupSysRegByAltName : SearchIndex {
-  let Table = SysRegsList;
-  let Key = [ "AltName" ];
-}
-
-def lookupSysRegByDeprecatedName : SearchIndex {
-  let Table = SysRegsList;
-  let Key = [ "DeprecatedName" ];
-}
-
 // The following CSR encodings match those given in Tables 2.2,
 // 2.3, 2.4, 2.5 and 2.6 in the RISC-V Instruction Set Manual
 // Volume II: Privileged Architecture.
@@ -123,15 +109,17 @@ def : SysReg<"senvcfg", 0x10A>;
 def : SysReg<"sscratch", 0x140>;
 def : SysReg<"sepc", 0x141>;
 def : SysReg<"scause", 0x142>;
-let DeprecatedName = "sbadaddr" in
 def : SysReg<"stval", 0x143>;
+let isDeprecatedName = 1 in
+def : SysReg<"sbadaddr", 0x143>;
 def : SysReg<"sip", 0x144>;
 
 //===----------------------------------------------------------------------===//
 // Supervisor Protection and Translation
 //===----------------------------------------------------------------------===//
-let DeprecatedName = "sptbr" in
 def : SysReg<"satp", 0x180>;
+let isDeprecatedName = 1 in
+def : SysReg<"sptbr", 0x180>;
 
 //===----------------------------------------------------------------------===//
 // Quality-of-Service(QoS) Identifiers (Ssqosid)
@@ -245,8 +233,9 @@ def : SysReg<"mstatush", 0x310>;
 def : SysReg<"mscratch", 0x340>;
 def : SysReg<"mepc", 0x341>;
 def : SysReg<"mcause", 0x342>;
-let DeprecatedName = "mbadaddr" in
 def : SysReg<"mtval", 0x343>;
+let isDeprecatedName = 1 in
+def : SysReg<"mbadaddr", 0x343>;
 def : SysReg<"mip", 0x344>;
 def : SysReg<"mtinst", 0x34A>;
 def : SysReg<"mtval2", 0x34B>;
@@ -298,8 +287,9 @@ foreach i = 3...31 in
 //===----------------------------------------------------------------------===//
 // Machine Counter Setup
 //===----------------------------------------------------------------------===//
-let AltName = "mucounteren" in // Privileged spec v1.9.1 Name
 def : SysReg<"mcountinhibit", 0x320>;
+let isAltName = 1 in
+def : SysReg<"mucounteren", 0x320>;
 
 // mhpmevent3-mhpmevent31 at 0x323-0x33F.
 foreach i = 3...31 in
@@ -336,8 +326,9 @@ def : SysReg<"dpc", 0x7B1>;
 
 // "dscratch" is an alternative name for "dscratch0" which appeared in earlier
 // drafts of the RISC-V debug spec
-let AltName = "dscratch" in
 def : SysReg<"dscratch0", 0x7B2>;
+let isAltName = 1 in
+def : SysReg<"dscratch", 0x7B2>;
 def : SysReg<"dscratch1", 0x7B3>;
 
 //===----------------------------------------------------------------------===//

From dc3cd2e95ee56cdb75f4d0d0742626f912b5c6f3 Mon Sep 17 00:00:00 2001
From: Chandler Carruth 
Date: Fri, 3 Jan 2025 19:23:42 -0800
Subject: [PATCH 417/567] Factor common code for quoting a builtin name
 (#120835)

This shows up in several places in order to match the quoting of other
uses of the same diagnostic. Handling it centrally simplifies the code
and reduces changes if the storage for builtin names changes.

This refactoring is extracted out of #120534 as requested in code
review.
---
 clang/include/clang/Basic/Builtins.h     |  3 +++
 clang/lib/AST/ByteCode/InterpBuiltin.cpp |  4 ++--
 clang/lib/AST/ExprConstant.cpp           | 15 +++++++--------
 clang/lib/Basic/Builtins.cpp             |  4 ++++
 4 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/clang/include/clang/Basic/Builtins.h b/clang/include/clang/Basic/Builtins.h
index e27d8ccce7366..63559d977ce6b 100644
--- a/clang/include/clang/Basic/Builtins.h
+++ b/clang/include/clang/Basic/Builtins.h
@@ -102,6 +102,9 @@ class Context {
   /// e.g. "__builtin_abs".
   llvm::StringRef getName(unsigned ID) const { return getRecord(ID).Name; }
 
+  /// Return a quoted name for the specified builtin for use in diagnostics.
+  std::string getQuotedName(unsigned ID) const;
+
   /// Get the type descriptor string for the specified builtin.
   const char *getTypeString(unsigned ID) const { return getRecord(ID).Type; }
 
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 731c9290993f1..0d52083b06946 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -155,7 +155,7 @@ static void diagnoseNonConstexprBuiltin(InterpState &S, CodePtr OpPC,
   if (S.getLangOpts().CPlusPlus11)
     S.CCEDiag(Loc, diag::note_constexpr_invalid_function)
         << /*isConstexpr=*/0 << /*isConstructor=*/0
-        << ("'" + S.getASTContext().BuiltinInfo.getName(ID) + "'").str();
+        << S.getASTContext().BuiltinInfo.getQuotedName(ID);
   else
     S.CCEDiag(Loc, diag::note_invalid_subexpr_in_const_expr);
 }
@@ -1977,7 +1977,7 @@ static bool interp__builtin_memcmp(InterpState &S, CodePtr OpPC,
                   !isOneByteCharacterType(PtrB.getType()))) {
     S.FFDiag(S.Current->getSource(OpPC),
              diag::note_constexpr_memcmp_unsupported)
-        << ("'" + ASTCtx.BuiltinInfo.getName(ID) + "'").str() << PtrA.getType()
+        << ASTCtx.BuiltinInfo.getQuotedName(ID) << PtrA.getType()
         << PtrB.getType();
     return false;
   }
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index dd75dca647540..e220f69b3a4f5 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -9858,7 +9858,7 @@ bool PointerExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
     if (Info.getLangOpts().CPlusPlus11)
       Info.CCEDiag(E, diag::note_constexpr_invalid_function)
           << /*isConstexpr*/ 0 << /*isConstructor*/ 0
-          << ("'" + Info.Ctx.BuiltinInfo.getName(BuiltinOp) + "'").str();
+          << Info.Ctx.BuiltinInfo.getQuotedName(BuiltinOp);
     else
       Info.CCEDiag(E, diag::note_invalid_subexpr_in_const_expr);
     [[fallthrough]];
@@ -9903,8 +9903,7 @@ bool PointerExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
     // FIXME: We can compare the bytes in the correct order.
     if (IsRawByte && !isOneByteCharacterType(CharTy)) {
       Info.FFDiag(E, diag::note_constexpr_memchr_unsupported)
-          << ("'" + Info.Ctx.BuiltinInfo.getName(BuiltinOp) + "'").str()
-          << CharTy;
+          << Info.Ctx.BuiltinInfo.getQuotedName(BuiltinOp) << CharTy;
       return false;
     }
     // Figure out what value we're actually looking for (after converting to
@@ -9966,7 +9965,7 @@ bool PointerExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
     if (Info.getLangOpts().CPlusPlus11)
       Info.CCEDiag(E, diag::note_constexpr_invalid_function)
           << /*isConstexpr*/ 0 << /*isConstructor*/ 0
-          << ("'" + Info.Ctx.BuiltinInfo.getName(BuiltinOp) + "'").str();
+          << Info.Ctx.BuiltinInfo.getQuotedName(BuiltinOp);
     else
       Info.CCEDiag(E, diag::note_invalid_subexpr_in_const_expr);
     [[fallthrough]];
@@ -13241,7 +13240,7 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
     if (Info.getLangOpts().CPlusPlus11)
       Info.CCEDiag(E, diag::note_constexpr_invalid_function)
           << /*isConstexpr*/ 0 << /*isConstructor*/ 0
-          << ("'" + Info.Ctx.BuiltinInfo.getName(BuiltinOp) + "'").str();
+          << Info.Ctx.BuiltinInfo.getQuotedName(BuiltinOp);
     else
       Info.CCEDiag(E, diag::note_invalid_subexpr_in_const_expr);
     [[fallthrough]];
@@ -13266,7 +13265,7 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
     if (Info.getLangOpts().CPlusPlus11)
       Info.CCEDiag(E, diag::note_constexpr_invalid_function)
           << /*isConstexpr*/ 0 << /*isConstructor*/ 0
-          << ("'" + Info.Ctx.BuiltinInfo.getName(BuiltinOp) + "'").str();
+          << Info.Ctx.BuiltinInfo.getQuotedName(BuiltinOp);
     else
       Info.CCEDiag(E, diag::note_invalid_subexpr_in_const_expr);
     [[fallthrough]];
@@ -13321,8 +13320,8 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
         !(isOneByteCharacterType(CharTy1) && isOneByteCharacterType(CharTy2))) {
       // FIXME: Consider using our bit_cast implementation to support this.
       Info.FFDiag(E, diag::note_constexpr_memcmp_unsupported)
-          << ("'" + Info.Ctx.BuiltinInfo.getName(BuiltinOp) + "'").str()
-          << CharTy1 << CharTy2;
+          << Info.Ctx.BuiltinInfo.getQuotedName(BuiltinOp) << CharTy1
+          << CharTy2;
       return false;
     }
 
diff --git a/clang/lib/Basic/Builtins.cpp b/clang/lib/Basic/Builtins.cpp
index 8dd1888db2988..588183788de32 100644
--- a/clang/lib/Basic/Builtins.cpp
+++ b/clang/lib/Basic/Builtins.cpp
@@ -163,6 +163,10 @@ void Builtin::Context::initializeBuiltins(IdentifierTable &Table,
   }
 }
 
+std::string Builtin::Context::getQuotedName(unsigned ID) const {
+  return (llvm::Twine("'") + getName(ID) + "'").str();
+}
+
 unsigned Builtin::Context::getRequiredVectorWidth(unsigned ID) const {
   const char *WidthPos = ::strchr(getRecord(ID).Attributes, 'V');
   if (!WidthPos)

From 34f0611bc36db40789823030a3748a8595198719 Mon Sep 17 00:00:00 2001
From: Owen Pan 
Date: Fri, 3 Jan 2025 20:09:39 -0800
Subject: [PATCH 418/567] [clang-format][doc] Minor cleanup

---
 clang/docs/ClangFormatStyleOptions.rst | 9 +++++----
 clang/include/clang/Format/Format.h    | 9 +++++----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index 7bfaee4e2d35b..637ec23e0abaf 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -4661,12 +4661,13 @@ the configuration (without a prefix: ``Auto``).
 .. _KeepEmptyLinesAtEOF:
 
 **KeepEmptyLinesAtEOF** (``Boolean``) :versionbadge:`clang-format 17` :ref:`¶ `
-  This option is deprecated. See ``AtEndOfFile`` of ``KeepEmptyLines``.
+  This option is **deprecated**. See ``AtEndOfFile`` of ``KeepEmptyLines``.
 
 .. _KeepEmptyLinesAtTheStartOfBlocks:
 
 **KeepEmptyLinesAtTheStartOfBlocks** (``Boolean``) :versionbadge:`clang-format 3.7` :ref:`¶ `
-  This option is deprecated. See ``AtStartOfBlock`` of ``KeepEmptyLines``.
+  This option is **deprecated**. See ``AtStartOfBlock`` of
+  ``KeepEmptyLines``.
 
 .. _KeepFormFeed:
 
@@ -6730,8 +6731,8 @@ the configuration (without a prefix: ``Auto``).
 .. _TemplateNames:
 
 **TemplateNames** (``List of Strings``) :versionbadge:`clang-format 20` :ref:`¶ `
-  A vector of non-keyword identifiers that should be interpreted as
-  template names.
+  A vector of non-keyword identifiers that should be interpreted as template
+  names.
 
   A ``<`` after a template name is annotated as a template opener instead of
   a binary operator.
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index 9b7a633e0a146..8d41077549690 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -3203,11 +3203,12 @@ struct FormatStyle {
   /// \version 19
   KeepEmptyLinesStyle KeepEmptyLines;
 
-  /// This option is deprecated. See ``AtEndOfFile`` of ``KeepEmptyLines``.
+  /// This option is **deprecated**. See ``AtEndOfFile`` of ``KeepEmptyLines``.
   /// \version 17
   // bool KeepEmptyLinesAtEOF;
 
-  /// This option is deprecated. See ``AtStartOfBlock`` of ``KeepEmptyLines``.
+  /// This option is **deprecated**. See ``AtStartOfBlock`` of
+  /// ``KeepEmptyLines``.
   /// \version 3.7
   // bool KeepEmptyLinesAtTheStartOfBlocks;
 
@@ -5042,8 +5043,8 @@ struct FormatStyle {
   /// \version 3.7
   unsigned TabWidth;
 
-  /// A vector of non-keyword identifiers that should be interpreted as
-  /// template names.
+  /// A vector of non-keyword identifiers that should be interpreted as template
+  /// names.
   ///
   /// A ``<`` after a template name is annotated as a template opener instead of
   /// a binary operator.

From aa0f3343a60c6132d9f6adfb8f62234a95519918 Mon Sep 17 00:00:00 2001
From: Craig Topper 
Date: Fri, 3 Jan 2025 19:51:28 -0800
Subject: [PATCH 419/567] [TableGen] Add 'final' to all of the *Init classes.

Classes that used TrailingObjects were already 'final'. Add to the
rest for consistency.
---
 llvm/include/llvm/TableGen/Record.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/llvm/include/llvm/TableGen/Record.h b/llvm/include/llvm/TableGen/Record.h
index 81a9257425783..3402f1957a1c1 100644
--- a/llvm/include/llvm/TableGen/Record.h
+++ b/llvm/include/llvm/TableGen/Record.h
@@ -448,7 +448,7 @@ class TypedInit : public Init {
 };
 
 /// '?' - Represents an uninitialized value.
-class UnsetInit : public Init {
+class UnsetInit final : public Init {
   friend detail::RecordKeeperImpl;
 
   /// The record keeper that initialized this Init.
@@ -486,7 +486,7 @@ class UnsetInit : public Init {
 
 // Represent an argument.
 using ArgAuxType = std::variant;
-class ArgumentInit : public Init, public FoldingSetNode {
+class ArgumentInit final : public Init, public FoldingSetNode {
 public:
   enum Kind {
     Positional,
@@ -638,7 +638,7 @@ class BitsInit final : public TypedInit,
 };
 
 /// '7' - Represent an initialization by a literal integer value.
-class IntInit : public TypedInit {
+class IntInit final : public TypedInit {
   int64_t Value;
 
   explicit IntInit(RecordKeeper &RK, int64_t V)
@@ -669,7 +669,7 @@ class IntInit : public TypedInit {
 };
 
 /// "anonymous_n" - Represent an anonymous record name
-class AnonymousNameInit : public TypedInit {
+class AnonymousNameInit final : public TypedInit {
   unsigned Value;
 
   explicit AnonymousNameInit(RecordKeeper &RK, unsigned V)
@@ -699,7 +699,7 @@ class AnonymousNameInit : public TypedInit {
 };
 
 /// "foo" - Represent an initialization by a string value.
-class StringInit : public TypedInit {
+class StringInit final : public TypedInit {
 public:
   enum StringFormat {
     SF_String, // Format as "text"
@@ -845,7 +845,7 @@ class OpInit : public TypedInit {
 
 /// !op (X) - Transform an init.
 ///
-class UnOpInit : public OpInit, public FoldingSetNode {
+class UnOpInit final : public OpInit, public FoldingSetNode {
 public:
   enum UnaryOp : uint8_t {
     TOLOWER,
@@ -908,7 +908,7 @@ class UnOpInit : public OpInit, public FoldingSetNode {
 };
 
 /// !op (X, Y) - Combine two inits.
-class BinOpInit : public OpInit, public FoldingSetNode {
+class BinOpInit final : public OpInit, public FoldingSetNode {
 public:
   enum BinaryOp : uint8_t {
     ADD,
@@ -995,7 +995,7 @@ class BinOpInit : public OpInit, public FoldingSetNode {
 };
 
 /// !op (X, Y, Z) - Combine two inits.
-class TernOpInit : public OpInit, public FoldingSetNode {
+class TernOpInit final : public OpInit, public FoldingSetNode {
 public:
   enum TernaryOp : uint8_t {
     SUBST,
@@ -1144,7 +1144,7 @@ class CondOpInit final : public TypedInit,
 };
 
 /// !foldl (a, b, expr, start, lst) - Fold over a list.
-class FoldOpInit : public TypedInit, public FoldingSetNode {
+class FoldOpInit final : public TypedInit, public FoldingSetNode {
 private:
   const Init *Start, *List, *A, *B, *Expr;
 
@@ -1179,7 +1179,7 @@ class FoldOpInit : public TypedInit, public FoldingSetNode {
 };
 
 /// !isa(expr) - Dynamically determine the type of an expression.
-class IsAOpInit : public TypedInit, public FoldingSetNode {
+class IsAOpInit final : public TypedInit, public FoldingSetNode {
 private:
   const RecTy *CheckType;
   const Init *Expr;
@@ -1213,7 +1213,7 @@ class IsAOpInit : public TypedInit, public FoldingSetNode {
 
 /// !exists(expr) - Dynamically determine if a record of `type` named
 /// `expr` exists.
-class ExistsOpInit : public TypedInit, public FoldingSetNode {
+class ExistsOpInit final : public TypedInit, public FoldingSetNode {
 private:
   const RecTy *CheckType;
   const Init *Expr;
@@ -1246,7 +1246,7 @@ class ExistsOpInit : public TypedInit, public FoldingSetNode {
 };
 
 /// 'Opcode' - Represent a reference to an entire variable object.
-class VarInit : public TypedInit {
+class VarInit final : public TypedInit {
   const Init *VarName;
 
   explicit VarInit(const Init *VN, const RecTy *T)
@@ -1320,7 +1320,7 @@ class VarBitInit final : public TypedInit {
 };
 
 /// AL - Represent a reference to a 'def' in the description
-class DefInit : public TypedInit {
+class DefInit final : public TypedInit {
   friend class Record;
 
   const Record *Def;
@@ -1409,7 +1409,7 @@ class VarDefInit final
 };
 
 /// X.Y - Represent a reference to a subfield of a variable
-class FieldInit : public TypedInit {
+class FieldInit final : public TypedInit {
   const Init *Rec;             // Record we are referring to
   const StringInit *FieldName; // Field we are accessing
 

From 2d424765f496410d6ab95a80c90d2eda933d66d4 Mon Sep 17 00:00:00 2001
From: Matthias Springer 
Date: Sat, 4 Jan 2025 09:12:03 +0100
Subject: [PATCH 420/567] [mlir][IR][NFC] `DominanceInfo`: Share same impl for
 block/op dominance (#115587)

The `properlyDominates` implementations for blocks and ops are very
similar. This commit replaces them with a single implementation that
operates on block iterators. That implementation can be used to
implement both `properlyDominates` variants.

Before:
```c++
template 
bool DominanceInfoBase::properlyDominatesImpl(Block *a,
                                                         Block *b) const;
template 
bool DominanceInfoBase::properlyDominatesImpl(
    Operation *a, Operation *b, bool enclosingOpOk) const;
```

After:
```c++
template 
bool DominanceInfoBase::properlyDominatesImpl(
    Block *aBlock, Block::iterator aIt, Block *bBlock, Block::iterator bIt,
    bool enclosingOk) const;
```

Note: A subsequent commit will add a new public `properlyDominates`
overload that accepts block iterators. That functionality can then be
used to find a valid insertion point at which a range of values is
defined (by utilizing post dominance).
---
 mlir/include/mlir/IR/Dominance.h |  28 +++----
 mlir/lib/IR/Dominance.cpp        | 124 ++++++++++++++++++++-----------
 2 files changed, 92 insertions(+), 60 deletions(-)

diff --git a/mlir/include/mlir/IR/Dominance.h b/mlir/include/mlir/IR/Dominance.h
index 16d17b9c0f3d0..63504cad211a4 100644
--- a/mlir/include/mlir/IR/Dominance.h
+++ b/mlir/include/mlir/IR/Dominance.h
@@ -113,12 +113,12 @@ class DominanceInfoBase {
   llvm::PointerIntPair
   getDominanceInfo(Region *region, bool needsDomTree) const;
 
-  /// Return "true" if the specified block A properly (post)dominates block B.
-  bool properlyDominatesImpl(Block *a, Block *b) const;
-
-  /// Return "true" if the specified op A properly (post)dominates op B.
-  bool properlyDominatesImpl(Operation *a, Operation *b,
-                             bool enclosingOpOk = true) const;
+  /// Return "true" if block iterator A properly (post)dominates block iterator
+  /// B. If `enclosingOk` is set, A is considered to (post)dominate B if A
+  /// encloses B.
+  bool properlyDominatesImpl(Block *aBlock, Block::iterator aIt, Block *bBlock,
+                             Block::iterator bIt,
+                             bool enclosingOk = true) const;
 
   /// A mapping of regions to their base dominator tree and a cached
   /// "hasSSADominance" bit. This map does not contain dominator trees for
@@ -151,9 +151,7 @@ class DominanceInfo : public detail::DominanceInfoBase {
   /// The `enclosingOpOk` flag says whether we should return true if the B op
   /// is enclosed by a region on A.
   bool properlyDominates(Operation *a, Operation *b,
-                         bool enclosingOpOk = true) const {
-    return super::properlyDominatesImpl(a, b, enclosingOpOk);
-  }
+                         bool enclosingOpOk = true) const;
 
   /// Return true if operation A dominates operation B, i.e. if A and B are the
   /// same operation or A properly dominates B.
@@ -188,9 +186,7 @@ class DominanceInfo : public detail::DominanceInfoBase {
   /// Graph regions have only a single block. To be consistent with "proper
   /// dominance" of ops, the single block is considered to properly dominate
   /// itself in a graph region.
-  bool properlyDominates(Block *a, Block *b) const {
-    return super::properlyDominatesImpl(a, b);
-  }
+  bool properlyDominates(Block *a, Block *b) const;
 };
 
 /// A class for computing basic postdominance information.
@@ -200,9 +196,7 @@ class PostDominanceInfo : public detail::DominanceInfoBase {
 
   /// Return true if operation A properly postdominates operation B.
   bool properlyPostDominates(Operation *a, Operation *b,
-                             bool enclosingOpOk = true) const {
-    return super::properlyDominatesImpl(a, b, enclosingOpOk);
-  }
+                             bool enclosingOpOk = true) const;
 
   /// Return true if operation A postdominates operation B.
   bool postDominates(Operation *a, Operation *b) const {
@@ -210,9 +204,7 @@ class PostDominanceInfo : public detail::DominanceInfoBase {
   }
 
   /// Return true if the specified block A properly postdominates block B.
-  bool properlyPostDominates(Block *a, Block *b) const {
-    return super::properlyDominatesImpl(a, b);
-  }
+  bool properlyPostDominates(Block *a, Block *b) const;
 
   /// Return true if the specified block A postdominates block B.
   bool postDominates(Block *a, Block *b) const {
diff --git a/mlir/lib/IR/Dominance.cpp b/mlir/lib/IR/Dominance.cpp
index 406e0f2d62d64..1c54e09d29b9b 100644
--- a/mlir/lib/IR/Dominance.cpp
+++ b/mlir/lib/IR/Dominance.cpp
@@ -213,61 +213,73 @@ DominanceInfoBase::findNearestCommonDominator(Block *a,
   return getDomTree(a->getParent()).findNearestCommonDominator(a, b);
 }
 
-/// Return true if the specified block A properly dominates block B.
-template 
-bool DominanceInfoBase::properlyDominatesImpl(Block *a,
-                                                         Block *b) const {
-  assert(a && b && "null blocks not allowed");
+/// Returns the given block iterator if it lies within the region region.
+/// Otherwise, otherwise finds the ancestor of the given block iterator that
+/// lies within the given region. Returns and "empty" iterator if the latter
+/// fails.
+///
+/// Note: This is a variant of Region::findAncestorOpInRegion that operates on
+/// block iterators instead of ops.
+static std::pair
+findAncestorIteratorInRegion(Region *r, Block *b, Block::iterator it) {
+  // Case 1: The iterator lies within the region region.
+  if (b->getParent() == r)
+    return std::make_pair(b, it);
+
+  // Otherwise: Find ancestor iterator. Bail if we run out of parent ops.
+  Operation *parentOp = b->getParentOp();
+  if (!parentOp)
+    return std::make_pair(static_cast(nullptr), Block::iterator());
+  Operation *op = r->findAncestorOpInRegion(*parentOp);
+  if (!op)
+    return std::make_pair(static_cast(nullptr), Block::iterator());
+  return std::make_pair(op->getBlock(), op->getIterator());
+}
 
-  // A block dominates, but does not properly dominate, itself unless this
-  // is a graph region.
+/// Given two iterators into the same block, return "true" if `a` is before `b.
+/// Note: This is a variant of Operation::isBeforeInBlock that operates on
+/// block iterators instead of ops.
+static bool isBeforeInBlock(Block *block, Block::iterator a,
+                            Block::iterator b) {
   if (a == b)
-    return !hasSSADominance(a);
-
-  // If both blocks are not in the same region, `a` properly dominates `b` if
-  // `b` is defined in an operation region that (recursively) ends up being
-  // dominated by `a`. Walk up the list of containers enclosing B.
-  Region *regionA = a->getParent();
-  if (regionA != b->getParent()) {
-    b = regionA ? regionA->findAncestorBlockInRegion(*b) : nullptr;
-    // If we could not find a valid block b then it is a not a dominator.
-    if (!b)
-      return false;
-
-    // Check to see if the ancestor of `b` is the same block as `a`.  A properly
-    // dominates B if it contains an op that contains the B block.
-    if (a == b)
-      return true;
-  }
-
-  // Otherwise, they are two different blocks in the same region, use DomTree.
-  return getDomTree(regionA).properlyDominates(a, b);
+    return false;
+  if (a == block->end())
+    return false;
+  if (b == block->end())
+    return true;
+  return a->isBeforeInBlock(&*b);
 }
 
 template 
 bool DominanceInfoBase::properlyDominatesImpl(
-    Operation *a, Operation *b, bool enclosingOpOk) const {
-  Block *aBlock = a->getBlock(), *bBlock = b->getBlock();
-  assert(aBlock && bBlock && "operations must be in a block");
+    Block *aBlock, Block::iterator aIt, Block *bBlock, Block::iterator bIt,
+    bool enclosingOk) const {
+  assert(aBlock && bBlock && "expected non-null blocks");
 
-  // An operation (pos)dominates, but does not properly (pos)dominate, itself
-  // unless this is a graph region.
-  if (a == b)
+  // A block iterator (post)dominates, but does not properly (post)dominate,
+  // itself unless this is a graph region.
+  if (aBlock == bBlock && aIt == bIt)
     return !hasSSADominance(aBlock);
 
-  // If these ops are in different regions, then normalize one into the other.
+  // If the iterators are in different regions, then normalize one into the
+  // other.
   Region *aRegion = aBlock->getParent();
   if (aRegion != bBlock->getParent()) {
-    // Scoot up b's region tree until we find an operation in A's region that
+    // Scoot up b's region tree until we find a location in A's region that
     // encloses it.  If this fails, then we know there is no (post)dom relation.
-    b = aRegion ? aRegion->findAncestorOpInRegion(*b) : nullptr;
-    if (!b)
+    if (!aRegion) {
+      bBlock = nullptr;
+      bIt = Block::iterator();
+    } else {
+      std::tie(bBlock, bIt) =
+          findAncestorIteratorInRegion(aRegion, bBlock, bIt);
+    }
+    if (!bBlock)
       return false;
-    bBlock = b->getBlock();
-    assert(bBlock->getParent() == aRegion);
+    assert(bBlock->getParent() == aRegion && "expected block in regionA");
 
     // If 'a' encloses 'b', then we consider it to (post)dominate.
-    if (a == b && enclosingOpOk)
+    if (aBlock == bBlock && aIt == bIt && enclosingOk)
       return true;
   }
 
@@ -279,9 +291,9 @@ bool DominanceInfoBase::properlyDominatesImpl(
     if (!hasSSADominance(aBlock))
       return true;
     if constexpr (IsPostDom) {
-      return b->isBeforeInBlock(a);
+      return isBeforeInBlock(aBlock, bIt, aIt);
     } else {
-      return a->isBeforeInBlock(b);
+      return isBeforeInBlock(aBlock, aIt, bIt);
     }
   }
 
@@ -309,6 +321,18 @@ template class detail::DominanceInfoBase;
 // DominanceInfo
 //===----------------------------------------------------------------------===//
 
+bool DominanceInfo::properlyDominates(Operation *a, Operation *b,
+                                      bool enclosingOpOk) const {
+  return super::properlyDominatesImpl(a->getBlock(), a->getIterator(),
+                                      b->getBlock(), b->getIterator(),
+                                      enclosingOpOk);
+}
+
+bool DominanceInfo::properlyDominates(Block *a, Block *b) const {
+  return super::properlyDominatesImpl(a, a->begin(), b, b->begin(),
+                                      /*enclosingOk=*/true);
+}
+
 /// Return true if the `a` value properly dominates operation `b`, i.e if the
 /// operation that defines `a` properlyDominates `b` and the operation that
 /// defines `a` does not contain `b`.
@@ -322,3 +346,19 @@ bool DominanceInfo::properlyDominates(Value a, Operation *b) const {
   // `b`, but `a` does not itself enclose `b` in one of its regions.
   return properlyDominates(a.getDefiningOp(), b, /*enclosingOpOk=*/false);
 }
+
+//===----------------------------------------------------------------------===//
+// PostDominanceInfo
+//===----------------------------------------------------------------------===//
+
+bool PostDominanceInfo::properlyPostDominates(Operation *a, Operation *b,
+                                              bool enclosingOpOk) const {
+  return super::properlyDominatesImpl(a->getBlock(), a->getIterator(),
+                                      b->getBlock(), b->getIterator(),
+                                      enclosingOpOk);
+}
+
+bool PostDominanceInfo::properlyPostDominates(Block *a, Block *b) const {
+  return super::properlyDominatesImpl(a, a->end(), b, b->end(),
+                                      /*enclosingOk=*/true);
+}

From 95c5c5d4badf7c2128d098be325356e15c2197be Mon Sep 17 00:00:00 2001
From: Matthias Springer 
Date: Sat, 4 Jan 2025 09:23:15 +0100
Subject: [PATCH 421/567] [mlir][Transforms][NFC] Use `DominanceInfo` to
 compute materialization insertion point (#120746)

In the dialect conversion driver, use `DominanceInfo` to compute a
suitable insertion point for N:1 source materializations.
---
 mlir/include/mlir/IR/Dominance.h              | 23 ++++++
 .../Transforms/Utils/DialectConversion.cpp    | 70 +++++--------------
 2 files changed, 42 insertions(+), 51 deletions(-)

diff --git a/mlir/include/mlir/IR/Dominance.h b/mlir/include/mlir/IR/Dominance.h
index 63504cad211a4..9e1254c1dfe1e 100644
--- a/mlir/include/mlir/IR/Dominance.h
+++ b/mlir/include/mlir/IR/Dominance.h
@@ -187,6 +187,17 @@ class DominanceInfo : public detail::DominanceInfoBase {
   /// dominance" of ops, the single block is considered to properly dominate
   /// itself in a graph region.
   bool properlyDominates(Block *a, Block *b) const;
+
+  bool properlyDominates(Block *aBlock, Block::iterator aIt, Block *bBlock,
+                         Block::iterator bIt, bool enclosingOk = true) const {
+    return super::properlyDominatesImpl(aBlock, aIt, bBlock, bIt, enclosingOk);
+  }
+
+  bool dominates(Block *aBlock, Block::iterator aIt, Block *bBlock,
+                 Block::iterator bIt, bool enclosingOk = true) const {
+    return (aBlock == bBlock && aIt == bIt) ||
+           super::properlyDominatesImpl(aBlock, aIt, bBlock, bIt, enclosingOk);
+  }
 };
 
 /// A class for computing basic postdominance information.
@@ -210,6 +221,18 @@ class PostDominanceInfo : public detail::DominanceInfoBase {
   bool postDominates(Block *a, Block *b) const {
     return a == b || properlyPostDominates(a, b);
   }
+
+  bool properlyPostDominates(Block *aBlock, Block::iterator aIt, Block *bBlock,
+                             Block::iterator bIt,
+                             bool enclosingOk = true) const {
+    return super::properlyDominatesImpl(aBlock, aIt, bBlock, bIt, enclosingOk);
+  }
+
+  bool postDominates(Block *aBlock, Block::iterator aIt, Block *bBlock,
+                     Block::iterator bIt, bool enclosingOk = true) const {
+    return (aBlock == bBlock && aIt == bIt) ||
+           super::properlyDominatesImpl(aBlock, aIt, bBlock, bIt, enclosingOk);
+  }
 };
 
 } // namespace mlir
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 6c3863e4c7f66..1e689cd96ae71 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -54,55 +54,6 @@ static void logFailure(llvm::ScopedPrinter &os, StringRef fmt, Args &&...args) {
   });
 }
 
-/// Given two insertion points in the same block, choose the later one.
-static OpBuilder::InsertPoint
-chooseLaterInsertPointInBlock(OpBuilder::InsertPoint a,
-                              OpBuilder::InsertPoint b) {
-  assert(a.getBlock() == b.getBlock() && "expected same block");
-  Block *block = a.getBlock();
-  if (a.getPoint() == block->begin())
-    return b;
-  if (b.getPoint() == block->begin())
-    return a;
-  if (a.getPoint()->isBeforeInBlock(&*b.getPoint()))
-    return b;
-  return a;
-}
-
-/// Helper function that chooses the insertion point among the two given ones
-/// that is later.
-// TODO: Extend DominanceInfo API to work with block iterators.
-static OpBuilder::InsertPoint chooseLaterInsertPoint(OpBuilder::InsertPoint a,
-                                                     OpBuilder::InsertPoint b) {
-  // Case 1: Fast path: Same block. This is the most common case.
-  if (LLVM_LIKELY(a.getBlock() == b.getBlock()))
-    return chooseLaterInsertPointInBlock(a, b);
-
-  // Case 2: Different block, but same region.
-  if (a.getBlock()->getParent() == b.getBlock()->getParent()) {
-    DominanceInfo domInfo;
-    if (domInfo.properlyDominates(a.getBlock(), b.getBlock()))
-      return b;
-    if (domInfo.properlyDominates(b.getBlock(), a.getBlock()))
-      return a;
-    // Neither of the two blocks dominante each other.
-    llvm_unreachable("unable to find valid insertion point");
-  }
-
-  // Case 3: b's region contains a: choose a.
-  if (b.getBlock()->getParent()->findAncestorOpInRegion(
-          *a.getPoint()->getParentOp()))
-    return a;
-
-  // Case 4: a's region contains b: choose b.
-  if (a.getBlock()->getParent()->findAncestorOpInRegion(
-          *b.getPoint()->getParentOp()))
-    return b;
-
-  // Neither of the two operations contain each other.
-  llvm_unreachable("unable to find valid insertion point");
-}
-
 /// Helper function that computes an insertion point where the given value is
 /// defined and can be used without a dominance violation.
 static OpBuilder::InsertPoint computeInsertPoint(Value value) {
@@ -117,9 +68,26 @@ static OpBuilder::InsertPoint computeInsertPoint(Value value) {
 /// defined and can be used without a dominance violation.
 static OpBuilder::InsertPoint computeInsertPoint(ArrayRef vals) {
   assert(!vals.empty() && "expected at least one value");
+  DominanceInfo domInfo;
   OpBuilder::InsertPoint pt = computeInsertPoint(vals.front());
-  for (Value v : vals.drop_front())
-    pt = chooseLaterInsertPoint(pt, computeInsertPoint(v));
+  for (Value v : vals.drop_front()) {
+    // Choose the "later" insertion point.
+    OpBuilder::InsertPoint nextPt = computeInsertPoint(v);
+    if (domInfo.dominates(pt.getBlock(), pt.getPoint(), nextPt.getBlock(),
+                          nextPt.getPoint())) {
+      // pt is before nextPt => choose nextPt.
+      pt = nextPt;
+    } else {
+#ifndef NDEBUG
+      // nextPt should be before pt => choose pt.
+      // If pt, nextPt are no dominance relationship, then there is no valid
+      // insertion point at which all given values are defined.
+      bool dom = domInfo.dominates(nextPt.getBlock(), nextPt.getPoint(),
+                                   pt.getBlock(), pt.getPoint());
+      assert(dom && "unable to find valid insertion point");
+#endif // NDEBUG
+    }
+  }
   return pt;
 }
 

From fac46469977da9c4e9c6eeaac21103c971190577 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng 
Date: Sat, 4 Jan 2025 17:23:57 +0800
Subject: [PATCH 422/567] [InstCombine] Check no wrap flags before folding icmp
 of GEPs with same indices (#121628)

Alive2: https://alive2.llvm.org/ce/z/Dr3Sbe
Closes https://github.com/llvm/llvm-project/issues/121581.
---
 .../InstCombine/InstCombineCompares.cpp       |  6 ++-
 llvm/test/Transforms/InstCombine/icmp-gep.ll  | 48 +++++++++++++++++++
 2 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index d6fdade25559f..8b23583c51063 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -747,6 +747,8 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
                         ConstantExpr::getPointerBitCastOrAddrSpaceCast(
                             cast(RHS), Base->getType()));
   } else if (GEPOperator *GEPRHS = dyn_cast(RHS)) {
+    GEPNoWrapFlags NW = GEPLHS->getNoWrapFlags() & GEPRHS->getNoWrapFlags();
+
     // If the base pointers are different, but the indices are the same, just
     // compare the base pointer.
     if (PtrBase != GEPRHS->getOperand(0)) {
@@ -764,7 +766,8 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
 
       // If all indices are the same, just compare the base pointers.
       Type *BaseType = GEPLHS->getOperand(0)->getType();
-      if (IndicesTheSame && CmpInst::makeCmpResultType(BaseType) == I.getType())
+      if (IndicesTheSame &&
+          CmpInst::makeCmpResultType(BaseType) == I.getType() && CanFold(NW))
         return new ICmpInst(Cond, GEPLHS->getOperand(0), GEPRHS->getOperand(0));
 
       // If we're comparing GEPs with two base pointers that only differ in type
@@ -804,7 +807,6 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
       return transformToIndexedCompare(GEPLHS, RHS, Cond, DL, *this);
     }
 
-    GEPNoWrapFlags NW = GEPLHS->getNoWrapFlags() & GEPRHS->getNoWrapFlags();
     if (GEPLHS->getNumOperands() == GEPRHS->getNumOperands() &&
         GEPLHS->getSourceElementType() == GEPRHS->getSourceElementType()) {
       // If the GEPs only differ by one index, compare it.
diff --git a/llvm/test/Transforms/InstCombine/icmp-gep.ll b/llvm/test/Transforms/InstCombine/icmp-gep.ll
index f9b90c224d832..7f8f1ae73948d 100644
--- a/llvm/test/Transforms/InstCombine/icmp-gep.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-gep.ll
@@ -709,3 +709,51 @@ define i1 @pointer_icmp_aligned_with_offset_negative(ptr align 8 %a, ptr align 8
   %cmp = icmp eq ptr %gep, %a2
   ret i1 %cmp
 }
+
+define i1 @gep_diff_base_same_indices(ptr %x, ptr %y, i64 %z) {
+; CHECK-LABEL: @gep_diff_base_same_indices(
+; CHECK-NEXT:    [[X:%.*]] = getelementptr i8, ptr [[X1:%.*]], i64 [[Z:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = getelementptr i8, ptr [[Y1:%.*]], i64 [[Z]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep1 = getelementptr i8, ptr %x, i64 %z
+  %gep2 = getelementptr i8, ptr %y, i64 %z
+  %cmp = icmp ult ptr %gep1, %gep2
+  ret i1 %cmp
+}
+
+define i1 @gep_diff_base_same_indices_nuw(ptr %x, ptr %y, i64 %z) {
+; CHECK-LABEL: @gep_diff_base_same_indices_nuw(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep1 = getelementptr nuw i8, ptr %x, i64 %z
+  %gep2 = getelementptr nuw i8, ptr %y, i64 %z
+  %cmp = icmp ult ptr %gep1, %gep2
+  ret i1 %cmp
+}
+
+define i1 @gep_diff_base_same_indices_nusw(ptr %x, ptr %y, i64 %z) {
+; CHECK-LABEL: @gep_diff_base_same_indices_nusw(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep1 = getelementptr nusw i8, ptr %x, i64 %z
+  %gep2 = getelementptr nusw i8, ptr %y, i64 %z
+  %cmp = icmp ult ptr %gep1, %gep2
+  ret i1 %cmp
+}
+
+define i1 @gep_diff_base_same_indices_nuw_nusw(ptr %x, ptr %y, i64 %z) {
+; CHECK-LABEL: @gep_diff_base_same_indices_nuw_nusw(
+; CHECK-NEXT:    [[X:%.*]] = getelementptr nuw i8, ptr [[X1:%.*]], i64 [[Z:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = getelementptr nusw i8, ptr [[Y1:%.*]], i64 [[Z]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep1 = getelementptr nuw i8, ptr %x, i64 %z
+  %gep2 = getelementptr nusw i8, ptr %y, i64 %z
+  %cmp = icmp ult ptr %gep1, %gep2
+  ret i1 %cmp
+}

From 2529a8df53af9bc6cecfd6c83404ffa5e89e3370 Mon Sep 17 00:00:00 2001
From: Chandler Carruth 
Date: Sat, 4 Jan 2025 02:23:54 -0800
Subject: [PATCH 423/567] Mechanically port bulk of x86 builtins to TableGen
 (#120831)

The goal is to make incremental (if small) progress towards fully
TableGen'ed builtins, and to unblock #120534 by gaining access to more
powerful TableGen-based representations.

The bulk `.td` file addition was generated with the help of a very rough
Python script. That script made no attempt to be robust or reusable, it
specifically handled only the cases in the X86 `.def` file.

Four entries from the `.def` file were not handled automatically as they
used `BUILTIN` rather than `TARGET_BUILTIN`. These were ported by hand
to an empty-feature `TargetBuiltin` entry, which seems like a better
match.

For all the automatically ported entries, the results were compared by
sorting and diffing the `.def` file and the generated `.inc` file. The
only differences were:

- Different horizontal whitespace

- Additional entries that had already been ported to the `.td` file.

- More systematically using `Oi` instead of `LLi` for the type `long
  long int` in the fully general `__builtin_ia32_...` builtins for OpenCL
  support. The `.def` file was only partially moved to this it seems, and
  the systematic migration has updated a few missed builtins.
---
 clang/include/clang/Basic/BuiltinsBase.td     |   13 +-
 clang/include/clang/Basic/BuiltinsX86.def     | 2225 -------
 clang/include/clang/Basic/BuiltinsX86.td      | 5390 +++++++++++++++++
 clang/include/clang/Basic/TargetBuiltins.h    |    2 -
 clang/lib/Basic/Targets/X86.cpp               |    8 -
 clang/utils/TableGen/ClangBuiltinsEmitter.cpp |   28 +-
 6 files changed, 5427 insertions(+), 2239 deletions(-)
 delete mode 100644 clang/include/clang/Basic/BuiltinsX86.def

diff --git a/clang/include/clang/Basic/BuiltinsBase.td b/clang/include/clang/Basic/BuiltinsBase.td
index cff182f3f282c..1a1096d41da40 100644
--- a/clang/include/clang/Basic/BuiltinsBase.td
+++ b/clang/include/clang/Basic/BuiltinsBase.td
@@ -88,6 +88,8 @@ class Builtin {
   // On some platforms, some functions are actually macros. In that case we need
   // to #undef them.
   bit RequiresUndef = 0;
+  // Enables builtins to generate `long long` outside of OpenCL and `long` inside.
+  bit EnableOpenCLLong = 0;
 }
 
 class CustomEntry {
@@ -95,9 +97,6 @@ class CustomEntry {
 }
 
 class AtomicBuiltin : Builtin;
-class TargetBuiltin : Builtin {
-  string Features = "";
-}
 
 class LibBuiltin : Builtin {
   string Header = header;
@@ -122,6 +121,14 @@ class OCL_DSELangBuiltin : LangBuiltin<"OCL_DSE">;
 class OCL_GASLangBuiltin : LangBuiltin<"OCL_GAS">;
 class OCLLangBuiltin : LangBuiltin<"ALL_OCL_LANGUAGES">;
 
+class TargetBuiltin : Builtin {
+  string Features = "";
+}
+class TargetLibBuiltin : TargetBuiltin {
+  string Header;
+  string Languages = "ALL_LANGUAGES";
+}
+
 class Template substitutions,
                list affixes,
                bit as_prefix = 0> {
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
deleted file mode 100644
index 352b3a9ec594a..0000000000000
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ /dev/null
@@ -1,2225 +0,0 @@
-//===--- BuiltinsX86.def - X86 Builtin function database --------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the X86-specific builtin function database.  Users of
-// this file must define the BUILTIN macro to make use of this information.
-//
-//===----------------------------------------------------------------------===//
-
-// The format of this database matches clang/Basic/Builtins.def.
-
-// FIXME: Ideally we would be able to pull this information from what
-// LLVM already knows about X86 builtins. We need to match the LLVM
-// definition anyway, since code generation will lower to the
-// intrinsic if one exists.
-
-#if defined(BUILTIN) && !defined(TARGET_BUILTIN)
-#   define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS)
-#endif
-
-#if defined(BUILTIN) && !defined(TARGET_HEADER_BUILTIN)
-#  define TARGET_HEADER_BUILTIN(ID, TYPE, ATTRS, HEADER, LANG, FEATURE) BUILTIN(ID, TYPE, ATTRS)
-#endif
-
-// MMX
-//
-// All MMX instructions will be generated via builtins. Any MMX vector
-// types (<1 x i64>, <2 x i32>, etc.) that aren't used by these builtins will be
-// expanded by the back-end.
-// FIXME: _mm_prefetch must be a built-in because it takes a compile-time constant
-// argument and our prior approach of using a #define to the current built-in
-// doesn't work in the presence of re-declaration of _mm_prefetch for windows.
-TARGET_BUILTIN(_mm_prefetch, "vcC*i", "nc", "mmx")
-
-// SSE intrinsics.
-
-TARGET_BUILTIN(__builtin_ia32_ldmxcsr, "vUi", "n", "sse")
-TARGET_HEADER_BUILTIN(_mm_setcsr, "vUi", "nh",XMMINTRIN_H, ALL_LANGUAGES, "sse")
-TARGET_BUILTIN(__builtin_ia32_stmxcsr, "Ui", "n", "sse")
-TARGET_HEADER_BUILTIN(_mm_getcsr, "Ui", "nh", XMMINTRIN_H, ALL_LANGUAGES, "sse")
-TARGET_BUILTIN(__builtin_ia32_cvtss2si, "iV4f", "ncV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_cvttss2si, "iV4f", "ncV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_movmskps, "iV4f", "nV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_sfence, "v", "n", "sse")
-TARGET_HEADER_BUILTIN(_mm_sfence, "v", "nh", XMMINTRIN_H, ALL_LANGUAGES, "sse")
-TARGET_BUILTIN(__builtin_ia32_rcpps, "V4fV4f", "ncV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_rcpss, "V4fV4f", "ncV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_rsqrtps, "V4fV4f", "ncV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_rsqrtss, "V4fV4f", "ncV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_sqrtps, "V4fV4f", "ncV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_sqrtss, "V4fV4f", "ncV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_shufps, "V4fV4fV4fIi", "ncV:128:", "sse")
-
-TARGET_BUILTIN(__builtin_ia32_maskmovdqu, "vV16cV16cc*", "nV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_movmskpd, "iV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_pmovmskb128, "iV16c", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_movnti, "vi*i", "n", "sse2")
-TARGET_BUILTIN(__builtin_ia32_pshufd, "V4iV4iIi", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_pshuflw, "V8sV8sIi", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_pshufhw, "V8sV8sIi", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psadbw128, "V2OiV16cV16c", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_sqrtpd, "V2dV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_sqrtsd, "V2dV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_shufpd, "V2dV2dV2dIi", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2dq, "V2OiV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2ps, "V4fV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2dq, "V4iV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_cvtsd2si, "iV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_cvttsd2si, "iV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_cvtsd2ss, "V4fV4fV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_cvtps2dq, "V4iV4f", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_cvttps2dq, "V4iV4f", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_clflush, "vvC*", "n", "sse2")
-TARGET_HEADER_BUILTIN(_mm_clflush, "vvC*", "nh", EMMINTRIN_H, ALL_LANGUAGES, "sse2")
-TARGET_BUILTIN(__builtin_ia32_lfence, "v", "n", "sse2")
-TARGET_HEADER_BUILTIN(_mm_lfence, "v", "nh", EMMINTRIN_H, ALL_LANGUAGES, "sse2")
-TARGET_BUILTIN(__builtin_ia32_mfence, "v", "n", "sse2")
-TARGET_HEADER_BUILTIN(_mm_mfence, "v", "nh", EMMINTRIN_H, ALL_LANGUAGES, "sse2")
-TARGET_BUILTIN(__builtin_ia32_pause, "v", "n", "")
-TARGET_HEADER_BUILTIN(_mm_pause, "v", "nh", EMMINTRIN_H, ALL_LANGUAGES, "")
-TARGET_BUILTIN(__builtin_ia32_pmuludq128, "V2OiV4iV4i", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psraw128, "V8sV8sV8s", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psrad128, "V4iV4iV4i", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psrlw128, "V8sV8sV8s", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psrld128, "V4iV4iV4i", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psrlq128, "V2OiV2OiV2Oi", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psllw128, "V8sV8sV8s", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_pslld128, "V4iV4iV4i", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psllq128, "V2OiV2OiV2Oi", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psllwi128, "V8sV8si", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_pslldi128, "V4iV4ii", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psllqi128, "V2OiV2Oii", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psrlwi128, "V8sV8si", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psrldi128, "V4iV4ii", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psrlqi128, "V2OiV2Oii", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psrawi128, "V8sV8si", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psradi128, "V4iV4ii", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_pmaddwd128, "V4iV8sV8s", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_pslldqi128_byteshift, "V2OiV2OiIi", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psrldqi128_byteshift, "V2OiV2OiIi", "ncV:128:", "sse2")
-
-TARGET_BUILTIN(__builtin_ia32_monitor, "vvC*UiUi", "n", "sse3")
-TARGET_BUILTIN(__builtin_ia32_mwait, "vUiUi", "n", "sse3")
-TARGET_BUILTIN(__builtin_ia32_lddqu, "V16ccC*", "nV:128:", "sse3")
-
-TARGET_BUILTIN(__builtin_ia32_palignr128, "V16cV16cV16cIi", "ncV:128:", "ssse3")
-
-TARGET_BUILTIN(__builtin_ia32_insertps128, "V4fV4fV4fIc", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_pblendvb128, "V16cV16cV16cV16c", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_pblendw128, "V8sV8sV8sIi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_blendpd, "V2dV2dV2dIi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_blendps, "V4fV4fV4fIi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_blendvpd, "V2dV2dV2dV2d", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_blendvps, "V4fV4fV4fV4f", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_packusdw128, "V8sV4iV4i", "ncV:128:", "sse4.1")
-
-TARGET_BUILTIN(__builtin_ia32_pmuldq128, "V2OiV4iV4i", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_roundps, "V4fV4fIi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_roundss, "V4fV4fV4fIi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_roundsd, "V2dV2dV2dIi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_roundpd, "V2dV2dIi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_dpps, "V4fV4fV4fIc", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_dppd, "V2dV2dV2dIc", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_ptestz128, "iV2OiV2Oi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_ptestc128, "iV2OiV2Oi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_ptestnzc128, "iV2OiV2Oi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_mpsadbw128, "V16cV16cV16cIc", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_phminposuw128, "V8sV8s", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_vec_ext_v16qi, "cV16cIi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_vec_set_v16qi, "V16cV16ccIi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_vec_set_v4si, "V4iV4iiIi", "ncV:128:", "sse4.1")
-
-// SSE 4.2
-TARGET_BUILTIN(__builtin_ia32_pcmpistrm128, "V16cV16cV16cIc", "ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpistri128, "iV16cV16cIc", "ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpestrm128, "V16cV16ciV16ciIc", "ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpestri128, "iV16ciV16ciIc","ncV:128:", "sse4.2")
-
-TARGET_BUILTIN(__builtin_ia32_pcmpistria128, "iV16cV16cIc","ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpistric128, "iV16cV16cIc","ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpistrio128, "iV16cV16cIc","ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpistris128, "iV16cV16cIc","ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpistriz128, "iV16cV16cIc","ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpestria128, "iV16ciV16ciIc","ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpestric128, "iV16ciV16ciIc","ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpestrio128, "iV16ciV16ciIc","ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpestris128, "iV16ciV16ciIc","ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpestriz128, "iV16ciV16ciIc","ncV:128:", "sse4.2")
-
-TARGET_BUILTIN(__builtin_ia32_crc32qi, "UiUiUc", "nc", "crc32")
-TARGET_BUILTIN(__builtin_ia32_crc32hi, "UiUiUs", "nc", "crc32")
-TARGET_BUILTIN(__builtin_ia32_crc32si, "UiUiUi", "nc", "crc32")
-
-// SSE4a
-TARGET_BUILTIN(__builtin_ia32_extrqi, "V2OiV2OiIcIc", "ncV:128:", "sse4a")
-TARGET_BUILTIN(__builtin_ia32_extrq, "V2OiV2OiV16c", "ncV:128:", "sse4a")
-TARGET_BUILTIN(__builtin_ia32_insertqi, "V2OiV2OiV2OiIcIc", "ncV:128:", "sse4a")
-TARGET_BUILTIN(__builtin_ia32_insertq, "V2OiV2OiV2Oi", "ncV:128:", "sse4a")
-TARGET_BUILTIN(__builtin_ia32_movntsd, "vd*V2d", "nV:128:", "sse4a")
-TARGET_BUILTIN(__builtin_ia32_movntss, "vf*V4f", "nV:128:", "sse4a")
-
-// AES
-TARGET_BUILTIN(__builtin_ia32_aesenc128, "V2OiV2OiV2Oi", "ncV:128:", "aes")
-TARGET_BUILTIN(__builtin_ia32_aesenclast128, "V2OiV2OiV2Oi", "ncV:128:", "aes")
-TARGET_BUILTIN(__builtin_ia32_aesdec128, "V2OiV2OiV2Oi", "ncV:128:", "aes")
-TARGET_BUILTIN(__builtin_ia32_aesdeclast128, "V2OiV2OiV2Oi", "ncV:128:", "aes")
-TARGET_BUILTIN(__builtin_ia32_aesimc128, "V2OiV2Oi", "ncV:128:", "aes")
-TARGET_BUILTIN(__builtin_ia32_aeskeygenassist128, "V2OiV2OiIc", "ncV:128:", "aes")
-
-// VAES
-TARGET_BUILTIN(__builtin_ia32_aesenc256, "V4OiV4OiV4Oi", "ncV:256:", "vaes")
-TARGET_BUILTIN(__builtin_ia32_aesenc512, "V8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512,vaes")
-TARGET_BUILTIN(__builtin_ia32_aesenclast256, "V4OiV4OiV4Oi", "ncV:256:", "vaes")
-TARGET_BUILTIN(__builtin_ia32_aesenclast512, "V8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512,vaes")
-TARGET_BUILTIN(__builtin_ia32_aesdec256, "V4OiV4OiV4Oi", "ncV:256:", "vaes")
-TARGET_BUILTIN(__builtin_ia32_aesdec512, "V8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512,vaes")
-TARGET_BUILTIN(__builtin_ia32_aesdeclast256, "V4OiV4OiV4Oi", "ncV:256:", "vaes")
-TARGET_BUILTIN(__builtin_ia32_aesdeclast512, "V8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512,vaes")
-
-// GFNI
-TARGET_BUILTIN(__builtin_ia32_vgf2p8affineinvqb_v16qi, "V16cV16cV16cIc", "ncV:128:", "gfni")
-TARGET_BUILTIN(__builtin_ia32_vgf2p8affineinvqb_v32qi, "V32cV32cV32cIc", "ncV:256:", "avx,gfni")
-TARGET_BUILTIN(__builtin_ia32_vgf2p8affineinvqb_v64qi, "V64cV64cV64cIc", "ncV:512:", "avx512f,evex512,gfni")
-TARGET_BUILTIN(__builtin_ia32_vgf2p8affineqb_v16qi, "V16cV16cV16cIc", "ncV:128:", "gfni")
-TARGET_BUILTIN(__builtin_ia32_vgf2p8affineqb_v32qi, "V32cV32cV32cIc", "ncV:256:", "avx,gfni")
-TARGET_BUILTIN(__builtin_ia32_vgf2p8affineqb_v64qi, "V64cV64cV64cIc", "ncV:512:", "avx512f,evex512,gfni")
-TARGET_BUILTIN(__builtin_ia32_vgf2p8mulb_v16qi, "V16cV16cV16c", "ncV:128:", "gfni")
-TARGET_BUILTIN(__builtin_ia32_vgf2p8mulb_v32qi, "V32cV32cV32c", "ncV:256:", "avx,gfni")
-TARGET_BUILTIN(__builtin_ia32_vgf2p8mulb_v64qi, "V64cV64cV64c", "ncV:512:", "avx512f,evex512,gfni")
-
-// CLMUL
-TARGET_BUILTIN(__builtin_ia32_pclmulqdq128, "V2OiV2OiV2OiIc", "ncV:128:", "pclmul")
-
-// VPCLMULQDQ
-TARGET_BUILTIN(__builtin_ia32_pclmulqdq256, "V4OiV4OiV4OiIc", "ncV:256:", "vpclmulqdq")
-TARGET_BUILTIN(__builtin_ia32_pclmulqdq512, "V8OiV8OiV8OiIc", "ncV:512:", "avx512f,evex512,vpclmulqdq")
-
-// AVX
-TARGET_BUILTIN(__builtin_ia32_vpermilvarpd, "V2dV2dV2Oi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vpermilvarps, "V4fV4fV4i", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vpermilvarpd256, "V4dV4dV4Oi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vpermilvarps256, "V8fV8fV8i", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_blendpd256, "V4dV4dV4dIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_blendps256, "V8fV8fV8fIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_blendvpd256, "V4dV4dV4dV4d", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_blendvps256, "V8fV8fV8fV8f", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_shufpd256, "V4dV4dV4dIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_shufps256, "V8fV8fV8fIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_dpps256, "V8fV8fV8fIc", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_cmppd256, "V4dV4dV4dIc", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_cmpps256, "V8fV8fV8fIc", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vextractf128_pd256, "V2dV4dIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vextractf128_ps256, "V4fV8fIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vextractf128_si256, "V4iV8iIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2ps256, "V4fV4d", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_cvtps2dq256, "V8iV8f", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2dq256, "V4iV4d", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2dq256, "V4iV4d", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_cvttps2dq256, "V8iV8f", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vperm2f128_pd256, "V4dV4dV4dIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vperm2f128_ps256, "V8fV8fV8fIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vperm2f128_si256, "V8iV8iV8iIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vpermilpd, "V2dV2dIi", "ncV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vpermilps, "V4fV4fIi", "ncV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vpermilpd256, "V4dV4dIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vpermilps256, "V8fV8fIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vinsertf128_pd256, "V4dV4dV2dIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vinsertf128_ps256, "V8fV8fV4fIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vinsertf128_si256, "V8iV8iV4iIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_sqrtpd256, "V4dV4d", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_sqrtps256, "V8fV8f", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_rsqrtps256, "V8fV8f", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_rcpps256, "V8fV8f", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_roundpd256, "V4dV4dIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_roundps256, "V8fV8fIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestzpd, "iV2dV2d", "ncV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestcpd, "iV2dV2d", "ncV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestnzcpd, "iV2dV2d", "ncV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestzps, "iV4fV4f", "ncV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestcps, "iV4fV4f", "ncV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestnzcps, "iV4fV4f", "ncV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestzpd256, "iV4dV4d", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestcpd256, "iV4dV4d", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestnzcpd256, "iV4dV4d", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestzps256, "iV8fV8f", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestcps256, "iV8fV8f", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestnzcps256, "iV8fV8f", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_ptestz256, "iV4OiV4Oi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_ptestc256, "iV4OiV4Oi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_ptestnzc256, "iV4OiV4Oi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_movmskpd256, "iV4d", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_movmskps256, "iV8f", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vzeroall, "v", "n", "avx")
-TARGET_BUILTIN(__builtin_ia32_vzeroupper, "v", "n", "avx")
-TARGET_BUILTIN(__builtin_ia32_lddqu256, "V32ccC*", "nV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_maskloadpd, "V2dV2dC*V2Oi", "nV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_maskloadps, "V4fV4fC*V4i", "nV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_maskloadpd256, "V4dV4dC*V4Oi", "nV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_maskloadps256, "V8fV8fC*V8i", "nV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_maskstorepd, "vV2d*V2OiV2d", "nV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_maskstoreps, "vV4f*V4iV4f", "nV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_maskstorepd256, "vV4d*V4OiV4d", "nV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_maskstoreps256, "vV8f*V8iV8f", "nV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vec_ext_v32qi, "cV32cIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vec_ext_v16hi, "sV16sIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vec_ext_v8si, "iV8iIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vec_set_v32qi, "V32cV32ccIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vec_set_v16hi, "V16sV16ssIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vec_set_v8si, "V8iV8iiIi", "ncV:256:", "avx")
-
-// AVX2
-TARGET_BUILTIN(__builtin_ia32_mpsadbw256, "V32cV32cV32cIc", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_packsswb256, "V32cV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_packssdw256, "V16sV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_packuswb256, "V32cV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_packusdw256, "V16sV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_palignr256, "V32cV32cV32cIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pavgb256, "V32cV32cV32c", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pavgw256, "V16sV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pblendvb256, "V32cV32cV32cV32c", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pblendw256, "V16sV16sV16sIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_phaddw256, "V16sV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_phaddd256, "V8iV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_phaddsw256, "V16sV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_phsubw256, "V16sV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_phsubd256, "V8iV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_phsubsw256, "V16sV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmaddubsw256, "V16sV32cV32c", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmaddwd256, "V8iV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmovmskb256, "iV32c", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmuldq256, "V4OiV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmulhrsw256, "V16sV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmulhuw256, "V16sV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmulhw256, "V16sV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmuludq256, "V4OiV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psadbw256, "V4OiV32cV32c", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pshufb256, "V32cV32cV32c", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pshufd256, "V8iV8iIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pshuflw256, "V16sV16sIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pshufhw256, "V16sV16sIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psignb256, "V32cV32cV32c", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psignw256, "V16sV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psignd256, "V8iV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psllwi256, "V16sV16si", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psllw256, "V16sV16sV8s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pslldqi256_byteshift, "V4OiV4OiIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pslldi256, "V8iV8ii", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pslld256, "V8iV8iV4i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psllqi256, "V4OiV4Oii", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psllq256, "V4OiV4OiV2Oi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrawi256, "V16sV16si", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psraw256, "V16sV16sV8s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psradi256, "V8iV8ii", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrad256, "V8iV8iV4i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrldqi256_byteshift, "V4OiV4OiIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrlwi256, "V16sV16si", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrlw256, "V16sV16sV8s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrldi256, "V8iV8ii", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrld256, "V8iV8iV4i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrlqi256, "V4OiV4Oii", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrlq256, "V4OiV4OiV2Oi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pblendd128, "V4iV4iV4iIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pblendd256, "V8iV8iV8iIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_permvarsi256, "V8iV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_permdf256, "V4dV4dIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_permvarsf256, "V8fV8fV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_permti256, "V4OiV4OiV4OiIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_permdi256, "V4OiV4OiIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_extract128i256, "V2OiV4OiIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_insert128i256, "V4OiV4OiV2OiIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_maskloadd256, "V8iV8iC*V8i", "nV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_maskloadq256, "V4OiV4OiC*V4Oi", "nV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_maskloadd, "V4iV4iC*V4i", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_maskloadq, "V2OiV2OiC*V2Oi", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_maskstored256, "vV8i*V8iV8i", "nV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_maskstoreq256, "vV4Oi*V4OiV4Oi", "nV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_maskstored, "vV4i*V4iV4i", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_maskstoreq, "vV2Oi*V2OiV2Oi", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psllv8si, "V8iV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psllv4si, "V4iV4iV4i", "ncV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psllv4di, "V4OiV4OiV4Oi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psllv2di, "V2OiV2OiV2Oi", "ncV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrav8si, "V8iV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrav4si, "V4iV4iV4i", "ncV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrlv8si, "V8iV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrlv4si, "V4iV4iV4i", "ncV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrlv4di, "V4OiV4OiV4Oi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrlv2di, "V2OiV2OiV2Oi", "ncV:128:", "avx2")
-
-// GATHER
-TARGET_BUILTIN(__builtin_ia32_gatherd_pd, "V2dV2ddC*V4iV2dIc", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherd_pd256, "V4dV4ddC*V4iV4dIc", "nV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherq_pd, "V2dV2ddC*V2OiV2dIc", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherq_pd256, "V4dV4ddC*V4OiV4dIc", "nV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherd_ps, "V4fV4ffC*V4iV4fIc", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherd_ps256, "V8fV8ffC*V8iV8fIc", "nV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherq_ps, "V4fV4ffC*V2OiV4fIc", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherq_ps256, "V4fV4ffC*V4OiV4fIc", "nV:256:", "avx2")
-
-TARGET_BUILTIN(__builtin_ia32_gatherd_q, "V2OiV2OiOiC*V4iV2OiIc", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherd_q256, "V4OiV4OiOiC*V4iV4OiIc", "nV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherq_q, "V2OiV2OiOiC*V2OiV2OiIc", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherq_q256, "V4OiV4OiOiC*V4OiV4OiIc", "nV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherd_d, "V4iV4iiC*V4iV4iIc", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherd_d256, "V8iV8iiC*V8iV8iIc", "nV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherq_d, "V4iV4iiC*V2OiV4iIc", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherq_d256, "V4iV4iiC*V4OiV4iIc", "nV:256:", "avx2")
-
-// F16C
-TARGET_BUILTIN(__builtin_ia32_vcvtps2ph, "V8sV4fIi", "ncV:128:", "f16c")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2ph256, "V8sV8fIi", "ncV:256:", "f16c")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2ps, "V4fV8s", "ncV:128:", "f16c")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2ps256, "V8fV8s", "ncV:256:", "f16c")
-
-// RDRAND
-TARGET_BUILTIN(__builtin_ia32_rdrand16_step, "UiUs*", "n", "rdrnd")
-TARGET_BUILTIN(__builtin_ia32_rdrand32_step, "UiUi*", "n", "rdrnd")
-
-// FXSR
-TARGET_BUILTIN(__builtin_ia32_fxrstor, "vv*", "n", "fxsr")
-TARGET_BUILTIN(__builtin_ia32_fxsave, "vv*", "n", "fxsr")
-
-// XSAVE
-TARGET_BUILTIN(__builtin_ia32_xsave, "vv*UOi", "n", "xsave")
-TARGET_BUILTIN(__builtin_ia32_xrstor, "vv*UOi", "n", "xsave")
-TARGET_BUILTIN(__builtin_ia32_xgetbv, "UOiUi", "n", "xsave")
-TARGET_HEADER_BUILTIN(_xgetbv, "UWiUi", "nh", IMMINTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_BUILTIN(__builtin_ia32_xsetbv, "vUiUOi", "n", "xsave")
-TARGET_HEADER_BUILTIN(_xsetbv, "vUiUWi", "nh", IMMINTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_BUILTIN(__builtin_ia32_xsaveopt, "vv*UOi", "n", "xsaveopt")
-TARGET_BUILTIN(__builtin_ia32_xrstors, "vv*UOi", "n", "xsaves")
-TARGET_BUILTIN(__builtin_ia32_xsavec, "vv*UOi", "n", "xsavec")
-TARGET_BUILTIN(__builtin_ia32_xsaves, "vv*UOi", "n", "xsaves")
-
-// SHSTK
-TARGET_BUILTIN(__builtin_ia32_incsspd, "vUi", "n", "shstk")
-TARGET_BUILTIN(__builtin_ia32_rdsspd, "UiUi", "n", "shstk")
-TARGET_BUILTIN(__builtin_ia32_saveprevssp, "v", "n", "shstk")
-TARGET_BUILTIN(__builtin_ia32_rstorssp, "vv*", "n", "shstk")
-TARGET_BUILTIN(__builtin_ia32_wrssd, "vUiv*", "n", "shstk")
-TARGET_BUILTIN(__builtin_ia32_wrussd, "vUiv*", "n", "shstk")
-TARGET_BUILTIN(__builtin_ia32_setssbsy, "v", "n", "shstk")
-TARGET_BUILTIN(__builtin_ia32_clrssbsy, "vv*", "n", "shstk")
-
-//CLFLUSHOPT
-TARGET_BUILTIN(__builtin_ia32_clflushopt, "vvC*", "n", "clflushopt")
-
-//CLWB
-TARGET_BUILTIN(__builtin_ia32_clwb, "vvC*", "n", "clwb")
-
-//WB[NO]INVD
-TARGET_BUILTIN(__builtin_ia32_wbinvd, "v", "n", "")
-TARGET_BUILTIN(__builtin_ia32_wbnoinvd, "v", "n", "wbnoinvd")
-
-// ADX
-TARGET_BUILTIN(__builtin_ia32_addcarryx_u32, "UcUcUiUiUi*", "nE", "")
-TARGET_BUILTIN(__builtin_ia32_subborrow_u32, "UcUcUiUiUi*", "nE", "")
-
-// RDSEED
-TARGET_BUILTIN(__builtin_ia32_rdseed16_step, "UiUs*", "n", "rdseed")
-TARGET_BUILTIN(__builtin_ia32_rdseed32_step, "UiUi*", "n", "rdseed")
-
-// LZCNT
-TARGET_BUILTIN(__builtin_ia32_lzcnt_u16, "UsUs", "ncE", "lzcnt")
-TARGET_BUILTIN(__builtin_ia32_lzcnt_u32, "UiUi", "ncE", "lzcnt")
-
-// BMI
-TARGET_BUILTIN(__builtin_ia32_bextr_u32, "UiUiUi", "ncE", "bmi")
-TARGET_BUILTIN(__builtin_ia32_tzcnt_u16, "UsUs", "ncE", "")
-TARGET_BUILTIN(__builtin_ia32_tzcnt_u32, "UiUi", "ncE", "")
-
-// BMI2
-TARGET_BUILTIN(__builtin_ia32_bzhi_si, "UiUiUi", "ncE", "bmi2")
-TARGET_BUILTIN(__builtin_ia32_pdep_si, "UiUiUi", "ncE", "bmi2")
-TARGET_BUILTIN(__builtin_ia32_pext_si, "UiUiUi", "ncE", "bmi2")
-
-// TBM
-TARGET_BUILTIN(__builtin_ia32_bextri_u32, "UiUiIUi", "ncE", "tbm")
-
-// LWP
-TARGET_BUILTIN(__builtin_ia32_llwpcb, "vv*", "n", "lwp")
-TARGET_BUILTIN(__builtin_ia32_slwpcb, "v*", "n", "lwp")
-TARGET_BUILTIN(__builtin_ia32_lwpins32, "UcUiUiIUi", "n", "lwp")
-TARGET_BUILTIN(__builtin_ia32_lwpval32, "vUiUiIUi", "n", "lwp")
-
-// SHA
-TARGET_BUILTIN(__builtin_ia32_sha1rnds4, "V4iV4iV4iIc", "ncV:128:", "sha")
-TARGET_BUILTIN(__builtin_ia32_sha1nexte, "V4iV4iV4i", "ncV:128:", "sha")
-TARGET_BUILTIN(__builtin_ia32_sha1msg1, "V4iV4iV4i", "ncV:128:", "sha")
-TARGET_BUILTIN(__builtin_ia32_sha1msg2, "V4iV4iV4i", "ncV:128:", "sha")
-TARGET_BUILTIN(__builtin_ia32_sha256rnds2, "V4iV4iV4iV4i", "ncV:128:", "sha")
-TARGET_BUILTIN(__builtin_ia32_sha256msg1, "V4iV4iV4i", "ncV:128:", "sha")
-TARGET_BUILTIN(__builtin_ia32_sha256msg2, "V4iV4iV4i", "ncV:128:", "sha")
-
-// FMA
-TARGET_BUILTIN(__builtin_ia32_vfmaddps, "V4fV4fV4fV4f", "ncV:128:", "fma|fma4")
-TARGET_BUILTIN(__builtin_ia32_vfmaddpd, "V2dV2dV2dV2d", "ncV:128:", "fma|fma4")
-TARGET_BUILTIN(__builtin_ia32_vfmaddss3, "V4fV4fV4fV4f", "ncV:128:", "fma")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsd3, "V2dV2dV2dV2d", "ncV:128:", "fma")
-TARGET_BUILTIN(__builtin_ia32_vfmaddss, "V4fV4fV4fV4f", "ncV:128:", "fma4")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsd, "V2dV2dV2dV2d", "ncV:128:", "fma4")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubps, "V4fV4fV4fV4f", "ncV:128:", "fma|fma4")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd, "V2dV2dV2dV2d", "ncV:128:", "fma|fma4")
-TARGET_BUILTIN(__builtin_ia32_vfmaddps256, "V8fV8fV8fV8f", "ncV:256:", "fma|fma4")
-TARGET_BUILTIN(__builtin_ia32_vfmaddpd256, "V4dV4dV4dV4d", "ncV:256:", "fma|fma4")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubps256, "V8fV8fV8fV8f", "ncV:256:", "fma|fma4")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd256, "V4dV4dV4dV4d", "ncV:256:", "fma|fma4")
-
-TARGET_BUILTIN(__builtin_ia32_vfmaddpd512_mask, "V8dV8dV8dV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddpd512_maskz, "V8dV8dV8dV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddpd512_mask3, "V8dV8dV8dV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmsubpd512_mask3, "V8dV8dV8dV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddps512_mask, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddps512_maskz, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddps512_mask3, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmsubps512_mask3, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd512_mask, "V8dV8dV8dV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd512_maskz, "V8dV8dV8dV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd512_mask3, "V8dV8dV8dV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmsubaddpd512_mask3, "V8dV8dV8dV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubps512_mask, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubps512_maskz, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubps512_mask3, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmsubaddps512_mask3, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512f,evex512")
-
-// XOP
-TARGET_BUILTIN(__builtin_ia32_vpmacssww, "V8sV8sV8sV8s", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmacsww, "V8sV8sV8sV8s", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmacsswd, "V4iV8sV8sV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmacswd, "V4iV8sV8sV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmacssdd, "V4iV4iV4iV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmacsdd, "V4iV4iV4iV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmacssdql, "V2OiV4iV4iV2Oi", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmacsdql, "V2OiV4iV4iV2Oi", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmacssdqh, "V2OiV4iV4iV2Oi", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmacsdqh, "V2OiV4iV4iV2Oi", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmadcsswd, "V4iV8sV8sV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmadcswd, "V4iV8sV8sV4i", "ncV:128:", "xop")
-
-TARGET_BUILTIN(__builtin_ia32_vphaddbw, "V8sV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphaddbd, "V4iV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphaddbq, "V2OiV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphaddwd, "V4iV8s", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphaddwq, "V2OiV8s", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphadddq, "V2OiV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphaddubw, "V8sV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphaddubd, "V4iV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphaddubq, "V2OiV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphadduwd, "V4iV8s", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphadduwq, "V2OiV8s", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphaddudq, "V2OiV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphsubbw, "V8sV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphsubwd, "V4iV8s", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphsubdq, "V2OiV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpperm, "V16cV16cV16cV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vprotb, "V16cV16cV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vprotw, "V8sV8sV8s", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vprotd, "V4iV4iV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vprotq, "V2OiV2OiV2Oi", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vprotbi, "V16cV16cIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vprotwi, "V8sV8sIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vprotdi, "V4iV4iIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vprotqi, "V2OiV2OiIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpshlb, "V16cV16cV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpshlw, "V8sV8sV8s", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpshld, "V4iV4iV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpshlq, "V2OiV2OiV2Oi", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpshab, "V16cV16cV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpshaw, "V8sV8sV8s", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpshad, "V4iV4iV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpshaq, "V2OiV2OiV2Oi", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpcomub, "V16cV16cV16cIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpcomuw, "V8sV8sV8sIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpcomud, "V4iV4iV4iIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpcomuq, "V2OiV2OiV2OiIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpcomb, "V16cV16cV16cIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpcomw, "V8sV8sV8sIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpcomd, "V4iV4iV4iIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpcomq, "V2OiV2OiV2OiIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpermil2pd, "V2dV2dV2dV2OiIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpermil2pd256, "V4dV4dV4dV4OiIc", "ncV:256:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpermil2ps, "V4fV4fV4fV4iIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpermil2ps256, "V8fV8fV8fV8iIc", "ncV:256:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vfrczss, "V4fV4f", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vfrczsd, "V2dV2d", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vfrczps, "V4fV4f", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vfrczpd, "V2dV2d", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vfrczps256, "V8fV8f", "ncV:256:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vfrczpd256, "V4dV4d", "ncV:256:", "xop")
-
-TARGET_BUILTIN(__builtin_ia32_xbegin, "i", "n", "rtm")
-TARGET_BUILTIN(__builtin_ia32_xend, "v", "n", "rtm")
-TARGET_BUILTIN(__builtin_ia32_xabort, "vIc", "n", "rtm")
-TARGET_BUILTIN(__builtin_ia32_xtest, "i", "n", "rtm")
-
-BUILTIN(__builtin_ia32_rdpmc, "UOii", "")
-BUILTIN(__builtin_ia32_rdtsc, "UOi", "")
-BUILTIN(__rdtsc, "UOi", "")
-BUILTIN(__builtin_ia32_rdtscp, "UOiUi*", "")
-
-TARGET_BUILTIN(__builtin_ia32_rdpid, "Ui", "n", "rdpid")
-TARGET_BUILTIN(__builtin_ia32_rdpru, "ULLii", "n", "rdpru")
-
-// PKU
-TARGET_BUILTIN(__builtin_ia32_rdpkru, "Ui", "n", "pku")
-TARGET_BUILTIN(__builtin_ia32_wrpkru, "vUi", "n", "pku")
-
-// AVX-512
-TARGET_BUILTIN(__builtin_ia32_sqrtpd512, "V8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_sqrtps512, "V16fV16fIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_rsqrt14sd_mask, "V2dV2dV2dV2dUc", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_rsqrt14ss_mask, "V4fV4fV4fV4fUc", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_rsqrt14pd512_mask, "V8dV8dV8dUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_rsqrt14ps512_mask, "V16fV16fV16fUs", "ncV:512:", "avx512f,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_rcp14sd_mask, "V2dV2dV2dV2dUc", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_rcp14ss_mask, "V4fV4fV4fV4fUc", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_rcp14pd512_mask, "V8dV8dV8dUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_rcp14ps512_mask, "V16fV16fV16fUs", "ncV:512:", "avx512f,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_cvttps2dq512_mask, "V16iV16fV16iUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvttps2udq512_mask, "V16iV16fV16iUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2dq512_mask, "V8iV8dV8iUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2udq512_mask, "V8iV8dV8iUcIi", "ncV:512:", "avx512f,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_cmpps512_mask,   "UsV16fV16fIiUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cmpps256_mask,   "UcV8fV8fIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cmpps128_mask,   "UcV4fV4fIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cmppd512_mask, "UcV8dV8dIiUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cmppd256_mask, "UcV4dV4dIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cmppd128_mask, "UcV2dV2dIiUc", "ncV:128:", "avx512vl")
-
-TARGET_BUILTIN(__builtin_ia32_rndscaleps_mask, "V16fV16fIiV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_rndscalepd_mask, "V8dV8dIiV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtps2dq512_mask, "V16iV16fV16iUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2dq512_mask, "V8iV8dV8iUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtps2udq512_mask, "V16iV16fV16iUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2udq512_mask, "V8iV8dV8iUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_minps512, "V16fV16fV16fIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_minpd512, "V8dV8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_maxps512, "V16fV16fV16fIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_maxpd512, "V8dV8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtdq2ps512_mask, "V16fV16iV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtudq2ps512_mask, "V16fV16iV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2ps512_mask, "V8fV8dV8fUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2ph512_mask, "V16sV16fIiV16sUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2ps512_mask, "V16fV16sV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmuldq512, "V8OiV16iV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmuludq512, "V8OiV16iV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_loaddqusi512_mask, "V16iiC*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_loaddqudi512_mask, "V8OiOiC*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_loadups512_mask, "V16ffC*V16fUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_loadaps512_mask, "V16fV16fC*V16fUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_loadupd512_mask, "V8ddC*V8dUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_loadapd512_mask, "V8dV8dC*V8dUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_storedqudi512_mask, "vOi*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_storedqusi512_mask, "vi*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_storeupd512_mask, "vd*V8dUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_storeapd512_mask, "vV8d*V8dUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_storeups512_mask, "vf*V16fUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_storeaps512_mask, "vV16f*V16fUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_alignq512, "V8OiV8OiV8OiIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_alignd512, "V16iV16iV16iIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_alignd128, "V4iV4iV4iIi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_alignd256, "V8iV8iV8iIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_alignq128, "V2OiV2OiV2OiIi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_alignq256, "V4OiV4OiV4OiIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_extractf64x4_mask, "V4dV8dIiV4dUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_extractf32x4_mask, "V4fV16fIiV4fUc", "ncV:512:", "avx512f,evex512")
-
-// AVX-VNNI and AVX512-VNNI
-TARGET_BUILTIN(__builtin_ia32_vpdpbusd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpbusd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpbusd512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpdpbusds128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpbusds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpbusds512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpdpwssd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpwssd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpwssd512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpdpwssds128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpwssds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpwssds512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni,evex512")
-
-// AVX-VNNI-INT8
-TARGET_BUILTIN(__builtin_ia32_vpdpbssd128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbssd256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbssds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbssds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbsud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbsud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbsuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbsuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbuud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbuud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbuuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbuuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
-
-// MOVRS
-TARGET_BUILTIN(__builtin_ia32_prefetchrs, "vvC*", "nc", "movrs")
-
-TARGET_BUILTIN(__builtin_ia32_gather3div2df, "V2dV2dvC*V2OiUcIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3div2di, "V2OiV2OivC*V2OiUcIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3div4df, "V4dV4dvC*V4OiUcIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3div4di, "V4OiV4OivC*V4OiUcIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3div4sf, "V4fV4fvC*V2OiUcIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3div4si, "V4iV4ivC*V2OiUcIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3div8sf, "V4fV4fvC*V4OiUcIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3div8si, "V4iV4ivC*V4OiUcIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3siv2df, "V2dV2dvC*V4iUcIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3siv2di, "V2OiV2OivC*V4iUcIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3siv4df, "V4dV4dvC*V4iUcIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3siv4di, "V4OiV4OivC*V4iUcIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3siv4sf, "V4fV4fvC*V4iUcIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3siv4si, "V4iV4ivC*V4iUcIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3siv8sf, "V8fV8fvC*V8iUcIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3siv8si, "V8iV8ivC*V8iUcIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gathersiv8df, "V8dV8dvC*V8iUcIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_gathersiv16sf, "V16fV16fvC*V16iUsIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_gatherdiv8df, "V8dV8dvC*V8OiUcIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_gatherdiv16sf, "V8fV8fvC*V8OiUcIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_gathersiv8di, "V8OiV8OivC*V8iUcIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_gathersiv16si, "V16iV16ivC*V16iUsIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_gatherdiv8di, "V8OiV8OivC*V8OiUcIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_gatherdiv16si, "V8iV8ivC*V8OiUcIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_scattersiv8df, "vv*UcV8iV8dIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_scattersiv16sf, "vv*UsV16iV16fIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv8df,  "vv*UcV8OiV8dIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv16sf, "vv*UcV8OiV8fIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_scattersiv8di,  "vv*UcV8iV8OiIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_scattersiv16si, "vv*UsV16iV16iIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv8di,  "vv*UcV8OiV8OiIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv16si, "vv*UcV8OiV8iIi", "nV:512:", "avx512f,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_knotqi, "UcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_knothi, "UsUs", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_knotsi, "UiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_knotdi, "UOiUOi", "nc", "avx512bw")
-
-TARGET_BUILTIN(__builtin_ia32_cmpb128_mask, "UsV16cV16cIiUs", "ncV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_cmpd128_mask, "UcV4iV4iIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cmpq128_mask, "UcV2OiV2OiIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cmpw128_mask, "UcV8sV8sIiUc", "ncV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_cmpb256_mask, "UiV32cV32cIiUi", "ncV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_cmpd256_mask, "UcV8iV8iIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cmpq256_mask, "UcV4OiV4OiIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cmpw256_mask, "UsV16sV16sIiUs", "ncV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_cmpb512_mask, "UOiV64cV64cIiUOi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_cmpd512_mask, "UsV16iV16iIiUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cmpq512_mask, "UcV8OiV8OiIiUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cmpw512_mask, "UiV32sV32sIiUi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_ucmpb128_mask, "UsV16cV16cIiUs", "ncV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_ucmpd128_mask, "UcV4iV4iIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_ucmpq128_mask, "UcV2OiV2OiIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_ucmpw128_mask, "UcV8sV8sIiUc", "ncV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_ucmpb256_mask, "UiV32cV32cIiUi", "ncV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_ucmpd256_mask, "UcV8iV8iIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_ucmpq256_mask, "UcV4OiV4OiIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_ucmpw256_mask, "UsV16sV16sIiUs", "ncV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_ucmpb512_mask, "UOiV64cV64cIiUOi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_ucmpd512_mask, "UsV16iV16iIiUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_ucmpq512_mask, "UcV8OiV8OiIiUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_ucmpw512_mask, "UiV32sV32sIiUi", "ncV:512:", "avx512bw,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_packssdw512, "V32sV16iV16i", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_packsswb512, "V64cV32sV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_packusdw512, "V32sV16iV16i", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_packuswb512, "V64cV32sV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pavgb512, "V64cV64cV64c", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pavgw512, "V32sV32sV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pshufb512, "V64cV64cV64c", "ncV:512:", "avx512bw,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_vpconflictdi_128, "V2OiV2Oi", "ncV:128:", "avx512cd,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpconflictdi_256, "V4OiV4Oi", "ncV:256:", "avx512cd,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpconflictsi_128, "V4iV4i", "ncV:128:", "avx512cd,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpconflictsi_256, "V8iV8i", "ncV:256:", "avx512cd,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpconflictdi_512, "V8OiV8Oi", "ncV:512:", "avx512cd,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpconflictsi_512, "V16iV16i", "ncV:512:", "avx512cd,evex512")
-TARGET_BUILTIN(__builtin_ia32_vplzcntd_512, "V16iV16i", "ncV:512:", "avx512cd,evex512")
-TARGET_BUILTIN(__builtin_ia32_vplzcntq_512, "V8OiV8Oi", "ncV:512:", "avx512cd,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_vpshufbitqmb128_mask, "UsV16cV16cUs", "ncV:128:", "avx512vl,avx512bitalg")
-TARGET_BUILTIN(__builtin_ia32_vpshufbitqmb256_mask, "UiV32cV32cUi", "ncV:256:", "avx512vl,avx512bitalg")
-TARGET_BUILTIN(__builtin_ia32_vpshufbitqmb512_mask, "UOiV64cV64cUOi", "ncV:512:", "avx512bitalg,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_pmulhrsw512, "V32sV32sV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmulhuw512, "V32sV32sV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmulhw512, "V32sV32sV32s", "ncV:512:", "avx512bw,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_addpd512, "V8dV8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_addps512, "V16fV16fV16fIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_divpd512, "V8dV8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_divps512, "V16fV16fV16fIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_mulpd512, "V8dV8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_mulps512, "V16fV16fV16fIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_subpd512, "V8dV8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_subps512, "V16fV16fV16fIi", "ncV:512:", "avx512f,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_pmaddubsw512, "V32sV64cV64c", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmaddwd512, "V16iV32sV32s", "ncV:512:", "avx512bw,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_addss_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_divss_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_mulss_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_subss_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_maxss_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_minss_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_addsd_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_divsd_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_mulsd_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_subsd_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_maxsd_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_minsd_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-
-TARGET_BUILTIN(__builtin_ia32_compressdf128_mask, "V2dV2dV2dUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compressdf256_mask, "V4dV4dV4dUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compressdi128_mask, "V2OiV2OiV2OiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compressdi256_mask, "V4OiV4OiV4OiUc", "ncV:256:", "avx512vl")
-
-TARGET_BUILTIN(__builtin_ia32_compresshi128_mask, "V8sV8sV8sUc", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_compresshi256_mask, "V16sV16sV16sUs", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_compressqi128_mask, "V16cV16cV16cUs", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_compressqi256_mask, "V32cV32cV32cUi", "ncV:256:", "avx512vl,avx512vbmi2")
-
-TARGET_BUILTIN(__builtin_ia32_compresssf128_mask, "V4fV4fV4fUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compresssf256_mask, "V8fV8fV8fUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compresssi128_mask, "V4iV4iV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compresssi256_mask, "V8iV8iV8iUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compressstoredf128_mask, "vV2d*V2dUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compressstoredf256_mask, "vV4d*V4dUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compressstoredi128_mask, "vV2Oi*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compressstoredi256_mask, "vV4Oi*V4OiUc", "nV:256:", "avx512vl")
-
-TARGET_BUILTIN(__builtin_ia32_compressstorehi128_mask, "vV8s*V8sUc", "nV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_compressstorehi256_mask, "vV16s*V16sUs", "nV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_compressstoreqi128_mask, "vV16c*V16cUs", "nV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_compressstoreqi256_mask, "vV32c*V32cUi", "nV:256:", "avx512vl,avx512vbmi2")
-
-TARGET_BUILTIN(__builtin_ia32_compressstoresf128_mask, "vV4f*V4fUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compressstoresf256_mask, "vV8f*V8fUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compressstoresi128_mask, "vV4i*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compressstoresi256_mask, "vV8i*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2dq128_mask, "V4iV2dV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2ps_mask, "V4fV2dV4fUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2udq128_mask, "V4iV2dV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2udq256_mask, "V4iV4dV4iUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtps2udq128_mask, "V4iV4fV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtps2udq256_mask, "V8iV8fV8iUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2dq128_mask, "V4iV2dV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2udq128_mask, "V4iV2dV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2udq256_mask, "V4iV4dV4iUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvttps2udq128_mask, "V4iV4fV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvttps2udq256_mask, "V8iV8fV8iUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expanddf128_mask, "V2dV2dV2dUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expanddf256_mask, "V4dV4dV4dUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expanddi128_mask, "V2OiV2OiV2OiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expanddi256_mask, "V4OiV4OiV4OiUc", "ncV:256:", "avx512vl")
-
-TARGET_BUILTIN(__builtin_ia32_expandhi128_mask, "V8sV8sV8sUc", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_expandhi256_mask, "V16sV16sV16sUs", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_expandqi128_mask, "V16cV16cV16cUs", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_expandqi256_mask, "V32cV32cV32cUi", "ncV:256:", "avx512vl,avx512vbmi2")
-
-TARGET_BUILTIN(__builtin_ia32_expandloaddf128_mask, "V2dV2dC*V2dUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expandloaddf256_mask, "V4dV4dC*V4dUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expandloaddi128_mask, "V4iV2OiC*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expandloaddi256_mask, "V4OiV4OiC*V4OiUc", "nV:256:", "avx512vl")
-
-TARGET_BUILTIN(__builtin_ia32_expandloadhi128_mask, "V8sV8sC*V8sUc", "nV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_expandloadhi256_mask, "V16sV16sC*V16sUs", "nV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_expandloadqi128_mask, "V16cV16cC*V16cUs", "nV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_expandloadqi256_mask, "V32cV32cC*V32cUi", "nV:256:", "avx512vl,avx512vbmi2")
-
-TARGET_BUILTIN(__builtin_ia32_expandloadsf128_mask, "V4fV4fC*V4fUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expandloadsf256_mask, "V8fV8fC*V8fUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expandloadsi128_mask, "V4iV4iC*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expandloadsi256_mask, "V8iV8iC*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expandsf128_mask, "V4fV4fV4fUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expandsf256_mask, "V8fV8fV8fUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expandsi128_mask, "V4iV4iV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expandsi256_mask, "V8iV8iV8iUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getexppd128_mask, "V2dV2dV2dUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getexppd256_mask, "V4dV4dV4dUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getexpps128_mask, "V4fV4fV4fUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getexpps256_mask, "V8fV8fV8fUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rndscalepd_128_mask, "V2dV2dIiV2dUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rndscalepd_256_mask, "V4dV4dIiV4dUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rndscaleps_128_mask, "V4fV4fIiV4fUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rndscaleps_256_mask, "V8fV8fIiV8fUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scalefpd128_mask, "V2dV2dV2dV2dUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scalefpd256_mask, "V4dV4dV4dV4dUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scalefps128_mask, "V4fV4fV4fV4fUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scalefps256_mask, "V8fV8fV8fV8fUc", "ncV:256:", "avx512vl")
-
-TARGET_BUILTIN(__builtin_ia32_scatterdiv2df, "vv*UcV2OiV2dIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv2di, "vv*UcV2OiV2OiIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv4df, "vv*UcV4OiV4dIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv4di, "vv*UcV4OiV4OiIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv4sf, "vv*UcV2OiV4fIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv4si, "vv*UcV2OiV4iIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv8sf, "vv*UcV4OiV4fIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv8si, "vv*UcV4OiV4iIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv2df, "vv*UcV4iV2dIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv2di, "vv*UcV4iV2OiIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv4df, "vv*UcV4iV4dIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv4di, "vv*UcV4iV4OiIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv4sf, "vv*UcV4iV4fIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv4si, "vv*UcV4iV4iIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv8sf, "vv*UcV8iV8fIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv8si, "vv*UcV8iV8iIi", "nV:256:", "avx512vl")
-
-TARGET_BUILTIN(__builtin_ia32_vpermi2vard128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpermi2vard256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpermi2vard512, "V16iV16iV16iV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varpd128, "V2dV2dV2OiV2d", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varpd256, "V4dV4dV4OiV4d", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varpd512, "V8dV8dV8OiV8d", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varps128, "V4fV4fV4iV4f", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varps256, "V8fV8fV8iV8f", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varps512, "V16fV16fV16iV16f", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varq128, "V2OiV2OiV2OiV2Oi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varq256, "V4OiV4OiV4OiV4Oi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varq512, "V8OiV8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varqi128, "V16cV16cV16cV16c", "ncV:128:", "avx512vbmi,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varqi256, "V32cV32cV32cV32c", "ncV:256:", "avx512vbmi,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varqi512, "V64cV64cV64cV64c", "ncV:512:", "avx512vbmi,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varhi128, "V8sV8sV8sV8s", "ncV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varhi256, "V16sV16sV16sV16s", "ncV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varhi512, "V32sV32sV32sV32s", "ncV:512:", "avx512bw,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_vpshldd128, "V4iV4iV4iIi", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldd256, "V8iV8iV8iIi", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldd512, "V16iV16iV16iIi", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpshldq128, "V2OiV2OiV2OiIi", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldq256, "V4OiV4OiV4OiIi", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldq512, "V8OiV8OiV8OiIi", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpshldw128, "V8sV8sV8sIi", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldw256, "V16sV16sV16sIi", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldw512, "V32sV32sV32sIi", "ncV:512:", "avx512vbmi2,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_vpshldvd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldvd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldvd512, "V16iV16iV16iV16i", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpshldvq128, "V2OiV2OiV2OiV2Oi", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldvq256, "V4OiV4OiV4OiV4Oi", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldvq512, "V8OiV8OiV8OiV8Oi", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpshldvw128, "V8sV8sV8sV8s", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldvw256, "V16sV16sV16sV16s", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldvw512, "V32sV32sV32sV32s", "ncV:512:", "avx512vbmi2,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_vpshrdvd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdvd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdvd512, "V16iV16iV16iV16i", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpshrdvq128, "V2OiV2OiV2OiV2Oi", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdvq256, "V4OiV4OiV4OiV4Oi", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdvq512, "V8OiV8OiV8OiV8Oi", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpshrdvw128, "V8sV8sV8sV8s", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdvw256, "V16sV16sV16sV16s", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdvw512, "V32sV32sV32sV32s", "ncV:512:", "avx512vbmi2,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_vpshrdd128, "V4iV4iV4iIi", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdd256, "V8iV8iV8iIi", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdd512, "V16iV16iV16iIi", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpshrdq128, "V2OiV2OiV2OiIi", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdq256, "V4OiV4OiV4OiIi", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdq512, "V8OiV8OiV8OiIi", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpshrdw128, "V8sV8sV8sIi", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdw256, "V16sV16sV16sIi", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdw512, "V32sV32sV32sIi", "ncV:512:", "avx512vbmi2,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_pmovswb512_mask, "V32cV32sV32cUi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovuswb512_mask, "V32cV32sV32cUi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovwb512_mask, "V32cV32sV32cUi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2qq128_mask, "V2OiV2dV2OiUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2qq256_mask, "V4OiV4dV4OiUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2uqq128_mask, "V2OiV2dV2OiUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2uqq256_mask, "V4OiV4dV4OiUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvtps2qq128_mask, "V2OiV4fV2OiUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvtps2qq256_mask, "V4OiV4fV4OiUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvtps2uqq128_mask, "V2OiV4fV2OiUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvtps2uqq256_mask, "V4OiV4fV4OiUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvtqq2ps128_mask, "V4fV2OiV4fUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2qq128_mask, "V2OiV2dV2OiUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2qq256_mask, "V4OiV4dV4OiUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2uqq128_mask, "V2OiV2dV2OiUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2uqq256_mask, "V4OiV4dV4OiUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvttps2qq128_mask, "V2OiV4fV2OiUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvttps2qq256_mask, "V4OiV4fV4OiUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvttps2uqq128_mask, "V2OiV4fV2OiUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvttps2uqq256_mask, "V4OiV4fV4OiUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvtuqq2ps128_mask, "V4fV2OiV4fUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_rangepd128_mask, "V2dV2dV2dIiV2dUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_rangepd256_mask, "V4dV4dV4dIiV4dUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_rangeps128_mask, "V4fV4fV4fIiV4fUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_rangeps256_mask, "V8fV8fV8fIiV8fUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_rangesd128_round_mask, "V2dV2dV2dV2dUcIiIi", "ncV:128:", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_rangess128_round_mask, "V4fV4fV4fV4fUcIiIi", "ncV:128:", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_reducepd128_mask, "V2dV2dIiV2dUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_reducepd256_mask, "V4dV4dIiV4dUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_reduceps128_mask, "V4fV4fIiV4fUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_reduceps256_mask, "V8fV8fIiV8fUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_reducesd_mask, "V2dV2dV2dV2dUcIiIi", "ncV:128:", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_reducess_mask, "V4fV4fV4fV4fUcIiIi", "ncV:128:", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_pmovswb128_mask, "V16cV8sV16cUc", "ncV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pmovswb256_mask, "V16cV16sV16cUs", "ncV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pmovuswb128_mask, "V16cV8sV16cUc", "ncV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pmovuswb256_mask, "V16cV16sV16cUs", "ncV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pmovwb128_mask, "V16cV8sV16cUc", "ncV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2qq512_mask, "V8OiV8dV8OiUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2uqq512_mask, "V8OiV8dV8OiUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtps2qq512_mask, "V8OiV8fV8OiUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtps2uqq512_mask, "V8OiV8fV8OiUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtqq2pd512_mask, "V8dV8OiV8dUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtqq2ps512_mask, "V8fV8OiV8fUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2qq512_mask, "V8OiV8dV8OiUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2uqq512_mask, "V8OiV8dV8OiUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvttps2qq512_mask, "V8OiV8fV8OiUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvttps2uqq512_mask, "V8OiV8fV8OiUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtuqq2pd512_mask, "V8dV8OiV8dUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtuqq2ps512_mask, "V8fV8OiV8fUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_rangepd512_mask, "V8dV8dV8dIiV8dUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_rangeps512_mask, "V16fV16fV16fIiV16fUsIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_reducepd512_mask, "V8dV8dIiV8dUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduceps512_mask, "V16fV16fIiV16fUsIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_prold512, "V16iV16iIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_prolq512, "V8OiV8OiIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_prold128, "V4iV4iIi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prold256, "V8iV8iIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prolq128, "V2OiV2OiIi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prolq256, "V4OiV4OiIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prolvd512, "V16iV16iV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_prolvq512, "V8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_prord512, "V16iV16iIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_prorq512, "V8OiV8OiIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_prolvd128, "V4iV4iV4i", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prolvd256, "V8iV8iV8i", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prolvq128, "V2OiV2OiV2Oi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prolvq256, "V4OiV4OiV4Oi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prord128, "V4iV4iIi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prord256, "V8iV8iIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prorq128, "V2OiV2OiIi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prorq256, "V4OiV4OiIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prorvd512, "V16iV16iV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_prorvq512, "V8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_prorvd128, "V4iV4iV4i", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prorvd256, "V8iV8iV8i", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prorvq128, "V2OiV2OiV2Oi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prorvq256, "V4OiV4OiV4Oi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pshufhw512, "V32sV32sIi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pshuflw512, "V32sV32sIi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psllv32hi, "V32sV32sV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psllw512, "V32sV32sV8s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psllwi512, "V32sV32si", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psllv16hi, "V16sV16sV16s", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_psllv8hi, "V8sV8sV8s", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pslldi512, "V16iV16ii", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psllqi512, "V8OiV8Oii", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrlv32hi, "V32sV32sV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrlv16hi, "V16sV16sV16s", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_psrlv8hi, "V8sV8sV8s", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_psrldi512, "V16iV16ii", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrlqi512, "V8OiV8Oii", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrav32hi, "V32sV32sV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrav16hi, "V16sV16sV16s", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_psrav8hi, "V8sV8sV8s", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_psravq128, "V2OiV2OiV2Oi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_psravq256, "V4OiV4OiV4Oi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_psraw512, "V32sV32sV8s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrawi512, "V32sV32si", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrlw512, "V32sV32sV8s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrlwi512, "V32sV32si", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pslldqi512_byteshift, "V8OiV8OiIi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrldqi512_byteshift, "V8OiV8OiIi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_movdqa32load128_mask, "V4iV4iC*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_movdqa32load256_mask, "V8iV8iC*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_movdqa32load512_mask, "V16iV16iC*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_movdqa32store512_mask, "vV16i*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_movdqa64load512_mask, "V8OiV8OiC*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_movdqa64store512_mask, "vV8Oi*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_movdqa32store128_mask, "vV4i*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_movdqa32store256_mask, "vV8i*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_movdqa64load128_mask, "V2OiV2OiC*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_movdqa64load256_mask, "V4OiV4OiC*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_movdqa64store128_mask, "vV2Oi*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_movdqa64store256_mask, "vV4Oi*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpmadd52huq512, "V8OiV8OiV8OiV8Oi", "ncV:512:", "avx512ifma,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpmadd52luq512, "V8OiV8OiV8OiV8Oi", "ncV:512:", "avx512ifma,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpmadd52huq128, "V2OiV2OiV2OiV2Oi", "ncV:128:", "avx512ifma,avx512vl|avxifma")
-TARGET_BUILTIN(__builtin_ia32_vpmadd52huq256, "V4OiV4OiV4OiV4Oi", "ncV:256:", "avx512ifma,avx512vl|avxifma")
-TARGET_BUILTIN(__builtin_ia32_vpmadd52luq128, "V2OiV2OiV2OiV2Oi", "ncV:128:", "avx512ifma,avx512vl|avxifma")
-TARGET_BUILTIN(__builtin_ia32_vpmadd52luq256, "V4OiV4OiV4OiV4Oi", "ncV:256:", "avx512ifma,avx512vl|avxifma")
-TARGET_BUILTIN(__builtin_ia32_vcomisd, "iV2dV2dIiIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcomiss, "iV4fV4fIiIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kunpckdi, "UOiUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kunpcksi, "UiUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_loaddquhi512_mask, "V32sV32sC*V32sUi", "nV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_loaddquqi512_mask, "V64cV64cC*V64cUOi", "nV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_fixupimmpd512_mask, "V8dV8dV8dV8OiIiUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_fixupimmpd512_maskz, "V8dV8dV8dV8OiIiUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_fixupimmps512_mask, "V16fV16fV16fV16iIiUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_fixupimmps512_maskz, "V16fV16fV16fV16iIiUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_fixupimmsd_mask, "V2dV2dV2dV2OiIiUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_fixupimmsd_maskz, "V2dV2dV2dV2OiIiUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_fixupimmss_mask, "V4fV4fV4fV4iIiUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_fixupimmss_maskz, "V4fV4fV4fV4iIiUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_getexpsd128_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_getexpss128_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_getmantsd_round_mask, "V2dV2dV2dIiV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_getmantss_round_mask, "V4fV4fV4fIiV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_loaddquhi128_mask, "V8sV8sC*V8sUc", "nV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loaddquhi256_mask, "V16sV16sC*V16sUs", "nV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loaddquqi128_mask, "V16cV16cC*V16cUs", "nV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loaddquqi256_mask, "V32cV32cC*V32cUi", "nV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fixupimmpd128_mask, "V2dV2dV2dV2OiIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fixupimmpd128_maskz, "V2dV2dV2dV2OiIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fixupimmpd256_mask, "V4dV4dV4dV4OiIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fixupimmpd256_maskz, "V4dV4dV4dV4OiIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fixupimmps128_mask, "V4fV4fV4fV4iIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fixupimmps128_maskz, "V4fV4fV4fV4iIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fixupimmps256_mask, "V8fV8fV8fV8iIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fixupimmps256_maskz, "V8fV8fV8fV8iIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loadapd128_mask, "V2dV2dC*V2dUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loadsd128_mask, "V2dV2dC*V2dUc", "nV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_loadapd256_mask, "V4dV4dC*V4dUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loadaps128_mask, "V4fV4fC*V4fUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loadss128_mask, "V4fV4fC*V4fUc", "nV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_loadaps256_mask, "V8fV8fC*V8fUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loaddqudi128_mask, "V2OiV2OiC*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loaddqudi256_mask, "V4OiV4OiC*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loaddqusi128_mask, "V4iV4iC*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loaddqusi256_mask, "V8iV8iC*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loadupd128_mask, "V2dV2dC*V2dUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loadupd256_mask, "V4dV4dC*V4dUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loadups128_mask, "V4fV4fC*V4fUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loadups256_mask, "V8fV8fC*V8fUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storedquhi512_mask, "vV32s*V32sUi", "nV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_storedquqi512_mask, "vV64c*V64cUOi", "nV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_storedquhi128_mask, "vV8s*V8sUc", "nV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_storedquhi256_mask, "vV16s*V16sUs", "nV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_storedquqi128_mask, "vV16c*V16cUs", "nV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_storedquqi256_mask, "vV32c*V32cUi", "nV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_storeapd128_mask, "vV2d*V2dUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storesd128_mask, "vV2d*V2dUc", "nV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_storeapd256_mask, "vV4d*V4dUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storeaps128_mask, "vV4f*V4fUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storess128_mask, "vV4f*V4fUc", "nV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_storeaps256_mask, "vV8f*V8fUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storedqudi128_mask, "vV2Oi*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storedqudi256_mask, "vV4Oi*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storedqusi128_mask, "vV4i*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storedqusi256_mask, "vV8i*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storeupd128_mask, "vV2d*V2dUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storeupd256_mask, "vV4d*V4dUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storeups128_mask, "vV4f*V4fUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storeups256_mask, "vV8f*V8fUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rcp14pd128_mask, "V2dV2dV2dUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rcp14pd256_mask, "V4dV4dV4dUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rcp14ps128_mask, "V4fV4fV4fUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rcp14ps256_mask, "V8fV8fV8fUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vplzcntd_128, "V4iV4i", "ncV:128:", "avx512cd,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vplzcntd_256, "V8iV8i", "ncV:256:", "avx512cd,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vplzcntq_128, "V2OiV2Oi", "ncV:128:", "avx512cd,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vplzcntq_256, "V4OiV4Oi", "ncV:256:", "avx512cd,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtsd2si32, "iV2dIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvtsd2usi32, "UiV2dIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvtss2si32, "iV4fIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvtss2usi32, "UiV4fIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvttsd2si32, "iV2dIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvttsd2usi32, "UiV2dIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvttss2si32, "iV4fIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvttss2usi32, "UiV4fIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vpermilpd512, "V8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpermilps512, "V16fV16fIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpermilvarpd512, "V8dV8dV8Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpermilvarps512, "V16fV16fV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_rndscalesd_round_mask, "V2dV2dV2dV2dUcIiIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_rndscaless_round_mask, "V4fV4fV4fV4fUcIiIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_scalefpd512_mask, "V8dV8dV8dV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_scalefps512_mask, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_scalefsd_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_scalefss_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_psradi512, "V16iV16ii", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psraqi512, "V8OiV8Oii", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psraq128, "V2OiV2OiV2Oi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_psraq256, "V4OiV4OiV2Oi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_psraqi128, "V2OiV2Oii", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_psraqi256, "V4OiV4Oii", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pslld512, "V16iV16iV4i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psllq512, "V8OiV8OiV2Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psllv16si, "V16iV16iV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psllv8di, "V8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrad512, "V16iV16iV4i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psraq512, "V8OiV8OiV2Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrav16si, "V16iV16iV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrav8di, "V8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrld512, "V16iV16iV4i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrlq512, "V8OiV8OiV2Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrlv16si, "V16iV16iV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrlv8di, "V8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pternlogd512_mask, "V16iV16iV16iV16iIiUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pternlogd512_maskz, "V16iV16iV16iV16iIiUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pternlogq512_mask, "V8OiV8OiV8OiV8OiIiUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pternlogq512_maskz, "V8OiV8OiV8OiV8OiIiUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pternlogd128_mask, "V4iV4iV4iV4iIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pternlogd128_maskz, "V4iV4iV4iV4iIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pternlogd256_mask, "V8iV8iV8iV8iIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pternlogd256_maskz, "V8iV8iV8iV8iIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pternlogq128_mask, "V2OiV2OiV2OiV2OiIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pternlogq128_maskz, "V2OiV2OiV2OiV2OiIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pternlogq256_mask, "V4OiV4OiV4OiV4OiIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pternlogq256_maskz, "V4OiV4OiV4OiV4OiIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_shuf_f32x4, "V16fV16fV16fIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_shuf_f64x2, "V8dV8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_shuf_i32x4, "V16iV16iV16iIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_shuf_i64x2, "V8OiV8OiV8OiIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_shufpd512, "V8dV8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_shufps512, "V16fV16fV16fIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_shuf_f32x4_256, "V8fV8fV8fIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_shuf_f64x2_256, "V4dV4dV4dIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_shuf_i32x4_256, "V8iV8iV8iIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_shuf_i64x2_256, "V4OiV4OiV4OiIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_sqrtsd_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_sqrtss_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_rsqrt14pd128_mask, "V2dV2dV2dUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rsqrt14pd256_mask, "V4dV4dV4dUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rsqrt14ps128_mask, "V4fV4fV4fUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rsqrt14ps256_mask, "V8fV8fV8fUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtb2mask512, "UOiV64c", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2b512, "V64cUOi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2w512, "V32sUi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtd2mask512, "UsV16i", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2d512, "V16iUs", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2q512, "V8OiUc", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtq2mask512, "UcV8Oi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtb2mask128, "UsV16c", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtb2mask256, "UiV32c", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2b128, "V16cUs", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2b256, "V32cUi", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2w128, "V8sUc", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2w256, "V16sUs", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtd2mask128, "UcV4i", "ncV:128:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtd2mask256, "UcV8i", "ncV:256:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2d128, "V4iUc", "ncV:128:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2d256, "V8iUc", "ncV:256:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2q128, "V2OiUc", "ncV:128:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2q256, "V4OiUc", "ncV:256:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtq2mask128, "UcV2Oi", "ncV:128:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtq2mask256, "UcV4Oi", "ncV:256:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsdb512_mask, "V16cV16iV16cUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovsdb512mem_mask, "vV16c*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovswb512mem_mask, "vV32c*V32sUi", "nV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovsdw512_mask, "V16sV16iV16sUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovsdw512mem_mask, "vV16s*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovsqb512_mask, "V16cV8OiV16cUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovsqb512mem_mask, "vV16c*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovsqd512_mask, "V8iV8OiV8iUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovsqd512mem_mask, "vV8i*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovsqw512_mask, "V8sV8OiV8sUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovsqw512mem_mask, "vV8s*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovsdb128_mask, "V16cV4iV16cUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsdb128mem_mask, "vV16c*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovswb128mem_mask, "vV16c*V8sUc", "nV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pmovsdb256_mask, "V16cV8iV16cUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsdb256mem_mask, "vV16c*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovswb256mem_mask, "vV16c*V16sUs", "nV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pmovsdw128_mask, "V8sV4iV8sUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsdw128mem_mask, "vV8s*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsdw256_mask, "V8sV8iV8sUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsdw256mem_mask, "vV8s*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqb128_mask, "V16cV2OiV16cUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqb128mem_mask, "vV16c*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqb256_mask, "V16cV4OiV16cUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqb256mem_mask, "vV16c*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqd128_mask, "V4iV2OiV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqd128mem_mask, "vV4i*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqd256_mask, "V4iV4OiV4iUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqd256mem_mask, "vV4i*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqw128_mask, "V8sV2OiV8sUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqw128mem_mask, "vV8s*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqw256_mask, "V8sV4OiV8sUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqw256mem_mask, "vV8s*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusdb512_mask, "V16cV16iV16cUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovusdb512mem_mask, "vV16c*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovuswb512mem_mask, "vV32c*V32sUi", "nV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovusdw512_mask, "V16sV16iV16sUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovusdw512mem_mask, "vV16s*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovusqb512_mask, "V16cV8OiV16cUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovusqb512mem_mask, "vV16c*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovusqd512_mask, "V8iV8OiV8iUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovusqd512mem_mask, "vV8i*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovusqw512_mask, "V8sV8OiV8sUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovusqw512mem_mask, "vV8s*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovusdb128_mask, "V16cV4iV16cUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusdb128mem_mask, "vV16c*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovuswb128mem_mask, "vV16c*V8sUc", "nV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pmovusdb256_mask, "V16cV8iV16cUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusdb256mem_mask, "vV16c*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovuswb256mem_mask, "vV16c*V16sUs", "nV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pmovusdw128_mask, "V8sV4iV8sUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusdw128mem_mask, "vV8s*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusdw256_mask, "V8sV8iV8sUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusdw256mem_mask, "vV8s*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqb128_mask, "V16cV2OiV16cUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqb128mem_mask, "vV16c*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqb256_mask, "V16cV4OiV16cUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqb256mem_mask, "vV16c*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqd128_mask, "V4iV2OiV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqd128mem_mask, "vV4i*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqd256_mask, "V4iV4OiV4iUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqd256mem_mask, "vV4i*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqw128_mask, "V8sV2OiV8sUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqw128mem_mask, "vV8s*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqw256_mask, "V8sV4OiV8sUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqw256mem_mask, "vV8s*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovdb512_mask, "V16cV16iV16cUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovdb512mem_mask, "vV16c*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovwb512mem_mask, "vV32c*V32sUi", "nV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovdw512_mask, "V16sV16iV16sUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovdw512mem_mask, "vV16s*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovqb512_mask, "V16cV8OiV16cUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovqb512mem_mask, "vV16c*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovqd512_mask, "V8iV8OiV8iUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovqd512mem_mask, "vV8i*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovqw512_mask, "V8sV8OiV8sUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovqw512mem_mask, "vV8s*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovdb128_mask, "V16cV4iV16cUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovwb128mem_mask, "vV16c*V8sUc", "nV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pmovdb128mem_mask, "vV16c*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovdb256_mask, "V16cV8iV16cUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovdb256mem_mask, "vV16c*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovwb256mem_mask, "vV16c*V16sUs", "nV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pmovdw128_mask, "V8sV4iV8sUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovdw128mem_mask, "vV8s*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovdw256_mask, "V8sV8iV8sUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovdw256mem_mask, "vV8s*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqb128_mask, "V16cV2OiV16cUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqb128mem_mask, "vV16c*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqb256_mask, "V16cV4OiV16cUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqb256mem_mask, "vV16c*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqd128_mask, "V4iV2OiV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqd128mem_mask, "vV4i*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqd256mem_mask, "vV4i*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqw128_mask, "V8sV2OiV8sUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqw128mem_mask, "vV8s*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqw256_mask, "V8sV4OiV8sUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqw256mem_mask, "vV8s*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_extractf32x8_mask, "V8fV16fIiV8fUc", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_extractf64x2_512_mask, "V2dV8dIiV2dUc", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_extracti32x8_mask, "V8iV16iIiV8iUc", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_extracti64x2_512_mask, "V2OiV8OiIiV2OiUc", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_extracti32x4_mask, "V4iV16iIiV4iUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_extracti64x4_mask, "V4OiV8OiIiV4OiUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_extractf64x2_256_mask, "V2dV4dIiV2dUc", "ncV:256:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_extracti64x2_256_mask, "V2OiV4OiIiV2OiUc", "ncV:256:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_extractf32x4_256_mask, "V4fV8fIiV4fUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_extracti32x4_256_mask, "V4iV8iIiV4iUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_insertf32x8, "V16fV16fV8fIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_insertf64x2_512, "V8dV8dV2dIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_inserti32x8, "V16iV16iV8iIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_inserti64x2_512, "V8OiV8OiV2OiIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_insertf64x4, "V8dV8dV4dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_inserti64x4, "V8OiV8OiV4OiIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_insertf64x2_256, "V4dV4dV2dIi", "ncV:256:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_inserti64x2_256, "V4OiV4OiV2OiIi", "ncV:256:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_insertf32x4_256, "V8fV8fV4fIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_inserti32x4_256, "V8iV8iV4iIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_insertf32x4, "V16fV16fV4fIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_inserti32x4, "V16iV16iV4iIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_getmantpd128_mask, "V2dV2dIiV2dUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getmantpd256_mask, "V4dV4dIiV4dUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getmantps128_mask, "V4fV4fIiV4fUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getmantps256_mask, "V8fV8fIiV8fUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getmantpd512_mask, "V8dV8dIiV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_getmantps512_mask, "V16fV16fIiV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_getexppd512_mask, "V8dV8dV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_getexpps512_mask, "V16fV16fV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddss3_mask,  "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vfmaddss3_maskz, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vfmaddss3_mask3, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsd3_mask,  "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsd3_maskz, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsd3_mask3, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vfmsubsd3_mask3, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vfmsubss3_mask3, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_permdf512, "V8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_permdi512, "V8OiV8OiIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_permvarhi512, "V32sV32sV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_permvardf512, "V8dV8dV8Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_permvardi512, "V8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_permvarsf512, "V16fV16fV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_permvarsi512, "V16iV16iV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_permvarqi512, "V64cV64cV64c", "ncV:512:", "avx512vbmi,evex512")
-TARGET_BUILTIN(__builtin_ia32_permvarqi128, "V16cV16cV16c", "ncV:128:", "avx512vbmi,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_permvarqi256, "V32cV32cV32c", "ncV:256:", "avx512vbmi,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_permvarhi128, "V8sV8sV8s", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_permvarhi256, "V16sV16sV16s", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_permvardf256, "V4dV4dV4Oi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_permvardi256, "V4OiV4OiV4Oi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fpclasspd128_mask, "UcV2dIiUc", "ncV:128:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fpclasspd256_mask, "UcV4dIiUc", "ncV:256:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fpclassps128_mask, "UcV4fIiUc", "ncV:128:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fpclassps256_mask, "UcV8fIiUc", "ncV:256:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fpclassps512_mask, "UsV16fIiUs", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_fpclasspd512_mask, "UcV8dIiUc", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_fpclasssd_mask, "UcV2dIiUc", "ncV:128:", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_fpclassss_mask, "UcV4fIiUc", "ncV:128:", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kaddqi, "UcUcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kaddhi, "UsUsUs", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kaddsi, "UiUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kadddi, "UOiUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kandqi, "UcUcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kandhi, "UsUsUs", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kandsi, "UiUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kanddi, "UOiUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kandnqi, "UcUcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kandnhi, "UsUsUs", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kandnsi, "UiUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kandndi, "UOiUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_korqi, "UcUcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_korhi, "UsUsUs", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_korsi, "UiUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kordi, "UOiUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kortestcqi, "iUcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kortestzqi, "iUcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kortestchi, "iUsUs", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kortestzhi, "iUsUs", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kortestcsi, "iUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kortestzsi, "iUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kortestcdi, "iUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kortestzdi, "iUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_ktestcqi, "iUcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_ktestzqi, "iUcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_ktestchi, "iUsUs", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_ktestzhi, "iUsUs", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_ktestcsi, "iUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_ktestzsi, "iUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_ktestcdi, "iUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_ktestzdi, "iUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kunpckhi, "UsUsUs", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kxnorqi, "UcUcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kxnorhi, "UsUsUs", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kxnorsi, "UiUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kxnordi, "UOiUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kxorqi, "UcUcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kxorhi, "UsUsUs", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kxorsi, "UiUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kxordi, "UOiUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kshiftliqi, "UcUcIUi", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kshiftlihi, "UsUsIUi", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kshiftlisi, "UiUiIUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kshiftlidi, "UOiUOiIUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kshiftriqi, "UcUcIUi", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kshiftrihi, "UsUsIUi", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kshiftrisi, "UiUiIUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kshiftridi, "UOiUOiIUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kmovb, "UcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kmovw, "UsUs", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kmovd, "UiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kmovq, "UOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_palignr512, "V64cV64cV64cIi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_dbpsadbw128, "V8sV16cV16cIi", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_dbpsadbw256, "V16sV32cV32cIi", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_dbpsadbw512, "V32sV64cV64cIi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psadbw512, "V8OiV64cV64c", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_compressdf512_mask, "V8dV8dV8dUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_compressdi512_mask, "V8OiV8OiV8OiUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_compresshi512_mask, "V32sV32sV32sUi", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_compressqi512_mask, "V64cV64cV64cUOi", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_compresssf512_mask, "V16fV16fV16fUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_compresssi512_mask, "V16iV16iV16iUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cmpsd_mask, "UcV2dV2dIiUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_cmpss_mask, "UcV4fV4fIiUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_pshufd512, "V16iV16iIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_expanddf512_mask, "V8dV8dV8dUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_expanddi512_mask, "V8OiV8OiV8OiUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_expandhi512_mask, "V32sV32sV32sUi", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_expandqi512_mask, "V64cV64cV64cUOi", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_expandloaddf512_mask, "V8dV8dC*V8dUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_expandloaddi512_mask, "V8OiV8OiC*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_expandloadhi512_mask, "V32sV32sC*V32sUi", "nV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_expandloadqi512_mask, "V64cV64cC*V64cUOi", "nV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_expandloadsf512_mask, "V16fV16fC*V16fUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_expandloadsi512_mask, "V16iV16iC*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_expandsf512_mask, "V16fV16fV16fUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_expandsi512_mask, "V16iV16iV16iUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtps2pd512_mask, "V8dV8fV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_compressstoredf512_mask, "vV8d*V8dUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_compressstoredi512_mask, "vV8Oi*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_compressstorehi512_mask, "vV32s*V32sUi", "nV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_compressstoreqi512_mask, "vV64c*V64cUOi", "nV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_compressstoresf512_mask, "vV16f*V16fUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_compressstoresi512_mask, "vV16i*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2ps_mask, "V4fV8sV4fUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2ps256_mask, "V8fV8sV8fUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2ph_mask, "V8sV4fIiV8sUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2ph256_mask, "V8sV8fIiV8sUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtw2mask512, "UiV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtw2mask128, "UcV8s", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtw2mask256, "UsV16s", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtsd2ss_round_mask, "V4fV4fV2dV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_cvtsi2ss32, "V4fV4fiIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_cvtss2sd_round_mask, "V2dV2dV4fV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_cvtusi2ss32, "V4fV4fUiIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vpmultishiftqb512, "V64cV64cV64c", "ncV:512:", "avx512vbmi,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpmultishiftqb128, "V16cV16cV16c", "ncV:128:", "avx512vbmi,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpmultishiftqb256, "V32cV32cV32c", "ncV:256:", "avx512vbmi,avx512vl")
-
-// bf16 intrinsics
-TARGET_BUILTIN(__builtin_ia32_cvtne2ps2bf16_128, "V8yV4fV4f", "ncV:128:", "avx512bf16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtne2ps2bf16_256, "V16yV8fV8f", "ncV:256:", "avx512bf16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtne2ps2bf16_512, "V32yV16fV16f", "ncV:512:", "avx512bf16,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtneps2bf16_128_mask, "V8yV4fV8yUc", "ncV:128:", "avx512bf16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtneps2bf16_256_mask, "V8yV8fV8yUc", "ncV:256:", "avx512bf16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtneps2bf16_512_mask, "V16yV16fV16yUs", "ncV:512:", "avx512bf16,evex512")
-TARGET_BUILTIN(__builtin_ia32_dpbf16ps_128, "V4fV4fV8yV8y", "ncV:128:", "avx512bf16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_dpbf16ps_256, "V8fV8fV16yV16y", "ncV:256:", "avx512bf16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_dpbf16ps_512, "V16fV16fV32yV32y", "ncV:512:", "avx512bf16,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtsbf162ss_32, "fy", "nc", "avx512bf16")
-
-TARGET_BUILTIN(__builtin_ia32_vp2intersect_q_512, "vV8OiV8OiUc*Uc*", "nV:512:", "avx512vp2intersect,evex512")
-TARGET_BUILTIN(__builtin_ia32_vp2intersect_q_256, "vV4OiV4OiUc*Uc*", "nV:256:", "avx512vp2intersect,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vp2intersect_q_128, "vV2OiV2OiUc*Uc*", "nV:128:", "avx512vp2intersect,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vp2intersect_d_512, "vV16iV16iUs*Us*", "nV:512:", "avx512vp2intersect,evex512")
-TARGET_BUILTIN(__builtin_ia32_vp2intersect_d_256, "vV8iV8iUc*Uc*", "nV:256:", "avx512vp2intersect,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vp2intersect_d_128, "vV4iV4iUc*Uc*", "nV:128:", "avx512vp2intersect,avx512vl")
-
-// AVX512 fp16 intrinsics
-TARGET_BUILTIN(__builtin_ia32_vcomish,       "iV8xV8xIiIi",    "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_addph512,      "V32xV32xV32xIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_subph512,      "V32xV32xV32xIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_mulph512,      "V32xV32xV32xIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_divph512,      "V32xV32xV32xIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_maxph512,      "V32xV32xV32xIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_minph512,      "V32xV32xV32xIi", "ncV:512:", "avx512fp16,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_minph256,      "V16xV16xV16x", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_minph128,      "V8xV8xV8x", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_maxph256,      "V16xV16xV16x", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_maxph128,      "V8xV8xV8x", "ncV:128:", "avx512fp16,avx512vl")
-
-TARGET_BUILTIN(__builtin_ia32_addsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_divsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_mulsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_subsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_maxsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_minsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_cmpph512_mask, "UiV32xV32xIiUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_cmpph256_mask, "UsV16xV16xIiUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cmpph128_mask, "UcV8xV8xIiUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cmpsh_mask, "UcV8xV8xIiUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_loadsh128_mask, "V8xV8xC*V8xUc", "nV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_storesh128_mask, "vV8x*V8xUc", "nV:128:", "avx512fp16")
-
-TARGET_BUILTIN(__builtin_ia32_rcpph128_mask, "V8xV8xV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rcpph256_mask, "V16xV16xV16xUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rcpph512_mask, "V32xV32xV32xUi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_rsqrtph128_mask, "V8xV8xV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rsqrtph256_mask, "V16xV16xV16xUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rsqrtph512_mask, "V32xV32xV32xUi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_getmantph128_mask, "V8xV8xIiV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getmantph256_mask, "V16xV16xIiV16xUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getmantph512_mask, "V32xV32xIiV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_getexpph128_mask, "V8xV8xV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getexpph256_mask, "V16xV16xV16xUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getexpph512_mask, "V32xV32xV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_scalefph128_mask, "V8xV8xV8xV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scalefph256_mask, "V16xV16xV16xV16xUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scalefph512_mask, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_rndscaleph_128_mask, "V8xV8xIiV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rndscaleph_256_mask, "V16xV16xIiV16xUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rndscaleph_mask, "V32xV32xIiV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduceph128_mask, "V8xV8xIiV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_reduceph256_mask, "V16xV16xIiV16xUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_reduceph512_mask, "V32xV32xIiV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_rcpsh_mask, "V8xV8xV8xV8xUc", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_rsqrtsh_mask, "V8xV8xV8xV8xUc", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_getmantsh_round_mask, "V8xV8xV8xIiV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_getexpsh128_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_scalefsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_rndscalesh_round_mask, "V8xV8xV8xV8xUcIiIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_reducesh_mask, "V8xV8xV8xV8xUcIiIi", "ncV:128:", "avx512fp16")
-
-TARGET_BUILTIN(__builtin_ia32_sqrtph, "V8xV8x", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_sqrtph256, "V16xV16x", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_sqrtph512, "V32xV32xIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_sqrtsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_fpclassph128_mask, "UcV8xIiUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fpclassph256_mask, "UsV16xIiUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fpclassph512_mask, "UiV32xIiUi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_fpclasssh_mask, "UcV8xIiUc", "ncV:128:", "avx512fp16")
-
-TARGET_BUILTIN(__builtin_ia32_vcvtpd2ph128_mask, "V8xV2dV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtpd2ph256_mask, "V8xV4dV8xUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtpd2ph512_mask, "V8xV8dV8xUcIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2pd128_mask, "V2dV8xV2dUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2pd256_mask, "V4dV8xV4dUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2pd512_mask, "V8dV8xV8dUcIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtsh2ss_round_mask, "V4fV4fV8xV4fUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvtss2sh_round_mask, "V8xV8xV4fV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvtsd2sh_round_mask, "V8xV8xV2dV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvtsh2sd_round_mask, "V2dV2dV8xV2dUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2w128_mask, "V8sV8xV8sUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2w256_mask, "V16sV16xV16sUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2w512_mask, "V32sV32xV32sUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2w128_mask, "V8sV8xV8sUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2w256_mask, "V16sV16xV16sUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2w512_mask, "V32sV32xV32sUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtw2ph128_mask, "V8xV8sV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtw2ph256_mask, "V16xV16sV16xUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtw2ph512_mask, "V32xV32sV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2uw128_mask, "V8UsV8xV8UsUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2uw256_mask, "V16UsV16xV16UsUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2uw512_mask, "V32UsV32xV32UsUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2uw128_mask, "V8UsV8xV8UsUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2uw256_mask, "V16UsV16xV16UsUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2uw512_mask, "V32UsV32xV32UsUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtuw2ph128_mask, "V8xV8UsV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtuw2ph256_mask, "V16xV16UsV16xUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtuw2ph512_mask, "V32xV32UsV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2dq128_mask, "V4iV8xV4iUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2dq256_mask, "V8iV8xV8iUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2dq512_mask, "V16iV16xV16iUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2udq128_mask, "V4UiV8xV4UiUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2udq256_mask, "V8UiV8xV8UiUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2udq512_mask, "V16UiV16xV16UiUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtdq2ph128_mask, "V8xV4iV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtdq2ph256_mask, "V8xV8iV8xUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtdq2ph512_mask, "V16xV16iV16xUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtudq2ph128_mask, "V8xV4UiV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtudq2ph256_mask, "V8xV8UiV8xUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtudq2ph512_mask, "V16xV16UiV16xUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2dq128_mask, "V4iV8xV4iUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2dq256_mask, "V8iV8xV8iUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2dq512_mask, "V16iV16xV16iUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2udq128_mask, "V4UiV8xV4UiUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2udq256_mask, "V8UiV8xV8UiUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2udq512_mask, "V16UiV16xV16UiUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtqq2ph128_mask, "V8xV2OiV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtqq2ph256_mask, "V8xV4OiV8xUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtqq2ph512_mask, "V8xV8OiV8xUcIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2qq128_mask, "V2OiV8xV2OiUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2qq256_mask, "V4OiV8xV4OiUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2qq512_mask, "V8OiV8xV8OiUcIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtuqq2ph128_mask, "V8xV2UOiV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtuqq2ph256_mask, "V8xV4UOiV8xUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtuqq2ph512_mask, "V8xV8UOiV8xUcIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2uqq128_mask, "V2UOiV8xV2UOiUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2uqq256_mask, "V4UOiV8xV4UOiUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2uqq512_mask, "V8UOiV8xV8UOiUcIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2qq128_mask, "V2OiV8xV2OiUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2qq256_mask, "V4OiV8xV4OiUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2qq512_mask, "V8OiV8xV8OiUcIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2uqq128_mask, "V2UOiV8xV2UOiUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2uqq256_mask, "V4UOiV8xV4UOiUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2uqq512_mask, "V8UOiV8xV8UOiUcIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtsh2si32, "iV8xIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvtsh2usi32, "UiV8xIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvtusi2sh, "V8xV8xUiIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvtsi2sh, "V8xV8xiIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvttsh2si32, "iV8xIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvttsh2usi32, "UiV8xIi", "ncV:128:", "avx512fp16")
-
-TARGET_BUILTIN(__builtin_ia32_vcvtph2psx128_mask, "V4fV8xV4fUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2psx256_mask, "V8fV8xV8fUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2psx512_mask, "V16fV16xV16fUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2phx128_mask, "V8xV4fV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2phx256_mask, "V8xV8fV8xUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2phx512_mask, "V16xV16fV16xUsIi", "ncV:512:", "avx512fp16,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_vfmaddph, "V8xV8xV8xV8x", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfmaddph256, "V16xV16xV16xV16x", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfmaddph512_mask,  "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddph512_mask3, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddph512_maskz, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubph, "V8xV8xV8xV8x", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubph256, "V16xV16xV16xV16x", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubph512_mask, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubph512_maskz, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubph512_mask3, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_vfmsubaddph512_mask3, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmsubph512_mask3, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_vfmaddsh3_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsh3_maskz, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsh3_mask3, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfmsubsh3_mask3, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-
-TARGET_BUILTIN(__builtin_ia32_vfmaddcph128_mask,  "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcph128_maskz,  "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcph256_mask,  "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcph256_maskz,  "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcph512_mask,  "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcph512_maskz,  "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcph512_mask3,  "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcph128_mask,  "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcph128_maskz,  "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcph256_mask,  "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcph256_maskz,  "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcph512_mask,  "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcph512_maskz,  "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcph512_mask3,  "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcsh_mask,   "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcsh_maskz,   "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcsh_mask,  "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcsh_maskz,  "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcsh_round_mask,  "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcsh_round_mask3,  "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcsh_round_mask,  "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcsh_round_mask3,  "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
-
-TARGET_BUILTIN(__builtin_ia32_vfmulcsh_mask,   "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfcmulcsh_mask,  "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfmulcph128_mask,  "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfmulcph256_mask,  "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfmulcph512_mask,  "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfcmulcph128_mask,  "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfcmulcph256_mask,  "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfcmulcph512_mask,  "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16,evex512")
-
-// generic select intrinsics
-TARGET_BUILTIN(__builtin_ia32_selectb_128, "V16cUsV16cV16c", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectb_256, "V32cUiV32cV32c", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectb_512, "V64cUOiV64cV64c", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_selectw_128, "V8sUcV8sV8s", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectw_256, "V16sUsV16sV16s", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectw_512, "V32sUiV32sV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_selectd_128, "V4iUcV4iV4i", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectd_256, "V8iUcV8iV8i", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectd_512, "V16iUsV16iV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_selectph_128, "V8xUcV8xV8x", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectph_256, "V16xUsV16xV16x", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectph_512, "V32xUiV32xV32x", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_selectpbf_128, "V8yUcV8yV8y", "ncV:128:", "avx512bf16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectpbf_256, "V16yUsV16yV16y", "ncV:256:", "avx512bf16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectpbf_512, "V32yUiV32yV32y", "ncV:512:", "avx512bf16,evex512")
-TARGET_BUILTIN(__builtin_ia32_selectq_128, "V2OiUcV2OiV2Oi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectq_256, "V4OiUcV4OiV4Oi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectq_512, "V8OiUcV8OiV8Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_selectps_128, "V4fUcV4fV4f", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectps_256, "V8fUcV8fV8f", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectps_512, "V16fUsV16fV16f", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_selectpd_128, "V2dUcV2dV2d", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectpd_256, "V4dUcV4dV4d", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectpd_512, "V8dUcV8dV8d", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_selectsh_128, "V8xUcV8xV8x", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_selectsbf_128, "V8yUcV8yV8y", "ncV:128:", "avx512bf16")
-TARGET_BUILTIN(__builtin_ia32_selectss_128, "V4fUcV4fV4f", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_selectsd_128, "V2dUcV2dV2d", "ncV:128:", "avx512f")
-
-// generic reduction intrinsics
-TARGET_BUILTIN(__builtin_ia32_reduce_fadd_pd512, "ddV8d", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fadd_ps512, "ffV16f", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fadd_ph512, "xxV32x", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fadd_ph256, "xxV16x", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_reduce_fadd_ph128, "xxV8x", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmax_pd512, "dV8d", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmax_ps512, "fV16f", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmax_ph512, "xV32x", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmax_ph256, "xV16x", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmax_ph128, "xV8x", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmin_pd512, "dV8d", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmin_ps512, "fV16f", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmin_ph512, "xV32x", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmin_ph256, "xV16x", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmin_ph128, "xV8x", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmul_pd512, "ddV8d", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmul_ps512, "ffV16f", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmul_ph512, "xxV32x", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmul_ph256, "xxV16x", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmul_ph128, "xxV8x", "ncV:128:", "avx512fp16,avx512vl")
-
-// MONITORX/MWAITX
-TARGET_BUILTIN(__builtin_ia32_monitorx, "vvC*UiUi", "n", "mwaitx")
-TARGET_BUILTIN(__builtin_ia32_mwaitx, "vUiUiUi", "n", "mwaitx")
-
-// WAITPKG
-TARGET_BUILTIN(__builtin_ia32_umonitor, "vvC*", "n", "waitpkg")
-TARGET_BUILTIN(__builtin_ia32_umwait, "UcUiUiUi", "n", "waitpkg")
-TARGET_BUILTIN(__builtin_ia32_tpause, "UcUiUiUi", "n", "waitpkg")
-
-// CLZERO
-TARGET_BUILTIN(__builtin_ia32_clzero, "vv*", "n", "clzero")
-
-// CLDEMOTE
-TARGET_BUILTIN(__builtin_ia32_cldemote, "vvC*", "n", "cldemote")
-
-// Direct Move
-TARGET_BUILTIN(__builtin_ia32_directstore_u32, "vUi*Ui", "n", "movdiri")
-TARGET_BUILTIN(__builtin_ia32_movdir64b, "vv*vC*", "n", "movdir64b")
-
-// PTWRITE
-TARGET_BUILTIN(__builtin_ia32_ptwrite32, "vUi", "n", "ptwrite")
-
-// INVPCID
-TARGET_BUILTIN(__builtin_ia32_invpcid, "vUiv*", "nc", "invpcid")
-
-// ENQCMD
-TARGET_BUILTIN(__builtin_ia32_enqcmd, "Ucv*vC*", "n", "enqcmd")
-TARGET_BUILTIN(__builtin_ia32_enqcmds, "Ucv*vC*", "n", "enqcmd")
-
-// KEY LOCKER
-TARGET_BUILTIN(__builtin_ia32_loadiwkey, "vV2OiV2OiV2OiUi", "nV:128:", "kl")
-TARGET_BUILTIN(__builtin_ia32_encodekey128_u32, "UiUiV2Oiv*", "nV:128:", "kl")
-TARGET_BUILTIN(__builtin_ia32_encodekey256_u32, "UiUiV2OiV2Oiv*", "nV:128:", "kl")
-TARGET_BUILTIN(__builtin_ia32_aesenc128kl_u8, "UcV2Oi*V2OivC*", "nV:128:", "kl")
-TARGET_BUILTIN(__builtin_ia32_aesenc256kl_u8, "UcV2Oi*V2OivC*", "nV:128:", "kl")
-TARGET_BUILTIN(__builtin_ia32_aesdec128kl_u8, "UcV2Oi*V2OivC*", "nV:128:", "kl")
-TARGET_BUILTIN(__builtin_ia32_aesdec256kl_u8, "UcV2Oi*V2OivC*", "nV:128:", "kl")
-TARGET_BUILTIN(__builtin_ia32_aesencwide128kl_u8, "UcV2Oi*V2OiC*vC*", "nV:128:", "kl,widekl")
-TARGET_BUILTIN(__builtin_ia32_aesencwide256kl_u8, "UcV2Oi*V2OiC*vC*", "nV:128:", "kl,widekl")
-TARGET_BUILTIN(__builtin_ia32_aesdecwide128kl_u8, "UcV2Oi*V2OiC*vC*", "nV:128:", "kl,widekl")
-TARGET_BUILTIN(__builtin_ia32_aesdecwide256kl_u8, "UcV2Oi*V2OiC*vC*", "nV:128:", "kl,widekl")
-
-// SERIALIZE
-TARGET_BUILTIN(__builtin_ia32_serialize, "v", "n", "serialize")
-
-// TSXLDTRK
-TARGET_BUILTIN(__builtin_ia32_xsusldtrk, "v", "n", "tsxldtrk")
-TARGET_BUILTIN(__builtin_ia32_xresldtrk, "v", "n", "tsxldtrk")
-
-// RAO-INT
-TARGET_BUILTIN(__builtin_ia32_aadd32, "vv*Si", "n", "raoint")
-TARGET_BUILTIN(__builtin_ia32_aand32, "vv*Si", "n", "raoint")
-TARGET_BUILTIN(__builtin_ia32_aor32, "vv*Si", "n", "raoint")
-TARGET_BUILTIN(__builtin_ia32_axor32, "vv*Si", "n", "raoint")
-
-// MSVC
-TARGET_HEADER_BUILTIN(_BitScanForward, "UcUNi*UNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_BitScanReverse, "UcUNi*UNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-
-TARGET_HEADER_BUILTIN(_ReadWriteBarrier, "v", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_ReadBarrier,      "v", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_WriteBarrier,     "v", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-
-TARGET_HEADER_BUILTIN(__cpuid,   "vi*i",  "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__cpuidex, "vi*ii", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-
-TARGET_HEADER_BUILTIN(__emul,  "LLiii",    "nch", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__emulu, "ULLiUiUi", "nch", INTRIN_H, ALL_MS_LANGUAGES, "")
-
-TARGET_HEADER_BUILTIN(_AddressOfReturnAddress, "v*", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-
-TARGET_HEADER_BUILTIN(__stosb, "vUc*Ucz", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__int2c, "v",       "nhr", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__ud2,   "v",       "nhr", INTRIN_H, ALL_MS_LANGUAGES, "")
-
-TARGET_HEADER_BUILTIN(__readfsbyte,  "UcUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__readfsword,  "UsUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__readfsdword, "UNiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__readfsqword, "ULLiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-
-TARGET_HEADER_BUILTIN(__readgsbyte,  "UcUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__readgsword,  "UsUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__readgsdword, "UNiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__readgsqword, "ULLiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-
-// AVX10.2 VNNI FP16
-TARGET_BUILTIN(__builtin_ia32_vdpphps128, "V4fV4fV8xV8x", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vdpphps256, "V8fV8fV16xV16x", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vdpphps512, "V16fV16fV32xV32x", "ncV:512:", "avx10.2-512")
-
-// AVX10.2 VNNI INT8
-TARGET_BUILTIN(__builtin_ia32_vpdpbssd512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vpdpbssds512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vpdpbsud512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vpdpbsuds512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vpdpbuud512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vpdpbuuds512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
-
-// AVX10.2 VNNI INT16
-TARGET_BUILTIN(__builtin_ia32_vpdpwsud512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vpdpwsuds512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vpdpwusd512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vpdpwusds512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vpdpwuud512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vpdpwuuds512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
-
-// AVX10.2 VMPSADBW
-TARGET_BUILTIN(__builtin_ia32_mpsadbw512, "V32sV64cV64cIc", "ncV:512:", "avx10.2-512")
-
-// AVX10.2 YMM Rounding
-TARGET_BUILTIN(__builtin_ia32_vaddpd256_round, "V4dV4dV4dIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vaddph256_round, "V16xV16xV16xIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vaddps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcmppd256_round_mask, "UcV4dV4dIiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcmpph256_round_mask, "UsV16xV16xIiUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcmpps256_round_mask, "UcV8fV8fIiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtdq2ph256_round_mask, "V8xV8iV8xUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtdq2ps256_round_mask, "V8fV8iV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtpd2dq256_round_mask, "V4iV4dV4iUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtpd2ph256_round_mask, "V8xV4dV8xUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtpd2ps256_round_mask, "V4fV4dV4fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtpd2qq256_round_mask, "V4LLiV4dV4LLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtpd2udq256_round_mask, "V4UiV4dV4UiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtpd2uqq256_round_mask, "V4ULLiV4dV4ULLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2dq256_round_mask, "V8iV8xV8iUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2pd256_round_mask, "V4dV8xV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2psx256_round_mask, "V8fV8xV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2qq256_round_mask, "V4LLiV8xV4LLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2udq256_round_mask, "V8UiV8xV8UiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2uqq256_round_mask, "V4ULLiV8xV4ULLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2uw256_round_mask, "V16UsV16xV16UsUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2w256_round_mask, "V16sV16xV16sUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2dq256_round_mask, "V8iV8fV8iUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2pd256_round_mask, "V4dV4fV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2phx256_round_mask, "V8xV8fV8xUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2qq256_round_mask, "V4LLiV4fV4LLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2udq256_round_mask, "V8UiV8fV8UiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2uqq256_round_mask, "V4ULLiV4fV4ULLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtqq2pd256_round_mask, "V4dV4LLiV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtqq2ph256_round_mask, "V8xV4LLiV8xUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtqq2ps256_round_mask, "V4fV4LLiV4fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2dq256_round_mask, "V4iV4dV4iUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2qq256_round_mask, "V4LLiV4dV4LLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2udq256_round_mask, "V4UiV4dV4UiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2uqq256_round_mask, "V4ULLiV4dV4ULLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2dq256_round_mask, "V8iV8xV8iUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2qq256_round_mask, "V4LLiV8xV4LLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2udq256_round_mask, "V8UiV8xV8UiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2uqq256_round_mask, "V4ULLiV8xV4ULLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2uw256_round_mask, "V16UsV16xV16UsUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2w256_round_mask, "V16sV16xV16sUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2dq256_round_mask, "V8iV8fV8iUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2qq256_round_mask, "V4LLiV4fV4LLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2udq256_round_mask, "V8UiV8fV8UiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2uqq256_round_mask, "V4ULLiV4fV4ULLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtudq2ph256_round_mask, "V8xV8UiV8xUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtudq2ps256_round_mask, "V8fV8UiV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtuqq2pd256_round_mask, "V4dV4ULLiV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtuqq2ph256_round_mask, "V8xV4ULLiV8xUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtuqq2ps256_round_mask, "V4fV4ULLiV4fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtuw2ph256_round_mask, "V16xV16UsV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtw2ph256_round_mask, "V16xV16sV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vdivpd256_round, "V4dV4dV4dIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vdivph256_round, "V16xV16xV16xIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vdivps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcph256_round_mask, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcph256_round_maskz, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcph256_round_mask3, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfcmulcph256_round_mask, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfixupimmpd256_round_mask, "V4dV4dV4dV4LLiIiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfixupimmpd256_round_maskz, "V4dV4dV4dV4LLiIiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfixupimmps256_round_mask, "V8fV8fV8fV8iIiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfixupimmps256_round_maskz, "V8fV8fV8fV8iIiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddpd256_round_mask, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddpd256_round_maskz, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddpd256_round_mask3, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddph256_round_mask, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddph256_round_maskz, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddph256_round_mask3, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddps256_round_mask, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddps256_round_maskz, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddps256_round_mask3, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcph256_round_mask, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcph256_round_maskz, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcph256_round_mask3, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd256_round_mask, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd256_round_maskz, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd256_round_mask3, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubph256_round_mask, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubph256_round_maskz, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubph256_round_mask3, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubps256_round_mask, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubps256_round_maskz, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubps256_round_mask3, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmsubpd256_round_mask3, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmsubph256_round_mask3, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmsubps256_round_mask3, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmsubaddpd256_round_mask3, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmsubaddph256_round_mask3, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmsubaddps256_round_mask3, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmulcph256_round_mask, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vgetexppd256_round_mask, "V4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vgetexpph256_round_mask, "V16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vgetexpps256_round_mask, "V8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vgetmantpd256_round_mask, "V4dV4dIiV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vgetmantph256_round_mask, "V16xV16xIiV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vgetmantps256_round_mask, "V8fV8fIiV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmaxpd256_round, "V4dV4dV4dIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmaxph256_round, "V16xV16xV16xIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmaxps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminpd256_round, "V4dV4dV4dIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminph256_round, "V16xV16xV16xIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmulpd256_round, "V4dV4dV4dIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmulph256_round, "V16xV16xV16xIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmulps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrangepd256_round_mask, "V4dV4dV4dIiV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrangeps256_round_mask, "V8fV8fV8fIiV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vreducepd256_round_mask, "V4dV4dIiV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vreduceph256_round_mask, "V16xV16xIiV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vreduceps256_round_mask, "V8fV8fIiV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrndscalepd256_round_mask, "V4dV4dIiV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrndscaleph256_round_mask, "V16xV16xIiV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrndscaleps256_round_mask, "V8fV8fIiV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vscalefpd256_round_mask, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vscalefph256_round_mask, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vscalefps256_round_mask, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vsqrtpd256_round, "V4dV4dIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vsqrtph256_round, "V16xV16xIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vsqrtps256_round, "V8fV8fIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vsubpd256_round, "V4dV4dV4dIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vsubph256_round, "V16xV16xV16xIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vsubps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
-
-// AVX-VNNI-INT16
-TARGET_BUILTIN(__builtin_ia32_vpdpwsud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwsud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwsuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwsuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwusd128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwusd256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwusds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwusds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwuud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwuud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwuuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwuuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
-
-// AVX10.2 SATCVT-DS
-TARGET_BUILTIN(__builtin_ia32_vcvttsd2sis32, "iV2dIi", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttsd2usis32, "UiV2dIi", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttss2sis32, "iV4fIi", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttss2usis32, "UiV4fIi", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2dqs128_mask, "V4iV2dV4iUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2dqs256_round_mask, "V4iV4dV4iUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2dqs512_round_mask, "V8iV8dV8iUcIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2udqs128_mask, "V4iV2dV4iUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2udqs256_round_mask, "V4iV4dV4iUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2udqs512_round_mask, "V8iV8dV8iUcIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2qqs128_mask,  "V2OiV2dV2OiUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2qqs256_round_mask, "V4OiV4dV4OiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2qqs512_round_mask, "V8OiV8dV8OiUcIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2uqqs128_mask, "V2OiV2dV2OiUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2uqqs256_round_mask, "V4OiV4dV4OiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2uqqs512_round_mask, "V8OiV8dV8OiUcIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2dqs128_mask, "V4iV4fV4iUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2dqs256_round_mask, "V8iV8fV8iUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2dqs512_round_mask, "V16iV16fV16iUsIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2udqs128_mask, "V4iV4fV4iUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2udqs256_round_mask, "V8iV8fV8iUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2udqs512_round_mask, "V16iV16fV16iUsIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2qqs128_mask, "V2OiV4fV2OiUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2qqs256_round_mask, "V4OiV4fV4OiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2qqs512_round_mask, "V8OiV8fV8OiUcIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2uqqs128_mask, "V2OiV4fV2OiUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2uqqs256_round_mask, "V4OiV4fV4OiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2uqqs512_round_mask, "V8OiV8fV8OiUcIi", "nV:512:", "avx10.2-512")
-
-// AVX-NE-CONVERT
-TARGET_BUILTIN(__builtin_ia32_vbcstnebf162ps128, "V4fyC*", "nV:128:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vbcstnebf162ps256, "V8fyC*", "nV:256:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vbcstnesh2ps128, "V4fxC*", "nV:128:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vbcstnesh2ps256, "V8fxC*", "nV:256:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vcvtneebf162ps128, "V4fV8yC*", "nV:128:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vcvtneebf162ps256, "V8fV16yC*", "nV:256:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vcvtneeph2ps128, "V4fV8xC*", "nV:128:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vcvtneeph2ps256, "V8fV16xC*", "nV:256:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vcvtneobf162ps128, "V4fV8yC*", "nV:128:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vcvtneobf162ps256, "V8fV16yC*", "nV:256:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vcvtneoph2ps128, "V4fV8xC*", "nV:128:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vcvtneoph2ps256, "V8fV16xC*", "nV:256:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vcvtneps2bf16128, "V8yV4f", "nV:128:", "avx512bf16,avx512vl|avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vcvtneps2bf16256, "V8yV8f", "nV:256:", "avx512bf16,avx512vl|avxneconvert")
-
-// SHA512
-TARGET_BUILTIN(__builtin_ia32_vsha512msg1, "V4ULLiV4ULLiV2ULLi", "nV:256:", "sha512")
-TARGET_BUILTIN(__builtin_ia32_vsha512msg2, "V4ULLiV4ULLiV4ULLi", "nV:256:", "sha512")
-TARGET_BUILTIN(__builtin_ia32_vsha512rnds2, "V4ULLiV4ULLiV4ULLiV2ULLi", "nV:256:", "sha512")
-
-TARGET_HEADER_BUILTIN(_InterlockedAnd64,         "WiWiD*Wi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_InterlockedDecrement64,   "WiWiD*",   "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_InterlockedExchange64,    "WiWiD*Wi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_InterlockedExchangeAdd64, "WiWiD*Wi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_InterlockedExchangeSub64, "WiWiD*Wi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_InterlockedIncrement64,   "WiWiD*",   "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_InterlockedOr64,          "WiWiD*Wi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_InterlockedXor64,         "WiWiD*Wi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-
-// SM3
-TARGET_BUILTIN(__builtin_ia32_vsm3msg1, "V4UiV4UiV4UiV4Ui", "nV:128:", "sm3")
-TARGET_BUILTIN(__builtin_ia32_vsm3msg2, "V4UiV4UiV4UiV4Ui", "nV:128:", "sm3")
-TARGET_BUILTIN(__builtin_ia32_vsm3rnds2, "V4UiV4UiV4UiV4UiIUi", "nV:128:", "sm3")
-
-// SM4
-TARGET_BUILTIN(__builtin_ia32_vsm4key4128, "V4UiV4UiV4Ui", "nV:128:", "sm4")
-TARGET_BUILTIN(__builtin_ia32_vsm4key4256, "V8UiV8UiV8Ui", "nV:256:", "sm4")
-TARGET_BUILTIN(__builtin_ia32_vsm4rnds4128, "V4UiV4UiV4Ui", "nV:128:", "sm4")
-TARGET_BUILTIN(__builtin_ia32_vsm4rnds4256, "V8UiV8UiV8Ui", "nV:256:", "sm4")
-
-// SM4_EVEX
-TARGET_BUILTIN(__builtin_ia32_vsm4key4512, "V16UiV16UiV16Ui", "nV:512:", "avx10.2-512,sm4")
-TARGET_BUILTIN(__builtin_ia32_vsm4rnds4512, "V16UiV16UiV16Ui", "nV:512:", "avx10.2-512,sm4")
-
-// AVX10 MINMAX
-TARGET_BUILTIN(__builtin_ia32_vminmaxnepbf16128, "V8yV8yV8yIi", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminmaxnepbf16256, "V16yV16yV16yIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminmaxnepbf16512, "V32yV32yV32yIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vminmaxpd128_mask, "V2dV2dV2dIiV2dUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminmaxpd256_round_mask, "V4dV4dV4dIiV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminmaxpd512_round_mask, "V8dV8dV8dIiV8dUcIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vminmaxph128_mask, "V8xV8xV8xIiV8xUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminmaxph256_round_mask, "V16xV16xV16xIiV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminmaxph512_round_mask, "V32xV32xV32xIiV32xUiIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vminmaxps128_mask, "V4fV4fV4fIiV4fUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminmaxps256_round_mask, "V8fV8fV8fIiV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminmaxps512_round_mask, "V16fV16fV16fIiV16fUsIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vminmaxsd_round_mask, "V2dV2dV2dIiV2dUcIi", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminmaxsh_round_mask, "V8xV8xV8xIiV8xUcIi", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminmaxss_round_mask, "V4fV4fV4fIiV4fUcIi", "nV:128:", "avx10.2-256")
-
-// AVX10.2 SATCVT
-TARGET_BUILTIN(__builtin_ia32_vcvtnebf162ibs128, "V8UsV8y", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtnebf162ibs256, "V16UsV16y", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtnebf162ibs512, "V32UsV32y", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtnebf162iubs128, "V8UsV8y", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtnebf162iubs256, "V16UsV16y", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtnebf162iubs512, "V32UsV32y", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2ibs128_mask, "V8UsV8xV8UsUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2ibs256_mask, "V16UsV16xV16UsUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2ibs512_mask, "V32UsV32xV32UsUiIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2iubs128_mask, "V8UsV8xV8UsUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2iubs256_mask, "V16UsV16xV16UsUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2iubs512_mask, "V32UsV32xV32UsUiIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2ibs128_mask, "V4UiV4fV4UiUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2ibs256_mask, "V8UiV8fV8UiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2ibs512_mask, "V16UiV16fV16UiUsIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2iubs128_mask, "V4UiV4fV4UiUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2iubs256_mask, "V8UiV8fV8UiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2iubs512_mask, "V16UiV16fV16UiUsIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttnebf162ibs128, "V8UsV8y", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttnebf162ibs256, "V16UsV16y", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttnebf162ibs512, "V32UsV32y", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttnebf162iubs128, "V8UsV8y", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttnebf162iubs256, "V16UsV16y", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttnebf162iubs512, "V32UsV32y", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2ibs128_mask, "V8UsV8xV8UsUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2ibs256_mask, "V16UsV16xV16UsUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2ibs512_mask, "V32UsV32xV32UsUiIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2iubs128_mask, "V8UsV8xV8UsUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2iubs256_mask, "V16UsV16xV16UsUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2iubs512_mask, "V32UsV32xV32UsUiIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2ibs128_mask, "V4UiV4fV4UiUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2ibs256_mask, "V8UiV8fV8UiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2ibs512_mask, "V16UiV16fV16UiUsIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2iubs128_mask, "V4UiV4fV4UiUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2iubs256_mask, "V8UiV8fV8UiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2iubs512_mask, "V16UiV16fV16UiUsIi", "nV:512:", "avx10.2-512")
-
-// AVX10.2 CONVERT
-TARGET_BUILTIN(__builtin_ia32_vcvt2ps2phx128_mask, "V8xV4fV4fV8xUc", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvt2ps2phx256_mask, "V16xV8fV8fV16xUsIi", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvt2ps2phx512_mask, "V32xV16fV16fV32xUiIi", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2bf8_128_mask, "V16cV16cV8xV16cUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2bf8_256_mask, "V16cV32cV16xV16cUs", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2bf8_512_mask, "V32cV64cV32xV32cUi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2bf8s_128_mask, "V16cV16cV8xV16cUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2bf8s_256_mask, "V16cV32cV16xV16cUs", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2bf8s_512_mask, "V32cV64cV32xV32cUi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2hf8_128_mask, "V16cV16cV8xV16cUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2hf8_256_mask, "V16cV32cV16xV16cUs", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2hf8_512_mask, "V32cV64cV32xV32cUi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2hf8s_128_mask, "V16cV16cV8xV16cUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2hf8s_256_mask, "V16cV32cV16xV16cUs", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2hf8s_512_mask, "V32cV64cV32xV32cUi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2bf8_128, "V16cV8xV8x", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2bf8_256, "V32cV16xV16x", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2bf8_512, "V64cV32xV32x", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2bf8s_128, "V16cV8xV8x", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2bf8s_256, "V32cV16xV16x", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2bf8s_512, "V64cV32xV32x", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2hf8_128, "V16cV8xV8x", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2hf8_256, "V32cV16xV16x", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2hf8_512, "V64cV32xV32x", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2hf8s_128, "V16cV8xV8x", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2hf8s_256, "V32cV16xV16x", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2hf8s_512, "V64cV32xV32x", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvthf8_2ph128_mask, "V8xV16cV8xUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvthf8_2ph256_mask, "V16xV16cV16xUs", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvthf8_2ph512_mask, "V32xV32cV32xUi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2bf8_128_mask, "V16cV8xV16cUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2bf8_256_mask, "V16cV16xV16cUs", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2bf8_512_mask, "V32cV32xV32cUi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2bf8s_128_mask, "V16cV8xV16cUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2bf8s_256_mask, "V16cV16xV16cUs", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2bf8s_512_mask, "V32cV32xV32cUi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8_128_mask, "V16cV8xV16cUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8_256_mask, "V16cV16xV16cUs", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8_512_mask, "V32cV32xV32cUi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_128_mask, "V16cV8xV16cUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_256_mask, "V16cV16xV16cUs", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_512_mask, "V32cV32xV32cUi", "nV:512:", "avx10.2-512")
-
-// AVX10.2 BF16
-TARGET_BUILTIN(__builtin_ia32_loadsbf16128_mask, "V8yV8yC*V8yUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_storesbf16128_mask, "vV8y*V8yUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vaddnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vaddnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vaddnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vdivnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vdivnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vdivnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vmaxpbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmaxpbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmaxpbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vminpbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminpbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminpbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vmulnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmulnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmulnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vsubnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vsubnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vsubnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcomsbf16eq, "iV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcomsbf16lt, "iV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcomsbf16neq, "iV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcomsbf16ge, "iV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcomsbf16gt, "iV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcomsbf16le, "iV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcmppbf16512_mask,"UiV32yV32yIiUi", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcmppbf16256_mask,"UsV16yV16yIiUs", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcmppbf16128_mask,"UcV8yV8yIiUc", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfpclasspbf16128_mask, "UcV8yIiUc", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfpclasspbf16256_mask, "UsV16yIiUs", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfpclasspbf16512_mask, "UiV32yIiUi", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vscalefpbf16128_mask, "V8yV8yV8yV8yUc", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vscalefpbf16256_mask, "V16yV16yV16yV16yUs", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vscalefpbf16512_mask, "V32yV32yV32yV32yUi", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vrcppbf16128_mask, "V8yV8yV8yUc", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrcppbf16256_mask, "V16yV16yV16yUs", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrcppbf16512_mask, "V32yV32yV32yUi", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vgetexppbf16128_mask, "V8yV8yV8yUc", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vgetexppbf16256_mask, "V16yV16yV16yUs", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vgetexppbf16512_mask, "V32yV32yV32yUi", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vrsqrtpbf16128_mask, "V8yV8yV8yUc", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrsqrtpbf16256_mask, "V16yV16yV16yUs", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrsqrtpbf16512_mask, "V32yV32yV32yUi", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vreducenepbf16128_mask, "V8yV8yIiV8yUc", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vreducenepbf16256_mask, "V16yV16yIiV16yUs", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vreducenepbf16512_mask, "V32yV32yIiV32yUi", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vrndscalenepbf16_128_mask, "V8yV8yIiV8yUc", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrndscalenepbf16_256_mask, "V16yV16yIiV16yUs", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrndscalenepbf16_mask, "V32yV32yIiV32yUi", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vgetmantpbf16128_mask, "V8yV8yIiV8yUc", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vgetmantpbf16256_mask, "V16yV16yIiV16yUs", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vgetmantpbf16512_mask, "V32yV32yIiV32yUi", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vsqrtnepbf16, "V8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vsqrtnepbf16256, "V16yV16y", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vsqrtnepbf16512, "V32yV32y", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddnepbh512, "V32yV32yV32yV32y", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddnepbh256, "V16yV16yV16yV16y", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddnepbh128, "V8yV8yV8yV8y", "ncV:128:", "avx10.2-256")
-
-#undef BUILTIN
-#undef TARGET_BUILTIN
-#undef TARGET_HEADER_BUILTIN
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index cf8d2771310e3..73678bc868bfd 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -15,8 +15,26 @@ include "clang/Basic/BuiltinsBase.td"
 class X86Builtin : TargetBuiltin {
   let Spellings = ["__builtin_ia32_" # NAME];
   let Prototype = prototype;
+  let EnableOpenCLLong = 1;
 }
 
+class X86NoPrefixBuiltin : TargetBuiltin {
+  let Spellings = [NAME];
+  let Prototype = prototype;
+}
+
+class X86LibBuiltin : TargetLibBuiltin {
+  let Spellings = [NAME];
+  let Prototype = prototype;
+}
+
+def rdpmc : X86Builtin<"unsigned long long int(int)">;
+def rdtsc : X86Builtin<"unsigned long long int()">;
+def __rdtsc : X86NoPrefixBuiltin<"unsigned long long int()"> {
+  let EnableOpenCLLong = 1;
+}
+def rdtscp : X86Builtin<"unsigned long long int(unsigned int*)">;
+
 // Undefined Values
 def undef128 : X86Builtin<"_Vector<2, double>()"> {
   let Attributes = [Const, NoThrow, RequiredVectorWidth<128>];
@@ -135,3 +153,5375 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<256>], Features = "avx" in
     def Op#ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>)">;
   }
 }
+
+
+// Mechanically ported builtins from the original `.def` file.
+//
+// TODO: Build structured ways of synthesizing relevant groups and improve the
+// organization of the builtins below this point (and move them above it). The
+// current formulation is based on what was easiest to recognize from the
+// pre-TableGen version.
+
+let Features = "mmx", Attributes = [NoThrow, Const] in {
+  def _mm_prefetch : X86NoPrefixBuiltin<"void(char const *, int)">;
+}
+
+let Features = "sse", Attributes = [NoThrow] in {
+  def ldmxcsr : X86Builtin<"void(unsigned int)">;
+}
+
+let Features = "sse", Header = "xmmintrin.h", Attributes = [NoThrow, RequireDeclaration] in {
+  def _mm_setcsr : X86LibBuiltin<"void(unsigned int)">;
+}
+
+let Features = "sse", Attributes = [NoThrow] in {
+  def stmxcsr : X86Builtin<"unsigned int()">;
+}
+
+let Features = "sse", Header = "xmmintrin.h", Attributes = [NoThrow, RequireDeclaration] in {
+  def _mm_getcsr : X86LibBuiltin<"unsigned int()">;
+}
+
+let Features = "sse", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtss2si : X86Builtin<"int(_Vector<4, float>)">;
+  def cvttss2si : X86Builtin<"int(_Vector<4, float>)">;
+}
+
+let Features = "sse", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def movmskps : X86Builtin<"int(_Vector<4, float>)">;
+}
+
+let Features = "sse", Attributes = [NoThrow] in {
+  def sfence : X86Builtin<"void()">;
+}
+
+let Features = "sse", Header = "xmmintrin.h", Attributes = [NoThrow, RequireDeclaration] in {
+  def _mm_sfence : X86LibBuiltin<"void()">;
+}
+
+let Features = "sse", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rcpps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
+  def rcpss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
+  def rsqrtps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
+  def rsqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
+  def sqrtps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
+  def sqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
+  def shufps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
+}
+
+let Features = "sse2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def maskmovdqu : X86Builtin<"void(_Vector<16, char>, _Vector<16, char>, char *)">;
+}
+
+let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def movmskpd : X86Builtin<"int(_Vector<2, double>)">;
+  def pmovmskb128 : X86Builtin<"int(_Vector<16, char>)">;
+}
+
+let Features = "sse2", Attributes = [NoThrow] in {
+  def movnti : X86Builtin<"void(int *, int)">;
+}
+
+let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pshufd : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Constant int)">;
+  def pshuflw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
+  def pshufhw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
+  def psadbw128 : X86Builtin<"_Vector<2, long long int>(_Vector<16, char>, _Vector<16, char>)">;
+  def sqrtpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
+  def sqrtsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
+  def shufpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
+  def cvtpd2dq : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>)">;
+  def cvtpd2ps : X86Builtin<"_Vector<4, float>(_Vector<2, double>)">;
+  def cvttpd2dq : X86Builtin<"_Vector<4, int>(_Vector<2, double>)">;
+  def cvtsd2si : X86Builtin<"int(_Vector<2, double>)">;
+  def cvttsd2si : X86Builtin<"int(_Vector<2, double>)">;
+  def cvtsd2ss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<2, double>)">;
+  def cvtps2dq : X86Builtin<"_Vector<4, int>(_Vector<4, float>)">;
+  def cvttps2dq : X86Builtin<"_Vector<4, int>(_Vector<4, float>)">;
+}
+
+let Features = "sse2", Attributes = [NoThrow] in {
+  def clflush : X86Builtin<"void(void const *)">;
+}
+
+let Features = "sse2", Header = "emmintrin.h", Attributes = [NoThrow, RequireDeclaration] in {
+  def _mm_clflush : X86LibBuiltin<"void(void const *)">;
+}
+
+let Features = "sse2", Attributes = [NoThrow] in {
+  def lfence : X86Builtin<"void()">;
+}
+
+let Features = "sse2", Header = "emmintrin.h", Attributes = [NoThrow, RequireDeclaration] in {
+  def _mm_lfence : X86LibBuiltin<"void()">;
+}
+
+let Features = "sse2", Attributes = [NoThrow] in {
+  def mfence : X86Builtin<"void()">;
+}
+
+let Features = "sse2", Header = "emmintrin.h", Attributes = [NoThrow, RequireDeclaration] in {
+  def _mm_mfence : X86LibBuiltin<"void()">;
+}
+
+let Attributes = [NoThrow] in {
+  def pause : X86Builtin<"void()">;
+}
+
+let Header = "emmintrin.h", Attributes = [NoThrow, RequireDeclaration] in {
+  def _mm_pause : X86LibBuiltin<"void()">;
+}
+
+let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmuludq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
+  def psraw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+  def psrad128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def psrlw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+  def psrld128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def psrlq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def psllw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+  def pslld128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def psllq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def psllwi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, int)">;
+  def pslldi128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int)">;
+  def psllqi128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, int)">;
+  def psrlwi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, int)">;
+  def psrldi128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int)">;
+  def psrlqi128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, int)">;
+  def psrawi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, int)">;
+  def psradi128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int)">;
+  def pmaddwd128 : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>)">;
+  def pslldqi128_byteshift : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant int)">;
+  def psrldqi128_byteshift : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "sse3", Attributes = [NoThrow] in {
+  def monitor : X86Builtin<"void(void const *, unsigned int, unsigned int)">;
+  def mwait : X86Builtin<"void(unsigned int, unsigned int)">;
+}
+
+let Features = "sse3", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def lddqu : X86Builtin<"_Vector<16, char>(char const *)">;
+}
+
+let Features = "ssse3", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def palignr128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant int)">;
+}
+
+let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def insertps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant char)">;
+  def pblendvb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Vector<16, char>)">;
+  def pblendw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Constant int)">;
+  def blendpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
+  def blendps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
+  def blendvpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
+  def blendvps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
+  def packusdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
+  def pmuldq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
+  def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
+  def roundss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
+  def roundsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
+  def roundpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Constant int)">;
+  def dpps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant char)">;
+  def dppd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant char)">;
+  def ptestz128 : X86Builtin<"int(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def ptestc128 : X86Builtin<"int(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def ptestnzc128 : X86Builtin<"int(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def mpsadbw128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+  def phminposuw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>)">;
+  def vec_ext_v16qi : X86Builtin<"char(_Vector<16, char>, _Constant int)">;
+  def vec_set_v16qi : X86Builtin<"_Vector<16, char>(_Vector<16, char>, char, _Constant int)">;
+  def vec_set_v4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int, _Constant int)">;
+}
+
+let Features = "sse4.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pcmpistrm128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+  def pcmpistri128 : X86Builtin<"int(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+  def pcmpestrm128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, int, _Vector<16, char>, int, _Constant char)">;
+  def pcmpestri128 : X86Builtin<"int(_Vector<16, char>, int, _Vector<16, char>, int, _Constant char)">;
+  def pcmpistria128 : X86Builtin<"int(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+  def pcmpistric128 : X86Builtin<"int(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+  def pcmpistrio128 : X86Builtin<"int(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+  def pcmpistris128 : X86Builtin<"int(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+  def pcmpistriz128 : X86Builtin<"int(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+  def pcmpestria128 : X86Builtin<"int(_Vector<16, char>, int, _Vector<16, char>, int, _Constant char)">;
+  def pcmpestric128 : X86Builtin<"int(_Vector<16, char>, int, _Vector<16, char>, int, _Constant char)">;
+  def pcmpestrio128 : X86Builtin<"int(_Vector<16, char>, int, _Vector<16, char>, int, _Constant char)">;
+  def pcmpestris128 : X86Builtin<"int(_Vector<16, char>, int, _Vector<16, char>, int, _Constant char)">;
+  def pcmpestriz128 : X86Builtin<"int(_Vector<16, char>, int, _Vector<16, char>, int, _Constant char)">;
+}
+
+let Features = "crc32", Attributes = [NoThrow, Const] in {
+  def crc32qi : X86Builtin<"unsigned int(unsigned int, unsigned char)">;
+  def crc32hi : X86Builtin<"unsigned int(unsigned int, unsigned short)">;
+  def crc32si : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+}
+
+let Features = "sse4a", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def extrqi : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant char, _Constant char)">;
+  def extrq : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<16, char>)">;
+  def insertqi : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Constant char, _Constant char)">;
+  def insertq : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "sse4a", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def movntsd : X86Builtin<"void(double *, _Vector<2, double>)">;
+  def movntss : X86Builtin<"void(float *, _Vector<4, float>)">;
+}
+
+let Features = "aes", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def aesenc128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def aesenclast128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def aesdec128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def aesdeclast128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def aesimc128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>)">;
+  def aeskeygenassist128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant char)">;
+}
+
+let Features = "vaes", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def aesenc256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512f,evex512,vaes", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def aesenc512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
+}
+
+let Features = "vaes", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def aesenclast256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512f,evex512,vaes", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def aesenclast512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
+}
+
+let Features = "vaes", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def aesdec256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512f,evex512,vaes", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def aesdec512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
+}
+
+let Features = "vaes", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def aesdeclast256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512f,evex512,vaes", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def aesdeclast512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
+}
+
+let Features = "gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vgf2p8affineinvqb_v16qi : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+}
+
+let Features = "avx,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vgf2p8affineinvqb_v32qi : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
+}
+
+let Features = "avx512f,evex512,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vgf2p8affineinvqb_v64qi : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Constant char)">;
+}
+
+let Features = "gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vgf2p8affineqb_v16qi : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+}
+
+let Features = "avx,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vgf2p8affineqb_v32qi : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
+}
+
+let Features = "avx512f,evex512,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vgf2p8affineqb_v64qi : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Constant char)">;
+}
+
+let Features = "gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vgf2p8mulb_v16qi : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
+}
+
+let Features = "avx,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vgf2p8mulb_v32qi : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
+}
+
+let Features = "avx512f,evex512,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vgf2p8mulb_v64qi : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
+}
+
+let Features = "pclmul", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pclmulqdq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Constant char)">;
+}
+
+let Features = "vpclmulqdq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pclmulqdq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant char)">;
+}
+
+let Features = "avx512f,evex512,vpclmulqdq", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pclmulqdq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Constant char)">;
+}
+
+let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpermilvarpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, long long int>)">;
+  def vpermilvarps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, int>)">;
+  def vpermilvarpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>)">;
+  def vpermilvarps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>)">;
+  def blendpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def blendps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+  def blendvpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>)">;
+  def blendvps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>)">;
+  def shufpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def shufps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+  def dpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant char)">;
+  def cmppd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant char)">;
+  def cmpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant char)">;
+  def vextractf128_pd256 : X86Builtin<"_Vector<2, double>(_Vector<4, double>, _Constant int)">;
+  def vextractf128_ps256 : X86Builtin<"_Vector<4, float>(_Vector<8, float>, _Constant int)">;
+  def vextractf128_si256 : X86Builtin<"_Vector<4, int>(_Vector<8, int>, _Constant int)">;
+  def cvtpd2ps256 : X86Builtin<"_Vector<4, float>(_Vector<4, double>)">;
+  def cvtps2dq256 : X86Builtin<"_Vector<8, int>(_Vector<8, float>)">;
+  def cvttpd2dq256 : X86Builtin<"_Vector<4, int>(_Vector<4, double>)">;
+  def cvtpd2dq256 : X86Builtin<"_Vector<4, int>(_Vector<4, double>)">;
+  def cvttps2dq256 : X86Builtin<"_Vector<8, int>(_Vector<8, float>)">;
+  def vperm2f128_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def vperm2f128_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+  def vperm2f128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">;
+}
+
+let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpermilpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Constant int)">;
+  def vpermilps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
+}
+
+let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpermilpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">;
+  def vpermilps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int)">;
+  def vinsertf128_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<2, double>, _Constant int)">;
+  def vinsertf128_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<4, float>, _Constant int)">;
+  def vinsertf128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>, _Constant int)">;
+  def sqrtpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>)">;
+  def sqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
+  def rsqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
+  def rcpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
+  def roundpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">;
+  def roundps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int)">;
+}
+
+let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vtestzpd : X86Builtin<"int(_Vector<2, double>, _Vector<2, double>)">;
+  def vtestcpd : X86Builtin<"int(_Vector<2, double>, _Vector<2, double>)">;
+  def vtestnzcpd : X86Builtin<"int(_Vector<2, double>, _Vector<2, double>)">;
+  def vtestzps : X86Builtin<"int(_Vector<4, float>, _Vector<4, float>)">;
+  def vtestcps : X86Builtin<"int(_Vector<4, float>, _Vector<4, float>)">;
+  def vtestnzcps : X86Builtin<"int(_Vector<4, float>, _Vector<4, float>)">;
+}
+
+let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vtestzpd256 : X86Builtin<"int(_Vector<4, double>, _Vector<4, double>)">;
+  def vtestcpd256 : X86Builtin<"int(_Vector<4, double>, _Vector<4, double>)">;
+  def vtestnzcpd256 : X86Builtin<"int(_Vector<4, double>, _Vector<4, double>)">;
+  def vtestzps256 : X86Builtin<"int(_Vector<8, float>, _Vector<8, float>)">;
+  def vtestcps256 : X86Builtin<"int(_Vector<8, float>, _Vector<8, float>)">;
+  def vtestnzcps256 : X86Builtin<"int(_Vector<8, float>, _Vector<8, float>)">;
+  def ptestz256 : X86Builtin<"int(_Vector<4, long long int>, _Vector<4, long long int>)">;
+  def ptestc256 : X86Builtin<"int(_Vector<4, long long int>, _Vector<4, long long int>)">;
+  def ptestnzc256 : X86Builtin<"int(_Vector<4, long long int>, _Vector<4, long long int>)">;
+  def movmskpd256 : X86Builtin<"int(_Vector<4, double>)">;
+  def movmskps256 : X86Builtin<"int(_Vector<8, float>)">;
+}
+
+let Features = "avx", Attributes = [NoThrow] in {
+  def vzeroall : X86Builtin<"void()">;
+  def vzeroupper : X86Builtin<"void()">;
+}
+
+let Features = "avx", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def lddqu256 : X86Builtin<"_Vector<32, char>(char const *)">;
+}
+
+let Features = "avx", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def maskloadpd : X86Builtin<"_Vector<2, double>(_Vector<2, double const *>, _Vector<2, long long int>)">;
+  def maskloadps : X86Builtin<"_Vector<4, float>(_Vector<4, float const *>, _Vector<4, int>)">;
+}
+
+let Features = "avx", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def maskloadpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double const *>, _Vector<4, long long int>)">;
+  def maskloadps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float const *>, _Vector<8, int>)">;
+}
+
+let Features = "avx", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def maskstorepd : X86Builtin<"void(_Vector<2, double *>, _Vector<2, long long int>, _Vector<2, double>)">;
+  def maskstoreps : X86Builtin<"void(_Vector<4, float *>, _Vector<4, int>, _Vector<4, float>)">;
+}
+
+let Features = "avx", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def maskstorepd256 : X86Builtin<"void(_Vector<4, double *>, _Vector<4, long long int>, _Vector<4, double>)">;
+  def maskstoreps256 : X86Builtin<"void(_Vector<8, float *>, _Vector<8, int>, _Vector<8, float>)">;
+}
+
+let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vec_ext_v32qi : X86Builtin<"char(_Vector<32, char>, _Constant int)">;
+  def vec_ext_v16hi : X86Builtin<"short(_Vector<16, short>, _Constant int)">;
+  def vec_ext_v8si : X86Builtin<"int(_Vector<8, int>, _Constant int)">;
+  def vec_set_v32qi : X86Builtin<"_Vector<32, char>(_Vector<32, char>, char, _Constant int)">;
+  def vec_set_v16hi : X86Builtin<"_Vector<16, short>(_Vector<16, short>, short, _Constant int)">;
+  def vec_set_v8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int, _Constant int)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def mpsadbw256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
+  def packsswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
+  def packssdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
+  def packuswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
+  def packusdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
+  def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant int)">;
+  def pavgb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
+  def pavgw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def pblendvb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Vector<32, char>)">;
+  def pblendw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Constant int)">;
+  def phaddw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def phaddd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+  def phaddsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def phsubw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def phsubd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+  def phsubsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def pmaddubsw256 : X86Builtin<"_Vector<16, short>(_Vector<32, char>, _Vector<32, char>)">;
+  def pmaddwd256 : X86Builtin<"_Vector<8, int>(_Vector<16, short>, _Vector<16, short>)">;
+  def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">;
+  def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
+  def pmulhrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def pmulhuw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def pmulhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def pmuludq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
+  def psadbw256 : X86Builtin<"_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">;
+  def pshufb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
+  def pshufd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Constant int)">;
+  def pshuflw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int)">;
+  def pshufhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int)">;
+  def psignb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
+  def psignw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def psignd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+  def psllwi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, int)">;
+  def psllw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">;
+  def pslldqi256_byteshift : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">;
+  def pslldi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int)">;
+  def pslld256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">;
+  def psllqi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, int)">;
+  def psllq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">;
+  def psrawi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, int)">;
+  def psraw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">;
+  def psradi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int)">;
+  def psrad256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">;
+  def psrldqi256_byteshift : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">;
+  def psrlwi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, int)">;
+  def psrlw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">;
+  def psrldi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int)">;
+  def psrld256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">;
+  def psrlqi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, int)">;
+  def psrlq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">;
+  def pblendd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Constant int)">;
+  def pblendd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">;
+  def permvarsi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+  def permdf256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">;
+  def permvarsf256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>)">;
+  def permti256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">;
+  def permdi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">;
+  def extract128i256 : X86Builtin<"_Vector<2, long long int>(_Vector<4, long long int>, _Constant int)">;
+  def insert128i256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def maskloadd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int const *>, _Vector<8, int>)">;
+  def maskloadq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int const *>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def maskloadd : X86Builtin<"_Vector<4, int>(_Vector<4, int const *>, _Vector<4, int>)">;
+  def maskloadq : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int const *>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def maskstored256 : X86Builtin<"void(_Vector<8, int *>, _Vector<8, int>, _Vector<8, int>)">;
+  def maskstoreq256 : X86Builtin<"void(_Vector<4, long long int *>, _Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def maskstored : X86Builtin<"void(_Vector<4, int *>, _Vector<4, int>, _Vector<4, int>)">;
+  def maskstoreq : X86Builtin<"void(_Vector<2, long long int *>, _Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psllv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psllv4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psllv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psllv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psrav8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psrav4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psrlv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psrlv4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psrlv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psrlv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gatherd_pd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, double const *, _Vector<4, int>, _Vector<2, double>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gatherd_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, double const *, _Vector<4, int>, _Vector<4, double>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gatherq_pd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, double const *, _Vector<2, long long int>, _Vector<2, double>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gatherq_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, double const *, _Vector<4, long long int>, _Vector<4, double>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gatherd_ps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, float const *, _Vector<4, int>, _Vector<4, float>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gatherd_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, float const *, _Vector<8, int>, _Vector<8, float>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gatherq_ps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, float const *, _Vector<2, long long int>, _Vector<4, float>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gatherq_ps256 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, float const *, _Vector<4, long long int>, _Vector<4, float>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gatherd_q : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, long long int const *, _Vector<4, int>, _Vector<2, long long int>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gatherd_q256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, long long int const *, _Vector<4, int>, _Vector<4, long long int>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gatherq_q : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, long long int const *, _Vector<2, long long int>, _Vector<2, long long int>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gatherq_q256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, long long int const *, _Vector<4, long long int>, _Vector<4, long long int>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gatherd_d : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int const *, _Vector<4, int>, _Vector<4, int>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gatherd_d256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int const *, _Vector<8, int>, _Vector<8, int>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gatherq_d : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int const *, _Vector<2, long long int>, _Vector<4, int>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gatherq_d256 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int const *, _Vector<4, long long int>, _Vector<4, int>, _Constant char)">;
+}
+
+let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtps2ph : X86Builtin<"_Vector<8, short>(_Vector<4, float>, _Constant int)">;
+}
+
+let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtps2ph256 : X86Builtin<"_Vector<8, short>(_Vector<8, float>, _Constant int)">;
+}
+
+let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtph2ps : X86Builtin<"_Vector<4, float>(_Vector<8, short>)">;
+}
+
+let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtph2ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, short>)">;
+}
+
+let Features = "rdrnd", Attributes = [NoThrow] in {
+  def rdrand16_step : X86Builtin<"unsigned int(unsigned short *)">;
+  def rdrand32_step : X86Builtin<"unsigned int(unsigned int *)">;
+}
+
+let Features = "fxsr", Attributes = [NoThrow] in {
+  def fxrstor : X86Builtin<"void(void *)">;
+  def fxsave : X86Builtin<"void(void *)">;
+}
+
+let Features = "xsave", Attributes = [NoThrow] in {
+  def xsave : X86Builtin<"void(void *, unsigned long long int)">;
+  def xrstor : X86Builtin<"void(void *, unsigned long long int)">;
+  def xgetbv : X86Builtin<"unsigned long long int(unsigned int)">;
+}
+
+let Header = "immintrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, RequireDeclaration] in {
+  def _xgetbv : X86LibBuiltin<"uint64_t(unsigned int)">;
+}
+
+let Features = "xsave", Attributes = [NoThrow] in {
+  def xsetbv : X86Builtin<"void(unsigned int, unsigned long long int)">;
+}
+
+let Header = "immintrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, RequireDeclaration] in {
+  def _xsetbv : X86LibBuiltin<"void(unsigned int, uint64_t)">;
+}
+
+let Features = "xsaveopt", Attributes = [NoThrow] in {
+  def xsaveopt : X86Builtin<"void(void *, unsigned long long int)">;
+}
+
+let Features = "xsaves", Attributes = [NoThrow] in {
+  def xrstors : X86Builtin<"void(void *, unsigned long long int)">;
+}
+
+let Features = "xsavec", Attributes = [NoThrow] in {
+  def xsavec : X86Builtin<"void(void *, unsigned long long int)">;
+}
+
+let Features = "xsaves", Attributes = [NoThrow] in {
+  def xsaves : X86Builtin<"void(void *, unsigned long long int)">;
+}
+
+let Features = "shstk", Attributes = [NoThrow] in {
+  def incsspd : X86Builtin<"void(unsigned int)">;
+  def rdsspd : X86Builtin<"unsigned int(unsigned int)">;
+  def saveprevssp : X86Builtin<"void()">;
+  def rstorssp : X86Builtin<"void(void *)">;
+  def wrssd : X86Builtin<"void(unsigned int, void *)">;
+  def wrussd : X86Builtin<"void(unsigned int, void *)">;
+  def setssbsy : X86Builtin<"void()">;
+  def clrssbsy : X86Builtin<"void(void *)">;
+}
+
+let Features = "clflushopt", Attributes = [NoThrow] in {
+  def clflushopt : X86Builtin<"void(void const *)">;
+}
+
+let Features = "clwb", Attributes = [NoThrow] in {
+  def clwb : X86Builtin<"void(void const *)">;
+}
+
+let Attributes = [NoThrow] in {
+  def wbinvd : X86Builtin<"void()">;
+}
+
+let Features = "wbnoinvd", Attributes = [NoThrow] in {
+  def wbnoinvd : X86Builtin<"void()">;
+}
+
+let Attributes = [NoThrow, Constexpr] in {
+  def addcarryx_u32 : X86Builtin<"unsigned char(unsigned char, unsigned int, unsigned int, unsigned int *)">;
+  def subborrow_u32 : X86Builtin<"unsigned char(unsigned char, unsigned int, unsigned int, unsigned int *)">;
+}
+
+let Features = "rdseed", Attributes = [NoThrow] in {
+  def rdseed16_step : X86Builtin<"unsigned int(unsigned short *)">;
+  def rdseed32_step : X86Builtin<"unsigned int(unsigned int *)">;
+}
+
+let Features = "lzcnt", Attributes = [NoThrow, Const, Constexpr] in {
+  def lzcnt_u16 : X86Builtin<"unsigned short(unsigned short)">;
+  def lzcnt_u32 : X86Builtin<"unsigned int(unsigned int)">;
+}
+
+let Features = "bmi", Attributes = [NoThrow, Const, Constexpr] in {
+  def bextr_u32 : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+}
+
+let Attributes = [NoThrow, Const, Constexpr] in {
+  def tzcnt_u16 : X86Builtin<"unsigned short(unsigned short)">;
+  def tzcnt_u32 : X86Builtin<"unsigned int(unsigned int)">;
+}
+
+let Features = "bmi2", Attributes = [NoThrow, Const, Constexpr] in {
+  def bzhi_si : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+  def pdep_si : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+  def pext_si : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+}
+
+let Features = "tbm", Attributes = [NoThrow, Const, Constexpr] in {
+  def bextri_u32 : X86Builtin<"unsigned int(unsigned int, _Constant unsigned int)">;
+}
+
+let Features = "lwp", Attributes = [NoThrow] in {
+  def llwpcb : X86Builtin<"void(void *)">;
+  def slwpcb : X86Builtin<"void *()">;
+  def lwpins32 : X86Builtin<"unsigned char(unsigned int, unsigned int, _Constant unsigned int)">;
+  def lwpval32 : X86Builtin<"void(unsigned int, unsigned int, _Constant unsigned int)">;
+}
+
+let Features = "sha", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def sha1rnds4 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Constant char)">;
+  def sha1nexte : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def sha1msg1 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def sha1msg2 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def sha256rnds2 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def sha256msg1 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def sha256msg2 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "fma|fma4", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
+  def vfmaddpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
+}
+
+let Features = "fma", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddss3 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
+  def vfmaddsd3 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
+}
+
+let Features = "fma4", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
+  def vfmaddsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
+}
+
+let Features = "fma|fma4", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddsubps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
+  def vfmaddsubpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
+}
+
+let Features = "fma|fma4", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vfmaddps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>)">;
+  def vfmaddpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>)">;
+  def vfmaddsubps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>)">;
+  def vfmaddsubpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vfmaddpd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def vfmaddpd512_maskz : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def vfmaddpd512_mask3 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def vfmsubpd512_mask3 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def vfmaddps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfmaddps512_maskz : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfmaddps512_mask3 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfmsubps512_mask3 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfmaddsubpd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def vfmaddsubpd512_maskz : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def vfmaddsubpd512_mask3 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def vfmsubaddpd512_mask3 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def vfmaddsubps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfmaddsubps512_maskz : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfmaddsubps512_mask3 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfmsubaddps512_mask3 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "xop", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpmacssww : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Vector<8, short>)">;
+  def vpmacsww : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Vector<8, short>)">;
+  def vpmacsswd : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>, _Vector<4, int>)">;
+  def vpmacswd : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>, _Vector<4, int>)">;
+  def vpmacssdd : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpmacsdd : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpmacssdql : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>, _Vector<2, long long int>)">;
+  def vpmacsdql : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>, _Vector<2, long long int>)">;
+  def vpmacssdqh : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>, _Vector<2, long long int>)">;
+  def vpmacsdqh : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>, _Vector<2, long long int>)">;
+  def vpmadcsswd : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>, _Vector<4, int>)">;
+  def vpmadcswd : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>, _Vector<4, int>)">;
+  def vphaddbw : X86Builtin<"_Vector<8, short>(_Vector<16, char>)">;
+  def vphaddbd : X86Builtin<"_Vector<4, int>(_Vector<16, char>)">;
+  def vphaddbq : X86Builtin<"_Vector<2, long long int>(_Vector<16, char>)">;
+  def vphaddwd : X86Builtin<"_Vector<4, int>(_Vector<8, short>)">;
+  def vphaddwq : X86Builtin<"_Vector<2, long long int>(_Vector<8, short>)">;
+  def vphadddq : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>)">;
+  def vphaddubw : X86Builtin<"_Vector<8, short>(_Vector<16, char>)">;
+  def vphaddubd : X86Builtin<"_Vector<4, int>(_Vector<16, char>)">;
+  def vphaddubq : X86Builtin<"_Vector<2, long long int>(_Vector<16, char>)">;
+  def vphadduwd : X86Builtin<"_Vector<4, int>(_Vector<8, short>)">;
+  def vphadduwq : X86Builtin<"_Vector<2, long long int>(_Vector<8, short>)">;
+  def vphaddudq : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>)">;
+  def vphsubbw : X86Builtin<"_Vector<8, short>(_Vector<16, char>)">;
+  def vphsubwd : X86Builtin<"_Vector<4, int>(_Vector<8, short>)">;
+  def vphsubdq : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>)">;
+  def vpperm : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Vector<16, char>)">;
+  def vprotb : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
+  def vprotw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+  def vprotd : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def vprotq : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def vprotbi : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant char)">;
+  def vprotwi : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant char)">;
+  def vprotdi : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Constant char)">;
+  def vprotqi : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant char)">;
+  def vpshlb : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
+  def vpshlw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+  def vpshld : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def vpshlq : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def vpshab : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
+  def vpshaw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+  def vpshad : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def vpshaq : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def vpcomub : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+  def vpcomuw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Constant char)">;
+  def vpcomud : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Constant char)">;
+  def vpcomuq : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Constant char)">;
+  def vpcomb : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+  def vpcomw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Constant char)">;
+  def vpcomd : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Constant char)">;
+  def vpcomq : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Constant char)">;
+  def vpermil2pd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, long long int>, _Constant char)">;
+}
+
+let Features = "xop", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpermil2pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, long long int>, _Constant char)">;
+}
+
+let Features = "xop", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpermil2ps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, int>, _Constant char)">;
+}
+
+let Features = "xop", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpermil2ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, int>, _Constant char)">;
+}
+
+let Features = "xop", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfrczss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
+  def vfrczsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
+  def vfrczps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
+  def vfrczpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
+}
+
+let Features = "xop", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vfrczps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
+  def vfrczpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>)">;
+}
+
+let Features = "rtm", Attributes = [NoThrow] in {
+  def xbegin : X86Builtin<"int()">;
+  def xend : X86Builtin<"void()">;
+  def xabort : X86Builtin<"void(_Constant char)">;
+  def xtest : X86Builtin<"int()">;
+}
+
+let Features = "rdpid", Attributes = [NoThrow] in {
+  def rdpid : X86Builtin<"unsigned int()">;
+}
+
+let Features = "rdpru", Attributes = [NoThrow], EnableOpenCLLong = 0 in {
+  def rdpru : X86Builtin<"unsigned long long int(int)">;
+}
+
+let Features = "pku", Attributes = [NoThrow] in {
+  def rdpkru : X86Builtin<"unsigned int()">;
+  def wrpkru : X86Builtin<"void(unsigned int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def sqrtpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Constant int)">;
+  def sqrtps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Constant int)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rsqrt14sd_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char)">;
+  def rsqrt14ss_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def rsqrt14pd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, unsigned char)">;
+  def rsqrt14ps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, unsigned short)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rcp14sd_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char)">;
+  def rcp14ss_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def rcp14pd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, unsigned char)">;
+  def rcp14ps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, unsigned short)">;
+  def cvttps2dq512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, float>, _Vector<16, int>, unsigned short, _Constant int)">;
+  def cvttps2udq512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, float>, _Vector<16, int>, unsigned short, _Constant int)">;
+  def cvttpd2dq512_mask : X86Builtin<"_Vector<8, int>(_Vector<8, double>, _Vector<8, int>, unsigned char, _Constant int)">;
+  def cvttpd2udq512_mask : X86Builtin<"_Vector<8, int>(_Vector<8, double>, _Vector<8, int>, unsigned char, _Constant int)">;
+  def cmpps512_mask : X86Builtin<"unsigned short(_Vector<16, float>, _Vector<16, float>, _Constant int, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cmpps256_mask : X86Builtin<"unsigned char(_Vector<8, float>, _Vector<8, float>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cmpps128_mask : X86Builtin<"unsigned char(_Vector<4, float>, _Vector<4, float>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cmppd512_mask : X86Builtin<"unsigned char(_Vector<8, double>, _Vector<8, double>, _Constant int, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cmppd256_mask : X86Builtin<"unsigned char(_Vector<4, double>, _Vector<4, double>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cmppd128_mask : X86Builtin<"unsigned char(_Vector<2, double>, _Vector<2, double>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def rndscaleps_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Constant int, _Vector<16, float>, unsigned short, _Constant int)">;
+  def rndscalepd_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Constant int, _Vector<8, double>, unsigned char, _Constant int)">;
+  def cvtps2dq512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, float>, _Vector<16, int>, unsigned short, _Constant int)">;
+  def cvtpd2dq512_mask : X86Builtin<"_Vector<8, int>(_Vector<8, double>, _Vector<8, int>, unsigned char, _Constant int)">;
+  def cvtps2udq512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, float>, _Vector<16, int>, unsigned short, _Constant int)">;
+  def cvtpd2udq512_mask : X86Builtin<"_Vector<8, int>(_Vector<8, double>, _Vector<8, int>, unsigned char, _Constant int)">;
+  def minps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">;
+  def minpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">;
+  def maxps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">;
+  def maxpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">;
+  def cvtdq2ps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, int>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def cvtudq2ps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, int>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def cvtpd2ps512_mask : X86Builtin<"_Vector<8, float>(_Vector<8, double>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vcvtps2ph512_mask : X86Builtin<"_Vector<16, short>(_Vector<16, float>, _Constant int, _Vector<16, short>, unsigned short)">;
+  def vcvtph2ps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, short>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def pmuldq512 : X86Builtin<"_Vector<8, long long int>(_Vector<16, int>, _Vector<16, int>)">;
+  def pmuludq512 : X86Builtin<"_Vector<8, long long int>(_Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def loaddqusi512_mask : X86Builtin<"_Vector<16, int>(int const *, _Vector<16, int>, unsigned short)">;
+  def loaddqudi512_mask : X86Builtin<"_Vector<8, long long int>(long long int const *, _Vector<8, long long int>, unsigned char)">;
+  def loadups512_mask : X86Builtin<"_Vector<16, float>(float const *, _Vector<16, float>, unsigned short)">;
+  def loadaps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float const *>, _Vector<16, float>, unsigned short)">;
+  def loadupd512_mask : X86Builtin<"_Vector<8, double>(double const *, _Vector<8, double>, unsigned char)">;
+  def loadapd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double const *>, _Vector<8, double>, unsigned char)">;
+  def storedqudi512_mask : X86Builtin<"void(long long int *, _Vector<8, long long int>, unsigned char)">;
+  def storedqusi512_mask : X86Builtin<"void(int *, _Vector<16, int>, unsigned short)">;
+  def storeupd512_mask : X86Builtin<"void(double *, _Vector<8, double>, unsigned char)">;
+  def storeapd512_mask : X86Builtin<"void(_Vector<8, double *>, _Vector<8, double>, unsigned char)">;
+  def storeups512_mask : X86Builtin<"void(float *, _Vector<16, float>, unsigned short)">;
+  def storeaps512_mask : X86Builtin<"void(_Vector<16, float *>, _Vector<16, float>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def alignq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Constant int)">;
+  def alignd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def alignd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def alignd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def alignq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def alignq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def extractf64x4_mask : X86Builtin<"_Vector<4, double>(_Vector<8, double>, _Constant int, _Vector<4, double>, unsigned char)">;
+  def extractf32x4_mask : X86Builtin<"_Vector<4, float>(_Vector<16, float>, _Constant int, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpdpbusd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpdpbusd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx512vnni,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpdpbusd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpdpbusds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpdpbusds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx512vnni,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpdpbusds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpdpwssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpdpwssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx512vnni,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpdpwssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpdpwssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpdpwssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx512vnni,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpdpwssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpdpbssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpdpbssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpdpbssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpdpbssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpdpbsud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpdpbsud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpdpbsuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpdpbsuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpdpbuud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpdpbuud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpdpbuuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpdpbuuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "movrs", Attributes = [NoThrow, Const] in {
+  def prefetchrs : X86Builtin<"void(void const *)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gather3div2df : X86Builtin<"_Vector<2, double>(_Vector<2, double>, void const *, _Vector<2, long long int>, unsigned char, _Constant int)">;
+  def gather3div2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, void const *, _Vector<2, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gather3div4df : X86Builtin<"_Vector<4, double>(_Vector<4, double>, void const *, _Vector<4, long long int>, unsigned char, _Constant int)">;
+  def gather3div4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, void const *, _Vector<4, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gather3div4sf : X86Builtin<"_Vector<4, float>(_Vector<4, float>, void const *, _Vector<2, long long int>, unsigned char, _Constant int)">;
+  def gather3div4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, void const *, _Vector<2, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gather3div8sf : X86Builtin<"_Vector<4, float>(_Vector<4, float>, void const *, _Vector<4, long long int>, unsigned char, _Constant int)">;
+  def gather3div8si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, void const *, _Vector<4, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gather3siv2df : X86Builtin<"_Vector<2, double>(_Vector<2, double>, void const *, _Vector<4, int>, unsigned char, _Constant int)">;
+  def gather3siv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, void const *, _Vector<4, int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gather3siv4df : X86Builtin<"_Vector<4, double>(_Vector<4, double>, void const *, _Vector<4, int>, unsigned char, _Constant int)">;
+  def gather3siv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, void const *, _Vector<4, int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gather3siv4sf : X86Builtin<"_Vector<4, float>(_Vector<4, float>, void const *, _Vector<4, int>, unsigned char, _Constant int)">;
+  def gather3siv4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, void const *, _Vector<4, int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gather3siv8sf : X86Builtin<"_Vector<8, float>(_Vector<8, float>, void const *, _Vector<8, int>, unsigned char, _Constant int)">;
+  def gather3siv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, void const *, _Vector<8, int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def gathersiv8df : X86Builtin<"_Vector<8, double>(_Vector<8, double>, void const *, _Vector<8, int>, unsigned char, _Constant int)">;
+  def gathersiv16sf : X86Builtin<"_Vector<16, float>(_Vector<16, float>, void const *, _Vector<16, int>, unsigned short, _Constant int)">;
+  def gatherdiv8df : X86Builtin<"_Vector<8, double>(_Vector<8, double>, void const *, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def gatherdiv16sf : X86Builtin<"_Vector<8, float>(_Vector<8, float>, void const *, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def gathersiv8di : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, void const *, _Vector<8, int>, unsigned char, _Constant int)">;
+  def gathersiv16si : X86Builtin<"_Vector<16, int>(_Vector<16, int>, void const *, _Vector<16, int>, unsigned short, _Constant int)">;
+  def gatherdiv8di : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, void const *, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def gatherdiv16si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, void const *, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def scattersiv8df : X86Builtin<"void(void *, unsigned char, _Vector<8, int>, _Vector<8, double>, _Constant int)">;
+  def scattersiv16sf : X86Builtin<"void(void *, unsigned short, _Vector<16, int>, _Vector<16, float>, _Constant int)">;
+  def scatterdiv8df : X86Builtin<"void(void *, unsigned char, _Vector<8, long long int>, _Vector<8, double>, _Constant int)">;
+  def scatterdiv16sf : X86Builtin<"void(void *, unsigned char, _Vector<8, long long int>, _Vector<8, float>, _Constant int)">;
+  def scattersiv8di : X86Builtin<"void(void *, unsigned char, _Vector<8, int>, _Vector<8, long long int>, _Constant int)">;
+  def scattersiv16si : X86Builtin<"void(void *, unsigned short, _Vector<16, int>, _Vector<16, int>, _Constant int)">;
+  def scatterdiv8di : X86Builtin<"void(void *, unsigned char, _Vector<8, long long int>, _Vector<8, long long int>, _Constant int)">;
+  def scatterdiv16si : X86Builtin<"void(void *, unsigned char, _Vector<8, long long int>, _Vector<8, int>, _Constant int)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def knotqi : X86Builtin<"unsigned char(unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def knothi : X86Builtin<"unsigned short(unsigned short)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def knotsi : X86Builtin<"unsigned int(unsigned int)">;
+  def knotdi : X86Builtin<"unsigned long long int(unsigned long long int)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cmpb128_mask : X86Builtin<"unsigned short(_Vector<16, char>, _Vector<16, char>, _Constant int, unsigned short)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cmpd128_mask : X86Builtin<"unsigned char(_Vector<4, int>, _Vector<4, int>, _Constant int, unsigned char)">;
+  def cmpq128_mask : X86Builtin<"unsigned char(_Vector<2, long long int>, _Vector<2, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cmpw128_mask : X86Builtin<"unsigned char(_Vector<8, short>, _Vector<8, short>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cmpb256_mask : X86Builtin<"unsigned int(_Vector<32, char>, _Vector<32, char>, _Constant int, unsigned int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cmpd256_mask : X86Builtin<"unsigned char(_Vector<8, int>, _Vector<8, int>, _Constant int, unsigned char)">;
+  def cmpq256_mask : X86Builtin<"unsigned char(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cmpw256_mask : X86Builtin<"unsigned short(_Vector<16, short>, _Vector<16, short>, _Constant int, unsigned short)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cmpb512_mask : X86Builtin<"unsigned long long int(_Vector<64, char>, _Vector<64, char>, _Constant int, unsigned long long int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cmpd512_mask : X86Builtin<"unsigned short(_Vector<16, int>, _Vector<16, int>, _Constant int, unsigned short)">;
+  def cmpq512_mask : X86Builtin<"unsigned char(_Vector<8, long long int>, _Vector<8, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cmpw512_mask : X86Builtin<"unsigned int(_Vector<32, short>, _Vector<32, short>, _Constant int, unsigned int)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def ucmpb128_mask : X86Builtin<"unsigned short(_Vector<16, char>, _Vector<16, char>, _Constant int, unsigned short)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def ucmpd128_mask : X86Builtin<"unsigned char(_Vector<4, int>, _Vector<4, int>, _Constant int, unsigned char)">;
+  def ucmpq128_mask : X86Builtin<"unsigned char(_Vector<2, long long int>, _Vector<2, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def ucmpw128_mask : X86Builtin<"unsigned char(_Vector<8, short>, _Vector<8, short>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def ucmpb256_mask : X86Builtin<"unsigned int(_Vector<32, char>, _Vector<32, char>, _Constant int, unsigned int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def ucmpd256_mask : X86Builtin<"unsigned char(_Vector<8, int>, _Vector<8, int>, _Constant int, unsigned char)">;
+  def ucmpq256_mask : X86Builtin<"unsigned char(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def ucmpw256_mask : X86Builtin<"unsigned short(_Vector<16, short>, _Vector<16, short>, _Constant int, unsigned short)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def ucmpb512_mask : X86Builtin<"unsigned long long int(_Vector<64, char>, _Vector<64, char>, _Constant int, unsigned long long int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def ucmpd512_mask : X86Builtin<"unsigned short(_Vector<16, int>, _Vector<16, int>, _Constant int, unsigned short)">;
+  def ucmpq512_mask : X86Builtin<"unsigned char(_Vector<8, long long int>, _Vector<8, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def ucmpw512_mask : X86Builtin<"unsigned int(_Vector<32, short>, _Vector<32, short>, _Constant int, unsigned int)">;
+  def packssdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
+  def packsswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
+  def packusdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
+  def packuswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
+  def pavgb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
+  def pavgw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
+  def pshufb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
+}
+
+let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpconflictdi_128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>)">;
+}
+
+let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpconflictdi_256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>)">;
+}
+
+let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpconflictsi_128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>)">;
+}
+
+let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpconflictsi_256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>)">;
+}
+
+let Features = "avx512cd,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpconflictdi_512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>)">;
+  def vpconflictsi_512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>)">;
+  def vplzcntd_512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>)">;
+  def vplzcntq_512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>)">;
+}
+
+let Features = "avx512vl,avx512bitalg", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshufbitqmb128_mask : X86Builtin<"unsigned short(_Vector<16, char>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512bitalg", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshufbitqmb256_mask : X86Builtin<"unsigned int(_Vector<32, char>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx512bitalg,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshufbitqmb512_mask : X86Builtin<"unsigned long long int(_Vector<64, char>, _Vector<64, char>, unsigned long long int)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmulhrsw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
+  def pmulhuw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
+  def pmulhw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def addpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">;
+  def addps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">;
+  def divpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">;
+  def divps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">;
+  def mulpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">;
+  def mulps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">;
+  def subpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">;
+  def subps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmaddubsw512 : X86Builtin<"_Vector<32, short>(_Vector<64, char>, _Vector<64, char>)">;
+  def pmaddwd512 : X86Builtin<"_Vector<16, int>(_Vector<32, short>, _Vector<32, short>)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def addss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def divss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def mulss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def subss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def maxss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def minss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def addsd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def divsd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def mulsd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def subsd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def maxsd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def minsd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def compressdf128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def compressdf256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def compressdi128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def compressdi256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def compresshi128_mask : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def compresshi256_mask : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def compressqi128_mask : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def compressqi256_mask : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def compresssf128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def compresssf256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def compresssi128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def compresssi256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def compressstoredf128_mask : X86Builtin<"void(_Vector<2, double *>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def compressstoredf256_mask : X86Builtin<"void(_Vector<4, double *>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def compressstoredi128_mask : X86Builtin<"void(_Vector<2, long long int *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def compressstoredi256_mask : X86Builtin<"void(_Vector<4, long long int *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def compressstorehi128_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def compressstorehi256_mask : X86Builtin<"void(_Vector<16, short *>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def compressstoreqi128_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def compressstoreqi256_mask : X86Builtin<"void(_Vector<32, char *>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def compressstoresf128_mask : X86Builtin<"void(_Vector<4, float *>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def compressstoresf256_mask : X86Builtin<"void(_Vector<8, float *>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def compressstoresi128_mask : X86Builtin<"void(_Vector<4, int *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def compressstoresi256_mask : X86Builtin<"void(_Vector<8, int *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtpd2dq128_mask : X86Builtin<"_Vector<4, int>(_Vector<2, double>, _Vector<4, int>, unsigned char)">;
+  def cvtpd2ps_mask : X86Builtin<"_Vector<4, float>(_Vector<2, double>, _Vector<4, float>, unsigned char)">;
+  def cvtpd2udq128_mask : X86Builtin<"_Vector<4, int>(_Vector<2, double>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtpd2udq256_mask : X86Builtin<"_Vector<4, int>(_Vector<4, double>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtps2udq128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, float>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtps2udq256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, float>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvttpd2dq128_mask : X86Builtin<"_Vector<4, int>(_Vector<2, double>, _Vector<4, int>, unsigned char)">;
+  def cvttpd2udq128_mask : X86Builtin<"_Vector<4, int>(_Vector<2, double>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvttpd2udq256_mask : X86Builtin<"_Vector<4, int>(_Vector<4, double>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvttps2udq128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, float>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvttps2udq256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, float>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def expanddf128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def expanddf256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def expanddi128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def expanddi256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def expandhi128_mask : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def expandhi256_mask : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def expandqi128_mask : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def expandqi256_mask : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def expandloaddf128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double const *>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def expandloaddf256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double const *>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def expandloaddi128_mask : X86Builtin<"_Vector<4, int>(_Vector<2, long long int const *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def expandloaddi256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int const *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def expandloadhi128_mask : X86Builtin<"_Vector<8, short>(_Vector<8, short const *>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def expandloadhi256_mask : X86Builtin<"_Vector<16, short>(_Vector<16, short const *>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def expandloadqi128_mask : X86Builtin<"_Vector<16, char>(_Vector<16, char const *>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def expandloadqi256_mask : X86Builtin<"_Vector<32, char>(_Vector<32, char const *>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def expandloadsf128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float const *>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def expandloadsf256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float const *>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def expandloadsi128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, int const *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def expandloadsi256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, int const *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def expandsf128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def expandsf256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def expandsi128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def expandsi256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def getexppd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def getexppd256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def getexpps128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def getexpps256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rndscalepd_128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Constant int, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rndscalepd_256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rndscaleps_128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rndscaleps_256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def scalefpd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def scalefpd256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def scalefps128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def scalefps256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def scatterdiv2df : X86Builtin<"void(void *, unsigned char, _Vector<2, long long int>, _Vector<2, double>, _Constant int)">;
+  def scatterdiv2di : X86Builtin<"void(void *, unsigned char, _Vector<2, long long int>, _Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def scatterdiv4df : X86Builtin<"void(void *, unsigned char, _Vector<4, long long int>, _Vector<4, double>, _Constant int)">;
+  def scatterdiv4di : X86Builtin<"void(void *, unsigned char, _Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def scatterdiv4sf : X86Builtin<"void(void *, unsigned char, _Vector<2, long long int>, _Vector<4, float>, _Constant int)">;
+  def scatterdiv4si : X86Builtin<"void(void *, unsigned char, _Vector<2, long long int>, _Vector<4, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def scatterdiv8sf : X86Builtin<"void(void *, unsigned char, _Vector<4, long long int>, _Vector<4, float>, _Constant int)">;
+  def scatterdiv8si : X86Builtin<"void(void *, unsigned char, _Vector<4, long long int>, _Vector<4, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def scattersiv2df : X86Builtin<"void(void *, unsigned char, _Vector<4, int>, _Vector<2, double>, _Constant int)">;
+  def scattersiv2di : X86Builtin<"void(void *, unsigned char, _Vector<4, int>, _Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def scattersiv4df : X86Builtin<"void(void *, unsigned char, _Vector<4, int>, _Vector<4, double>, _Constant int)">;
+  def scattersiv4di : X86Builtin<"void(void *, unsigned char, _Vector<4, int>, _Vector<4, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def scattersiv4sf : X86Builtin<"void(void *, unsigned char, _Vector<4, int>, _Vector<4, float>, _Constant int)">;
+  def scattersiv4si : X86Builtin<"void(void *, unsigned char, _Vector<4, int>, _Vector<4, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def scattersiv8sf : X86Builtin<"void(void *, unsigned char, _Vector<8, int>, _Vector<8, float>, _Constant int)">;
+  def scattersiv8si : X86Builtin<"void(void *, unsigned char, _Vector<8, int>, _Vector<8, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpermi2vard128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpermi2vard256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpermi2vard512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpermi2varpd128 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, long long int>, _Vector<2, double>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpermi2varpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>, _Vector<4, double>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpermi2varpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, long long int>, _Vector<8, double>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpermi2varps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, int>, _Vector<4, float>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpermi2varps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>, _Vector<8, float>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpermi2varps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, int>, _Vector<16, float>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpermi2varq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpermi2varq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpermi2varq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">;
+}
+
+let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpermi2varqi128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Vector<16, char>)">;
+}
+
+let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpermi2varqi256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Vector<32, char>)">;
+}
+
+let Features = "avx512vbmi,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpermi2varqi512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Vector<64, char>)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpermi2varhi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Vector<8, short>)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpermi2varhi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Vector<16, short>)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpermi2varhi512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, _Vector<32, short>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshldd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshldd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshldd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshldq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshldq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshldw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshldw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Constant int)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshldw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshldvd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshldvd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshldvd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshldvq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshldvq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshldvq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshldvw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Vector<8, short>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshldvw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Vector<16, short>)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshldvw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, _Vector<32, short>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshrdvd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshrdvd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshrdvd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshrdvq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshrdvq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshrdvq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshrdvw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Vector<8, short>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshrdvw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Vector<16, short>)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshrdvw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, _Vector<32, short>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshrdd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshrdd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshrdd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshrdq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshrdq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshrdq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshrdw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshrdw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Constant int)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshrdw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, _Constant int)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovswb512_mask : X86Builtin<"_Vector<32, char>(_Vector<32, short>, _Vector<32, char>, unsigned int)">;
+  def pmovuswb512_mask : X86Builtin<"_Vector<32, char>(_Vector<32, short>, _Vector<32, char>, unsigned int)">;
+  def pmovwb512_mask : X86Builtin<"_Vector<32, char>(_Vector<32, short>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtpd2qq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtpd2qq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, double>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtpd2uqq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtpd2uqq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, double>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtps2qq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<4, float>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtps2qq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, float>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtps2uqq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<4, float>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtps2uqq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, float>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtqq2ps128_mask : X86Builtin<"_Vector<4, float>(_Vector<2, long long int>, _Vector<4, float>, unsigned char)">;
+  def cvttpd2qq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvttpd2qq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, double>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvttpd2uqq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvttpd2uqq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, double>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvttps2qq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<4, float>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvttps2qq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, float>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvttps2uqq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<4, float>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvttps2uqq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, float>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtuqq2ps128_mask : X86Builtin<"_Vector<4, float>(_Vector<2, long long int>, _Vector<4, float>, unsigned char)">;
+  def rangepd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rangepd256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rangeps128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rangeps256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rangesd128_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int, _Constant int)">;
+  def rangess128_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def reducepd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Constant int, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def reducepd256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def reduceps128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def reduceps256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def reducesd_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int, _Constant int)">;
+  def reducess_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovswb128_mask : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovswb256_mask : X86Builtin<"_Vector<16, char>(_Vector<16, short>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovuswb128_mask : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovuswb256_mask : X86Builtin<"_Vector<16, char>(_Vector<16, short>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovwb128_mask : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512dq,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cvtpd2qq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, double>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def cvtpd2uqq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, double>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def cvtps2qq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, float>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def cvtps2uqq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, float>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def cvtqq2pd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, long long int>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def cvtqq2ps512_mask : X86Builtin<"_Vector<8, float>(_Vector<8, long long int>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def cvttpd2qq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, double>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def cvttpd2uqq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, double>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def cvttps2qq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, float>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def cvttps2uqq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, float>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def cvtuqq2pd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, long long int>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def cvtuqq2ps512_mask : X86Builtin<"_Vector<8, float>(_Vector<8, long long int>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def rangepd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int, _Vector<8, double>, unsigned char, _Constant int)">;
+  def rangeps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int, _Vector<16, float>, unsigned short, _Constant int)">;
+  def reducepd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Constant int, _Vector<8, double>, unsigned char, _Constant int)">;
+  def reduceps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Constant int, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def prold512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Constant int)">;
+  def prolq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def prold128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def prold256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def prolq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def prolq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def prolvd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def prolvq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
+  def prord512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Constant int)">;
+  def prorq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def prolvd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def prolvd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def prolvq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def prolvq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def prord128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def prord256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def prorq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def prorq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def prorvd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def prorvq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def prorvd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def prorvd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def prorvq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def prorvq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pshufhw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int)">;
+  def pshuflw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int)">;
+  def psllv32hi : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
+  def psllw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<8, short>)">;
+  def psllwi512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, int)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psllv16hi : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psllv8hi : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pslldi512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def psllqi512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, int)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def psrlv32hi : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psrlv16hi : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psrlv8hi : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def psrldi512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def psrlqi512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, int)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def psrav32hi : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psrav16hi : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psrav8hi : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psravq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psravq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def psraw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<8, short>)">;
+  def psrawi512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, int)">;
+  def psrlw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<8, short>)">;
+  def psrlwi512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, int)">;
+  def pslldqi512_byteshift : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Constant int)">;
+  def psrldqi512_byteshift : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def movdqa32load128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, int const *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def movdqa32load256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, int const *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def movdqa32load512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, int const *>, _Vector<16, int>, unsigned short)">;
+  def movdqa32store512_mask : X86Builtin<"void(_Vector<16, int *>, _Vector<16, int>, unsigned short)">;
+  def movdqa64load512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int const *>, _Vector<8, long long int>, unsigned char)">;
+  def movdqa64store512_mask : X86Builtin<"void(_Vector<8, long long int *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def movdqa32store128_mask : X86Builtin<"void(_Vector<4, int *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def movdqa32store256_mask : X86Builtin<"void(_Vector<8, int *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def movdqa64load128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int const *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def movdqa64load256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int const *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def movdqa64store128_mask : X86Builtin<"void(_Vector<2, long long int *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def movdqa64store256_mask : X86Builtin<"void(_Vector<4, long long int *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512ifma,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpmadd52huq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">;
+  def vpmadd52luq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">;
+}
+
+let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpmadd52huq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpmadd52huq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpmadd52luq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpmadd52luq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcomisd : X86Builtin<"int(_Vector<2, double>, _Vector<2, double>, _Constant int, _Constant int)">;
+  def vcomiss : X86Builtin<"int(_Vector<4, float>, _Vector<4, float>, _Constant int, _Constant int)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def kunpckdi : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
+  def kunpcksi : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def loaddquhi512_mask : X86Builtin<"_Vector<32, short>(_Vector<32, short const *>, _Vector<32, short>, unsigned int)">;
+  def loaddquqi512_mask : X86Builtin<"_Vector<64, char>(_Vector<64, char const *>, _Vector<64, char>, unsigned long long int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def fixupimmpd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, long long int>, _Constant int, unsigned char, _Constant int)">;
+  def fixupimmpd512_maskz : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, long long int>, _Constant int, unsigned char, _Constant int)">;
+  def fixupimmps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, int>, _Constant int, unsigned short, _Constant int)">;
+  def fixupimmps512_maskz : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, int>, _Constant int, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def fixupimmsd_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, long long int>, _Constant int, unsigned char, _Constant int)">;
+  def fixupimmsd_maskz : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, long long int>, _Constant int, unsigned char, _Constant int)">;
+  def fixupimmss_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, int>, _Constant int, unsigned char, _Constant int)">;
+  def fixupimmss_maskz : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, int>, _Constant int, unsigned char, _Constant int)">;
+  def getexpsd128_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def getexpss128_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def getmantsd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int, _Vector<2, double>, unsigned char, _Constant int)">;
+  def getmantss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int, _Vector<4, float>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loaddquhi128_mask : X86Builtin<"_Vector<8, short>(_Vector<8, short const *>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def loaddquhi256_mask : X86Builtin<"_Vector<16, short>(_Vector<16, short const *>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loaddquqi128_mask : X86Builtin<"_Vector<16, char>(_Vector<16, char const *>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def loaddquqi256_mask : X86Builtin<"_Vector<32, char>(_Vector<32, char const *>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def fixupimmpd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, long long int>, _Constant int, unsigned char)">;
+  def fixupimmpd128_maskz : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def fixupimmpd256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, long long int>, _Constant int, unsigned char)">;
+  def fixupimmpd256_maskz : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def fixupimmps128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, int>, _Constant int, unsigned char)">;
+  def fixupimmps128_maskz : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def fixupimmps256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, int>, _Constant int, unsigned char)">;
+  def fixupimmps256_maskz : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loadapd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double const *>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loadsd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double const *>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def loadapd256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double const *>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loadaps128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float const *>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loadss128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float const *>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def loadaps256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float const *>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loaddqudi128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int const *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def loaddqudi256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int const *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loaddqusi128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, int const *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def loaddqusi256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, int const *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loadupd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double const *>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def loadupd256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double const *>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loadups128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float const *>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def loadups256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float const *>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def storedquhi512_mask : X86Builtin<"void(_Vector<32, short *>, _Vector<32, short>, unsigned int)">;
+  def storedquqi512_mask : X86Builtin<"void(_Vector<64, char *>, _Vector<64, char>, unsigned long long int)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def storedquhi128_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def storedquhi256_mask : X86Builtin<"void(_Vector<16, short *>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def storedquqi128_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def storedquqi256_mask : X86Builtin<"void(_Vector<32, char *>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def storeapd128_mask : X86Builtin<"void(_Vector<2, double *>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def storesd128_mask : X86Builtin<"void(_Vector<2, double *>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def storeapd256_mask : X86Builtin<"void(_Vector<4, double *>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def storeaps128_mask : X86Builtin<"void(_Vector<4, float *>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def storess128_mask : X86Builtin<"void(_Vector<4, float *>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def storeaps256_mask : X86Builtin<"void(_Vector<8, float *>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def storedqudi128_mask : X86Builtin<"void(_Vector<2, long long int *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def storedqudi256_mask : X86Builtin<"void(_Vector<4, long long int *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def storedqusi128_mask : X86Builtin<"void(_Vector<4, int *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def storedqusi256_mask : X86Builtin<"void(_Vector<8, int *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def storeupd128_mask : X86Builtin<"void(_Vector<2, double *>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def storeupd256_mask : X86Builtin<"void(_Vector<4, double *>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def storeups128_mask : X86Builtin<"void(_Vector<4, float *>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def storeups256_mask : X86Builtin<"void(_Vector<8, float *>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rcp14pd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rcp14pd256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rcp14ps128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rcp14ps256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vplzcntd_128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>)">;
+}
+
+let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vplzcntd_256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>)">;
+}
+
+let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vplzcntq_128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>)">;
+}
+
+let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vplzcntq_256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtsd2si32 : X86Builtin<"int(_Vector<2, double>, _Constant int)">;
+  def vcvtsd2usi32 : X86Builtin<"unsigned int(_Vector<2, double>, _Constant int)">;
+  def vcvtss2si32 : X86Builtin<"int(_Vector<4, float>, _Constant int)">;
+  def vcvtss2usi32 : X86Builtin<"unsigned int(_Vector<4, float>, _Constant int)">;
+  def vcvttsd2si32 : X86Builtin<"int(_Vector<2, double>, _Constant int)">;
+  def vcvttsd2usi32 : X86Builtin<"unsigned int(_Vector<2, double>, _Constant int)">;
+  def vcvttss2si32 : X86Builtin<"int(_Vector<4, float>, _Constant int)">;
+  def vcvttss2usi32 : X86Builtin<"unsigned int(_Vector<4, float>, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpermilpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Constant int)">;
+  def vpermilps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Constant int)">;
+  def vpermilvarpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, long long int>)">;
+  def vpermilvarps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, int>)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rndscalesd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int, _Constant int)">;
+  def rndscaless_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def scalefpd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def scalefps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def scalefsd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def scalefss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def psradi512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def psraqi512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psraq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psraq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psraqi128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psraqi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pslld512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<4, int>)">;
+  def psllq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<2, long long int>)">;
+  def psllv16si : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def psllv8di : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
+  def psrad512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<4, int>)">;
+  def psraq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<2, long long int>)">;
+  def psrav16si : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def psrav8di : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
+  def psrld512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<4, int>)">;
+  def psrlq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<2, long long int>)">;
+  def psrlv16si : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def psrlv8di : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
+  def pternlogd512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>, _Constant int, unsigned short)">;
+  def pternlogd512_maskz : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>, _Constant int, unsigned short)">;
+  def pternlogq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>, _Constant int, unsigned char)">;
+  def pternlogq512_maskz : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pternlogd128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>, _Constant int, unsigned char)">;
+  def pternlogd128_maskz : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pternlogd256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>, _Constant int, unsigned char)">;
+  def pternlogd256_maskz : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pternlogq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>, _Constant int, unsigned char)">;
+  def pternlogq128_maskz : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pternlogq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>, _Constant int, unsigned char)">;
+  def pternlogq256_maskz : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def shuf_f32x4 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">;
+  def shuf_f64x2 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">;
+  def shuf_i32x4 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Constant int)">;
+  def shuf_i64x2 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Constant int)">;
+  def shufpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">;
+  def shufps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def shuf_f32x4_256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+  def shuf_f64x2_256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def shuf_i32x4_256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">;
+  def shuf_i64x2_256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def sqrtsd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def sqrtss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rsqrt14pd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rsqrt14pd256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rsqrt14ps128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rsqrt14ps256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cvtb2mask512 : X86Builtin<"unsigned long long int(_Vector<64, char>)">;
+  def cvtmask2b512 : X86Builtin<"_Vector<64, char>(unsigned long long int)">;
+  def cvtmask2w512 : X86Builtin<"_Vector<32, short>(unsigned int)">;
+}
+
+let Features = "avx512dq,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cvtd2mask512 : X86Builtin<"unsigned short(_Vector<16, int>)">;
+  def cvtmask2d512 : X86Builtin<"_Vector<16, int>(unsigned short)">;
+  def cvtmask2q512 : X86Builtin<"_Vector<8, long long int>(unsigned char)">;
+  def cvtq2mask512 : X86Builtin<"unsigned char(_Vector<8, long long int>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtb2mask128 : X86Builtin<"unsigned short(_Vector<16, char>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtb2mask256 : X86Builtin<"unsigned int(_Vector<32, char>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtmask2b128 : X86Builtin<"_Vector<16, char>(unsigned short)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtmask2b256 : X86Builtin<"_Vector<32, char>(unsigned int)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtmask2w128 : X86Builtin<"_Vector<8, short>(unsigned char)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtmask2w256 : X86Builtin<"_Vector<16, short>(unsigned short)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtd2mask128 : X86Builtin<"unsigned char(_Vector<4, int>)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtd2mask256 : X86Builtin<"unsigned char(_Vector<8, int>)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtmask2d128 : X86Builtin<"_Vector<4, int>(unsigned char)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtmask2d256 : X86Builtin<"_Vector<8, int>(unsigned char)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtmask2q128 : X86Builtin<"_Vector<2, long long int>(unsigned char)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtmask2q256 : X86Builtin<"_Vector<4, long long int>(unsigned char)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtq2mask128 : X86Builtin<"unsigned char(_Vector<2, long long int>)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtq2mask256 : X86Builtin<"unsigned char(_Vector<4, long long int>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovsdb512_mask : X86Builtin<"_Vector<16, char>(_Vector<16, int>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovsdb512mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<16, int>, unsigned short)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovswb512mem_mask : X86Builtin<"void(_Vector<32, char *>, _Vector<32, short>, unsigned int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovsdw512_mask : X86Builtin<"_Vector<16, short>(_Vector<16, int>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovsdw512mem_mask : X86Builtin<"void(_Vector<16, short *>, _Vector<16, int>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovsqb512_mask : X86Builtin<"_Vector<16, char>(_Vector<8, long long int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovsqb512mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovsqd512_mask : X86Builtin<"_Vector<8, int>(_Vector<8, long long int>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovsqd512mem_mask : X86Builtin<"void(_Vector<8, int *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovsqw512_mask : X86Builtin<"_Vector<8, short>(_Vector<8, long long int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovsqw512mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovsdb128_mask : X86Builtin<"_Vector<16, char>(_Vector<4, int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovsdb128mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovswb128mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovsdb256_mask : X86Builtin<"_Vector<16, char>(_Vector<8, int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovsdb256mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovswb256mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovsdw128_mask : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovsdw128mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovsdw256_mask : X86Builtin<"_Vector<8, short>(_Vector<8, int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovsdw256mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovsqb128_mask : X86Builtin<"_Vector<16, char>(_Vector<2, long long int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovsqb128mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovsqb256_mask : X86Builtin<"_Vector<16, char>(_Vector<4, long long int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovsqb256mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovsqd128_mask : X86Builtin<"_Vector<4, int>(_Vector<2, long long int>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovsqd128mem_mask : X86Builtin<"void(_Vector<4, int *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovsqd256_mask : X86Builtin<"_Vector<4, int>(_Vector<4, long long int>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovsqd256mem_mask : X86Builtin<"void(_Vector<4, int *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovsqw128_mask : X86Builtin<"_Vector<8, short>(_Vector<2, long long int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovsqw128mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovsqw256_mask : X86Builtin<"_Vector<8, short>(_Vector<4, long long int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovsqw256mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovusdb512_mask : X86Builtin<"_Vector<16, char>(_Vector<16, int>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovusdb512mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<16, int>, unsigned short)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovuswb512mem_mask : X86Builtin<"void(_Vector<32, char *>, _Vector<32, short>, unsigned int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovusdw512_mask : X86Builtin<"_Vector<16, short>(_Vector<16, int>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovusdw512mem_mask : X86Builtin<"void(_Vector<16, short *>, _Vector<16, int>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovusqb512_mask : X86Builtin<"_Vector<16, char>(_Vector<8, long long int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovusqb512mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovusqd512_mask : X86Builtin<"_Vector<8, int>(_Vector<8, long long int>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovusqd512mem_mask : X86Builtin<"void(_Vector<8, int *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovusqw512_mask : X86Builtin<"_Vector<8, short>(_Vector<8, long long int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovusqw512mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovusdb128_mask : X86Builtin<"_Vector<16, char>(_Vector<4, int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovusdb128mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovuswb128mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovusdb256_mask : X86Builtin<"_Vector<16, char>(_Vector<8, int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovusdb256mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovuswb256mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovusdw128_mask : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovusdw128mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovusdw256_mask : X86Builtin<"_Vector<8, short>(_Vector<8, int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovusdw256mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovusqb128_mask : X86Builtin<"_Vector<16, char>(_Vector<2, long long int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovusqb128mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovusqb256_mask : X86Builtin<"_Vector<16, char>(_Vector<4, long long int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovusqb256mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovusqd128_mask : X86Builtin<"_Vector<4, int>(_Vector<2, long long int>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovusqd128mem_mask : X86Builtin<"void(_Vector<4, int *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovusqd256_mask : X86Builtin<"_Vector<4, int>(_Vector<4, long long int>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovusqd256mem_mask : X86Builtin<"void(_Vector<4, int *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovusqw128_mask : X86Builtin<"_Vector<8, short>(_Vector<2, long long int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovusqw128mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovusqw256_mask : X86Builtin<"_Vector<8, short>(_Vector<4, long long int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovusqw256mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovdb512_mask : X86Builtin<"_Vector<16, char>(_Vector<16, int>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovdb512mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<16, int>, unsigned short)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovwb512mem_mask : X86Builtin<"void(_Vector<32, char *>, _Vector<32, short>, unsigned int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovdw512_mask : X86Builtin<"_Vector<16, short>(_Vector<16, int>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovdw512mem_mask : X86Builtin<"void(_Vector<16, short *>, _Vector<16, int>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovqb512_mask : X86Builtin<"_Vector<16, char>(_Vector<8, long long int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovqb512mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovqd512_mask : X86Builtin<"_Vector<8, int>(_Vector<8, long long int>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovqd512mem_mask : X86Builtin<"void(_Vector<8, int *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovqw512_mask : X86Builtin<"_Vector<8, short>(_Vector<8, long long int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovqw512mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovdb128_mask : X86Builtin<"_Vector<16, char>(_Vector<4, int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovwb128mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovdb128mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovdb256_mask : X86Builtin<"_Vector<16, char>(_Vector<8, int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovdb256mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovwb256mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovdw128_mask : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovdw128mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovdw256_mask : X86Builtin<"_Vector<8, short>(_Vector<8, int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovdw256mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovqb128_mask : X86Builtin<"_Vector<16, char>(_Vector<2, long long int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovqb128mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovqb256_mask : X86Builtin<"_Vector<16, char>(_Vector<4, long long int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovqb256mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovqd128_mask : X86Builtin<"_Vector<4, int>(_Vector<2, long long int>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovqd128mem_mask : X86Builtin<"void(_Vector<4, int *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovqd256mem_mask : X86Builtin<"void(_Vector<4, int *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovqw128_mask : X86Builtin<"_Vector<8, short>(_Vector<2, long long int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovqw128mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovqw256_mask : X86Builtin<"_Vector<8, short>(_Vector<4, long long int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovqw256mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512dq,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def extractf32x8_mask : X86Builtin<"_Vector<8, float>(_Vector<16, float>, _Constant int, _Vector<8, float>, unsigned char)">;
+  def extractf64x2_512_mask : X86Builtin<"_Vector<2, double>(_Vector<8, double>, _Constant int, _Vector<2, double>, unsigned char)">;
+  def extracti32x8_mask : X86Builtin<"_Vector<8, int>(_Vector<16, int>, _Constant int, _Vector<8, int>, unsigned char)">;
+  def extracti64x2_512_mask : X86Builtin<"_Vector<2, long long int>(_Vector<8, long long int>, _Constant int, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def extracti32x4_mask : X86Builtin<"_Vector<4, int>(_Vector<16, int>, _Constant int, _Vector<4, int>, unsigned char)">;
+  def extracti64x4_mask : X86Builtin<"_Vector<4, long long int>(_Vector<8, long long int>, _Constant int, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def extractf64x2_256_mask : X86Builtin<"_Vector<2, double>(_Vector<4, double>, _Constant int, _Vector<2, double>, unsigned char)">;
+  def extracti64x2_256_mask : X86Builtin<"_Vector<2, long long int>(_Vector<4, long long int>, _Constant int, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def extractf32x4_256_mask : X86Builtin<"_Vector<4, float>(_Vector<8, float>, _Constant int, _Vector<4, float>, unsigned char)">;
+  def extracti32x4_256_mask : X86Builtin<"_Vector<4, int>(_Vector<8, int>, _Constant int, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512dq,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def insertf32x8 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<8, float>, _Constant int)">;
+  def insertf64x2_512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<2, double>, _Constant int)">;
+  def inserti32x8 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<8, int>, _Constant int)">;
+  def inserti64x2_512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def insertf64x4 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<4, double>, _Constant int)">;
+  def inserti64x4 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<4, long long int>, _Constant int)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def insertf64x2_256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<2, double>, _Constant int)">;
+  def inserti64x2_256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def insertf32x4_256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<4, float>, _Constant int)">;
+  def inserti32x4_256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def insertf32x4 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<4, float>, _Constant int)">;
+  def inserti32x4 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<4, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def getmantpd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Constant int, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def getmantpd256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def getmantps128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def getmantps256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def getmantpd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Constant int, _Vector<8, double>, unsigned char, _Constant int)">;
+  def getmantps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Constant int, _Vector<16, float>, unsigned short, _Constant int)">;
+  def getexppd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def getexpps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddss3_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfmaddss3_maskz : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfmaddss3_mask3 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfmaddsd3_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def vfmaddsd3_maskz : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def vfmaddsd3_mask3 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def vfmsubsd3_mask3 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def vfmsubss3_mask3 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def permdf512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Constant int)">;
+  def permdi512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Constant int)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def permvarhi512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def permvardf512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, long long int>)">;
+  def permvardi512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
+  def permvarsf512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, int>)">;
+  def permvarsi512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx512vbmi,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def permvarqi512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
+}
+
+let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def permvarqi128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
+}
+
+let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def permvarqi256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def permvarhi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def permvarhi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def permvardf256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>)">;
+  def permvardi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def fpclasspd128_mask : X86Builtin<"unsigned char(_Vector<2, double>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def fpclasspd256_mask : X86Builtin<"unsigned char(_Vector<4, double>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def fpclassps128_mask : X86Builtin<"unsigned char(_Vector<4, float>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def fpclassps256_mask : X86Builtin<"unsigned char(_Vector<8, float>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512dq,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def fpclassps512_mask : X86Builtin<"unsigned short(_Vector<16, float>, _Constant int, unsigned short)">;
+  def fpclasspd512_mask : X86Builtin<"unsigned char(_Vector<8, double>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def fpclasssd_mask : X86Builtin<"unsigned char(_Vector<2, double>, _Constant int, unsigned char)">;
+  def fpclassss_mask : X86Builtin<"unsigned char(_Vector<4, float>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def kaddqi : X86Builtin<"unsigned char(unsigned char, unsigned char)">;
+  def kaddhi : X86Builtin<"unsigned short(unsigned short, unsigned short)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def kaddsi : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+  def kadddi : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def kandqi : X86Builtin<"unsigned char(unsigned char, unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def kandhi : X86Builtin<"unsigned short(unsigned short, unsigned short)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def kandsi : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+  def kanddi : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def kandnqi : X86Builtin<"unsigned char(unsigned char, unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def kandnhi : X86Builtin<"unsigned short(unsigned short, unsigned short)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def kandnsi : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+  def kandndi : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def korqi : X86Builtin<"unsigned char(unsigned char, unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def korhi : X86Builtin<"unsigned short(unsigned short, unsigned short)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def korsi : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+  def kordi : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def kortestcqi : X86Builtin<"int(unsigned char, unsigned char)">;
+  def kortestzqi : X86Builtin<"int(unsigned char, unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def kortestchi : X86Builtin<"int(unsigned short, unsigned short)">;
+  def kortestzhi : X86Builtin<"int(unsigned short, unsigned short)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def kortestcsi : X86Builtin<"int(unsigned int, unsigned int)">;
+  def kortestzsi : X86Builtin<"int(unsigned int, unsigned int)">;
+  def kortestcdi : X86Builtin<"int(unsigned long long int, unsigned long long int)">;
+  def kortestzdi : X86Builtin<"int(unsigned long long int, unsigned long long int)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def ktestcqi : X86Builtin<"int(unsigned char, unsigned char)">;
+  def ktestzqi : X86Builtin<"int(unsigned char, unsigned char)">;
+  def ktestchi : X86Builtin<"int(unsigned short, unsigned short)">;
+  def ktestzhi : X86Builtin<"int(unsigned short, unsigned short)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def ktestcsi : X86Builtin<"int(unsigned int, unsigned int)">;
+  def ktestzsi : X86Builtin<"int(unsigned int, unsigned int)">;
+  def ktestcdi : X86Builtin<"int(unsigned long long int, unsigned long long int)">;
+  def ktestzdi : X86Builtin<"int(unsigned long long int, unsigned long long int)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def kunpckhi : X86Builtin<"unsigned short(unsigned short, unsigned short)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def kxnorqi : X86Builtin<"unsigned char(unsigned char, unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def kxnorhi : X86Builtin<"unsigned short(unsigned short, unsigned short)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def kxnorsi : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+  def kxnordi : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def kxorqi : X86Builtin<"unsigned char(unsigned char, unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def kxorhi : X86Builtin<"unsigned short(unsigned short, unsigned short)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def kxorsi : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+  def kxordi : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def kshiftliqi : X86Builtin<"unsigned char(unsigned char, _Constant unsigned int)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def kshiftlihi : X86Builtin<"unsigned short(unsigned short, _Constant unsigned int)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def kshiftlisi : X86Builtin<"unsigned int(unsigned int, _Constant unsigned int)">;
+  def kshiftlidi : X86Builtin<"unsigned long long int(unsigned long long int, _Constant unsigned int)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def kshiftriqi : X86Builtin<"unsigned char(unsigned char, _Constant unsigned int)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def kshiftrihi : X86Builtin<"unsigned short(unsigned short, _Constant unsigned int)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def kshiftrisi : X86Builtin<"unsigned int(unsigned int, _Constant unsigned int)">;
+  def kshiftridi : X86Builtin<"unsigned long long int(unsigned long long int, _Constant unsigned int)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def kmovb : X86Builtin<"unsigned char(unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def kmovw : X86Builtin<"unsigned short(unsigned short)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def kmovd : X86Builtin<"unsigned int(unsigned int)">;
+  def kmovq : X86Builtin<"unsigned long long int(unsigned long long int)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def palignr512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Constant int)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def dbpsadbw128 : X86Builtin<"_Vector<8, short>(_Vector<16, char>, _Vector<16, char>, _Constant int)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def dbpsadbw256 : X86Builtin<"_Vector<16, short>(_Vector<32, char>, _Vector<32, char>, _Constant int)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def dbpsadbw512 : X86Builtin<"_Vector<32, short>(_Vector<64, char>, _Vector<64, char>, _Constant int)">;
+  def psadbw512 : X86Builtin<"_Vector<8, long long int>(_Vector<64, char>, _Vector<64, char>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def compressdf512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, unsigned char)">;
+  def compressdi512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def compresshi512_mask : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, unsigned int)">;
+  def compressqi512_mask : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, unsigned long long int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def compresssf512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, unsigned short)">;
+  def compresssi512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, unsigned short)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cmpsd_mask : X86Builtin<"unsigned char(_Vector<2, double>, _Vector<2, double>, _Constant int, unsigned char, _Constant int)">;
+  def cmpss_mask : X86Builtin<"unsigned char(_Vector<4, float>, _Vector<4, float>, _Constant int, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pshufd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Constant int)">;
+  def expanddf512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, unsigned char)">;
+  def expanddi512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def expandhi512_mask : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, unsigned int)">;
+  def expandqi512_mask : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, unsigned long long int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def expandloaddf512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double const *>, _Vector<8, double>, unsigned char)">;
+  def expandloaddi512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int const *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def expandloadhi512_mask : X86Builtin<"_Vector<32, short>(_Vector<32, short const *>, _Vector<32, short>, unsigned int)">;
+  def expandloadqi512_mask : X86Builtin<"_Vector<64, char>(_Vector<64, char const *>, _Vector<64, char>, unsigned long long int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def expandloadsf512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float const *>, _Vector<16, float>, unsigned short)">;
+  def expandloadsi512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, int const *>, _Vector<16, int>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def expandsf512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, unsigned short)">;
+  def expandsi512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, unsigned short)">;
+  def cvtps2pd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, float>, _Vector<8, double>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def compressstoredf512_mask : X86Builtin<"void(_Vector<8, double *>, _Vector<8, double>, unsigned char)">;
+  def compressstoredi512_mask : X86Builtin<"void(_Vector<8, long long int *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def compressstorehi512_mask : X86Builtin<"void(_Vector<32, short *>, _Vector<32, short>, unsigned int)">;
+  def compressstoreqi512_mask : X86Builtin<"void(_Vector<64, char *>, _Vector<64, char>, unsigned long long int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def compressstoresf512_mask : X86Builtin<"void(_Vector<16, float *>, _Vector<16, float>, unsigned short)">;
+  def compressstoresi512_mask : X86Builtin<"void(_Vector<16, int *>, _Vector<16, int>, unsigned short)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtph2ps_mask : X86Builtin<"_Vector<4, float>(_Vector<8, short>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtph2ps256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, short>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtps2ph_mask : X86Builtin<"_Vector<8, short>(_Vector<4, float>, _Constant int, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtps2ph256_mask : X86Builtin<"_Vector<8, short>(_Vector<8, float>, _Constant int, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cvtw2mask512 : X86Builtin<"unsigned int(_Vector<32, short>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtw2mask128 : X86Builtin<"unsigned char(_Vector<8, short>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtw2mask256 : X86Builtin<"unsigned short(_Vector<16, short>)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtsd2ss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<2, double>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def cvtsi2ss32 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, int, _Constant int)">;
+  def cvtss2sd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<4, float>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def cvtusi2ss32 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512vbmi,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpmultishiftqb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
+}
+
+let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpmultishiftqb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
+}
+
+let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpmultishiftqb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
+}
+
+let Features = "avx512bf16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtne2ps2bf16_128 : X86Builtin<"_Vector<8, __bf16>(_Vector<4, float>, _Vector<4, float>)">;
+}
+
+let Features = "avx512bf16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtne2ps2bf16_256 : X86Builtin<"_Vector<16, __bf16>(_Vector<8, float>, _Vector<8, float>)">;
+}
+
+let Features = "avx512bf16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cvtne2ps2bf16_512 : X86Builtin<"_Vector<32, __bf16>(_Vector<16, float>, _Vector<16, float>)">;
+}
+
+let Features = "avx512bf16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtneps2bf16_128_mask : X86Builtin<"_Vector<8, __bf16>(_Vector<4, float>, _Vector<8, __bf16>, unsigned char)">;
+}
+
+let Features = "avx512bf16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtneps2bf16_256_mask : X86Builtin<"_Vector<8, __bf16>(_Vector<8, float>, _Vector<8, __bf16>, unsigned char)">;
+}
+
+let Features = "avx512bf16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cvtneps2bf16_512_mask : X86Builtin<"_Vector<16, __bf16>(_Vector<16, float>, _Vector<16, __bf16>, unsigned short)">;
+}
+
+let Features = "avx512bf16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def dpbf16ps_128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
+
+let Features = "avx512bf16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def dpbf16ps_256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<16, __bf16>, _Vector<16, __bf16>)">;
+}
+
+let Features = "avx512bf16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def dpbf16ps_512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<32, __bf16>, _Vector<32, __bf16>)">;
+}
+
+let Features = "avx512bf16", Attributes = [NoThrow, Const] in {
+  def cvtsbf162ss_32 : X86Builtin<"float(__bf16)">;
+}
+
+let Features = "avx512vp2intersect,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vp2intersect_q_512 : X86Builtin<"void(_Vector<8, long long int>, _Vector<8, long long int>, unsigned char *, unsigned char *)">;
+}
+
+let Features = "avx512vp2intersect,avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vp2intersect_q_256 : X86Builtin<"void(_Vector<4, long long int>, _Vector<4, long long int>, unsigned char *, unsigned char *)">;
+}
+
+let Features = "avx512vp2intersect,avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vp2intersect_q_128 : X86Builtin<"void(_Vector<2, long long int>, _Vector<2, long long int>, unsigned char *, unsigned char *)">;
+}
+
+let Features = "avx512vp2intersect,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vp2intersect_d_512 : X86Builtin<"void(_Vector<16, int>, _Vector<16, int>, unsigned short *, unsigned short *)">;
+}
+
+let Features = "avx512vp2intersect,avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vp2intersect_d_256 : X86Builtin<"void(_Vector<8, int>, _Vector<8, int>, unsigned char *, unsigned char *)">;
+}
+
+let Features = "avx512vp2intersect,avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vp2intersect_d_128 : X86Builtin<"void(_Vector<4, int>, _Vector<4, int>, unsigned char *, unsigned char *)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcomish : X86Builtin<"int(_Vector<8, _Float16>, _Vector<8, _Float16>, _Constant int, _Constant int)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def addph512 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Constant int)">;
+  def subph512 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Constant int)">;
+  def mulph512 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Constant int)">;
+  def divph512 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Constant int)">;
+  def maxph512 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Constant int)">;
+  def minph512 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def minph256 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def minph128 : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def maxph256 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def maxph128 : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def addsh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def divsh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def mulsh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def subsh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def maxsh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def minsh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cmpph512_mask : X86Builtin<"unsigned int(_Vector<32, _Float16>, _Vector<32, _Float16>, _Constant int, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cmpph256_mask : X86Builtin<"unsigned short(_Vector<16, _Float16>, _Vector<16, _Float16>, _Constant int, unsigned short)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cmpph128_mask : X86Builtin<"unsigned char(_Vector<8, _Float16>, _Vector<8, _Float16>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cmpsh_mask : X86Builtin<"unsigned char(_Vector<8, _Float16>, _Vector<8, _Float16>, _Constant int, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loadsh128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16 const *>, _Vector<8, _Float16>, unsigned char)">;
+  def storesh128_mask : X86Builtin<"void(_Vector<8, _Float16 *>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rcpph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rcpph256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def rcpph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rsqrtph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rsqrtph256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def rsqrtph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def getmantph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Constant int, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def getmantph256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Constant int, _Vector<16, _Float16>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def getmantph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Constant int, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def getexpph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def getexpph256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def getexpph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def scalefph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def scalefph256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def scalefph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rndscaleph_128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Constant int, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rndscaleph_256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Constant int, _Vector<16, _Float16>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def rndscaleph_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Constant int, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def reduceph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Constant int, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def reduceph256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Constant int, _Vector<16, _Float16>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def reduceph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Constant int, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rcpsh_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char)">;
+  def rsqrtsh_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char)">;
+  def getmantsh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Constant int, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def getexpsh128_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def scalefsh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def rndscalesh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int, _Constant int)">;
+  def reducesh_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def sqrtph : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def sqrtph256 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def sqrtph512 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Constant int)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def sqrtsh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def fpclassph128_mask : X86Builtin<"unsigned char(_Vector<8, _Float16>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def fpclassph256_mask : X86Builtin<"unsigned short(_Vector<16, _Float16>, _Constant int, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def fpclassph512_mask : X86Builtin<"unsigned int(_Vector<32, _Float16>, _Constant int, unsigned int)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def fpclasssh_mask : X86Builtin<"unsigned char(_Vector<8, _Float16>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtpd2ph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<2, double>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtpd2ph256_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<4, double>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtpd2ph512_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, double>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtph2pd128_mask : X86Builtin<"_Vector<2, double>(_Vector<8, _Float16>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtph2pd256_mask : X86Builtin<"_Vector<4, double>(_Vector<8, _Float16>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtph2pd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, _Float16>, _Vector<8, double>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtsh2ss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<8, _Float16>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vcvtss2sh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<4, float>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vcvtsd2sh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<2, double>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vcvtsh2sd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<8, _Float16>, _Vector<2, double>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtph2w128_mask : X86Builtin<"_Vector<8, short>(_Vector<8, _Float16>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtph2w256_mask : X86Builtin<"_Vector<16, short>(_Vector<16, _Float16>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtph2w512_mask : X86Builtin<"_Vector<32, short>(_Vector<32, _Float16>, _Vector<32, short>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvttph2w128_mask : X86Builtin<"_Vector<8, short>(_Vector<8, _Float16>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvttph2w256_mask : X86Builtin<"_Vector<16, short>(_Vector<16, _Float16>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvttph2w512_mask : X86Builtin<"_Vector<32, short>(_Vector<32, _Float16>, _Vector<32, short>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtw2ph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, short>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtw2ph256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, short>, _Vector<16, _Float16>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtw2ph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, short>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtph2uw128_mask : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, _Float16>, _Vector<8, unsigned short>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtph2uw256_mask : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, _Float16>, _Vector<16, unsigned short>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtph2uw512_mask : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, _Float16>, _Vector<32, unsigned short>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvttph2uw128_mask : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, _Float16>, _Vector<8, unsigned short>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvttph2uw256_mask : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, _Float16>, _Vector<16, unsigned short>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvttph2uw512_mask : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, _Float16>, _Vector<32, unsigned short>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtuw2ph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, unsigned short>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtuw2ph256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, unsigned short>, _Vector<16, _Float16>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtuw2ph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, unsigned short>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtph2dq128_mask : X86Builtin<"_Vector<4, int>(_Vector<8, _Float16>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtph2dq256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, _Float16>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtph2dq512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, _Float16>, _Vector<16, int>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtph2udq128_mask : X86Builtin<"_Vector<4, unsigned int>(_Vector<8, _Float16>, _Vector<4, unsigned int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtph2udq256_mask : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, _Float16>, _Vector<8, unsigned int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtph2udq512_mask : X86Builtin<"_Vector<16, unsigned int>(_Vector<16, _Float16>, _Vector<16, unsigned int>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtdq2ph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<4, int>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtdq2ph256_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, int>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtdq2ph512_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, int>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtudq2ph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<4, unsigned int>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtudq2ph256_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, unsigned int>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtudq2ph512_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, unsigned int>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvttph2dq128_mask : X86Builtin<"_Vector<4, int>(_Vector<8, _Float16>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvttph2dq256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, _Float16>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvttph2dq512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, _Float16>, _Vector<16, int>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvttph2udq128_mask : X86Builtin<"_Vector<4, unsigned int>(_Vector<8, _Float16>, _Vector<4, unsigned int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvttph2udq256_mask : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, _Float16>, _Vector<8, unsigned int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvttph2udq512_mask : X86Builtin<"_Vector<16, unsigned int>(_Vector<16, _Float16>, _Vector<16, unsigned int>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtqq2ph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<2, long long int>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtqq2ph256_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<4, long long int>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtqq2ph512_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, long long int>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtph2qq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<8, _Float16>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtph2qq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<8, _Float16>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtph2qq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, _Float16>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtuqq2ph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<2, unsigned long long int>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtuqq2ph256_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<4, unsigned long long int>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtuqq2ph512_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, unsigned long long int>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtph2uqq128_mask : X86Builtin<"_Vector<2, unsigned long long int>(_Vector<8, _Float16>, _Vector<2, unsigned long long int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtph2uqq256_mask : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<8, _Float16>, _Vector<4, unsigned long long int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtph2uqq512_mask : X86Builtin<"_Vector<8, unsigned long long int>(_Vector<8, _Float16>, _Vector<8, unsigned long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvttph2qq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<8, _Float16>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvttph2qq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<8, _Float16>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvttph2qq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, _Float16>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvttph2uqq128_mask : X86Builtin<"_Vector<2, unsigned long long int>(_Vector<8, _Float16>, _Vector<2, unsigned long long int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvttph2uqq256_mask : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<8, _Float16>, _Vector<4, unsigned long long int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvttph2uqq512_mask : X86Builtin<"_Vector<8, unsigned long long int>(_Vector<8, _Float16>, _Vector<8, unsigned long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtsh2si32 : X86Builtin<"int(_Vector<8, _Float16>, _Constant int)">;
+  def vcvtsh2usi32 : X86Builtin<"unsigned int(_Vector<8, _Float16>, _Constant int)">;
+  def vcvtusi2sh : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, unsigned int, _Constant int)">;
+  def vcvtsi2sh : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, int, _Constant int)">;
+  def vcvttsh2si32 : X86Builtin<"int(_Vector<8, _Float16>, _Constant int)">;
+  def vcvttsh2usi32 : X86Builtin<"unsigned int(_Vector<8, _Float16>, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtph2psx128_mask : X86Builtin<"_Vector<4, float>(_Vector<8, _Float16>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtph2psx256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, _Float16>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtph2psx512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, _Float16>, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtps2phx128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<4, float>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtps2phx256_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, float>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtps2phx512_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, float>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddph : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vfmaddph256 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vfmaddph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+  def vfmaddph512_mask3 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+  def vfmaddph512_maskz : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddsubph : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vfmaddsubph256 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vfmaddsubph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+  def vfmaddsubph512_maskz : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+  def vfmaddsubph512_mask3 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+  def vfmsubaddph512_mask3 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+  def vfmsubph512_mask3 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddsh3_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vfmaddsh3_maskz : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vfmaddsh3_mask3 : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vfmsubsh3_mask3 : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddcph128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char)">;
+  def vfmaddcph128_maskz : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vfmaddcph256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char)">;
+  def vfmaddcph256_maskz : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vfmaddcph512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfmaddcph512_maskz : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfmaddcph512_mask3 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfcmaddcph128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char)">;
+  def vfcmaddcph128_maskz : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vfcmaddcph256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char)">;
+  def vfcmaddcph256_maskz : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vfcmaddcph512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfcmaddcph512_maskz : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfcmaddcph512_mask3 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddcsh_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfmaddcsh_maskz : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfcmaddcsh_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfcmaddcsh_maskz : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfmaddcsh_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfmaddcsh_round_mask3 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfcmaddcsh_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfcmaddcsh_round_mask3 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfmulcsh_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfcmulcsh_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmulcph128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vfmulcph256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vfmulcph512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfcmulcph128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vfcmulcph256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vfcmulcph512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectb_128 : X86Builtin<"_Vector<16, char>(unsigned short, _Vector<16, char>, _Vector<16, char>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def selectb_256 : X86Builtin<"_Vector<32, char>(unsigned int, _Vector<32, char>, _Vector<32, char>)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def selectb_512 : X86Builtin<"_Vector<64, char>(unsigned long long int, _Vector<64, char>, _Vector<64, char>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectw_128 : X86Builtin<"_Vector<8, short>(unsigned char, _Vector<8, short>, _Vector<8, short>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def selectw_256 : X86Builtin<"_Vector<16, short>(unsigned short, _Vector<16, short>, _Vector<16, short>)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def selectw_512 : X86Builtin<"_Vector<32, short>(unsigned int, _Vector<32, short>, _Vector<32, short>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectd_128 : X86Builtin<"_Vector<4, int>(unsigned char, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def selectd_256 : X86Builtin<"_Vector<8, int>(unsigned char, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def selectd_512 : X86Builtin<"_Vector<16, int>(unsigned short, _Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectph_128 : X86Builtin<"_Vector<8, _Float16>(unsigned char, _Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def selectph_256 : X86Builtin<"_Vector<16, _Float16>(unsigned short, _Vector<16, _Float16>, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def selectph_512 : X86Builtin<"_Vector<32, _Float16>(unsigned int, _Vector<32, _Float16>, _Vector<32, _Float16>)">;
+}
+
+let Features = "avx512bf16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectpbf_128 : X86Builtin<"_Vector<8, __bf16>(unsigned char, _Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
+
+let Features = "avx512bf16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def selectpbf_256 : X86Builtin<"_Vector<16, __bf16>(unsigned short, _Vector<16, __bf16>, _Vector<16, __bf16>)">;
+}
+
+let Features = "avx512bf16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def selectpbf_512 : X86Builtin<"_Vector<32, __bf16>(unsigned int, _Vector<32, __bf16>, _Vector<32, __bf16>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectq_128 : X86Builtin<"_Vector<2, long long int>(unsigned char, _Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def selectq_256 : X86Builtin<"_Vector<4, long long int>(unsigned char, _Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def selectq_512 : X86Builtin<"_Vector<8, long long int>(unsigned char, _Vector<8, long long int>, _Vector<8, long long int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectps_128 : X86Builtin<"_Vector<4, float>(unsigned char, _Vector<4, float>, _Vector<4, float>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def selectps_256 : X86Builtin<"_Vector<8, float>(unsigned char, _Vector<8, float>, _Vector<8, float>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def selectps_512 : X86Builtin<"_Vector<16, float>(unsigned short, _Vector<16, float>, _Vector<16, float>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectpd_128 : X86Builtin<"_Vector<2, double>(unsigned char, _Vector<2, double>, _Vector<2, double>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def selectpd_256 : X86Builtin<"_Vector<4, double>(unsigned char, _Vector<4, double>, _Vector<4, double>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def selectpd_512 : X86Builtin<"_Vector<8, double>(unsigned char, _Vector<8, double>, _Vector<8, double>)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectsh_128 : X86Builtin<"_Vector<8, _Float16>(unsigned char, _Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx512bf16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectsbf_128 : X86Builtin<"_Vector<8, __bf16>(unsigned char, _Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectss_128 : X86Builtin<"_Vector<4, float>(unsigned char, _Vector<4, float>, _Vector<4, float>)">;
+  def selectsd_128 : X86Builtin<"_Vector<2, double>(unsigned char, _Vector<2, double>, _Vector<2, double>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def reduce_fadd_pd512 : X86Builtin<"double(double, _Vector<8, double>)">;
+  def reduce_fadd_ps512 : X86Builtin<"float(float, _Vector<16, float>)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def reduce_fadd_ph512 : X86Builtin<"_Float16(_Float16, _Vector<32, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def reduce_fadd_ph256 : X86Builtin<"_Float16(_Float16, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def reduce_fadd_ph128 : X86Builtin<"_Float16(_Float16, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def reduce_fmax_pd512 : X86Builtin<"double(_Vector<8, double>)">;
+  def reduce_fmax_ps512 : X86Builtin<"float(_Vector<16, float>)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def reduce_fmax_ph512 : X86Builtin<"_Float16(_Vector<32, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def reduce_fmax_ph256 : X86Builtin<"_Float16(_Vector<16, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def reduce_fmax_ph128 : X86Builtin<"_Float16(_Vector<8, _Float16>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def reduce_fmin_pd512 : X86Builtin<"double(_Vector<8, double>)">;
+  def reduce_fmin_ps512 : X86Builtin<"float(_Vector<16, float>)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def reduce_fmin_ph512 : X86Builtin<"_Float16(_Vector<32, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def reduce_fmin_ph256 : X86Builtin<"_Float16(_Vector<16, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def reduce_fmin_ph128 : X86Builtin<"_Float16(_Vector<8, _Float16>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def reduce_fmul_pd512 : X86Builtin<"double(double, _Vector<8, double>)">;
+  def reduce_fmul_ps512 : X86Builtin<"float(float, _Vector<16, float>)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def reduce_fmul_ph512 : X86Builtin<"_Float16(_Float16, _Vector<32, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def reduce_fmul_ph256 : X86Builtin<"_Float16(_Float16, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def reduce_fmul_ph128 : X86Builtin<"_Float16(_Float16, _Vector<8, _Float16>)">;
+}
+
+let Features = "mwaitx", Attributes = [NoThrow] in {
+  def monitorx : X86Builtin<"void(void const *, unsigned int, unsigned int)">;
+  def mwaitx : X86Builtin<"void(unsigned int, unsigned int, unsigned int)">;
+}
+
+let Features = "waitpkg", Attributes = [NoThrow] in {
+  def umonitor : X86Builtin<"void(void const *)">;
+  def umwait : X86Builtin<"unsigned char(unsigned int, unsigned int, unsigned int)">;
+  def tpause : X86Builtin<"unsigned char(unsigned int, unsigned int, unsigned int)">;
+}
+
+let Features = "clzero", Attributes = [NoThrow] in {
+  def clzero : X86Builtin<"void(void *)">;
+}
+
+let Features = "cldemote", Attributes = [NoThrow] in {
+  def cldemote : X86Builtin<"void(void const *)">;
+}
+
+let Features = "movdiri", Attributes = [NoThrow] in {
+  def directstore_u32 : X86Builtin<"void(unsigned int *, unsigned int)">;
+}
+
+let Features = "movdir64b", Attributes = [NoThrow] in {
+  def movdir64b : X86Builtin<"void(void *, void const *)">;
+}
+
+let Features = "ptwrite", Attributes = [NoThrow] in {
+  def ptwrite32 : X86Builtin<"void(unsigned int)">;
+}
+
+let Features = "invpcid", Attributes = [NoThrow, Const] in {
+  def invpcid : X86Builtin<"void(unsigned int, void *)">;
+}
+
+let Features = "enqcmd", Attributes = [NoThrow] in {
+  def enqcmd : X86Builtin<"unsigned char(void *, void const *)">;
+  def enqcmds : X86Builtin<"unsigned char(void *, void const *)">;
+}
+
+let Features = "kl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loadiwkey : X86Builtin<"void(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>, unsigned int)">;
+  def encodekey128_u32 : X86Builtin<"unsigned int(unsigned int, _Vector<2, long long int>, void *)">;
+  def encodekey256_u32 : X86Builtin<"unsigned int(unsigned int, _Vector<2, long long int>, _Vector<2, long long int>, void *)">;
+  def aesenc128kl_u8 : X86Builtin<"unsigned char(_Vector<2, long long int *>, _Vector<2, long long int>, void const *)">;
+  def aesenc256kl_u8 : X86Builtin<"unsigned char(_Vector<2, long long int *>, _Vector<2, long long int>, void const *)">;
+  def aesdec128kl_u8 : X86Builtin<"unsigned char(_Vector<2, long long int *>, _Vector<2, long long int>, void const *)">;
+  def aesdec256kl_u8 : X86Builtin<"unsigned char(_Vector<2, long long int *>, _Vector<2, long long int>, void const *)">;
+}
+
+let Features = "kl,widekl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def aesencwide128kl_u8 : X86Builtin<"unsigned char(_Vector<2, long long int *>, _Vector<2, long long int const *>, void const *)">;
+  def aesencwide256kl_u8 : X86Builtin<"unsigned char(_Vector<2, long long int *>, _Vector<2, long long int const *>, void const *)">;
+  def aesdecwide128kl_u8 : X86Builtin<"unsigned char(_Vector<2, long long int *>, _Vector<2, long long int const *>, void const *)">;
+  def aesdecwide256kl_u8 : X86Builtin<"unsigned char(_Vector<2, long long int *>, _Vector<2, long long int const *>, void const *)">;
+}
+
+let Features = "serialize", Attributes = [NoThrow] in {
+  def serialize : X86Builtin<"void()">;
+}
+
+let Features = "tsxldtrk", Attributes = [NoThrow] in {
+  def xsusldtrk : X86Builtin<"void()">;
+  def xresldtrk : X86Builtin<"void()">;
+}
+
+let Features = "raoint", Attributes = [NoThrow] in {
+  def aadd32 : X86Builtin<"void(void *, signed int)">;
+  def aand32 : X86Builtin<"void(void *, signed int)">;
+  def aor32 : X86Builtin<"void(void *, signed int)">;
+  def axor32 : X86Builtin<"void(void *, signed int)">;
+}
+
+let Header = "intrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, RequireDeclaration] in {
+  def _BitScanForward : X86LibBuiltin<"unsigned char(msuint32_t *, msuint32_t)">;
+  def _BitScanReverse : X86LibBuiltin<"unsigned char(msuint32_t *, msuint32_t)">;
+  def _ReadWriteBarrier : X86LibBuiltin<"void()">;
+  def _ReadBarrier : X86LibBuiltin<"void()">;
+  def _WriteBarrier : X86LibBuiltin<"void()">;
+  def __cpuid : X86LibBuiltin<"void(int *, int)">;
+  def __cpuidex : X86LibBuiltin<"void(int *, int, int)">;
+}
+
+let Header = "intrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, Const, RequireDeclaration] in {
+  def __emul : X86LibBuiltin<"long long int(int, int)">;
+  def __emulu : X86LibBuiltin<"unsigned long long int(unsigned int, unsigned int)">;
+}
+
+let Header = "intrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, RequireDeclaration] in {
+  def _AddressOfReturnAddress : X86LibBuiltin<"void *()">;
+  def __stosb : X86LibBuiltin<"void(unsigned char *, unsigned char, size_t)">;
+}
+
+let Header = "intrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, RequireDeclaration, NoReturn] in {
+  def __int2c : X86LibBuiltin<"void()">;
+  def __ud2 : X86LibBuiltin<"void()">;
+}
+
+let Header = "intrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, RequireDeclaration] in {
+  def __readfsbyte : X86LibBuiltin<"unsigned char(msuint32_t)">;
+  def __readfsword : X86LibBuiltin<"unsigned short(msuint32_t)">;
+  def __readfsdword : X86LibBuiltin<"msuint32_t(msuint32_t)">;
+  def __readfsqword : X86LibBuiltin<"unsigned long long int(msuint32_t)">;
+  def __readgsbyte : X86LibBuiltin<"unsigned char(msuint32_t)">;
+  def __readgsword : X86LibBuiltin<"unsigned short(msuint32_t)">;
+  def __readgsdword : X86LibBuiltin<"msuint32_t(msuint32_t)">;
+  def __readgsqword : X86LibBuiltin<"unsigned long long int(msuint32_t)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vdpphps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vdpphps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<16, _Float16>, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vdpphps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<32, _Float16>, _Vector<32, _Float16>)">;
+  def vpdpbssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpbssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpbsud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpbsuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpbuud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpbuuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vpdpwsud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpwsuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpwusd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpwusds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpwuud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpwuuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def mpsadbw512 : X86Builtin<"_Vector<32, short>(_Vector<64, char>, _Vector<64, char>, _Constant char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vaddpd256_round : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def vaddph256_round : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Constant int)">;
+  def vaddps256_round : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+  def vcmppd256_round_mask : X86Builtin<"unsigned char(_Vector<4, double>, _Vector<4, double>, _Constant int, unsigned char, _Constant int)">;
+  def vcmpph256_round_mask : X86Builtin<"unsigned short(_Vector<16, _Float16>, _Vector<16, _Float16>, _Constant int, unsigned short, _Constant int)">;
+  def vcmpps256_round_mask : X86Builtin<"unsigned char(_Vector<8, float>, _Vector<8, float>, _Constant int, unsigned char, _Constant int)">;
+  def vcvtdq2ph256_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, int>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vcvtdq2ps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, int>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vcvtpd2dq256_round_mask : X86Builtin<"_Vector<4, int>(_Vector<4, double>, _Vector<4, int>, unsigned char, _Constant int)">;
+  def vcvtpd2ph256_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<4, double>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vcvtpd2ps256_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, double>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vcvtpd2qq256_round_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, double>, _Vector<4, long long int>, unsigned char, _Constant int)">;
+  def vcvtpd2udq256_round_mask : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, double>, _Vector<4, unsigned int>, unsigned char, _Constant int)">;
+  def vcvtpd2uqq256_round_mask : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<4, double>, _Vector<4, unsigned long long int>, unsigned char, _Constant int)">;
+  def vcvtph2dq256_round_mask : X86Builtin<"_Vector<8, int>(_Vector<8, _Float16>, _Vector<8, int>, unsigned char, _Constant int)">;
+  def vcvtph2pd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<8, _Float16>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vcvtph2psx256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, _Float16>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vcvtph2qq256_round_mask : X86Builtin<"_Vector<4, long long int>(_Vector<8, _Float16>, _Vector<4, long long int>, unsigned char, _Constant int)">;
+  def vcvtph2udq256_round_mask : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, _Float16>, _Vector<8, unsigned int>, unsigned char, _Constant int)">;
+  def vcvtph2uqq256_round_mask : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<8, _Float16>, _Vector<4, unsigned long long int>, unsigned char, _Constant int)">;
+  def vcvtph2uw256_round_mask : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, _Float16>, _Vector<16, unsigned short>, unsigned short, _Constant int)">;
+  def vcvtph2w256_round_mask : X86Builtin<"_Vector<16, short>(_Vector<16, _Float16>, _Vector<16, short>, unsigned short, _Constant int)">;
+  def vcvtps2dq256_round_mask : X86Builtin<"_Vector<8, int>(_Vector<8, float>, _Vector<8, int>, unsigned char, _Constant int)">;
+  def vcvtps2pd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, float>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vcvtps2phx256_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, float>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vcvtps2qq256_round_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, float>, _Vector<4, long long int>, unsigned char, _Constant int)">;
+  def vcvtps2udq256_round_mask : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, float>, _Vector<8, unsigned int>, unsigned char, _Constant int)">;
+  def vcvtps2uqq256_round_mask : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<4, float>, _Vector<4, unsigned long long int>, unsigned char, _Constant int)">;
+  def vcvtqq2pd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, long long int>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vcvtqq2ph256_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<4, long long int>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vcvtqq2ps256_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, long long int>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vcvttpd2dq256_round_mask : X86Builtin<"_Vector<4, int>(_Vector<4, double>, _Vector<4, int>, unsigned char, _Constant int)">;
+  def vcvttpd2qq256_round_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, double>, _Vector<4, long long int>, unsigned char, _Constant int)">;
+  def vcvttpd2udq256_round_mask : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, double>, _Vector<4, unsigned int>, unsigned char, _Constant int)">;
+  def vcvttpd2uqq256_round_mask : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<4, double>, _Vector<4, unsigned long long int>, unsigned char, _Constant int)">;
+  def vcvttph2dq256_round_mask : X86Builtin<"_Vector<8, int>(_Vector<8, _Float16>, _Vector<8, int>, unsigned char, _Constant int)">;
+  def vcvttph2qq256_round_mask : X86Builtin<"_Vector<4, long long int>(_Vector<8, _Float16>, _Vector<4, long long int>, unsigned char, _Constant int)">;
+  def vcvttph2udq256_round_mask : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, _Float16>, _Vector<8, unsigned int>, unsigned char, _Constant int)">;
+  def vcvttph2uqq256_round_mask : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<8, _Float16>, _Vector<4, unsigned long long int>, unsigned char, _Constant int)">;
+  def vcvttph2uw256_round_mask : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, _Float16>, _Vector<16, unsigned short>, unsigned short, _Constant int)">;
+  def vcvttph2w256_round_mask : X86Builtin<"_Vector<16, short>(_Vector<16, _Float16>, _Vector<16, short>, unsigned short, _Constant int)">;
+  def vcvttps2dq256_round_mask : X86Builtin<"_Vector<8, int>(_Vector<8, float>, _Vector<8, int>, unsigned char, _Constant int)">;
+  def vcvttps2qq256_round_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, float>, _Vector<4, long long int>, unsigned char, _Constant int)">;
+  def vcvttps2udq256_round_mask : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, float>, _Vector<8, unsigned int>, unsigned char, _Constant int)">;
+  def vcvttps2uqq256_round_mask : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<4, float>, _Vector<4, unsigned long long int>, unsigned char, _Constant int)">;
+  def vcvtudq2ph256_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, unsigned int>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vcvtudq2ps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, unsigned int>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vcvtuqq2pd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, unsigned long long int>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vcvtuqq2ph256_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<4, unsigned long long int>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vcvtuqq2ps256_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, unsigned long long int>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vcvtuw2ph256_round_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, unsigned short>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vcvtw2ph256_round_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, short>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vdivpd256_round : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def vdivph256_round : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Constant int)">;
+  def vdivps256_round : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+  def vfcmaddcph256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfcmaddcph256_round_maskz : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfcmaddcph256_round_mask3 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfcmulcph256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfixupimmpd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, long long int>, _Constant int, unsigned char, _Constant int)">;
+  def vfixupimmpd256_round_maskz : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, long long int>, _Constant int, unsigned char, _Constant int)">;
+  def vfixupimmps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, int>, _Constant int, unsigned char, _Constant int)">;
+  def vfixupimmps256_round_maskz : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, int>, _Constant int, unsigned char, _Constant int)">;
+  def vfmaddpd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vfmaddpd256_round_maskz : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vfmaddpd256_round_mask3 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vfmaddph256_round_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vfmaddph256_round_maskz : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vfmaddph256_round_mask3 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vfmaddps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmaddps256_round_maskz : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmaddps256_round_mask3 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmaddcph256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmaddcph256_round_maskz : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmaddcph256_round_mask3 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmaddsubpd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vfmaddsubpd256_round_maskz : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vfmaddsubpd256_round_mask3 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vfmaddsubph256_round_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vfmaddsubph256_round_maskz : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vfmaddsubph256_round_mask3 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vfmaddsubps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmaddsubps256_round_maskz : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmaddsubps256_round_mask3 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmsubpd256_round_mask3 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vfmsubph256_round_mask3 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vfmsubps256_round_mask3 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmsubaddpd256_round_mask3 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vfmsubaddph256_round_mask3 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vfmsubaddps256_round_mask3 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmulcph256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vgetexppd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vgetexpph256_round_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vgetexpps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vgetmantpd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vgetmantph256_round_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Constant int, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vgetmantps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vmaxpd256_round : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def vmaxph256_round : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Constant int)">;
+  def vmaxps256_round : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+  def vminpd256_round : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def vminph256_round : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Constant int)">;
+  def vminps256_round : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+  def vmulpd256_round : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def vmulph256_round : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Constant int)">;
+  def vmulps256_round : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+  def vrangepd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vrangeps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vreducepd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vreduceph256_round_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Constant int, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vreduceps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vrndscalepd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vrndscaleph256_round_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Constant int, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vrndscaleps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vscalefpd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vscalefph256_round_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vscalefps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vsqrtpd256_round : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">;
+  def vsqrtph256_round : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Constant int)">;
+  def vsqrtps256_round : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int)">;
+  def vsubpd256_round : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def vsubph256_round : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Constant int)">;
+  def vsubps256_round : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vpdpwsud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vpdpwsud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vpdpwsuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vpdpwsuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vpdpwusd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vpdpwusd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vpdpwusds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vpdpwusds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vpdpwuud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vpdpwuud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vpdpwuuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vpdpwuuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvttsd2sis32 : X86Builtin<"int(_Vector<2, double>, _Constant int)">;
+  def vcvttsd2usis32 : X86Builtin<"unsigned int(_Vector<2, double>, _Constant int)">;
+  def vcvttss2sis32 : X86Builtin<"int(_Vector<4, float>, _Constant int)">;
+  def vcvttss2usis32 : X86Builtin<"unsigned int(_Vector<4, float>, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttpd2dqs128_mask : X86Builtin<"_Vector<4, int>(_Vector<2, double>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttpd2dqs256_round_mask : X86Builtin<"_Vector<4, int>(_Vector<4, double>, _Vector<4, int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttpd2dqs512_round_mask : X86Builtin<"_Vector<8, int>(_Vector<8, double>, _Vector<8, int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttpd2udqs128_mask : X86Builtin<"_Vector<4, int>(_Vector<2, double>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttpd2udqs256_round_mask : X86Builtin<"_Vector<4, int>(_Vector<4, double>, _Vector<4, int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttpd2udqs512_round_mask : X86Builtin<"_Vector<8, int>(_Vector<8, double>, _Vector<8, int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttpd2qqs128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttpd2qqs256_round_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, double>, _Vector<4, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttpd2qqs512_round_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, double>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttpd2uqqs128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttpd2uqqs256_round_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, double>, _Vector<4, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttpd2uqqs512_round_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, double>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttps2dqs128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, float>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttps2dqs256_round_mask : X86Builtin<"_Vector<8, int>(_Vector<8, float>, _Vector<8, int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttps2dqs512_round_mask : X86Builtin<"_Vector<16, int>(_Vector<16, float>, _Vector<16, int>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttps2udqs128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, float>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttps2udqs256_round_mask : X86Builtin<"_Vector<8, int>(_Vector<8, float>, _Vector<8, int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttps2udqs512_round_mask : X86Builtin<"_Vector<16, int>(_Vector<16, float>, _Vector<16, int>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttps2qqs128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<4, float>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttps2qqs256_round_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, float>, _Vector<4, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttps2qqs512_round_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, float>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttps2uqqs128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<4, float>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttps2uqqs256_round_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, float>, _Vector<4, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttps2uqqs512_round_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, float>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vbcstnebf162ps128 : X86Builtin<"_Vector<4, float>(__bf16 const *)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vbcstnebf162ps256 : X86Builtin<"_Vector<8, float>(__bf16 const *)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vbcstnesh2ps128 : X86Builtin<"_Vector<4, float>(_Float16 const *)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vbcstnesh2ps256 : X86Builtin<"_Vector<8, float>(_Float16 const *)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtneebf162ps128 : X86Builtin<"_Vector<4, float>(_Vector<8, __bf16 const *>)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtneebf162ps256 : X86Builtin<"_Vector<8, float>(_Vector<16, __bf16 const *>)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtneeph2ps128 : X86Builtin<"_Vector<4, float>(_Vector<8, _Float16 const *>)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtneeph2ps256 : X86Builtin<"_Vector<8, float>(_Vector<16, _Float16 const *>)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtneobf162ps128 : X86Builtin<"_Vector<4, float>(_Vector<8, __bf16 const *>)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtneobf162ps256 : X86Builtin<"_Vector<8, float>(_Vector<16, __bf16 const *>)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtneoph2ps128 : X86Builtin<"_Vector<4, float>(_Vector<8, _Float16 const *>)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtneoph2ps256 : X86Builtin<"_Vector<8, float>(_Vector<16, _Float16 const *>)">;
+}
+
+let Features = "avx512bf16,avx512vl|avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtneps2bf16128 : X86Builtin<"_Vector<8, __bf16>(_Vector<4, float>)">;
+}
+
+let Features = "avx512bf16,avx512vl|avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtneps2bf16256 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, float>)">;
+}
+
+let Features = "sha512", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vsha512msg1 : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<4, unsigned long long int>, _Vector<2, unsigned long long int>)">;
+  def vsha512msg2 : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<4, unsigned long long int>, _Vector<4, unsigned long long int>)">;
+  def vsha512rnds2 : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<4, unsigned long long int>, _Vector<4, unsigned long long int>, _Vector<2, unsigned long long int>)">;
+}
+
+let Header = "intrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, RequireDeclaration] in {
+  def _InterlockedAnd64 : X86LibBuiltin<"int64_t(int64_t volatile *, int64_t)">;
+  def _InterlockedDecrement64 : X86LibBuiltin<"int64_t(int64_t volatile *)">;
+  def _InterlockedExchange64 : X86LibBuiltin<"int64_t(int64_t volatile *, int64_t)">;
+  def _InterlockedExchangeAdd64 : X86LibBuiltin<"int64_t(int64_t volatile *, int64_t)">;
+  def _InterlockedExchangeSub64 : X86LibBuiltin<"int64_t(int64_t volatile *, int64_t)">;
+  def _InterlockedIncrement64 : X86LibBuiltin<"int64_t(int64_t volatile *)">;
+  def _InterlockedOr64 : X86LibBuiltin<"int64_t(int64_t volatile *, int64_t)">;
+  def _InterlockedXor64 : X86LibBuiltin<"int64_t(int64_t volatile *, int64_t)">;
+}
+
+let Features = "sm3", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vsm3msg1 : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, unsigned int>, _Vector<4, unsigned int>, _Vector<4, unsigned int>)">;
+  def vsm3msg2 : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, unsigned int>, _Vector<4, unsigned int>, _Vector<4, unsigned int>)">;
+  def vsm3rnds2 : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, unsigned int>, _Vector<4, unsigned int>, _Vector<4, unsigned int>, _Constant unsigned int)">;
+}
+
+let Features = "sm4", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vsm4key4128 : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, unsigned int>, _Vector<4, unsigned int>)">;
+}
+
+let Features = "sm4", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vsm4key4256 : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, unsigned int>, _Vector<8, unsigned int>)">;
+}
+
+let Features = "sm4", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vsm4rnds4128 : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, unsigned int>, _Vector<4, unsigned int>)">;
+}
+
+let Features = "sm4", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vsm4rnds4256 : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, unsigned int>, _Vector<8, unsigned int>)">;
+}
+
+let Features = "avx10.2-512,sm4", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vsm4key4512 : X86Builtin<"_Vector<16, unsigned int>(_Vector<16, unsigned int>, _Vector<16, unsigned int>)">;
+  def vsm4rnds4512 : X86Builtin<"_Vector<16, unsigned int>(_Vector<16, unsigned int>, _Vector<16, unsigned int>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vminmaxnepbf16128 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vminmaxnepbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vminmaxnepbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vminmaxpd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vminmaxpd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int, _Vector<4, double>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vminmaxpd512_round_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int, _Vector<8, double>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vminmaxph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Constant int, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vminmaxph256_round_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Constant int, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vminmaxph512_round_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Constant int, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vminmaxps128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vminmaxps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int, _Vector<8, float>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vminmaxps512_round_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vminmaxsd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int, _Vector<2, double>, unsigned char, _Constant int)">;
+  def vminmaxsh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Constant int, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vminmaxss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vcvtnebf162ibs128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtnebf162ibs256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtnebf162ibs512 : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtnebf162iubs128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtnebf162iubs256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtnebf162iubs512 : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtph2ibs128_mask : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, _Float16>, _Vector<8, unsigned short>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtph2ibs256_mask : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, _Float16>, _Vector<16, unsigned short>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtph2ibs512_mask : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, _Float16>, _Vector<32, unsigned short>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtph2iubs128_mask : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, _Float16>, _Vector<8, unsigned short>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtph2iubs256_mask : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, _Float16>, _Vector<16, unsigned short>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtph2iubs512_mask : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, _Float16>, _Vector<32, unsigned short>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtps2ibs128_mask : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, float>, _Vector<4, unsigned int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtps2ibs256_mask : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, float>, _Vector<8, unsigned int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtps2ibs512_mask : X86Builtin<"_Vector<16, unsigned int>(_Vector<16, float>, _Vector<16, unsigned int>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtps2iubs128_mask : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, float>, _Vector<4, unsigned int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtps2iubs256_mask : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, float>, _Vector<8, unsigned int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtps2iubs512_mask : X86Builtin<"_Vector<16, unsigned int>(_Vector<16, float>, _Vector<16, unsigned int>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttnebf162ibs128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttnebf162ibs256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttnebf162ibs512 : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttnebf162iubs128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttnebf162iubs256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttnebf162iubs512 : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttph2ibs128_mask : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, _Float16>, _Vector<8, unsigned short>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttph2ibs256_mask : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, _Float16>, _Vector<16, unsigned short>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttph2ibs512_mask : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, _Float16>, _Vector<32, unsigned short>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttph2iubs128_mask : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, _Float16>, _Vector<8, unsigned short>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttph2iubs256_mask : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, _Float16>, _Vector<16, unsigned short>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttph2iubs512_mask : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, _Float16>, _Vector<32, unsigned short>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttps2ibs128_mask : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, float>, _Vector<4, unsigned int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttps2ibs256_mask : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, float>, _Vector<8, unsigned int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttps2ibs512_mask : X86Builtin<"_Vector<16, unsigned int>(_Vector<16, float>, _Vector<16, unsigned int>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttps2iubs128_mask : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, float>, _Vector<4, unsigned int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttps2iubs256_mask : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, float>, _Vector<8, unsigned int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttps2iubs512_mask : X86Builtin<"_Vector<16, unsigned int>(_Vector<16, float>, _Vector<16, unsigned int>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvt2ps2phx128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<4, float>, _Vector<4, float>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvt2ps2phx256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<8, float>, _Vector<8, float>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvt2ps2phx512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<16, float>, _Vector<16, float>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtbiasph2bf8_128_mask : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<8, _Float16>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtbiasph2bf8_256_mask : X86Builtin<"_Vector<16, char>(_Vector<32, char>, _Vector<16, _Float16>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtbiasph2bf8_512_mask : X86Builtin<"_Vector<32, char>(_Vector<64, char>, _Vector<32, _Float16>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtbiasph2bf8s_128_mask : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<8, _Float16>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtbiasph2bf8s_256_mask : X86Builtin<"_Vector<16, char>(_Vector<32, char>, _Vector<16, _Float16>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtbiasph2bf8s_512_mask : X86Builtin<"_Vector<32, char>(_Vector<64, char>, _Vector<32, _Float16>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtbiasph2hf8_128_mask : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<8, _Float16>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtbiasph2hf8_256_mask : X86Builtin<"_Vector<16, char>(_Vector<32, char>, _Vector<16, _Float16>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtbiasph2hf8_512_mask : X86Builtin<"_Vector<32, char>(_Vector<64, char>, _Vector<32, _Float16>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtbiasph2hf8s_128_mask : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<8, _Float16>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtbiasph2hf8s_256_mask : X86Builtin<"_Vector<16, char>(_Vector<32, char>, _Vector<16, _Float16>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtbiasph2hf8s_512_mask : X86Builtin<"_Vector<32, char>(_Vector<64, char>, _Vector<32, _Float16>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtne2ph2bf8_128 : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtne2ph2bf8_256 : X86Builtin<"_Vector<32, char>(_Vector<16, _Float16>, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtne2ph2bf8_512 : X86Builtin<"_Vector<64, char>(_Vector<32, _Float16>, _Vector<32, _Float16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtne2ph2bf8s_128 : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtne2ph2bf8s_256 : X86Builtin<"_Vector<32, char>(_Vector<16, _Float16>, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtne2ph2bf8s_512 : X86Builtin<"_Vector<64, char>(_Vector<32, _Float16>, _Vector<32, _Float16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtne2ph2hf8_128 : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtne2ph2hf8_256 : X86Builtin<"_Vector<32, char>(_Vector<16, _Float16>, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtne2ph2hf8_512 : X86Builtin<"_Vector<64, char>(_Vector<32, _Float16>, _Vector<32, _Float16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtne2ph2hf8s_128 : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtne2ph2hf8s_256 : X86Builtin<"_Vector<32, char>(_Vector<16, _Float16>, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtne2ph2hf8s_512 : X86Builtin<"_Vector<64, char>(_Vector<32, _Float16>, _Vector<32, _Float16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvthf8_2ph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<16, char>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvthf8_2ph256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, char>, _Vector<16, _Float16>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvthf8_2ph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, char>, _Vector<32, _Float16>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtneph2bf8_128_mask : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtneph2bf8_256_mask : X86Builtin<"_Vector<16, char>(_Vector<16, _Float16>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtneph2bf8_512_mask : X86Builtin<"_Vector<32, char>(_Vector<32, _Float16>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtneph2bf8s_128_mask : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtneph2bf8s_256_mask : X86Builtin<"_Vector<16, char>(_Vector<16, _Float16>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtneph2bf8s_512_mask : X86Builtin<"_Vector<32, char>(_Vector<32, _Float16>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtneph2hf8_128_mask : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtneph2hf8_256_mask : X86Builtin<"_Vector<16, char>(_Vector<16, _Float16>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtneph2hf8_512_mask : X86Builtin<"_Vector<32, char>(_Vector<32, _Float16>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtneph2hf8s_128_mask : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtneph2hf8s_256_mask : X86Builtin<"_Vector<16, char>(_Vector<16, _Float16>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtneph2hf8s_512_mask : X86Builtin<"_Vector<32, char>(_Vector<32, _Float16>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loadsbf16128_mask : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16 const *>, _Vector<8, __bf16>, unsigned char)">;
+  def storesbf16128_mask : X86Builtin<"void(_Vector<8, __bf16 *>, _Vector<8, __bf16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vaddnepbf16128 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vaddnepbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vaddnepbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vdivnepbf16128 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vdivnepbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vdivnepbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vmaxpbf16128 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vmaxpbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vmaxpbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vminpbf16128 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vminpbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vminpbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vmulnepbf16128 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vmulnepbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vmulnepbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vsubnepbf16128 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vsubnepbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vsubnepbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcomsbf16eq : X86Builtin<"int(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+  def vcomsbf16lt : X86Builtin<"int(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+  def vcomsbf16neq : X86Builtin<"int(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+  def vcomsbf16ge : X86Builtin<"int(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+  def vcomsbf16gt : X86Builtin<"int(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+  def vcomsbf16le : X86Builtin<"int(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcmppbf16512_mask : X86Builtin<"unsigned int(_Vector<32, __bf16>, _Vector<32, __bf16>, _Constant int, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcmppbf16256_mask : X86Builtin<"unsigned short(_Vector<16, __bf16>, _Vector<16, __bf16>, _Constant int, unsigned short)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcmppbf16128_mask : X86Builtin<"unsigned char(_Vector<8, __bf16>, _Vector<8, __bf16>, _Constant int, unsigned char)">;
+  def vfpclasspbf16128_mask : X86Builtin<"unsigned char(_Vector<8, __bf16>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vfpclasspbf16256_mask : X86Builtin<"unsigned short(_Vector<16, __bf16>, _Constant int, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vfpclasspbf16512_mask : X86Builtin<"unsigned int(_Vector<32, __bf16>, _Constant int, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vscalefpbf16128_mask : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>, _Vector<8, __bf16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vscalefpbf16256_mask : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>, _Vector<16, __bf16>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vscalefpbf16512_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>, _Vector<32, __bf16>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vrcppbf16128_mask : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vrcppbf16256_mask : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vrcppbf16512_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vgetexppbf16128_mask : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vgetexppbf16256_mask : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vgetexppbf16512_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vrsqrtpbf16128_mask : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vrsqrtpbf16256_mask : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vrsqrtpbf16512_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vreducenepbf16128_mask : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Constant int, _Vector<8, __bf16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vreducenepbf16256_mask : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Constant int, _Vector<16, __bf16>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vreducenepbf16512_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Constant int, _Vector<32, __bf16>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vrndscalenepbf16_128_mask : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Constant int, _Vector<8, __bf16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vrndscalenepbf16_256_mask : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Constant int, _Vector<16, __bf16>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vrndscalenepbf16_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Constant int, _Vector<32, __bf16>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vgetmantpbf16128_mask : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Constant int, _Vector<8, __bf16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vgetmantpbf16256_mask : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Constant int, _Vector<16, __bf16>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vgetmantpbf16512_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Constant int, _Vector<32, __bf16>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vsqrtnepbf16 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vsqrtnepbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vsqrtnepbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>)">;
+  def vfmaddnepbh512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>, _Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vfmaddnepbh256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>, _Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddnepbh128 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h
index a14fd2c4b224d..556332dd4b217 100644
--- a/clang/include/clang/Basic/TargetBuiltins.h
+++ b/clang/include/clang/Basic/TargetBuiltins.h
@@ -124,8 +124,6 @@ namespace clang {
   enum {
     LastTIBuiltin = clang::Builtin::FirstTSBuiltin - 1,
 #define BUILTIN(ID, TYPE, ATTRS) BI##ID,
-#include "clang/Basic/BuiltinsX86.def"
-#define BUILTIN(ID, TYPE, ATTRS) BI##ID,
 #include "clang/Basic/BuiltinsX86.inc"
     FirstX86_64Builtin,
     LastX86CommonBuiltin = FirstX86_64Builtin - 1,
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 1b16888a0711b..7e5a5c78aa6b5 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -24,14 +24,6 @@ namespace clang {
 namespace targets {
 
 static constexpr Builtin::Info BuiltinInfoX86[] = {
-#define BUILTIN(ID, TYPE, ATTRS)                                               \
-  {#ID, TYPE, ATTRS, nullptr, HeaderDesc::NO_HEADER, ALL_LANGUAGES},
-#define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE)                               \
-  {#ID, TYPE, ATTRS, FEATURE, HeaderDesc::NO_HEADER, ALL_LANGUAGES},
-#define TARGET_HEADER_BUILTIN(ID, TYPE, ATTRS, HEADER, LANGS, FEATURE)         \
-  {#ID, TYPE, ATTRS, FEATURE, HeaderDesc::HEADER, LANGS},
-#include "clang/Basic/BuiltinsX86.def"
-
 #define BUILTIN(ID, TYPE, ATTRS)                                               \
   {#ID, TYPE, ATTRS, nullptr, HeaderDesc::NO_HEADER, ALL_LANGUAGES},
 #define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE)                               \
diff --git a/clang/utils/TableGen/ClangBuiltinsEmitter.cpp b/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
index 6c3604adc92b9..94cc218376002 100644
--- a/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
+++ b/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
@@ -25,12 +25,14 @@ enum class BuiltinType {
   LibBuiltin,
   LangBuiltin,
   TargetBuiltin,
+  TargetLibBuiltin,
 };
 
 class PrototypeParser {
 public:
   PrototypeParser(StringRef Substitution, const Record *Builtin)
-      : Loc(Builtin->getFieldLoc("Prototype")), Substitution(Substitution) {
+      : Loc(Builtin->getFieldLoc("Prototype")), Substitution(Substitution),
+        EnableOpenCLLong(Builtin->getValueAsBit("EnableOpenCLLong")) {
     ParsePrototype(Builtin->getValueAsString("Prototype"));
   }
 
@@ -108,9 +110,15 @@ class PrototypeParser {
     } else if (T.consume_back("&")) {
       ParseType(T);
       Type += "&";
+    } else if (EnableOpenCLLong && T.consume_front("long long")) {
+      Type += "O";
+      ParseType(T);
     } else if (T.consume_front("long")) {
       Type += "L";
       ParseType(T);
+    } else if (T.consume_front("signed")) {
+      Type += "S";
+      ParseType(T);
     } else if (T.consume_front("unsigned")) {
       Type += "U";
       ParseType(T);
@@ -155,6 +163,7 @@ class PrototypeParser {
                                .Case("__fp16", "h")
                                .Case("__int128_t", "LLLi")
                                .Case("_Float16", "x")
+                               .Case("__bf16", "y")
                                .Case("bool", "b")
                                .Case("char", "c")
                                .Case("constant_CFString", "F")
@@ -194,6 +203,7 @@ class PrototypeParser {
 private:
   SMLoc Loc;
   StringRef Substitution;
+  bool EnableOpenCLLong;
   std::string Type;
 };
 
@@ -262,6 +272,9 @@ void EmitBuiltinDef(raw_ostream &OS, StringRef Substitution,
   case BuiltinType::TargetBuiltin:
     OS << "TARGET_BUILTIN";
     break;
+  case BuiltinType::TargetLibBuiltin:
+    OS << "TARGET_HEADER_BUILTIN";
+    break;
   }
 
   OS << "(" << Spelling;
@@ -279,6 +292,12 @@ void EmitBuiltinDef(raw_ostream &OS, StringRef Substitution,
     OS << ", " << Builtin->getValueAsString("Languages");
     break;
   }
+  case BuiltinType::TargetLibBuiltin: {
+    OS << ", ";
+    HeaderNameParser{Builtin}.Print(OS);
+    OS << ", " << Builtin->getValueAsString("Languages");
+    [[fallthrough]];
+  }
   case BuiltinType::TargetBuiltin:
     OS << ", \"" << Builtin->getValueAsString("Features") << "\"";
     break;
@@ -331,6 +350,8 @@ void EmitBuiltin(raw_ostream &OS, const Record *Builtin) {
         BT = BuiltinType::AtomicBuiltin;
       } else if (Builtin->isSubClassOf("LangBuiltin")) {
         BT = BuiltinType::LangBuiltin;
+      } else if (Builtin->isSubClassOf("TargetLibBuiltin")) {
+        BT = BuiltinType::TargetLibBuiltin;
       } else if (Builtin->isSubClassOf("TargetBuiltin")) {
         BT = BuiltinType::TargetBuiltin;
       } else if (Builtin->isSubClassOf("LibBuiltin")) {
@@ -367,6 +388,10 @@ void clang::EmitClangBuiltins(const RecordKeeper &Records, raw_ostream &OS) {
 #if defined(BUILTIN) && !defined(TARGET_BUILTIN)
 #  define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS)
 #endif
+
+#if defined(BUILTIN) && !defined(TARGET_HEADER_BUILTIN)
+#  define TARGET_HEADER_BUILTIN(ID, TYPE, ATTRS, HEADER, LANG, FEATURE) BUILTIN(ID, TYPE, ATTRS)
+#endif
 )c++";
 
   // AtomicBuiltins are order dependent
@@ -390,5 +415,6 @@ void clang::EmitClangBuiltins(const RecordKeeper &Records, raw_ostream &OS) {
 #undef LIBBUILTIN
 #undef LANGBUILTIN
 #undef TARGET_BUILTIN
+#undef TARGET_HEADER_BUILTIN
 )c++";
 }

From 95db1116c5718004e0bd7c3b79d39987fdbbff32 Mon Sep 17 00:00:00 2001
From: David CARLIER 
Date: Sat, 4 Jan 2025 10:52:41 +0000
Subject: [PATCH 424/567] [compiler-rt][rtsan] intercept setbuf, setvbuf,
 setlinebuf and setbuffer (#121616)

---
 .../lib/rtsan/rtsan_interceptors_posix.cpp    | 35 +++++++++++++
 .../tests/rtsan_test_interceptors_posix.cpp   | 50 +++++++++++++++++++
 2 files changed, 85 insertions(+)

diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
index 9f89ab6bf1fc7..227d077290af7 100644
--- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
@@ -325,6 +325,37 @@ INTERCEPTOR(FILE *, fmemopen, void *buf, size_t size, const char *mode) {
 #define RTSAN_MAYBE_INTERCEPT_FMEMOPEN
 #endif
 
+#if SANITIZER_INTERCEPT_SETVBUF
+INTERCEPTOR(void, setbuf, FILE *stream, char *buf) {
+  __rtsan_notify_intercepted_call("setbuf");
+  return REAL(setbuf)(stream, buf);
+}
+
+INTERCEPTOR(void, setbuffer, FILE *stream, char *buf, size_t size) {
+  __rtsan_notify_intercepted_call("setbuffer");
+  return REAL(setbuffer)(stream, buf, size);
+}
+
+INTERCEPTOR(void, setlinebuf, FILE *stream) {
+  __rtsan_notify_intercepted_call("setlinebuf");
+  return REAL(setlinebuf)(stream);
+}
+
+INTERCEPTOR(int, setvbuf, FILE *stream, char *buf, int mode, size_t size) {
+  __rtsan_notify_intercepted_call("setvbuf");
+  return REAL(setvbuf)(stream, buf, mode, size);
+}
+#define RTSAN_MAYBE_INTERCEPT_SETBUF INTERCEPT_FUNCTION(setbuf)
+#define RTSAN_MAYBE_INTERCEPT_SETBUFFER INTERCEPT_FUNCTION(setbuffer)
+#define RTSAN_MAYBE_INTERCEPT_SETLINEBUF INTERCEPT_FUNCTION(setlinebuf)
+#define RTSAN_MAYBE_INTERCEPT_SETVBUF INTERCEPT_FUNCTION(setvbuf)
+#else
+#define RTSAN_MAYBE_INTERCEPT_SETBUF
+#define RTSAN_MAYBE_INTERCEPT_SETBUFFER
+#define RTSAN_MAYBE_INTERCEPT_SETLINEBUF
+#define RTSAN_MAYBE_INTERCEPT_SETVBUF
+#endif
+
 INTERCEPTOR(int, puts, const char *s) {
   __rtsan_notify_intercepted_call("puts");
   return REAL(puts)(s);
@@ -986,6 +1017,10 @@ void __rtsan::InitializeInterceptors() {
   RTSAN_MAYBE_INTERCEPT_FOPENCOOKIE;
   RTSAN_MAYBE_INTERCEPT_OPEN_MEMSTREAM;
   RTSAN_MAYBE_INTERCEPT_FMEMOPEN;
+  RTSAN_MAYBE_INTERCEPT_SETBUF;
+  RTSAN_MAYBE_INTERCEPT_SETBUFFER;
+  RTSAN_MAYBE_INTERCEPT_SETLINEBUF;
+  RTSAN_MAYBE_INTERCEPT_SETVBUF;
   INTERCEPT_FUNCTION(lseek);
   RTSAN_MAYBE_INTERCEPT_LSEEK64;
   INTERCEPT_FUNCTION(dup);
diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
index 5adbf0fb63de8..2947510b2cfde 100644
--- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
+++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
@@ -403,6 +403,56 @@ TEST_F(RtsanFileTest, FmemOpenDiesWhenRealtime) {
 }
 #endif
 
+#if SANITIZER_INTERCEPT_SETVBUF
+TEST_F(RtsanFileTest, SetbufDieWhenRealtime) {
+  char buffer[BUFSIZ];
+  FILE *f = fopen(GetTemporaryFilePath(), "w");
+  EXPECT_THAT(f, Ne(nullptr));
+
+  auto Func = [&f, &buffer]() { setbuf(f, buffer); };
+
+  ExpectRealtimeDeath(Func, "setbuf");
+  ExpectNonRealtimeSurvival(Func);
+}
+
+TEST_F(RtsanFileTest, SetbufferDieWhenRealtime) {
+  char buffer[1024];
+  size_t size = sizeof(buffer);
+  FILE *f = fopen(GetTemporaryFilePath(), "w");
+  EXPECT_THAT(f, Ne(nullptr));
+
+  auto Func = [&f, &buffer, &size]() { setbuffer(f, buffer, size); };
+
+  ExpectRealtimeDeath(Func, "setbuffer");
+  ExpectNonRealtimeSurvival(Func);
+}
+
+TEST_F(RtsanFileTest, SetvbufDieWhenRealtime) {
+  char buffer[1024];
+  size_t size = sizeof(buffer);
+  FILE *f = fopen(GetTemporaryFilePath(), "w");
+  EXPECT_THAT(f, Ne(nullptr));
+
+  auto Func = [&f, &buffer, &size]() {
+    int r = setvbuf(f, buffer, _IOFBF, size);
+    EXPECT_THAT(r, Eq(0));
+  };
+
+  ExpectRealtimeDeath(Func, "setvbuf");
+  ExpectNonRealtimeSurvival(Func);
+}
+
+TEST_F(RtsanFileTest, SetlinebufDieWhenRealtime) {
+  FILE *f = fopen(GetTemporaryFilePath(), "w");
+  EXPECT_THAT(f, Ne(nullptr));
+
+  auto Func = [&f]() { setlinebuf(f); };
+
+  ExpectRealtimeDeath(Func, "setlinebuf");
+  ExpectNonRealtimeSurvival(Func);
+}
+#endif
+
 class RtsanOpenedFileTest : public RtsanFileTest {
 protected:
   void SetUp() override {

From c7fa3cf620f62d87dc7753f5d341ae3f63da87f4 Mon Sep 17 00:00:00 2001
From: David CARLIER 
Date: Sat, 4 Jan 2025 11:35:31 +0000
Subject: [PATCH 425/567] =?UTF-8?q?Revert=20"[compiler-rt][rtsan]=20interc?=
 =?UTF-8?q?ept=20setbuf,=20setvbuf,=20setlinebuf=20an=E2=80=A6=20(#121639)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…d setbuffer (#121616)"

This reverts commit 95db1116c5718004e0bd7c3b79d39987fdbbff32.
---
 .../lib/rtsan/rtsan_interceptors_posix.cpp    | 35 -------------
 .../tests/rtsan_test_interceptors_posix.cpp   | 50 -------------------
 2 files changed, 85 deletions(-)

diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
index 227d077290af7..9f89ab6bf1fc7 100644
--- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
@@ -325,37 +325,6 @@ INTERCEPTOR(FILE *, fmemopen, void *buf, size_t size, const char *mode) {
 #define RTSAN_MAYBE_INTERCEPT_FMEMOPEN
 #endif
 
-#if SANITIZER_INTERCEPT_SETVBUF
-INTERCEPTOR(void, setbuf, FILE *stream, char *buf) {
-  __rtsan_notify_intercepted_call("setbuf");
-  return REAL(setbuf)(stream, buf);
-}
-
-INTERCEPTOR(void, setbuffer, FILE *stream, char *buf, size_t size) {
-  __rtsan_notify_intercepted_call("setbuffer");
-  return REAL(setbuffer)(stream, buf, size);
-}
-
-INTERCEPTOR(void, setlinebuf, FILE *stream) {
-  __rtsan_notify_intercepted_call("setlinebuf");
-  return REAL(setlinebuf)(stream);
-}
-
-INTERCEPTOR(int, setvbuf, FILE *stream, char *buf, int mode, size_t size) {
-  __rtsan_notify_intercepted_call("setvbuf");
-  return REAL(setvbuf)(stream, buf, mode, size);
-}
-#define RTSAN_MAYBE_INTERCEPT_SETBUF INTERCEPT_FUNCTION(setbuf)
-#define RTSAN_MAYBE_INTERCEPT_SETBUFFER INTERCEPT_FUNCTION(setbuffer)
-#define RTSAN_MAYBE_INTERCEPT_SETLINEBUF INTERCEPT_FUNCTION(setlinebuf)
-#define RTSAN_MAYBE_INTERCEPT_SETVBUF INTERCEPT_FUNCTION(setvbuf)
-#else
-#define RTSAN_MAYBE_INTERCEPT_SETBUF
-#define RTSAN_MAYBE_INTERCEPT_SETBUFFER
-#define RTSAN_MAYBE_INTERCEPT_SETLINEBUF
-#define RTSAN_MAYBE_INTERCEPT_SETVBUF
-#endif
-
 INTERCEPTOR(int, puts, const char *s) {
   __rtsan_notify_intercepted_call("puts");
   return REAL(puts)(s);
@@ -1017,10 +986,6 @@ void __rtsan::InitializeInterceptors() {
   RTSAN_MAYBE_INTERCEPT_FOPENCOOKIE;
   RTSAN_MAYBE_INTERCEPT_OPEN_MEMSTREAM;
   RTSAN_MAYBE_INTERCEPT_FMEMOPEN;
-  RTSAN_MAYBE_INTERCEPT_SETBUF;
-  RTSAN_MAYBE_INTERCEPT_SETBUFFER;
-  RTSAN_MAYBE_INTERCEPT_SETLINEBUF;
-  RTSAN_MAYBE_INTERCEPT_SETVBUF;
   INTERCEPT_FUNCTION(lseek);
   RTSAN_MAYBE_INTERCEPT_LSEEK64;
   INTERCEPT_FUNCTION(dup);
diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
index 2947510b2cfde..5adbf0fb63de8 100644
--- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
+++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
@@ -403,56 +403,6 @@ TEST_F(RtsanFileTest, FmemOpenDiesWhenRealtime) {
 }
 #endif
 
-#if SANITIZER_INTERCEPT_SETVBUF
-TEST_F(RtsanFileTest, SetbufDieWhenRealtime) {
-  char buffer[BUFSIZ];
-  FILE *f = fopen(GetTemporaryFilePath(), "w");
-  EXPECT_THAT(f, Ne(nullptr));
-
-  auto Func = [&f, &buffer]() { setbuf(f, buffer); };
-
-  ExpectRealtimeDeath(Func, "setbuf");
-  ExpectNonRealtimeSurvival(Func);
-}
-
-TEST_F(RtsanFileTest, SetbufferDieWhenRealtime) {
-  char buffer[1024];
-  size_t size = sizeof(buffer);
-  FILE *f = fopen(GetTemporaryFilePath(), "w");
-  EXPECT_THAT(f, Ne(nullptr));
-
-  auto Func = [&f, &buffer, &size]() { setbuffer(f, buffer, size); };
-
-  ExpectRealtimeDeath(Func, "setbuffer");
-  ExpectNonRealtimeSurvival(Func);
-}
-
-TEST_F(RtsanFileTest, SetvbufDieWhenRealtime) {
-  char buffer[1024];
-  size_t size = sizeof(buffer);
-  FILE *f = fopen(GetTemporaryFilePath(), "w");
-  EXPECT_THAT(f, Ne(nullptr));
-
-  auto Func = [&f, &buffer, &size]() {
-    int r = setvbuf(f, buffer, _IOFBF, size);
-    EXPECT_THAT(r, Eq(0));
-  };
-
-  ExpectRealtimeDeath(Func, "setvbuf");
-  ExpectNonRealtimeSurvival(Func);
-}
-
-TEST_F(RtsanFileTest, SetlinebufDieWhenRealtime) {
-  FILE *f = fopen(GetTemporaryFilePath(), "w");
-  EXPECT_THAT(f, Ne(nullptr));
-
-  auto Func = [&f]() { setlinebuf(f); };
-
-  ExpectRealtimeDeath(Func, "setlinebuf");
-  ExpectNonRealtimeSurvival(Func);
-}
-#endif
-
 class RtsanOpenedFileTest : public RtsanFileTest {
 protected:
   void SetUp() override {

From 914fd916d5456e15cf9baaf617edaac6b7334d09 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser 
Date: Sat, 4 Jan 2025 14:49:22 +0100
Subject: [PATCH 426/567] [libc++][NFC] Simplify basic_ostream by combining
 operator<<(Arithmetic) (#121011)

The bodies of all the `operator<<` for arithmetic types have very
similar or even identical bodies. This introduces two new functions to
avoid all the duplication.
---
 libcxx/include/__ostream/basic_ostream.h | 284 +++++------------------
 1 file changed, 62 insertions(+), 222 deletions(-)

diff --git a/libcxx/include/__ostream/basic_ostream.h b/libcxx/include/__ostream/basic_ostream.h
index cf4d26167aebd..97226476e5ef0 100644
--- a/libcxx/include/__ostream/basic_ostream.h
+++ b/libcxx/include/__ostream/basic_ostream.h
@@ -88,6 +88,55 @@ class _LIBCPP_TEMPLATE_VIS basic_ostream : virtual public basic_ios<_CharT, _Tra
     return *this;
   }
 
+  template 
+  _LIBCPP_HIDE_FROM_ABI basic_ostream& __put_num(_Tp __value) {
+#  if _LIBCPP_HAS_EXCEPTIONS
+    try {
+#  endif // _LIBCPP_HAS_EXCEPTIONS
+      sentry __s(*this);
+      if (__s) {
+        using _Fp          = num_put >;
+        const _Fp& __facet = std::use_facet<_Fp>(this->getloc());
+        if (__facet.put(*this, *this, this->fill(), __value).failed())
+          this->setstate(ios_base::badbit | ios_base::failbit);
+      }
+#  if _LIBCPP_HAS_EXCEPTIONS
+    } catch (...) {
+      this->__set_badbit_and_consider_rethrow();
+    }
+#  endif // _LIBCPP_HAS_EXCEPTIONS
+    return *this;
+  }
+
+  template 
+  _LIBCPP_HIDE_FROM_ABI basic_ostream& __put_num_integer_promote(_Tp __value) {
+#  if _LIBCPP_HAS_EXCEPTIONS
+    try {
+#  endif // _LIBCPP_HAS_EXCEPTIONS
+      sentry __s(*this);
+      if (__s) {
+        ios_base::fmtflags __flags = ios_base::flags() & ios_base::basefield;
+
+        using _Fp          = num_put >;
+        const _Fp& __facet = std::use_facet<_Fp>(this->getloc());
+        if (__facet
+                .put(*this,
+                     *this,
+                     this->fill(),
+                     __flags == ios_base::oct || __flags == ios_base::hex
+                         ? static_cast<__copy_unsigned_t<_Tp, long> >(std::__to_unsigned_like(__value))
+                         : static_cast<__copy_unsigned_t<_Tp, long> >(__value))
+                .failed())
+          this->setstate(ios_base::badbit | ios_base::failbit);
+      }
+#  if _LIBCPP_HAS_EXCEPTIONS
+    } catch (...) {
+      this->__set_badbit_and_consider_rethrow();
+    }
+#  endif // _LIBCPP_HAS_EXCEPTIONS
+    return *this;
+  }
+
   basic_ostream& operator<<(bool __n);
   basic_ostream& operator<<(short __n);
   basic_ostream& operator<<(unsigned short __n);
@@ -225,276 +274,67 @@ basic_ostream<_CharT, _Traits>::operator<<(basic_streambuf
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(bool __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num(__n);
 }
 
 template 
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(short __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      ios_base::fmtflags __flags = ios_base::flags() & ios_base::basefield;
-      typedef num_put > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this,
-                  *this,
-                  this->fill(),
-                  __flags == ios_base::oct || __flags == ios_base::hex
-                      ? static_cast(static_cast(__n))
-                      : static_cast(__n))
-              .failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num_integer_promote(__n);
 }
 
 template 
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(unsigned short __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), static_cast(__n)).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num_integer_promote(__n);
 }
 
 template 
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(int __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      ios_base::fmtflags __flags = ios_base::flags() & ios_base::basefield;
-      typedef num_put > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this,
-                  *this,
-                  this->fill(),
-                  __flags == ios_base::oct || __flags == ios_base::hex
-                      ? static_cast(static_cast(__n))
-                      : static_cast(__n))
-              .failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num_integer_promote(__n);
 }
 
 template 
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(unsigned int __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), static_cast(__n)).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num_integer_promote(__n);
 }
 
 template 
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(long __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num(__n);
 }
 
 template 
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(unsigned long __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num(__n);
 }
 
 template 
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(long long __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num(__n);
 }
 
 template 
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(unsigned long long __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num(__n);
 }
 
 template 
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(float __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), static_cast(__n)).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return *this << static_cast(__n);
 }
 
 template 
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(double __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num(__n);
 }
 
 template 
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(long double __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num(__n);
 }
 
 template 
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(const void* __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num(__n);
 }
 
 template 

From c9d61cde2ba3521c7604c8ee0c3e1ba4dfc4d406 Mon Sep 17 00:00:00 2001
From: Matthias Springer 
Date: Sat, 4 Jan 2025 15:16:35 +0100
Subject: [PATCH 427/567] [mlir][Transforms][NFC] Delete unused
 `nTo1TempMaterializations` (#121647)

`nTo1TempMaterializations` is no longer used since the conversion value
mapping supports 1:N mappings.
---
 mlir/lib/Transforms/Utils/DialectConversion.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 1e689cd96ae71..0e577d2d39de3 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -1040,10 +1040,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   DenseMap
       unresolvedMaterializations;
 
-  /// A set of all N:1 materializations that were added to work around
-  /// incomplete 1:N support in the dialect conversion driver.
-  DenseSet nTo1TempMaterializations;
-
   /// The current type converter, or nullptr if no type converter is currently
   /// active.
   const TypeConverter *currentTypeConverter = nullptr;
@@ -1180,7 +1176,6 @@ void UnresolvedMaterializationRewrite::rollback() {
   if (!mappedValues.empty())
     rewriterImpl.mapping.erase(mappedValues);
   rewriterImpl.unresolvedMaterializations.erase(getOperation());
-  rewriterImpl.nTo1TempMaterializations.erase(getOperation());
   op->erase();
 }
 

From 47ac7fa8619c1f1e29ee4aafded2ae990ffa319e Mon Sep 17 00:00:00 2001
From: Florian Hahn 
Date: Sat, 4 Jan 2025 14:50:04 +0000
Subject: [PATCH 428/567] [LV] Add tests with wide inductions and live-in step.

Also regenerate check lines and simplify existing tests and names.
---
 .../LoopVectorize/induction-step.ll           | 513 +++++++++++++-----
 1 file changed, 387 insertions(+), 126 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/induction-step.ll b/llvm/test/Transforms/LoopVectorize/induction-step.ll
index ecb00d4724488..f553864b5fc38 100644
--- a/llvm/test/Transforms/LoopVectorize/induction-step.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction-step.ll
@@ -1,21 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=8 -S | FileCheck %s
 
 ; int int_inc;
 ;
-;int induction_with_global(int init, int *restrict A, int N) {
+;void induction_with_global(int init, int *restrict A, int N) {
 ;  int x = init;
 ;  for (int i=0;i poison, i32 %init, i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP5]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[DOTCAST]], [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INIT]], [[TMP3]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> poison, i32 [[INIT]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT2]], <8 x i32> poison, <8 x i32> zeroinitializer
@@ -24,53 +42,56 @@
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP0]], 8
 ; CHECK-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> poison, i32 [[TMP7]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    br label %vector.body
-; CHECK:       vector.body:
-; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECK-NEXT:    %vec.ind = phi <8 x i32> [ [[INDUCTION4]], %vector.ph ], [ %vec.ind.next, %vector.body ]
-; CHECK:         [[TMP8:%.*]] = add i64 %index, 0
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ [[INDUCTION4]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    store <8 x i32> %vec.ind, ptr [[TMP10]], align 4
-; CHECK:         %index.next = add nuw i64 %index, 8
-; CHECK-NEXT:    %vec.ind.next = add <8 x i32> %vec.ind, [[DOTSPLAT6]]
-; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-
-@int_inc = common global i32 0, align 4
-
-define i32 @induction_with_global(i32 %init, ptr noalias nocapture %A, i32 %N) {
+; CHECK-NEXT:    store <8 x i32> [[VEC_IND]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], [[DOTSPLAT6]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ [[INIT]], %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[X_05:%.*]] = phi i32 [ [[BC_RESUME_VAL5]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[X_05]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[TMP0]], [[X_05]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
-  %cmp4 = icmp sgt i32 %N, 0
-  br i1 %cmp4, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
   %0 = load i32, ptr @int_inc, align 4
-  %1 = mul i32 %0, %N
   br label %for.body
 
-for.body:                                         ; preds = %for.body, %for.body.lr.ph
-  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
-  %x.05 = phi i32 [ %init, %for.body.lr.ph ], [ %add, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %x.05 = phi i32 [ %init, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %iv
   store i32 %x.05, ptr %arrayidx, align 4
   %add = add nsw i32 %0, %x.05
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %N
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:                                 ; preds = %for.body
-  %2 = add i32 %1, %init
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  %x.0.lcssa = phi i32 [ %init, %entry ], [ %2, %for.end.loopexit ]
-  ret i32 %x.0.lcssa
-}
+  %iv.next = add nuw nsw i64 %iv, 1
+  %iv.next.trunc = trunc i64 %iv.next to i32
+  %exitcond = icmp eq i32 %iv.next.trunc, %N
+  br i1 %exitcond, label %exit, label %for.body
 
+exit:
+  ret void
+}
 
 ;int induction_with_loop_inv(int init, int *restrict A, int N, int M) {
 ;  int x = init;
@@ -83,82 +104,123 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ;  return x;
 ;}
 
-; CHECK-LABEL: @induction_with_loop_inv(
-; CHECK:       vector.ph:
-; CHECK:         [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> poison, i32 %x.011, i64 0
+define i32 @induction_with_loop_inv(i32 %init, ptr noalias nocapture %A, i32 %N, i32 %M) {
+; CHECK-LABEL: define i32 @induction_with_loop_inv(
+; CHECK-SAME: i32 [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i32 [[N:%.*]], i32 [[M:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[TMP11]], 1
+; CHECK-NEXT:    br label %[[OUTER_HEADER:.*]]
+; CHECK:       [[OUTER_HEADER]]:
+; CHECK-NEXT:    [[INDVARS_IV15:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT16:%.*]], %[[OUTER_LATCH:.*]] ]
+; CHECK-NEXT:    [[J_012:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC5:%.*]], %[[OUTER_LATCH]] ]
+; CHECK-NEXT:    [[X_011:%.*]] = phi i32 [ [[INIT]], %[[ENTRY]] ], [ [[X_0_LCSSA:%.*]], %[[OUTER_LATCH]] ]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[DOTCAST]], [[J_012]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[X_011]], [[TMP1]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> poison, i32 [[X_011]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> poison, i32 %j.012, i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> poison, i32 [[J_012]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT2]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> , [[DOTSPLAT3]]
 ; CHECK-NEXT:    [[INDUCTION4:%.*]] = add <8 x i32> [[DOTSPLAT]], [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 %j.012, 8
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[J_012]], 8
 ; CHECK-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> poison, i32 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    br label %vector.body
-; CHECK:       vector.body:
-; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECK-NEXT:    %vec.ind = phi <8 x i32> [ [[INDUCTION4]], %vector.ph ], [ %vec.ind.next, %vector.body ]
-; CHECK:         [[TMP6:%.*]] = add i64 %index, 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ [[INDUCTION4]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    store <8 x i32> %vec.ind, ptr [[TMP8]], align 4
-; CHECK:         %index.next = add nuw i64 %index, 8
-; CHECK-NEXT:    %vec.ind.next = add <8 x i32> %vec.ind, [[DOTSPLAT6]]
-; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
-
-define i32 @induction_with_loop_inv(i32 %init, ptr noalias nocapture %A, i32 %N, i32 %M) {
+; CHECK-NEXT:    store <8 x i32> [[VEC_IND]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], [[DOTSPLAT6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[INNER_EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_HEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[X_011]], %[[OUTER_HEADER]] ]
+; CHECK-NEXT:    br label %[[INNER:.*]]
+; CHECK:       [[INNER]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[INNER]] ]
+; CHECK-NEXT:    [[X_18:%.*]] = phi i32 [ [[BC_RESUME_VAL5]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[INNER]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[X_18]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[X_18]], [[J_012]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[INNER_EXIT]], label %[[INNER]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[INNER_EXIT]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[X_011]], [[INDVARS_IV15]]
+; CHECK-NEXT:    br label %[[OUTER_LATCH]]
+; CHECK:       [[OUTER_LATCH]]:
+; CHECK-NEXT:    [[X_0_LCSSA]] = phi i32 [ [[TMP9]], %[[INNER_EXIT]] ]
+; CHECK-NEXT:    [[INC5]] = add nuw nsw i32 [[J_012]], 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT16]] = add i32 [[INDVARS_IV15]], [[N]]
+; CHECK-NEXT:    [[EXITCOND17:%.*]] = icmp eq i32 [[INC5]], [[M]]
+; CHECK-NEXT:    br i1 [[EXITCOND17]], label %[[EXIT:.*]], label %[[OUTER_HEADER]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i32 [[X_0_LCSSA]]
+;
 entry:
-  %cmp10 = icmp sgt i32 %M, 0
-  br i1 %cmp10, label %for.cond1.preheader.lr.ph, label %for.end6
-
-for.cond1.preheader.lr.ph:                        ; preds = %entry
-  %cmp27 = icmp sgt i32 %N, 0
-  br label %for.cond1.preheader
-
-for.cond1.preheader:                              ; preds = %for.inc4, %for.cond1.preheader.lr.ph
-  %indvars.iv15 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next16, %for.inc4 ]
-  %j.012 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %inc5, %for.inc4 ]
-  %x.011 = phi i32 [ %init, %for.cond1.preheader.lr.ph ], [ %x.1.lcssa, %for.inc4 ]
-  br i1 %cmp27, label %for.body3.preheader, label %for.inc4
-
-for.body3.preheader:                              ; preds = %for.cond1.preheader
-  br label %for.body3
-
-for.body3:                                        ; preds = %for.body3.preheader, %for.body3
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 0, %for.body3.preheader ]
-  %x.18 = phi i32 [ %add, %for.body3 ], [ %x.011, %for.body3.preheader ]
-  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  br label %outer.header
+
+outer.header:
+  %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv.next, %outer.latch ]
+  %j.012 = phi i32 [ 0, %entry ], [ %inc5, %outer.latch ]
+  %x.011 = phi i32 [ %init, %entry ], [ %x.1.lcssa, %outer.latch ]
+  br label %inner
+
+inner:
+  %iv = phi i64 [ 0, %outer.header ], [ %iv.next, %inner ]
+  %x.18 = phi i32 [ %x.011, %outer.header ], [ %add, %inner ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %iv
   store i32 %x.18, ptr %arrayidx, align 4
   %add = add nsw i32 %x.18, %j.012
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %N
-  br i1 %exitcond, label %for.inc4.loopexit, label %for.body3
+  %iv.next = add nuw nsw i64 %iv, 1
+  %iv.next.trunc = trunc i64 %iv.next to i32
+  %inner.ec = icmp eq i32 %iv.next.trunc, %N
+  br i1 %inner.ec, label %inner.exit, label %inner
 
-for.inc4.loopexit:                                ; preds = %for.body3
-  %0 = add i32 %x.011, %indvars.iv15
-  br label %for.inc4
+inner.exit:
+  %add.ivs  = add i32 %x.011, %outer.iv
+  br label %outer.latch
 
-for.inc4:                                         ; preds = %for.inc4.loopexit, %for.cond1.preheader
-  %x.1.lcssa = phi i32 [ %x.011, %for.cond1.preheader ], [ %0, %for.inc4.loopexit ]
+outer.latch:
+  %x.1.lcssa = phi i32 [ %add.ivs, %inner.exit ]
   %inc5 = add nuw nsw i32 %j.012, 1
-  %indvars.iv.next16 = add i32 %indvars.iv15, %N
-  %exitcond17 = icmp eq i32 %inc5, %M
-  br i1 %exitcond17, label %for.end6.loopexit, label %for.cond1.preheader
-
-for.end6.loopexit:                                ; preds = %for.inc4
-  %x.1.lcssa.lcssa = phi i32 [ %x.1.lcssa, %for.inc4 ]
-  br label %for.end6
+  %outer.iv.next = add i32 %outer.iv, %N
+  %outer.ec = icmp eq i32 %inc5, %M
+  br i1 %outer.ec, label %exit, label %outer.header
 
-for.end6:                                         ; preds = %for.end6.loopexit, %entry
-  %x.0.lcssa = phi i32 [ %init, %entry ], [ %x.1.lcssa.lcssa, %for.end6.loopexit ]
-  ret i32 %x.0.lcssa
+exit:
+  ret i32 %x.1.lcssa
 }
 
-
-; CHECK-LABEL: @non_primary_iv_loop_inv_trunc(
-; CHECK:       vector.ph:
-; CHECK:         [[TMP3:%.*]] = trunc i64 %step to i32
+define void @non_primary_iv_loop_inv_trunc(ptr %a, i64 %n, i64 %step) {
+; CHECK-LABEL: define void @non_primary_iv_loop_inv_trunc(
+; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i64 [[STEP:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[SMAX]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[N_VEC]], [[STEP]]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[STEP]] to i32
 ; CHECK-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> , [[DOTSPLAT6]]
@@ -166,19 +228,38 @@ for.end6:                                         ; preds = %for.end6.loopexit,
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP3]], 8
 ; CHECK-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <8 x i32> poison, i32 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT8]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    br label %vector.body
-; CHECK:       vector.body:
-; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECK:         [[VEC_IND10:%.*]] = phi <8 x i32> [ [[INDUCTION7]], %vector.ph ], [ [[VEC_IND_NEXT11:%.*]], %vector.body ]
-; CHECK:         [[TMP6:%.*]] = add i64 %index, 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND10:%.*]] = phi <8 x i32> [ [[INDUCTION7]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
 ; CHECK-NEXT:    store <8 x i32> [[VEC_IND10]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    %index.next = add nuw i64 %index, 8
-; CHECK:         [[VEC_IND_NEXT11]] = add <8 x i32> [[VEC_IND10]], [[DOTSPLAT9]]
-; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
-
-define void @non_primary_iv_loop_inv_trunc(ptr %a, i64 %n, i64 %step) {
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT11]] = add <8 x i32> [[VEC_IND10]], [[DOTSPLAT9]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[J]] to i32
+; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[J_NEXT]] = add nuw nsw i64 [[J]], [[STEP]]
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[FOR_BODY]], label %[[FOR_END]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -197,22 +278,43 @@ for.end:
   ret void
 }
 
-; CHECK-LABEL: @iv_no_binary_op_in_descriptor(
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ , %vector.ph ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+
+define void @iv_no_binary_op_in_descriptor(i1 %c, ptr %dst) {
+; CHECK-LABEL: define void @iv_no_binary_op_in_descriptor(
+; CHECK-SAME: i1 [[C:%.*]], ptr [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DST:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    store <8 x i64> [[VEC_IND]], ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP3]], label %middle.block, label [[VECTOR_BODY]]
-
-define void @iv_no_binary_op_in_descriptor(i1 %c, ptr %dst) {
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT_P:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i64 [[IV]], ptr [[GEP]], align 8
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add i64 [[IV]], 1
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT_P]] = phi i64 [ [[IV_NEXT]], %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT_P]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop.header
 
@@ -231,3 +333,162 @@ loop.latch:
 exit:
   ret void
 }
+
+define void @wide_add_induction_step_live_in(ptr %dst, i64 %N, i16 %off) {
+; CHECK-LABEL: define void @wide_add_induction_step_live_in(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]], i16 [[OFF:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[O_1:%.*]] = add i16 [[OFF]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i16 [[DOTCAST]], [[O_1]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[O_1]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <8 x i16> , [[DOTSPLAT]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i16> zeroinitializer, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i16 [[O_1]], 8
+; CHECK-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[O_1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i16> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i16> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    store <8 x i16> [[TMP4]], ptr [[TMP6]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i16> [[VEC_IND]], [[DOTSPLAT2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i16 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i16 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ADD]] = add i16 [[IV_2]], [[O_1]]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i16 [[ADD]], ptr [[GEP_DST]], align 2
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %o.1 = add i16 %off, 2
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.2 = phi i16 [ 0, %entry ], [ %add, %loop ]
+  %add = add i16 %iv.2, %o.1
+  %gep.dst = getelementptr inbounds i16, ptr %dst, i64 %iv
+  store i16 %add, ptr %gep.dst, align 2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec , label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @wide_sub_induction_step_live_in(ptr %dst, i64 %N, i16 %off) {
+; CHECK-LABEL: define void @wide_sub_induction_step_live_in(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]], i16 [[OFF:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[O_1:%.*]] = add i16 [[OFF]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i16 -2, [[OFF]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i16 [[DOTCAST]], [[TMP0]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i16> , [[DOTSPLAT]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i16> zeroinitializer, [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i16 [[TMP0]], 8
+; CHECK-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP3]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[O_1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i16> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    store <8 x i16> [[TMP5]], ptr [[TMP7]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i16> [[VEC_IND]], [[DOTSPLAT2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i16 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i16 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[SUB:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[SUB]] = sub i16 [[IV_2]], [[O_1]]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i16 [[SUB]], ptr [[GEP_DST]], align 2
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %o.1 = add i16 %off, 2
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.2 = phi i16 [ 0, %entry ], [ %sub, %loop ]
+  %sub = sub i16 %iv.2, %o.1
+  %gep.dst = getelementptr inbounds i16, ptr %dst, i64 %iv
+  store i16 %sub, ptr %gep.dst, align 2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec , label %exit, label %loop
+
+exit:
+  ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+;.

From da2a9ede81a88bea0bba28a543441197772e4727 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov 
Date: Sat, 4 Jan 2025 18:53:01 +0400
Subject: [PATCH 429/567] [clang] Update C++ DR status page (#121642)

This patch brings our C++ DR status page up to date with WG21 updates.

[CWG1223](https://cplusplus.github.io/CWG/issues/1223.html) "Syntactic
disambiguation and _trailing-return-types_" is resolved by
[P2915R0](https://wg21.link/p2915r0) "Proposed resolution for CWG1223".
Both the test and the paper were written by @cor3ntin, so I presume no
updates are needed.

[CWG2819](https://cplusplus.github.io/CWG/issues/2819.html) "Cast from
null pointer value in a constant expression" was revisited and marked as
not a DR, so I updated the test to ensure that the example is not
accepted in C++23 and earlier modes. CC @offsetof.

Tentantive resolutions to the following issues were simply promoted to
actual resolutions, so tests don't require any meaningful changes:
- [CWG2913](https://cplusplus.github.io/CWG/issues/2913.html) "Grammar
for deduction-guide has requires-clause in the wrong position"
(@zyn0217)
- [CWG2915](https://cplusplus.github.io/CWG/issues/2915.html) "Explicit
object parameters of type void" (@MitalAshok)
- [CWG2922](https://cplusplus.github.io/CWG/issues/2922.html) "constexpr
placement-new is too permissive" (@cor3ntin)

As a drive-by fix, I updated the `make_cxx_dr_status` script to
accommodate for `C++23 onwards` and `C++26 onwards` statuses, which are
useful for Core issues that are not DRs.
---
 clang/test/CXX/drs/cwg12xx.cpp |   2 +-
 clang/test/CXX/drs/cwg28xx.cpp |  25 +-
 clang/test/CXX/drs/cwg29xx.cpp |   6 +-
 clang/www/cxx_dr_status.html   | 534 ++++++++++++++++++++++++---------
 clang/www/make_cxx_dr_status   |   6 +
 5 files changed, 413 insertions(+), 160 deletions(-)

diff --git a/clang/test/CXX/drs/cwg12xx.cpp b/clang/test/CXX/drs/cwg12xx.cpp
index cdfbc6d672658..951c71a9832de 100644
--- a/clang/test/CXX/drs/cwg12xx.cpp
+++ b/clang/test/CXX/drs/cwg12xx.cpp
@@ -32,7 +32,7 @@ namespace cwg1213 { // cwg1213: 7
 }
 
 #if __cplusplus >= 201103L
-namespace cwg1223 { // cwg1223: 17 drafting 2023-05-12
+namespace cwg1223 { // cwg1223: 17
 struct M;
 template 
 struct V;
diff --git a/clang/test/CXX/drs/cwg28xx.cpp b/clang/test/CXX/drs/cwg28xx.cpp
index ff625a4a985bc..40e2b25eedde0 100644
--- a/clang/test/CXX/drs/cwg28xx.cpp
+++ b/clang/test/CXX/drs/cwg28xx.cpp
@@ -1,9 +1,9 @@
 // RUN: %clang_cc1 -std=c++98 -pedantic-errors -verify=expected,cxx98 %s
-// RUN: %clang_cc1 -std=c++11 -pedantic-errors -verify=expected %s
-// RUN: %clang_cc1 -std=c++14 -pedantic-errors -verify=expected %s
-// RUN: %clang_cc1 -std=c++17 -pedantic-errors -verify=expected %s
-// RUN: %clang_cc1 -std=c++20 -pedantic-errors -verify=expected,since-cxx20 %s
-// RUN: %clang_cc1 -std=c++23 -pedantic-errors -verify=expected,since-cxx20,since-cxx23 %s
+// RUN: %clang_cc1 -std=c++11 -pedantic-errors -verify=expected,cxx11-23 %s
+// RUN: %clang_cc1 -std=c++14 -pedantic-errors -verify=expected,cxx11-23 %s
+// RUN: %clang_cc1 -std=c++17 -pedantic-errors -verify=expected,cxx11-23 %s
+// RUN: %clang_cc1 -std=c++20 -pedantic-errors -verify=expected,cxx11-23,since-cxx20 %s
+// RUN: %clang_cc1 -std=c++23 -pedantic-errors -verify=expected,cxx11-23,since-cxx20,since-cxx23 %s
 // RUN: %clang_cc1 -std=c++2c -pedantic-errors -verify=expected,since-cxx20,since-cxx23,since-cxx26 %s
 
 
@@ -47,12 +47,17 @@ void f() {
 #endif
 } // namespace cwg2813
 
-namespace cwg2819 { // cwg2819: 19 tentatively ready 2023-12-01
-
-#if __cpp_constexpr >= 202306L
+namespace cwg2819 { // cwg2819: 19 c++26
+#if __cplusplus >= 201103L
+  // CWG 2024-04-19: This issue is not a DR.
   constexpr void* p = nullptr;
-  constexpr int* q = static_cast(p);
-  static_assert(q == nullptr);
+  constexpr int* q = static_cast(p); // #cwg2819-q
+  // cxx11-23-error@-1 {{constexpr variable 'q' must be initialized by a constant expression}}
+  //   cxx11-23-note@-2 {{cast from 'void *' is not allowed in a constant expression}}
+  static_assert(q == nullptr, "");
+  // cxx11-23-error@-1 {{static assertion expression is not an integral constant expression}}
+  //   cxx11-23-note@-2 {{initializer of 'q' is not a constant expression}}
+  //   cxx11-23-note@#cwg2819-q {{declared here}}
 #endif
 }
 
diff --git a/clang/test/CXX/drs/cwg29xx.cpp b/clang/test/CXX/drs/cwg29xx.cpp
index 9629bdd41a2a5..2aa52ad98ada8 100644
--- a/clang/test/CXX/drs/cwg29xx.cpp
+++ b/clang/test/CXX/drs/cwg29xx.cpp
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -std=c++23 -pedantic-errors -verify=expected %s
 // RUN: %clang_cc1 -std=c++2c -pedantic-errors -verify=expected %s
 
-namespace cwg2913 { // cwg2913: 20 tentatively ready 2024-08-16
+namespace cwg2913 { // cwg2913: 20
 
 #if __cplusplus >= 202002L
 
@@ -26,7 +26,7 @@ R(T, T) requires true -> R; // expected-error {{expected function body after
 
 } // namespace cwg2913
 
-namespace cwg2915 { // cwg2915: 20 tentatively ready 2024-08-16
+namespace cwg2915 { // cwg2915: 20
 #if __cplusplus >= 202302L
 struct A {
   void f(this void); // expected-error {{explicit object parameter cannot have 'void' type}}
@@ -61,7 +61,7 @@ void *operator new(std::size_t, void *p) { return p; }
 void* operator new[] (std::size_t, void* p) {return p;}
 
 
-namespace cwg2922 { // cwg2922: 20 tentatively ready 2024-07-10
+namespace cwg2922 { // cwg2922: 20
 union U { int a, b; };
 constexpr U nondeterministic(bool i) {
   if(i) {
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index 386c57250b7db..c069e155fd547 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -1442,7 +1442,7 @@ 

C++ defect report implementation status

233 - DR + DRWP References vs pointers in UDC overload resolution Unknown @@ -7153,15 +7153,11 @@

C++ defect report implementation status

Unnecessary restriction on auto array types Unknown - + 1223 - drafting + DRWP Syntactic disambiguation and trailing-return-types - -
- Not resolved - Clang 17 implements 2023-05-12 resolution -
+ Clang 17 1224 @@ -8945,11 +8941,11 @@

C++ defect report implementation status

Alias template specialization vs pack expansion Unknown - + 1521 - drafting + dup T{expr} with reference types - Not resolved + Unknown 1522 @@ -11545,11 +11541,11 @@

C++ defect report implementation status

Constant expressions and library undefined behavior Unknown - + 1953 - open + DR Data races and common initial sequence - Not resolved + Unknown 1954 @@ -11619,7 +11615,7 @@

C++ defect report implementation status

1965 - drafting + open Explicit casts to reference types Not resolved @@ -12693,7 +12689,7 @@

C++ defect report implementation status

2144 - DR + DRWP Function/variable declaration ambiguity Unknown @@ -13525,11 +13521,11 @@

C++ defect report implementation status

Consistency with mismatched aligned/non-over-aligned allocation/deallocation functions Unknown - + 2283 - drafting + DR Missing complete type requirements - Not resolved + Unknown 2284 @@ -15183,7 +15179,7 @@

C++ defect report implementation status

2557 - drafting + review Class member access referring to an unrelated class Not resolved @@ -15207,7 +15203,7 @@

C++ defect report implementation status

2561 - DR + DRWP Conversion to function pointer for lambda with explicit object parameter No @@ -15373,7 +15369,7 @@

C++ defect report implementation status

2588 - DR + DRWP friend declarations and module linkage Unknown @@ -16213,7 +16209,7 @@

C++ defect report implementation status

2728 - DR + DRWP Evaluation of conversions in a delete-expression Unknown @@ -16734,11 +16730,11 @@

C++ defect report implementation status

Alignment requirement of incomplete class type Unknown - + 2815 - tentatively ready + DR Overload resolution for references/pointers to noexcept functions - Not resolved + Unknown 2816 @@ -16754,15 +16750,15 @@

C++ defect report implementation status

2818 - DR + DRWP Use of predefined reserved identifiers Unknown 2819 - accepted + WP Cast from null pointer value in a constant expression - Clang 19 + Clang 19 (C++26 onwards) 2820 @@ -16862,7 +16858,7 @@

C++ defect report implementation status

2836 - DR + DRWP Conversion rank of long double and extended floating-point types Unknown @@ -16904,7 +16900,7 @@

C++ defect report implementation status

2843 - review + drafting Undated reference to Unicode makes C++ a moving target Not resolved @@ -16998,13 +16994,13 @@

C++ defect report implementation status

2858 - accepted + WP Declarative nested-name-specifiers and pack-index-specifiers Clang 19 2859 - DR + DRWP Value-initialization with multiple default constructors Unknown @@ -17016,7 +17012,7 @@

C++ defect report implementation status

2861 - DR + DRWP dynamic_cast on bad pointer value Unknown @@ -17034,13 +17030,13 @@

C++ defect report implementation status

2864 - DR + DRWP Narrowing floating-point conversions Unknown 2865 - DR + DRWP Regression on result of conditional operator Unknown @@ -17052,7 +17048,7 @@

C++ defect report implementation status

2867 - DR + DRWP Order of initialization for structured bindings Unknown @@ -17064,25 +17060,25 @@

C++ defect report implementation status

2869 - DR + DRWP this in local classes Unknown 2870 - DR + DRWP Combining absent encoding-prefixes Unknown 2871 - DR + DRWP User-declared constructor templates inhibiting default constructors Unknown 2872 - DR + DRWP Linkage and unclear "can be referred to" Unknown @@ -17094,7 +17090,7 @@

C++ defect report implementation status

2874 - DR + DRWP Qualified declarations of partial specializations Unknown @@ -17106,13 +17102,13 @@

C++ defect report implementation status

2876 - accepted + WP Disambiguation of T x = delete("text") Unknown 2877 - DR + DRWP Type-only lookup for using-enum-declarator Clang 19 @@ -17122,33 +17118,33 @@

C++ defect report implementation status

C-style casts to reference types Not resolved - + 2879 - review + DR Undesired outcomes with const_cast - Not resolved + Unknown 2880 - accepted + WP Accessibility check for destructor of incomplete class type Unknown 2881 - DR + DRWP Type restrictions for the explicit object parameter of a lambda Clang 19 2882 - DR + DRWP Unclear treatment of conversion to void Clang 2.7 2883 - DR + DRWP Definition of "odr-usable" ignores lambda scopes No @@ -17170,13 +17166,13 @@

C++ defect report implementation status

2886 - DR + DRWP Temporaries and trivial potentially-throwing special member functions Clang 9 2887 - DR + DRWP Missing compatibility entries for xvalues Unknown @@ -17192,21 +17188,21 @@

C++ defect report implementation status

Requiring an accessible destructor for destroying operator delete Not resolved - + 2890 - review + DR Defining members of local classes - Not resolved + Unknown 2891 - DR + DRWP Normative status of implementation limits Unknown 2892 - DR + DRWP Unclear usual arithmetic conversions Unknown @@ -17216,15 +17212,15 @@

C++ defect report implementation status

Instantiations in discarded if constexpr substatements Unknown - + 2894 - review + DR Functional casts create prvalues of reference type - Not resolved + Unknown 2895 - DR + DRWP Initialization should ignore the destination type's cv-qualification Unknown @@ -17246,11 +17242,11 @@

C++ defect report implementation status

Clarify implicit conversion sequence from cv T to T Not resolved - + 2899 - tentatively ready + DR Bad value representations should cause undefined behavior - Not resolved + Unknown 2900 @@ -17258,11 +17254,11 @@

C++ defect report implementation status

Deduction of non-type template arguments with placeholder types Not resolved - + 2901 - tentatively ready + DR Unclear semantics for near-match aliased access - Not resolved + Unknown 2902 @@ -17272,7 +17268,7 @@

C++ defect report implementation status

2903 - tentatively ready + drafting Can we omit the template disambiguator in nested-name-specifiers in type-only contexts? Not resolved @@ -17282,47 +17278,47 @@

C++ defect report implementation status

Introducing template-names Not resolved - + 2905 - tentatively ready + DR Value-dependence of noexcept-expression - Not resolved + Unknown - + 2906 - tentatively ready + DR Lvalue-to-rvalue conversion of class types for conditional operator - Not resolved + Unknown - + 2907 - tentatively ready + DR Constant lvalue-to-rvalue conversion on uninitialized std::nullptr_t - Not resolved + Unknown - + 2908 - tentatively ready + DR Counting physical source lines for __LINE__ - Not resolved + Unknown - + 2909 - review + DR Subtle difference between constant-initialized and constexpr - Not resolved + Unknown - + 2910 - tentatively ready + DR Effect of requirement-parameter-lists on odr-usability - Not resolved + Unknown - + 2911 - tentatively ready + DR Unclear meaning of expressions "appearing within" subexpressions - Not resolved + Unknown 2912 @@ -17330,15 +17326,11 @@

C++ defect report implementation status

Too-large value for size in array new Not resolved - + 2913 - tentatively ready + DR Grammar for deduction-guide has requires-clause in the wrong position - -
- Not resolved - Clang 20 implements 2024-08-16 resolution -
+ Clang 20 2914 @@ -17346,15 +17338,11 @@

C++ defect report implementation status

Unclear order of initialization of static and thread-local variables Not resolved - + 2915 - tentatively ready + DR Explicit object parameters of type void - -
- Not resolved - Clang 20 implements 2024-08-16 resolution -
+ Clang 20 2916 @@ -17372,17 +17360,17 @@

C++ defect report implementation status

Clang 20 implements 2024-07-30 resolution - + 2918 - tentatively ready + DR Consideration of constraints for address of overloaded function - Not resolved + Unknown - + 2919 - tentatively ready + DR Conversion function candidates for initialization of const lvalue reference - Not resolved + Unknown 2920 @@ -17390,33 +17378,29 @@

C++ defect report implementation status

The template keyword for base classes Not resolved - + 2921 - tentatively ready + DR Exporting redeclarations of entities not attached to a named module - Not resolved + Unknown - + 2922 - tentatively ready + DR constexpr placement-new is too permissive - -
- Not resolved - Clang 20 implements 2024-07-10 resolution -
+ Clang 20 2923 - tentatively ready + review Note about infinite loops and execution steps Not resolved - + 2924 - review + DR Undefined behavior during constant evaluation - Not resolved + Unknown 2925 @@ -17426,15 +17410,15 @@

C++ defect report implementation status

2926 - open + drafting Lookup context for dependent qualified names Not resolved - + 2927 - tentatively ready + DR Unclear status of translation unit with module keyword - Not resolved + Unknown 2928 @@ -17444,21 +17428,21 @@

C++ defect report implementation status

2929 - tentatively ready + review Lifetime of trivially-destructible static or thread-local objects Not resolved - + 2930 - tentatively ready + DR Unclear term "copy/move operation" in specification of copy elision - Not resolved + Unknown - + 2931 - tentatively ready + DR Restrictions on operator functions that are explicit object member functions - Not resolved + Unknown 2932 @@ -17466,11 +17450,11 @@

C++ defect report implementation status

Value range of empty enumeration Not resolved - + 2933 - open + DR Dangling references - Not resolved + Unknown 2934 @@ -17484,17 +17468,17 @@

C++ defect report implementation status

Destroying the coroutine state when initial-await-resume-called is false Not resolved - + 2936 - open + DR Local classes of templated functions should be part of the current instantiation - Not resolved + Unknown - + 2937 - open + DR Grammar for preprocessing-file has no normative effect - Not resolved + Unknown 2938 @@ -17502,10 +17486,268 @@

C++ defect report implementation status

Inheriting linkage from a previous declaration Not resolved - + 2939 - open + DR Do not allow reinterpret_cast from prvalue to rvalue reference + Unknown + + + 2940 + review + Definition of "object" + Not resolved + + + 2941 + open + Lifetime extension for function-style cast to reference type + Not resolved + + + 2942 + open + Packs in a function's parameter-type-list + Not resolved + + + 2943 + open + Discarding a void return value + Not resolved + + + 2944 + DR + Unsequenced throw-expressions + Unknown + + + 2945 + open + Redundant constraints on matching function template declarations + Not resolved + + + 2946 + open + Dependent call equivalence in non-ADL cases + Not resolved + + + 2947 + open + Limiting macro expansion in pp-module + Not resolved + + + 2948 + open + Late ambiguity for partial template specialization + Not resolved + + + 2949 + open + Treatment of ellipsis during partial ordering + Not resolved + + + 2950 + open + Value preservation in enumeration vs. integer bit-fields + Not resolved + + + 2951 + open + Distinguishing a primary template + Not resolved + + + 2952 + open + Vacuous initialization for subobjects + Not resolved + + + 2953 + open + Value representation for non-trivially-copyable types + Not resolved + + + 2954 + NAD + Simultaneous modifications of an atomic object + Unknown + + + 2955 + open + Unify rules about conflicting unordered accesses + Not resolved + + + 2956 + open + Missing allowance for pseudo-destructors in qualified lookup + Not resolved + + + 2957 + open + Evaluating a reference member should constitute access + Not resolved + + + 2958 + open + Overload resolution involving lvalue transformation and qualification conversion + Not resolved + + + 2959 + open + Naming enumerators in class member access expressions + Not resolved + + + 2960 + open + Introduce discontiguous object lifetime + Not resolved + + + 2961 + open + Checking of ill-formed types in constraint-expressions + Not resolved + + + 2962 + open + Evaluation of destructor call for variable with constant destruction + Not resolved + + + 2963 + open + Paradoxical variable-or-function declaration + Not resolved + + + 2964 + open + Reading "invalid pointer values" + Not resolved + + + 2965 + open + Generic lambdas do not have a template parameter scope + Not resolved + + + 2966 + open + Alignment and value representation of std::nullptr_t + Not resolved + + + 2967 + open + Explicit conversion functions + Not resolved + + + 2968 + open + Name lookup result for typedef-name vs. class-name + Not resolved + + + 2969 + open + Scopes in the function-try-block of a constructor + Not resolved + + + 2970 + open + Races with volatile sig_atomic_t bit-fields + Not resolved + + + 2971 + open + Specializations for a class are not decl-reachable + Not resolved + + + 2972 + open + Declarative nested-name-specifier naming a partial specialization + Not resolved + + + 2973 + open + Does an alias-declaration introduce a name for linkage purposes? + Not resolved + + + 2974 + open + Non-deduced context for qualified-id naming a template + Not resolved + + + 2975 + open + Effect of concept template-head on parameter mappings + Not resolved + + + 2976 + open + Transferring control out of a function + Not resolved + + + 2977 + open + Initialization with string literals + Not resolved + + + 2978 + open + Deduction involving reference to similar types + Not resolved + + + 2979 + open + Duplicate declarations of enumerations in class scope + Not resolved + + + 2980 + open + Constraints on template template parameters + Not resolved + + + 2981 + open + Usual arithmetic conversions and result types + Not resolved + + + 2982 + open + Deduction in type-constraints Not resolved diff --git a/clang/www/make_cxx_dr_status b/clang/www/make_cxx_dr_status index f9a35c61c12de..e0885fdbd2d3c 100755 --- a/clang/www/make_cxx_dr_status +++ b/clang/www/make_cxx_dr_status @@ -169,6 +169,12 @@ def availability(issue): elif status.endswith(' c++20'): status = status[:-6] avail_suffix = ' (C++20 onwards)' + elif status.endswith(' c++23'): + status = status[:-6] + avail_suffix = ' (C++23 onwards)' + elif status.endswith(' c++26'): + status = status[:-6] + avail_suffix = ' (C++26 onwards)' if status == 'unknown': avail = 'Unknown' avail_style = 'unknown' From 4a7c0b8afe6bf616cd6bb4f13b5b706a43c10e74 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 4 Jan 2025 15:09:03 +0000 Subject: [PATCH 430/567] [LV] Add X86-specific induction step tests. Adds additional test coverage for induction codegen. --- .../LoopVectorize/X86/induction-step.ll | 155 ++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 llvm/test/Transforms/LoopVectorize/X86/induction-step.ll diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll new file mode 100644 index 0000000000000..6aac11a579719 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll @@ -0,0 +1,155 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=loop-vectorize -force-vector-width=4 -S %s | FileCheck %s + +target triple = "x86_64-unknown-linux-gnu" + +define i16 @wide_add_induction_step_live_in(ptr %dst, i64 %N, i16 %off) { +; CHECK-LABEL: @wide_add_induction_step_live_in( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[O_1:%.*]] = add i16 [[OFF:%.*]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16 +; CHECK-NEXT: [[TMP0:%.*]] = mul i16 [[DOTCAST]], [[O_1]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[O_1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i16> splat (i16 4), [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[O_1]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i16> [[DOTSPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i16> , [[DOTSPLAT]] +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i16> zeroinitializer, [[TMP2]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i16> poison, i16 [[O_1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT1]], <4 x i16> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i16> [[VEC_IND]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i16> [[STEP_ADD]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[DST:%.*]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 4 +; CHECK-NEXT: store <4 x i16> [[TMP4]], ptr [[TMP6]], align 2 +; CHECK-NEXT: store <4 x i16> [[TMP9]], ptr [[TMP8]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i16 [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV_2:%.*]] = phi i16 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[ADD]] = add i16 [[IV_2]], [[O_1]] +; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]] +; CHECK-NEXT: store i16 [[ADD]], ptr [[GEP_DST]], align 2 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i16 [ [[ADD]], [[LOOP]] ], [ [[TMP0]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i16 [[ADD_LCSSA]] +; +entry: + %o.1 = add i16 %off, 2 + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %iv.2 = phi i16 [ 0, %entry ], [ %add, %loop ] + %add = add i16 %iv.2, %o.1 + %gep.dst = getelementptr inbounds i16, ptr %dst, i64 %iv + store i16 %add, ptr %gep.dst, align 2 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %N + br i1 %ec , label %exit, label %loop + +exit: + ret i16 %add +} + +define i16 @wide_sub_induction_step_live_in(ptr %dst, i64 %N, i16 %off) { +; CHECK-LABEL: @wide_sub_induction_step_live_in( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[O_1:%.*]] = add i16 [[OFF:%.*]], 2 +; CHECK-NEXT: [[TMP0:%.*]] = sub i16 -2, [[OFF]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16 +; CHECK-NEXT: [[TMP1:%.*]] = mul i16 [[DOTCAST]], [[TMP0]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i16> splat (i16 4), [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i16> [[DOTSPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i16> , [[DOTSPLAT]] +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i16> zeroinitializer, [[TMP3]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i16> poison, i16 [[O_1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT1]], <4 x i16> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = sub <4 x i16> [[VEC_IND]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP10:%.*]] = sub <4 x i16> [[STEP_ADD]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[DST:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 4 +; CHECK-NEXT: store <4 x i16> [[TMP5]], ptr [[TMP7]], align 2 +; CHECK-NEXT: store <4 x i16> [[TMP10]], ptr [[TMP9]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], [[TMP2]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i16 [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV_2:%.*]] = phi i16 [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ], [ [[SUB:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[SUB]] = sub i16 [[IV_2]], [[O_1]] +; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]] +; CHECK-NEXT: store i16 [[SUB]], ptr [[GEP_DST]], align 2 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[SUB_LCSSA:%.*]] = phi i16 [ [[SUB]], [[LOOP]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i16 [[SUB_LCSSA]] +; +entry: + %o.1 = add i16 %off, 2 + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %iv.2 = phi i16 [ 0, %entry ], [ %sub, %loop ] + %sub = sub i16 %iv.2, %o.1 + %gep.dst = getelementptr inbounds i16, ptr %dst, i64 %iv + store i16 %sub, ptr %gep.dst, align 2 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %N + br i1 %ec , label %exit, label %loop + +exit: + ret i16 %sub +} From 24c2ba07ce65a5bf7d1113e05c517169d950b663 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Sun, 5 Jan 2025 01:20:54 +0800 Subject: [PATCH 431/567] [InstCombine] Drop NSW when converting `shl X, BW - 1` back into mul (#121633) `X < bool { + auto MatchShiftOrMulXC = [](Value *Op, Value *&V, APInt &C, + bool &PreserveNSW) -> bool { const APInt *Tmp = nullptr; if ((!V && match(Op, m_Mul(m_Value(V), m_APInt(Tmp)))) || (V && match(Op, m_Mul(m_Specific(V), m_APInt(Tmp))))) C = *Tmp; else if ((!V && match(Op, m_Shl(m_Value(V), m_APInt(Tmp)))) || - (V && match(Op, m_Shl(m_Specific(V), m_APInt(Tmp))))) + (V && match(Op, m_Shl(m_Specific(V), m_APInt(Tmp))))) { C = APInt(Tmp->getBitWidth(), 1) << *Tmp; + // We cannot preserve NSW when shifting by BW - 1. + PreserveNSW = Tmp->ult(Tmp->getBitWidth() - 1); + } if (Tmp != nullptr) return true; @@ -2095,7 +2099,9 @@ static Instruction *simplifyIRemMulShl(BinaryOperator &I, return false; }; - if (MatchShiftOrMulXC(Op0, X, Y) && MatchShiftOrMulXC(Op1, X, Z)) { + bool Op0PreserveNSW = true, Op1PreserveNSW = true; + if (MatchShiftOrMulXC(Op0, X, Y, Op0PreserveNSW) && + MatchShiftOrMulXC(Op1, X, Z, Op1PreserveNSW)) { // pass } else if (MatchShiftCX(Op0, Y, X) && MatchShiftCX(Op1, Z, X)) { ShiftByX = true; @@ -2108,7 +2114,7 @@ static Instruction *simplifyIRemMulShl(BinaryOperator &I, OverflowingBinaryOperator *BO0 = cast(Op0); // TODO: We may be able to deduce more about nsw/nuw of BO0/BO1 based on Y >= // Z or Z >= Y. - bool BO0HasNSW = BO0->hasNoSignedWrap(); + bool BO0HasNSW = Op0PreserveNSW && BO0->hasNoSignedWrap(); bool BO0HasNUW = BO0->hasNoUnsignedWrap(); bool BO0NoWrap = IsSRem ? BO0HasNSW : BO0HasNUW; @@ -2131,7 +2137,7 @@ static Instruction *simplifyIRemMulShl(BinaryOperator &I, }; OverflowingBinaryOperator *BO1 = cast(Op1); - bool BO1HasNSW = BO1->hasNoSignedWrap(); + bool BO1HasNSW = Op1PreserveNSW && BO1->hasNoSignedWrap(); bool BO1HasNUW = BO1->hasNoUnsignedWrap(); bool BO1NoWrap = IsSRem ? BO1HasNSW : BO1HasNUW; // (rem (mul X, Y), (mul nuw/nsw X, Z)) diff --git a/llvm/test/Transforms/InstCombine/rem-mul-shl.ll b/llvm/test/Transforms/InstCombine/rem-mul-shl.ll index e7d6cc7102c71..920497c07e380 100644 --- a/llvm/test/Transforms/InstCombine/rem-mul-shl.ll +++ b/llvm/test/Transforms/InstCombine/rem-mul-shl.ll @@ -372,6 +372,32 @@ define <2 x i8> @srem_XY_XZ_with_CY_gt_CZ_no_nuw_out(<2 x i8> %X) { ret <2 x i8> %r } +define i8 @srem_XY_XZ_with_CY_gt_CZ_drop_nsw(i8 noundef %X) { +; CHECK-LABEL: @srem_XY_XZ_with_CY_gt_CZ_drop_nsw( +; CHECK-NEXT: [[BO0:%.*]] = mul nsw i8 [[X:%.*]], 127 +; CHECK-NEXT: [[BO1:%.*]] = shl nsw i8 [[X]], 7 +; CHECK-NEXT: [[R:%.*]] = srem i8 [[BO1]], [[BO0]] +; CHECK-NEXT: ret i8 [[R]] +; + %BO0 = mul nsw i8 %X, 127 + %BO1 = shl nsw i8 %X, 7 + %r = srem i8 %BO1, %BO0 + ret i8 %r +} + +define i8 @srem_XY_XZ_with_CY_gt_CZ_drop_nsw_commuted(i8 noundef %X) { +; CHECK-LABEL: @srem_XY_XZ_with_CY_gt_CZ_drop_nsw_commuted( +; CHECK-NEXT: [[BO0:%.*]] = mul nsw i8 [[X:%.*]], 127 +; CHECK-NEXT: [[BO1:%.*]] = shl nsw i8 [[X]], 7 +; CHECK-NEXT: [[R:%.*]] = srem i8 [[BO0]], [[BO1]] +; CHECK-NEXT: ret i8 [[R]] +; + %BO0 = mul nsw i8 %X, 127 + %BO1 = shl nsw i8 %X, 7 + %r = srem i8 %BO0, %BO1 + ret i8 %r +} + define i8 @srem_XY_XZ_with_CY_gt_CZ_fail_missing_flag1(i8 %X) { ; CHECK-LABEL: @srem_XY_XZ_with_CY_gt_CZ_fail_missing_flag1( ; CHECK-NEXT: [[BO0:%.*]] = mul nuw nsw i8 [[X:%.*]], 10 From ce6251540d7af30585d4ca753ca2a0ab34d32be2 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 4 Jan 2025 16:29:30 +0000 Subject: [PATCH 432/567] [X86] vector overflow tests - regenerate VPTERNLOG comments --- llvm/test/CodeGen/X86/vec_saddo.ll | 6 +++--- llvm/test/CodeGen/X86/vec_ssubo.ll | 6 +++--- llvm/test/CodeGen/X86/vec_uaddo.ll | 6 +++--- llvm/test/CodeGen/X86/vec_usubo.ll | 6 +++--- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll index 460c5fe11f82a..78dd2cf783ef8 100644 --- a/llvm/test/CodeGen/X86/vec_saddo.ll +++ b/llvm/test/CodeGen/X86/vec_saddo.ll @@ -517,7 +517,7 @@ define <16 x i32> @saddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 ; AVX512-NEXT: kxorw %k1, %k0, %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<16 x i32>, <16 x i1>} @llvm.sadd.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1) @@ -647,7 +647,7 @@ define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpaddsb %xmm1, %xmm0, %xmm2 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpneqb %xmm2, %xmm1, %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vmovdqa %xmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<16 x i8>, <16 x i1>} @llvm.sadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) @@ -993,7 +993,7 @@ define <4 x i32> @saddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpslld $8, %xmm1, %xmm0 ; AVX512-NEXT: vpsrad $8, %xmm0, %xmm0 ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vpextrd $3, %xmm1, %eax ; AVX512-NEXT: movw %ax, 9(%rdi) ; AVX512-NEXT: vpextrd $2, %xmm1, %ecx diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll index d06993da6365d..746c09e5e70db 100644 --- a/llvm/test/CodeGen/X86/vec_ssubo.ll +++ b/llvm/test/CodeGen/X86/vec_ssubo.ll @@ -522,7 +522,7 @@ define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin ; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 ; AVX512-NEXT: kxorw %k1, %k0, %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<16 x i32>, <16 x i1>} @llvm.ssub.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1) @@ -652,7 +652,7 @@ define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpsubsb %xmm1, %xmm0, %xmm2 ; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpneqb %xmm2, %xmm1, %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vmovdqa %xmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<16 x i8>, <16 x i1>} @llvm.ssub.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) @@ -1010,7 +1010,7 @@ define <4 x i32> @ssubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpslld $8, %xmm1, %xmm0 ; AVX512-NEXT: vpsrad $8, %xmm0, %xmm0 ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vpextrd $3, %xmm1, %eax ; AVX512-NEXT: movw %ax, 9(%rdi) ; AVX512-NEXT: vpextrd $2, %xmm1, %ecx diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll index bac118095331c..be7888cd76a6b 100644 --- a/llvm/test/CodeGen/X86/vec_uaddo.ll +++ b/llvm/test/CodeGen/X86/vec_uaddo.ll @@ -604,7 +604,7 @@ define <16 x i32> @uaddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vpcmpltud %zmm0, %zmm1, %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<16 x i32>, <16 x i1>} @llvm.uadd.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1) @@ -730,7 +730,7 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpltub %xmm0, %xmm1, %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vmovdqa %xmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<16 x i8>, <16 x i1>} @llvm.uadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) @@ -1046,7 +1046,7 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm0 ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vpextrd $3, %xmm1, %eax ; AVX512-NEXT: movw %ax, 9(%rdi) ; AVX512-NEXT: vpextrd $2, %xmm1, %ecx diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll index ab75ada72f256..ceb1ad13bc153 100644 --- a/llvm/test/CodeGen/X86/vec_usubo.ll +++ b/llvm/test/CodeGen/X86/vec_usubo.ll @@ -647,7 +647,7 @@ define <16 x i32> @usubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin ; AVX512: # %bb.0: ; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vpcmpnleud %zmm0, %zmm1, %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<16 x i32>, <16 x i1>} @llvm.usub.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1) @@ -773,7 +773,7 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpnleub %xmm0, %xmm1, %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vmovdqa %xmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<16 x i8>, <16 x i1>} @llvm.usub.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) @@ -1093,7 +1093,7 @@ define <4 x i32> @usubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm0 ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vpextrd $3, %xmm1, %eax ; AVX512-NEXT: movw %ax, 9(%rdi) ; AVX512-NEXT: vpextrd $2, %xmm1, %ecx From 9a95c097d0466c594f40a4ba9ced8a155574fdff Mon Sep 17 00:00:00 2001 From: Alexey Samsonov Date: Sat, 4 Jan 2025 10:02:48 -0800 Subject: [PATCH 433/567] [libc] Remove some unused includes from headers under src/math/generic. (#121632) These were indicated by Clang include-cleaner. --- libc/src/math/generic/CMakeLists.txt | 3 --- libc/src/math/generic/exp10f_impl.h | 3 --- libc/src/math/generic/range_reduction_double_common.h | 1 - libc/src/math/generic/sincosf16_utils.h | 2 -- utils/bazel/llvm-project-overlay/libc/BUILD.bazel | 4 ---- 5 files changed, 13 deletions(-) diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index b3d4612915197..382f5b362e2eb 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -358,7 +358,6 @@ add_header_library( HDRS sincosf16_utils.h DEPENDS - libc.src.__support.FPUtil.fp_bits libc.src.__support.FPUtil.polyeval libc.src.__support.FPUtil.nearest_integer libc.src.__support.common @@ -1702,8 +1701,6 @@ add_header_library( libc.src.__support.FPUtil.fenv_impl libc.src.__support.FPUtil.fp_bits libc.src.__support.FPUtil.multiply_add - libc.src.__support.FPUtil.nearest_integer - libc.src.__support.FPUtil.polyeval libc.src.__support.FPUtil.rounding_mode libc.src.__support.macros.optimization libc.src.__support.common diff --git a/libc/src/math/generic/exp10f_impl.h b/libc/src/math/generic/exp10f_impl.h index d741318382e1f..975fd01a0a25c 100644 --- a/libc/src/math/generic/exp10f_impl.h +++ b/libc/src/math/generic/exp10f_impl.h @@ -10,12 +10,9 @@ #define LLVM_LIBC_SRC_MATH_GENERIC_EXP10F_IMPL_H #include "explogxf.h" -#include "src/__support/FPUtil/BasicOperations.h" #include "src/__support/FPUtil/FEnvImpl.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/__support/FPUtil/PolyEval.h" #include "src/__support/FPUtil/multiply_add.h" -#include "src/__support/FPUtil/nearest_integer.h" #include "src/__support/FPUtil/rounding_mode.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" diff --git a/libc/src/math/generic/range_reduction_double_common.h b/libc/src/math/generic/range_reduction_double_common.h index bcab82f6c9c3a..06aeb49495ad2 100644 --- a/libc/src/math/generic/range_reduction_double_common.h +++ b/libc/src/math/generic/range_reduction_double_common.h @@ -9,7 +9,6 @@ #ifndef LLVM_LIBC_SRC_MATH_GENERIC_RANGE_REDUCTION_DOUBLE_COMMON_H #define LLVM_LIBC_SRC_MATH_GENERIC_RANGE_REDUCTION_DOUBLE_COMMON_H -#include "src/__support/FPUtil/FPBits.h" #include "src/__support/FPUtil/double_double.h" #include "src/__support/FPUtil/dyadic_float.h" #include "src/__support/FPUtil/multiply_add.h" diff --git a/libc/src/math/generic/sincosf16_utils.h b/libc/src/math/generic/sincosf16_utils.h index 5e5edd4a8c85b..87b1dde560c5e 100644 --- a/libc/src/math/generic/sincosf16_utils.h +++ b/libc/src/math/generic/sincosf16_utils.h @@ -9,9 +9,7 @@ #ifndef LLVM_LIBC_SRC_MATH_GENERIC_SINCOSF16_UTILS_H #define LLVM_LIBC_SRC_MATH_GENERIC_SINCOSF16_UTILS_H -#include "src/__support/FPUtil/FPBits.h" #include "src/__support/FPUtil/PolyEval.h" -#include "src/__support/FPUtil/cast.h" #include "src/__support/FPUtil/nearest_integer.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index 91c7db9029a66..15fa4123b75fe 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -1797,7 +1797,6 @@ libc_support_library( hdrs = ["src/math/generic/sincosf16_utils.h"], deps = [ ":__support_common", - ":__support_fputil_fp_bits", ":__support_fputil_nearest_integer", ":__support_fputil_polyeval", ], @@ -1846,11 +1845,8 @@ libc_support_library( name = "exp10f_impl", hdrs = ["src/math/generic/exp10f_impl.h"], deps = [ - ":__support_fputil_basic_operations", ":__support_fputil_fma", ":__support_fputil_multiply_add", - ":__support_fputil_nearest_integer", - ":__support_fputil_polyeval", ":__support_fputil_rounding_mode", ":__support_macros_optimization", ":common_constants", From c19e0d63b45f9c97157060c662396820ce2a1621 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 4 Jan 2025 10:56:45 -0800 Subject: [PATCH 434/567] [gcov,test] Update exit-block.ll now that exit block is always the second Follow-up to 82fecab85ae2d72ffac0e44749d99f12d6f71cc0 --- llvm/test/Transforms/GCOVProfiling/exit-block.ll | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/llvm/test/Transforms/GCOVProfiling/exit-block.ll b/llvm/test/Transforms/GCOVProfiling/exit-block.ll index 567e22222f580..1840f045b3ffe 100644 --- a/llvm/test/Transforms/GCOVProfiling/exit-block.ll +++ b/llvm/test/Transforms/GCOVProfiling/exit-block.ll @@ -3,13 +3,9 @@ ; RUN: echo '!19 = !{!"%/t/exit-block.ll", !0}' > %t/1 ; RUN: cat %s %t/1 > %t/2 -; By default, the exit block is the second. +; The exit block is the second. ; RUN: opt -passes=insert-gcov-profiling -disable-output %t/2 -; RUN: llvm-cov gcov -n -dump %t/exit-block.gcno 2>&1 | FileCheck --check-prefixes=CHECK,EXIT-SECOND %s - -; But we can optionally emit it last, to match GCC<4.8 (r189778). -; RUN: opt -passes=insert-gcov-profiling -default-gcov-version='407*' -disable-output %t/2 -; RUN: llvm-cov gcov -n -dump %t/exit-block.gcno 2>&1 | FileCheck --check-prefixes=CHECK,EXIT-SECOND %s +; RUN: llvm-cov gcov -n -dump %t/exit-block.gcno 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -66,10 +62,7 @@ attributes #2 = { nounwind } ; There should be no destination edges for the exit block. ; CHECK: Block : 1 Counter : 0 -; EXIT-LAST: Destination Edges -; EXIT-SECOND-NOT: Destination Edges ; CHECK: Block : 2 Counter : 0 ; CHECK: Block : 4 Counter : 0 -; EXIT-LAST-NOT: Destination Edges -; EXIT-SECOND: Destination Edges +; CHECK: Destination Edges ; CHECK-NOT: Block : From b95cce99049d6b79c418c9981dc39ede2850994e Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 31 Dec 2024 12:08:43 +0000 Subject: [PATCH 435/567] [VPlan] Update wide induction inc recipes to use same step as Wide IV. Update wide induction increments to use the same step as the corresponding wide induction. This enables detecting induction increments directly in VPlan and removes redundant splats. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 14 ++++++++++++++ .../Transforms/LoopVectorize/X86/induction-step.ll | 6 ++---- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 7ef5295bb1276..5b75f6b26b6c5 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9311,6 +9311,20 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { "VPBasicBlock"); RecipeBuilder.fixHeaderPhis(); + // Update wide induction increments to use the same step as the corresponding + // wide induction. This enables detecting induction increments directly in + // VPlan and removes redundant splats. + for (const auto &[Phi, ID] : Legal->getInductionVars()) { + auto *IVInc = cast( + Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())); + if (IVInc->getOperand(0) != Phi || IVInc->getOpcode() != Instruction::Add) + continue; + VPWidenInductionRecipe *WideIV = + cast(RecipeBuilder.getRecipe(Phi)); + VPRecipeBase *R = RecipeBuilder.getRecipe(IVInc); + R->setOperand(1, WideIV->getStepValue()); + } + if (auto *UncountableExitingBlock = Legal->getUncountableEarlyExitingBlock()) { VPlanTransforms::handleUncountableEarlyExit( diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll index 6aac11a579719..f6a9767c7f87d 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll @@ -21,16 +21,14 @@ define i16 @wide_add_induction_step_live_in(ptr %dst, i64 %N, i16 %off) { ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i16> [[DOTSPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i16> , [[DOTSPLAT]] ; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i16> zeroinitializer, [[TMP2]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i16> poison, i16 [[O_1]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT1]], <4 x i16> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i16> [[VEC_IND]], [[BROADCAST_SPLAT2]] -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i16> [[STEP_ADD]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i16> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i16> [[STEP_ADD]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[DST:%.*]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 4 From 1fa0036226d0ffad624bfb43595d00885db546b9 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 4 Jan 2025 11:49:12 -0800 Subject: [PATCH 436/567] [AArch64] Remove one unnecesssary include of AArch64GenSystemOperands.inc. NFC GET_PRCTX_DECL and GET_PRCTX_IMPl don't exist in AArch64GenSystemOperands.inc so this include does nothing. It looks like it was removed in 2050e7ebe18cc4cf906d9b54d17ee885cd868327. --- llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp | 7 ------- llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h | 8 -------- 2 files changed, 15 deletions(-) diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp index d83c22e717950..49ce0a58f4167 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp +++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp @@ -66,13 +66,6 @@ namespace llvm { } } -namespace llvm { - namespace AArch64PRCTX { -#define GET_PRCTX_IMPL -#include "AArch64GenSystemOperands.inc" - } -} - namespace llvm { namespace AArch64PRFM { #define GET_PRFM_IMPL diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h index e0ccba4d6a59e..e7db9077b6439 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -731,14 +731,6 @@ namespace AArch64TLBI { #include "AArch64GenSystemOperands.inc" } -namespace AArch64PRCTX { - struct PRCTX : SysAliasReg { - using SysAliasReg::SysAliasReg; - }; - #define GET_PRCTX_DECL - #include "AArch64GenSystemOperands.inc" -} - namespace AArch64II { /// Target Operand Flag enum. enum TOF { From 59354a865fe408749634456e10bd76a50d785c2b Mon Sep 17 00:00:00 2001 From: David CARLIER Date: Sat, 4 Jan 2025 20:15:05 +0000 Subject: [PATCH 437/567] [compiler-rt][rtsan] intercept fflush. (#121643) --- .../lib/rtsan/rtsan_interceptors_posix.cpp | 13 +++++++++ .../tests/rtsan_test_interceptors_posix.cpp | 28 +++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp index 9f89ab6bf1fc7..f1fe20b255d9c 100644 --- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp @@ -292,6 +292,18 @@ INTERCEPTOR(int, fputs, const char *s, FILE *stream) { return REAL(fputs)(s, stream); } +INTERCEPTOR(int, fflush, FILE *stream) { + __rtsan_notify_intercepted_call("fflush"); + return REAL(fflush)(stream); +} + +#if SANITIZER_APPLE +INTERCEPTOR(int, fpurge, FILE *stream) { + __rtsan_notify_intercepted_call("fpurge"); + return REAL(fpurge)(stream); +} +#endif + INTERCEPTOR(FILE *, fdopen, int fd, const char *mode) { __rtsan_notify_intercepted_call("fdopen"); return REAL(fdopen)(fd, mode); @@ -981,6 +993,7 @@ void __rtsan::InitializeInterceptors() { RTSAN_MAYBE_INTERCEPT_CREAT64; INTERCEPT_FUNCTION(puts); INTERCEPT_FUNCTION(fputs); + INTERCEPT_FUNCTION(fflush); INTERCEPT_FUNCTION(fdopen); INTERCEPT_FUNCTION(freopen); RTSAN_MAYBE_INTERCEPT_FOPENCOOKIE; diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp index 5adbf0fb63de8..15dfc1af01625 100644 --- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp @@ -604,6 +604,34 @@ TEST_F(RtsanOpenedFileTest, FputsDiesWhenRealtime) { ExpectNonRealtimeSurvival(Func); } +TEST_F(RtsanFileTest, FflushDiesWhenRealtime) { + FILE *f = fopen(GetTemporaryFilePath(), "w"); + EXPECT_THAT(f, Ne(nullptr)); + int written = fwrite("abc", 1, 3, f); + EXPECT_THAT(written, Eq(3)); + auto Func = [&f]() { + int res = fflush(f); + EXPECT_THAT(res, Eq(0)); + }; + ExpectRealtimeDeath(Func, "fflush"); + ExpectNonRealtimeSurvival(Func); +} + +#if SANITIZER_APPLE +TEST_F(RtsanFileTest, FpurgeDiesWhenRealtime) { + FILE *f = fopen(GetTemporaryFilePath(), "w"); + EXPECT_THAT(f, Ne(nullptr)); + int written = fwrite("abc", 1, 3, f); + EXPECT_THAT(written, Eq(3)); + auto Func = [&f]() { + int res = fpurge(f); + EXPECT_THAT(res, Eq(0)); + }; + ExpectRealtimeDeath(Func, "fpurge"); + ExpectNonRealtimeSurvival(Func); +} +#endif + TEST_F(RtsanOpenedFileTest, ReadDiesWhenRealtime) { auto Func = [this]() { char c{}; From afef716e839bf7dd96ebce5264779b1d316db58e Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Sat, 4 Jan 2025 21:28:59 +0100 Subject: [PATCH 438/567] [mlir][Transforms] Fix build after #116524 (part 2) (#121662) Since #116524, an integration test started to become flaky (failure rate ~15%). ``` bin/mlir-opt mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block_matmul.mlir --sparsifier="enable-arm-sve=true enable-runtime-library=false vl=2 reassociate-fp-reductions=true enable-index-optimizations=true" | mlir-cpu-runner --march=aarch64 --mattr="+sve" -e main -entry-point-result=void -shared-libs=./lib/libmlir_runner_utils.so,./lib/libmlir_c_runner_utils.so | bin/FileCheck mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block_matmul.mlir # executed command: bin/mlir-opt mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block_matmul.mlir '--sparsifier=enable-arm-sve=true enable-runtime-library=false vl=2 reassociate-fp-reductions=true enable-index-optimizations=true' # .---command stderr------------ # | mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block_matmul.mlir:71:10: error: null operand found # | %0 = linalg.generic #trait_mul # | ^ # | mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block_matmul.mlir:71:10: note: see current operation: %70 = "arith.mulf"(<>, %69) <{fastmath = #arith.fastmath}> : (<>, vector<[2]xf64>) -> vector<[2]xf64> # `----------------------------- # error: command failed with exit status: 1 ``` I traced the issue back to the `DenseMap mapping;` data structure: previously, some `mapping.erase(foo)` calls were unsuccessful (returning `false`), even though the `mapping` contains `foo` as a key. --- mlir/lib/Transforms/Utils/DialectConversion.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index 0e577d2d39de3..48b8c727a7828 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -103,8 +103,8 @@ namespace { /// Helper class to make it possible to use `ValueVector` as a key in DenseMap. struct ValueVectorMapInfo { - static ValueVector getEmptyKey() { return ValueVector{}; } - static ValueVector getTombstoneKey() { return ValueVector{}; } + static ValueVector getEmptyKey() { return ValueVector{Value()}; } + static ValueVector getTombstoneKey() { return ValueVector{Value(), Value()}; } static ::llvm::hash_code getHashValue(const ValueVector &val) { return ::llvm::hash_combine_range(val.begin(), val.end()); } From fd38a95586477f8f60330ef723406d69b33b91f6 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 4 Jan 2025 12:31:31 -0800 Subject: [PATCH 439/567] [TargetParser] Use StringRef::split that takes a char separator instead of StringRef separator. NFC --- llvm/lib/TargetParser/Host.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index 45b4cafc99598..9d1b7b8b0e7cd 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -173,7 +173,7 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) { // Read 32 lines from /proc/cpuinfo, which should contain the CPU part line // in all cases. SmallVector Lines; - ProcCpuinfoContent.split(Lines, "\n"); + ProcCpuinfoContent.split(Lines, '\n'); // Look for the CPU implementer line. StringRef Implementer; @@ -436,7 +436,7 @@ StringRef sys::detail::getHostCPUNameForS390x(StringRef ProcCpuinfoContent) { // The "processor 0:" line comes after a fair amount of other information, // including a cache breakdown, but this should be plenty. SmallVector Lines; - ProcCpuinfoContent.split(Lines, "\n"); + ProcCpuinfoContent.split(Lines, '\n'); // Look for the CPU features. SmallVector CPUFeatures; @@ -478,7 +478,7 @@ StringRef sys::detail::getHostCPUNameForS390x(StringRef ProcCpuinfoContent) { StringRef sys::detail::getHostCPUNameForRISCV(StringRef ProcCpuinfoContent) { // There are 24 lines in /proc/cpuinfo SmallVector Lines; - ProcCpuinfoContent.split(Lines, "\n"); + ProcCpuinfoContent.split(Lines, '\n'); // Look for uarch line to determine cpu name StringRef UArch; @@ -1630,7 +1630,7 @@ StringRef sys::getHostCPUName() { #if defined(__linux__) StringRef sys::detail::getHostCPUNameForSPARC(StringRef ProcCpuinfoContent) { SmallVector Lines; - ProcCpuinfoContent.split(Lines, "\n"); + ProcCpuinfoContent.split(Lines, '\n'); // Look for cpu line to determine cpu name StringRef Cpu; @@ -1970,7 +1970,7 @@ const StringMap sys::getHostCPUFeatures() { return Features; SmallVector Lines; - P->getBuffer().split(Lines, "\n"); + P->getBuffer().split(Lines, '\n'); SmallVector CPUFeatures; From c56b74315f57acb1b285ddc77b07031b773549b7 Mon Sep 17 00:00:00 2001 From: Sergei Barannikov Date: Sun, 5 Jan 2025 00:11:24 +0300 Subject: [PATCH 440/567] [TableGen][GISel] Reuse `importNodeRenderer` for `OperandWithDefaultOps` (#121285) This avoids some code duplication (handling `Register`, `zero_reg` and immediate operands). --- .../GlobalISelEmitter/undef-tied-input.td | 17 +++- llvm/utils/TableGen/GlobalISelEmitter.cpp | 82 +++++++------------ 2 files changed, 43 insertions(+), 56 deletions(-) diff --git a/llvm/test/TableGen/GlobalISelEmitter/undef-tied-input.td b/llvm/test/TableGen/GlobalISelEmitter/undef-tied-input.td index a2ee3dc311772..323aea9e396d1 100644 --- a/llvm/test/TableGen/GlobalISelEmitter/undef-tied-input.td +++ b/llvm/test/TableGen/GlobalISelEmitter/undef-tied-input.td @@ -1,14 +1,25 @@ -// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common %s | FileCheck %s +// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns \ +// RUN: -I %p/../../../include -I %p/../Common %s 2> %t | FileCheck %s +// RUN: FileCheck -check-prefix=ERR %s < %t include "llvm/Target/Target.td" include "GlobalISelEmitterCommon.td" -def undef_tied : OperandWithDefaultOps { +def undef_tied_1 : OperandWithDefaultOps { let MIOperandInfo = (ops GPR32:$inactive); } +def undef_tied_2 : OperandWithDefaultOps { + let MIOperandInfo = (ops GPR32:$inactive); +} + +let Constraints = "$opt.inactive = $rd" in +def I1 : I<(outs GPR32:$rd), (ins GPR32:$rs, undef_tied_1:$opt), + [(set GPR32:$rd, (abs i32:$rs))]>; + +// ERR: [[#@LINE+2]]:5: warning: Skipped pattern: unsupported type let Constraints = "$opt.inactive = $rd" in -def I1 : I<(outs GPR32:$rd), (ins GPR32:$rs, undef_tied:$opt), +def I2 : I<(outs GPR32:$rd), (ins GPR32:$rs, undef_tied_2:$opt), [(set GPR32:$rd, (abs i32:$rs))]>; // CHECK-LABEL: // (abs:{ *:[i32] } i32:{ *:[i32] }:$rs) => (I1:{ *:[i32] } i32:{ *:[i32] }:$rs) diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp index 092cdd4ad5b43..9f6d3a506dceb 100644 --- a/llvm/utils/TableGen/GlobalISelEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp @@ -418,7 +418,8 @@ class GlobalISelEmitter final : public GlobalISelMatchTableExecutorEmitter { const TreePatternNode &N) const; Error importLeafNodeRenderer(RuleMatcher &M, BuildMIAction &MIBuilder, - const TreePatternNode &N) const; + const TreePatternNode &N, + action_iterator InsertPt) const; Error importXFormNodeRenderer(RuleMatcher &M, BuildMIAction &MIBuilder, const TreePatternNode &N) const; @@ -431,9 +432,6 @@ class GlobalISelEmitter final : public GlobalISelMatchTableExecutorEmitter { const TreePatternNode &N, action_iterator &InsertPt) const; - Error importDefaultOperandRenderers(action_iterator InsertPt, RuleMatcher &M, - BuildMIAction &DstMIBuilder, - const DAGDefaultOperand &DefaultOp) const; Error importImplicitDefRenderers(BuildMIAction &DstMIBuilder, ArrayRef ImplicitDefs) const; @@ -1291,7 +1289,8 @@ Error GlobalISelEmitter::importNamedNodeRenderer( // Equivalent of MatcherGen::EmitResultLeafAsOperand. Error GlobalISelEmitter::importLeafNodeRenderer( - RuleMatcher &M, BuildMIAction &MIBuilder, const TreePatternNode &N) const { + RuleMatcher &M, BuildMIAction &MIBuilder, const TreePatternNode &N, + action_iterator InsertPt) const { if (const auto *II = dyn_cast(N.getLeafValue())) { MIBuilder.addRenderer(II->getValue()); return Error::success(); @@ -1300,11 +1299,29 @@ Error GlobalISelEmitter::importLeafNodeRenderer( if (const auto *DI = dyn_cast(N.getLeafValue())) { const Record *R = DI->getDef(); - if (R->isSubClassOf("Register")) { + if (R->isSubClassOf("Register") || R->getName() == "zero_reg") { MIBuilder.addRenderer(Target, R); return Error::success(); } + if (R->getName() == "undef_tied_input") { + std::optional OpTyOrNone = MVTToLLT(N.getSimpleType(0)); + if (!OpTyOrNone) + return failedImport("unsupported type"); + + unsigned TempRegID = M.allocateTempRegID(); + M.insertAction(InsertPt, *OpTyOrNone, TempRegID); + + auto I = M.insertAction( + InsertPt, M.allocateOutputInsnID(), + &Target.getInstruction(RK.getDef("IMPLICIT_DEF"))); + auto &ImpDefBuilder = static_cast(**I); + ImpDefBuilder.addRenderer(TempRegID, /*IsDef=*/true); + + MIBuilder.addRenderer(TempRegID); + return Error::success(); + } + if (R->isSubClassOf("SubRegIndex")) { const CodeGenSubRegIndex *SubRegIndex = CGRegs.getSubRegIdx(R); MIBuilder.addRenderer(SubRegIndex->EnumValue); @@ -1386,7 +1403,7 @@ Error GlobalISelEmitter::importNodeRenderer(RuleMatcher &M, return importNamedNodeRenderer(M, MIBuilder, N); if (N.isLeaf()) - return importLeafNodeRenderer(M, MIBuilder, N); + return importLeafNodeRenderer(M, MIBuilder, N, InsertPt); if (N.getOperator()->isSubClassOf("SDNodeXForm")) return importXFormNodeRenderer(M, MIBuilder, N); @@ -1707,11 +1724,11 @@ Expected GlobalISelEmitter::importExplicitUseRenderers( // This is a predicate or optional def operand which the pattern has not // overridden, or which we aren't letting it override; emit the 'default // ops' operands. - - const Record *OperandNode = DstI->Operands[InstOpNo].Rec; - if (auto Error = importDefaultOperandRenderers( - InsertPt, M, DstMIBuilder, CGP.getDefaultOperand(OperandNode))) - return std::move(Error); + for (const TreePatternNode &OpNode : + make_pointee_range(CGP.getDefaultOperand(OperandNode).DefaultOps)) { + if (Error Err = importNodeRenderer(M, DstMIBuilder, OpNode, InsertPt)) + return Err; + } ++NumDefaultOps; continue; @@ -1734,47 +1751,6 @@ Expected GlobalISelEmitter::importExplicitUseRenderers( return InsertPt; } -Error GlobalISelEmitter::importDefaultOperandRenderers( - action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder, - const DAGDefaultOperand &DefaultOp) const { - for (const auto &Op : DefaultOp.DefaultOps) { - const auto &N = *Op; - if (!N.isLeaf()) - return failedImport("Could not add default op"); - - const auto *DefaultOp = N.getLeafValue(); - - if (const DefInit *DefaultDefOp = dyn_cast(DefaultOp)) { - std::optional OpTyOrNone = MVTToLLT(N.getSimpleType(0)); - auto *Def = DefaultDefOp->getDef(); - if (Def->getName() == "undef_tied_input") { - unsigned TempRegID = M.allocateTempRegID(); - M.insertAction(InsertPt, *OpTyOrNone, - TempRegID); - InsertPt = M.insertAction( - InsertPt, M.allocateOutputInsnID(), - &Target.getInstruction(RK.getDef("IMPLICIT_DEF"))); - BuildMIAction &IDMIBuilder = - *static_cast(InsertPt->get()); - IDMIBuilder.addRenderer(TempRegID, /*IsDef=*/true); - DstMIBuilder.addRenderer(TempRegID); - } else { - DstMIBuilder.addRenderer(Target, Def); - } - continue; - } - - if (const IntInit *DefaultIntOp = dyn_cast(DefaultOp)) { - DstMIBuilder.addRenderer(DefaultIntOp->getValue()); - continue; - } - - return failedImport("Could not add default op"); - } - - return Error::success(); -} - Error GlobalISelEmitter::importImplicitDefRenderers( BuildMIAction &DstMIBuilder, ArrayRef ImplicitDefs) const { if (!ImplicitDefs.empty()) From f855ceeefc97220a052cc76a52a45c6907eac1f8 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Sat, 4 Jan 2025 13:04:35 -0600 Subject: [PATCH 441/567] [libc][NFC] use `__has_builtin` instead of checking macros. --- libc/shared/rpc_util.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libc/shared/rpc_util.h b/libc/shared/rpc_util.h index 9406de59f63b7..687814b7ff2ae 100644 --- a/libc/shared/rpc_util.h +++ b/libc/shared/rpc_util.h @@ -152,10 +152,10 @@ template class optional { /// Suspend the thread briefly to assist the thread scheduler during busy loops. RPC_ATTRS void sleep_briefly() { -#if defined(__NVPTX__) && defined(RPC_TARGET_IS_GPU) +#if __has_builtin(__nvvm_reflect) if (__nvvm_reflect("__CUDA_ARCH") >= 700) asm("nanosleep.u32 64;" ::: "memory"); -#elif defined(__AMDGPU__) && defined(RPC_TARGET_IS_GPU) +#elif __has_builtin(__builtin_amdgcn_s_sleep) __builtin_amdgcn_s_sleep(2); #elif __has_builtin(__builtin_ia32_pause) __builtin_ia32_pause(); From d1d400372adc9ae78d8ee9c2387b2c6b062b0dc0 Mon Sep 17 00:00:00 2001 From: David CARLIER Date: Sat, 4 Jan 2025 21:46:08 +0000 Subject: [PATCH 442/567] [compiler-rt][rtsan] fix unit tests by sanitizer-aarch64-linux report. (#121666) --- compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp index 15dfc1af01625..d9872c54b2614 100644 --- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp @@ -361,10 +361,10 @@ TEST_F(RtsanFileTest, FopenCookieDieWhenRealtime) { FILE *fp; size_t read; } fh = {f, 0}; - auto CookieRead = [this](void *cookie, char *buf, size_t size) { + auto CookieRead = [](void *cookie, char *buf, size_t size) { fholder *p = reinterpret_cast(cookie); p->read = fread(static_cast(buf), 1, size, p->fp); - EXPECT_NE(0, p->read); + EXPECT_NE(0u, p->read); }; cookie_io_functions_t funcs = {(cookie_read_function_t *)&CookieRead, nullptr, nullptr, nullptr}; From 7db0a606a294bc788563b8363261efa0c13e3062 Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Sat, 4 Jan 2025 21:55:12 +0000 Subject: [PATCH 443/567] [objcopy][COFF] Do not strip .rdata section with --only-keep-debug (#121653) When not in MinGW mode, the PE debug directory is placed in .rdata by the linker instead of .buildid. In addition to .buildid always explicitly preserve the section containing the debug directory to avoid causing errors later in patchDebugDirectory. --- llvm/lib/ObjCopy/COFF/COFFObjcopy.cpp | 10 ++- .../COFF/Inputs/i386-debug-rdata.yaml | 63 +++++++++++++++++++ .../COFF/only-keep-debug-rdata.test | 45 +++++++++++++ 3 files changed, 117 insertions(+), 1 deletion(-) create mode 100644 llvm/test/tools/llvm-objcopy/COFF/Inputs/i386-debug-rdata.yaml create mode 100644 llvm/test/tools/llvm-objcopy/COFF/only-keep-debug-rdata.test diff --git a/llvm/lib/ObjCopy/COFF/COFFObjcopy.cpp b/llvm/lib/ObjCopy/COFF/COFFObjcopy.cpp index 782d5b2f70c3e..cebcb823e6895 100644 --- a/llvm/lib/ObjCopy/COFF/COFFObjcopy.cpp +++ b/llvm/lib/ObjCopy/COFF/COFFObjcopy.cpp @@ -183,10 +183,18 @@ static Error handleArgs(const CommonConfig &Config, }); if (Config.OnlyKeepDebug) { + const data_directory *DebugDir = + Obj.DataDirectories.size() > DEBUG_DIRECTORY + ? &Obj.DataDirectories[DEBUG_DIRECTORY] + : nullptr; // For --only-keep-debug, we keep all other sections, but remove their // content. The VirtualSize field in the section header is kept intact. - Obj.truncateSections([](const Section &Sec) { + Obj.truncateSections([DebugDir](const Section &Sec) { return !isDebugSection(Sec) && Sec.Name != ".buildid" && + !(DebugDir && DebugDir->Size > 0 && + DebugDir->RelativeVirtualAddress >= Sec.Header.VirtualAddress && + DebugDir->RelativeVirtualAddress < + Sec.Header.VirtualAddress + Sec.Header.SizeOfRawData) && ((Sec.Header.Characteristics & (IMAGE_SCN_CNT_CODE | IMAGE_SCN_CNT_INITIALIZED_DATA)) != 0); }); diff --git a/llvm/test/tools/llvm-objcopy/COFF/Inputs/i386-debug-rdata.yaml b/llvm/test/tools/llvm-objcopy/COFF/Inputs/i386-debug-rdata.yaml new file mode 100644 index 0000000000000..02a6e9db19c19 --- /dev/null +++ b/llvm/test/tools/llvm-objcopy/COFF/Inputs/i386-debug-rdata.yaml @@ -0,0 +1,63 @@ +--- !COFF +OptionalHeader: + AddressOfEntryPoint: 4096 + ImageBase: 268435456 + SectionAlignment: 4096 + FileAlignment: 512 + MajorOperatingSystemVersion: 6 + MinorOperatingSystemVersion: 0 + MajorImageVersion: 0 + MinorImageVersion: 0 + MajorSubsystemVersion: 6 + MinorSubsystemVersion: 0 + Subsystem: IMAGE_SUBSYSTEM_WINDOWS_CUI + DLLCharacteristics: [ ] + SizeOfStackReserve: 1048576 + SizeOfStackCommit: 4096 + SizeOfHeapReserve: 1048576 + SizeOfHeapCommit: 4096 + Debug: + RelativeVirtualAddress: 8192 + Size: 28 +header: + Machine: IMAGE_FILE_MACHINE_I386 + Characteristics: [ IMAGE_FILE_EXECUTABLE_IMAGE, IMAGE_FILE_32BIT_MACHINE, IMAGE_FILE_DLL ] +sections: + - Name: .text + Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ] + VirtualAddress: 4096 + VirtualSize: 18 + SectionData: 5589E58B45108B450C8B450831C05DC20C00 + SizeOfRawData: 512 + - Name: .rdata + Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ] + VirtualAddress: 8192 + VirtualSize: 109 + SectionData: 000000008D6978670000000002000000510000001C2000001C060000525344538B301061671ED0994C4C44205044422E010000002F686F6D652F6D652F446F63756D656E74732F6C6C766D2D6D696E67772F6C6C766D2D70726F6A6563742F6C6C766D2F746573742E70646200 + SizeOfRawData: 512 + - Name: .debug_abbrev + Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ] + VirtualAddress: 12288 + VirtualSize: 78 + SectionData: 011101250E1305030E10171B0E110112060000022E011101120640186E0E030E3A0B3B0B2719360B49133F1900000305000218030E3A0B3B0B49130000042400030E3E0B0B0B0000050F00000000 + SizeOfRawData: 512 + - Name: .debug_info + Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ] + VirtualAddress: 16384 + VirtualSize: 116 + SectionData: 700000000400000000000401000000001D006E000000000000007500000000100010120000000200100010120000000155A5000000BC0000000101B16B00000003029108D70000000101720000000302910CD500000001016B00000003029110D30000000101720000000004CF00000005040500 + SizeOfRawData: 512 + - Name: .debug_line + Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ] + VirtualAddress: 20480 + VirtualSize: 60 + SectionData: 3800000004001E000000010101FB0E0D00010101010000000100000100746573742E6300000000000005020010001001053D0ABA060B2E0204000101 + SizeOfRawData: 512 + - Name: .debug_str + Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ] + VirtualAddress: 24576 + VirtualSize: 217 + SectionData: 636C616E672076657273696F6E2032302E302E30676974202868747470733A2F2F6769746875622E636F6D2F62796C6177732F6C6C766D2D70726F6A6563742E67697420393963353263306236613662396366303765383365656265393364323831333635656165383732332900746573742E63002F686F6D652F6D652F446F63756D656E74732F6C6C766D2D6D696E67772F6C6C766D2D70726F6A6563742F6C6C766D005F5F446C6C4D61696E43525453746172747570403132005F446C6C4D61696E4352545374617274757000696E7400630062006100 + SizeOfRawData: 512 +symbols: [] +... diff --git a/llvm/test/tools/llvm-objcopy/COFF/only-keep-debug-rdata.test b/llvm/test/tools/llvm-objcopy/COFF/only-keep-debug-rdata.test new file mode 100644 index 0000000000000..affd4b65009f4 --- /dev/null +++ b/llvm/test/tools/llvm-objcopy/COFF/only-keep-debug-rdata.test @@ -0,0 +1,45 @@ +RUN: yaml2obj %p/Inputs/i386-debug-rdata.yaml -o %t.in.exe + +RUN: llvm-objcopy --only-keep-debug %t.in.exe %t.out.exe +RUN: llvm-readobj --sections %t.out.exe | FileCheck %s + +Check that all non-debug/rodata (which contains the debug directory in this case) +sections with IMAGE_SCN_CNT_CODE or IMAGE_SCN_CNT_INITIALIZED_DATA are truncated, +and no others. + +CHECK: Section { +CHECK-NEXT: Number: 1 +CHECK-NEXT: Name: .text (2E 74 65 78 74 00 00 00) +CHECK-NEXT: VirtualSize: 0x12 +CHECK-NEXT: VirtualAddress: 0x1000 +CHECK-NEXT: RawDataSize: 0 +CHECK: Section { +CHECK-NEXT: Number: 2 +CHECK-NEXT: Name: .rdata (2E 72 64 61 74 61 00 00) +CHECK-NEXT: VirtualSize: 0x6D +CHECK-NEXT: VirtualAddress: 0x2000 +CHECK-NEXT: RawDataSize: 512 +CHECK: Section { +CHECK-NEXT: Number: 3 +CHECK-NEXT: Name: .debug_abbrev (2F 34 00 00 00 00 00 00) +CHECK-NEXT: VirtualSize: 0x4E +CHECK-NEXT: VirtualAddress: 0x3000 +CHECK-NEXT: RawDataSize: 512 +CHECK: Section { +CHECK-NEXT: Number: 4 +CHECK-NEXT: Name: .debug_info (2F 32 39 00 00 00 00 00) +CHECK-NEXT: VirtualSize: 0x74 +CHECK-NEXT: VirtualAddress: 0x4000 +CHECK-NEXT: RawDataSize: 512 +CHECK: Section { +CHECK-NEXT: Number: 5 +CHECK-NEXT: Name: .debug_line (2F 34 31 00 00 00 00 00) +CHECK-NEXT: VirtualSize: 0x3C +CHECK-NEXT: VirtualAddress: 0x5000 +CHECK-NEXT: RawDataSize: 512 +CHECK: Section { +CHECK-NEXT: Number: 6 +CHECK-NEXT: Name: .debug_str (2F 31 38 00 00 00 00 00) +CHECK-NEXT: VirtualSize: 0xD9 +CHECK-NEXT: VirtualAddress: 0x6000 +CHECK-NEXT: RawDataSize: 512 From 7a761100960c0c9e2b2fa8a9ee233b137270bd73 Mon Sep 17 00:00:00 2001 From: Zhengxing li Date: Sat, 4 Jan 2025 14:02:39 -0800 Subject: [PATCH 444/567] [HLSL][SPIR-V] implement SV_GroupID semantic lowering (#121521) The HLSL SV_GroupID semantic attribute is lowered into @llvm.spv.group.id intrinsic in LLVM IR for SPIR-V target. In the SPIR-V backend, this is now translated to a `WorkgroupId` builtin variable. Fixes #118700 which's a follow-up work to #70120 --- clang/lib/CodeGen/CGHLSLRuntime.cpp | 2 +- clang/lib/CodeGen/CGHLSLRuntime.h | 1 + .../CodeGenHLSL/semantics/SV_GroupID.hlsl | 34 ++++++------ llvm/include/llvm/IR/IntrinsicsSPIRV.td | 1 + .../Target/SPIRV/SPIRVInstructionSelector.cpp | 8 +++ .../SPIRV/hlsl-intrinsics/SV_GroupID.ll | 52 +++++++++++++++++++ 6 files changed, 82 insertions(+), 16 deletions(-) create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-intrinsics/SV_GroupID.ll diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp index c354e58e15f4b..5679bd7158179 100644 --- a/clang/lib/CodeGen/CGHLSLRuntime.cpp +++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp @@ -395,7 +395,7 @@ llvm::Value *CGHLSLRuntime::emitInputSemantic(IRBuilder<> &B, return buildVectorInput(B, GroupThreadIDIntrinsic, Ty); } if (D.hasAttr()) { - llvm::Function *GroupIDIntrinsic = CGM.getIntrinsic(Intrinsic::dx_group_id); + llvm::Function *GroupIDIntrinsic = CGM.getIntrinsic(getGroupIdIntrinsic()); return buildVectorInput(B, GroupIDIntrinsic, Ty); } assert(false && "Unhandled parameter attribute"); diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h index edb87f9d5efdf..3d5724118611c 100644 --- a/clang/lib/CodeGen/CGHLSLRuntime.h +++ b/clang/lib/CodeGen/CGHLSLRuntime.h @@ -87,6 +87,7 @@ class CGHLSLRuntime { GENERATE_HLSL_INTRINSIC_FUNCTION(Radians, radians) GENERATE_HLSL_INTRINSIC_FUNCTION(ThreadId, thread_id) GENERATE_HLSL_INTRINSIC_FUNCTION(GroupThreadId, thread_id_in_group) + GENERATE_HLSL_INTRINSIC_FUNCTION(GroupId, group_id) GENERATE_HLSL_INTRINSIC_FUNCTION(FDot, fdot) GENERATE_HLSL_INTRINSIC_FUNCTION(SDot, sdot) GENERATE_HLSL_INTRINSIC_FUNCTION(UDot, udot) diff --git a/clang/test/CodeGenHLSL/semantics/SV_GroupID.hlsl b/clang/test/CodeGenHLSL/semantics/SV_GroupID.hlsl index 5e09f0fe06d4e..3aa054afc9045 100644 --- a/clang/test/CodeGenHLSL/semantics/SV_GroupID.hlsl +++ b/clang/test/CodeGenHLSL/semantics/SV_GroupID.hlsl @@ -1,32 +1,36 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL -DTARGET=dx +// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV -DTARGET=spv -// Make sure SV_GroupID translated into dx.group.id. +// Make sure SV_GroupID translated into dx.group.id for directx target and spv.group.id for spirv target. -// CHECK: define void @foo() -// CHECK: %[[#ID:]] = call i32 @llvm.dx.group.id(i32 0) -// CHECK: call void @{{.*}}foo{{.*}}(i32 %[[#ID]]) +// CHECK: define void @foo() +// CHECK: %[[#ID:]] = call i32 @llvm.[[TARGET]].group.id(i32 0) +// CHECK-DXIL: call void @{{.*}}foo{{.*}}(i32 %[[#ID]]) +// CHECK-SPIRV: call spir_func void @{{.*}}foo{{.*}}(i32 %[[#ID]]) [shader("compute")] [numthreads(8,8,1)] void foo(uint Idx : SV_GroupID) {} -// CHECK: define void @bar() -// CHECK: %[[#ID_X:]] = call i32 @llvm.dx.group.id(i32 0) -// CHECK: %[[#ID_X_:]] = insertelement <2 x i32> poison, i32 %[[#ID_X]], i64 0 -// CHECK: %[[#ID_Y:]] = call i32 @llvm.dx.group.id(i32 1) -// CHECK: %[[#ID_XY:]] = insertelement <2 x i32> %[[#ID_X_]], i32 %[[#ID_Y]], i64 1 -// CHECK: call void @{{.*}}bar{{.*}}(<2 x i32> %[[#ID_XY]]) +// CHECK: define void @bar() +// CHECK: %[[#ID_X:]] = call i32 @llvm.[[TARGET]].group.id(i32 0) +// CHECK: %[[#ID_X_:]] = insertelement <2 x i32> poison, i32 %[[#ID_X]], i64 0 +// CHECK: %[[#ID_Y:]] = call i32 @llvm.[[TARGET]].group.id(i32 1) +// CHECK: %[[#ID_XY:]] = insertelement <2 x i32> %[[#ID_X_]], i32 %[[#ID_Y]], i64 1 +// CHECK-DXIL: call void @{{.*}}bar{{.*}}(<2 x i32> %[[#ID_XY]]) +// CHECK-SPIRV: call spir_func void @{{.*}}bar{{.*}}(<2 x i32> %[[#ID_XY]]) [shader("compute")] [numthreads(8,8,1)] void bar(uint2 Idx : SV_GroupID) {} // CHECK: define void @test() -// CHECK: %[[#ID_X:]] = call i32 @llvm.dx.group.id(i32 0) +// CHECK: %[[#ID_X:]] = call i32 @llvm.[[TARGET]].group.id(i32 0) // CHECK: %[[#ID_X_:]] = insertelement <3 x i32> poison, i32 %[[#ID_X]], i64 0 -// CHECK: %[[#ID_Y:]] = call i32 @llvm.dx.group.id(i32 1) +// CHECK: %[[#ID_Y:]] = call i32 @llvm.[[TARGET]].group.id(i32 1) // CHECK: %[[#ID_XY:]] = insertelement <3 x i32> %[[#ID_X_]], i32 %[[#ID_Y]], i64 1 -// CHECK: %[[#ID_Z:]] = call i32 @llvm.dx.group.id(i32 2) +// CHECK: %[[#ID_Z:]] = call i32 @llvm.[[TARGET]].group.id(i32 2) // CHECK: %[[#ID_XYZ:]] = insertelement <3 x i32> %[[#ID_XY]], i32 %[[#ID_Z]], i64 2 -// CHECK: call void @{{.*}}test{{.*}}(<3 x i32> %[[#ID_XYZ]]) +// CHECK-DXIL: call void @{{.*}}test{{.*}}(<3 x i32> %[[#ID_XYZ]]) +// CHECK-SPIRV: call spir_func void @{{.*}}test{{.*}}(<3 x i32> %[[#ID_XYZ]]) [shader("compute")] [numthreads(8,8,1)] void test(uint3 Idx : SV_GroupID) {} diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td index bcff0f20b985d..8ebce408ff138 100644 --- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td +++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td @@ -59,6 +59,7 @@ let TargetPrefix = "spv" in { // The following intrinsic(s) are mirrored from IntrinsicsDirectX.td for HLSL support. def int_spv_thread_id : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrWillReturn]>; + def int_spv_group_id : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrWillReturn]>; def int_spv_thread_id_in_group : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrWillReturn]>; def int_spv_all : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty], [IntrNoMem]>; def int_spv_any : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty], [IntrNoMem]>; diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 289d5f3166487..0fa0986a10c69 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -2881,6 +2881,14 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, // translated to a `LocalInvocationId` builtin variable return loadVec3BuiltinInputID(SPIRV::BuiltIn::LocalInvocationId, ResVReg, ResType, I); + case Intrinsic::spv_group_id: + // The HLSL SV_GroupId semantic is lowered to + // llvm.spv.group.id intrinsic in LLVM IR for SPIR-V backend. + // + // In SPIR-V backend, llvm.spv.group.id is now translated to a `WorkgroupId` + // builtin variable + return loadVec3BuiltinInputID(SPIRV::BuiltIn::WorkgroupId, ResVReg, ResType, + I); case Intrinsic::spv_fdot: return selectFloatDot(ResVReg, ResType, I); case Intrinsic::spv_udot: diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/SV_GroupID.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/SV_GroupID.ll new file mode 100644 index 0000000000000..92947f7865ced --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/SV_GroupID.ll @@ -0,0 +1,52 @@ +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-vulkan-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK-DAG: %[[#int:]] = OpTypeInt 32 0 +; CHECK-DAG: %[[#v3int:]] = OpTypeVector %[[#int]] 3 +; CHECK-DAG: %[[#ptr_Input_v3int:]] = OpTypePointer Input %[[#v3int]] +; CHECK-DAG: %[[#tempvar:]] = OpUndef %[[#v3int]] +; CHECK-DAG: %[[#WorkgroupId:]] = OpVariable %[[#ptr_Input_v3int]] Input + +; CHECK-DAG: OpEntryPoint GLCompute {{.*}} %[[#WorkgroupId]] +; CHECK-DAG: OpName %[[#WorkgroupId]] "__spirv_BuiltInWorkgroupId" +; CHECK-DAG: OpDecorate %[[#WorkgroupId]] LinkageAttributes "__spirv_BuiltInWorkgroupId" Import +; CHECK-DAG: OpDecorate %[[#WorkgroupId]] BuiltIn WorkgroupId + +target triple = "spirv-unknown-vulkan-library" + +declare void @group_id_user(<3 x i32>) + +; Function Attrs: convergent noinline norecurse +define void @main() #1 { +entry: + +; CHECK: %[[#load:]] = OpLoad %[[#v3int]] %[[#WorkgroupId]] +; CHECK: %[[#load0:]] = OpCompositeExtract %[[#int]] %[[#load]] 0 + %1 = call i32 @llvm.spv.group.id(i32 0) + +; CHECK: %[[#tempvar:]] = OpCompositeInsert %[[#v3int]] %[[#load0]] %[[#tempvar]] + %2 = insertelement <3 x i32> poison, i32 %1, i64 0 + +; CHECK: %[[#load:]] = OpLoad %[[#v3int]] %[[#WorkgroupId]] +; CHECK: %[[#load1:]] = OpCompositeExtract %[[#int]] %[[#load]] 1 + %3 = call i32 @llvm.spv.group.id(i32 1) + +; CHECK: %[[#tempvar:]] = OpCompositeInsert %[[#v3int]] %[[#load1]] %[[#tempvar]] 1 + %4 = insertelement <3 x i32> %2, i32 %3, i64 1 + +; CHECK: %[[#load:]] = OpLoad %[[#v3int]] %[[#WorkgroupId]] +; CHECK: %[[#load2:]] = OpCompositeExtract %[[#int]] %[[#load]] 2 + %5 = call i32 @llvm.spv.group.id(i32 2) + +; CHECK: %[[#tempvar:]] = OpCompositeInsert %[[#v3int]] %[[#load2]] %[[#tempvar]] 2 + %6 = insertelement <3 x i32> %4, i32 %5, i64 2 + + call spir_func void @group_id_user(<3 x i32> %6) + ret void +} + +; Function Attrs: nounwind willreturn memory(none) +declare i32 @llvm.spv.group.id(i32) #3 + +attributes #1 = { convergent noinline norecurse "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +attributes #3 = { nounwind willreturn memory(none) } From a738d81cd2822698539b0482af48d49d91ea5a2e Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Sat, 4 Jan 2025 23:10:41 +0100 Subject: [PATCH 445/567] [libc] Improve qsort (with build fix) (#121482) --- libc/fuzzing/stdlib/CMakeLists.txt | 6 +- libc/fuzzing/stdlib/heap_sort_fuzz.cpp | 29 ++- .../{qsort_fuzz.cpp => quick_sort_fuzz.cpp} | 29 ++- libc/src/stdlib/heap_sort.h | 12 +- libc/src/stdlib/qsort.cpp | 10 +- libc/src/stdlib/qsort_data.h | 171 +++++++++------ libc/src/stdlib/qsort_pivot.h | 85 ++++++++ libc/src/stdlib/qsort_r.cpp | 11 +- libc/src/stdlib/qsort_util.h | 47 +++- libc/src/stdlib/quick_sort.h | 203 +++++++++++++----- libc/test/src/stdlib/CMakeLists.txt | 18 +- libc/test/src/stdlib/SortingTest.h | 199 +++++++++-------- libc/test/src/stdlib/heap_sort_test.cpp | 18 +- libc/test/src/stdlib/qsort_r_test.cpp | 4 +- libc/test/src/stdlib/qsort_test.cpp | 17 -- libc/test/src/stdlib/quick_sort_test.cpp | 19 +- .../libc/test/src/stdlib/BUILD.bazel | 16 +- 17 files changed, 569 insertions(+), 325 deletions(-) rename libc/fuzzing/stdlib/{qsort_fuzz.cpp => quick_sort_fuzz.cpp} (62%) create mode 100644 libc/src/stdlib/qsort_pivot.h delete mode 100644 libc/test/src/stdlib/qsort_test.cpp diff --git a/libc/fuzzing/stdlib/CMakeLists.txt b/libc/fuzzing/stdlib/CMakeLists.txt index 9b3298cfc55a7..3dbd640a67dbd 100644 --- a/libc/fuzzing/stdlib/CMakeLists.txt +++ b/libc/fuzzing/stdlib/CMakeLists.txt @@ -1,9 +1,9 @@ add_libc_fuzzer( - qsort_fuzz + quick_sort_fuzz SRCS - qsort_fuzz.cpp + quick_sort_fuzz.cpp DEPENDS - libc.src.stdlib.qsort + libc.src.stdlib.qsort_util ) add_libc_fuzzer( diff --git a/libc/fuzzing/stdlib/heap_sort_fuzz.cpp b/libc/fuzzing/stdlib/heap_sort_fuzz.cpp index 876c5f9975d4d..6b00306ec7dc1 100644 --- a/libc/fuzzing/stdlib/heap_sort_fuzz.cpp +++ b/libc/fuzzing/stdlib/heap_sort_fuzz.cpp @@ -10,21 +10,10 @@ /// //===----------------------------------------------------------------------===// -#include "src/stdlib/heap_sort.h" +#include "src/stdlib/qsort_util.h" #include -static int int_compare(const void *l, const void *r) { - int li = *reinterpret_cast(l); - int ri = *reinterpret_cast(r); - if (li == ri) - return 0; - if (li > ri) - return 1; - return -1; -} - extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { - const size_t array_size = size / sizeof(int); if (array_size == 0) return 0; @@ -34,14 +23,22 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { for (size_t i = 0; i < array_size; ++i) array[i] = data_as_int[i]; - auto arr = LIBC_NAMESPACE::internal::Array( - reinterpret_cast(array), array_size, sizeof(int), int_compare); + const auto is_less = [](const void *a_ptr, + const void *b_ptr) noexcept -> bool { + const int &a = *static_cast(a_ptr); + const int &b = *static_cast(b_ptr); + + return a < b; + }; - LIBC_NAMESPACE::internal::heap_sort(arr); + constexpr bool USE_QUICKSORT = false; + LIBC_NAMESPACE::internal::unstable_sort_impl( + array, array_size, sizeof(int), is_less); - for (size_t i = 0; i < array_size - 1; ++i) + for (size_t i = 0; i < array_size - 1; ++i) { if (array[i] > array[i + 1]) __builtin_trap(); + } delete[] array; return 0; diff --git a/libc/fuzzing/stdlib/qsort_fuzz.cpp b/libc/fuzzing/stdlib/quick_sort_fuzz.cpp similarity index 62% rename from libc/fuzzing/stdlib/qsort_fuzz.cpp rename to libc/fuzzing/stdlib/quick_sort_fuzz.cpp index 5d5053cff5c58..6371e851d2fc3 100644 --- a/libc/fuzzing/stdlib/qsort_fuzz.cpp +++ b/libc/fuzzing/stdlib/quick_sort_fuzz.cpp @@ -1,4 +1,4 @@ -//===-- qsort_fuzz.cpp ----------------------------------------------------===// +//===-- quick_sort_fuzz.cpp------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,24 +6,13 @@ // //===----------------------------------------------------------------------===// /// -/// Fuzzing test for llvm-libc qsort implementation. +/// Fuzzing test for llvm-libc quick_sort implementation. /// //===----------------------------------------------------------------------===// -#include "src/stdlib/qsort.h" +#include "src/stdlib/qsort_util.h" #include -static int int_compare(const void *l, const void *r) { - int li = *reinterpret_cast(l); - int ri = *reinterpret_cast(r); - if (li == ri) - return 0; - else if (li > ri) - return 1; - else - return -1; -} - extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { const size_t array_size = size / sizeof(int); if (array_size == 0) @@ -34,7 +23,17 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { for (size_t i = 0; i < array_size; ++i) array[i] = data_as_int[i]; - LIBC_NAMESPACE::qsort(array, array_size, sizeof(int), int_compare); + const auto is_less = [](const void *a_ptr, + const void *b_ptr) noexcept -> bool { + const int &a = *static_cast(a_ptr); + const int &b = *static_cast(b_ptr); + + return a < b; + }; + + constexpr bool USE_QUICKSORT = true; + LIBC_NAMESPACE::internal::unstable_sort_impl( + array, array_size, sizeof(int), is_less); for (size_t i = 0; i < array_size - 1; ++i) { if (array[i] > array[i + 1]) diff --git a/libc/src/stdlib/heap_sort.h b/libc/src/stdlib/heap_sort.h index ccb9ec5f82149..b9699776df89c 100644 --- a/libc/src/stdlib/heap_sort.h +++ b/libc/src/stdlib/heap_sort.h @@ -18,11 +18,12 @@ namespace internal { // A simple in-place heapsort implementation. // Follow the implementation in https://en.wikipedia.org/wiki/Heapsort. -LIBC_INLINE void heap_sort(const Array &array) { - size_t end = array.size(); +template +LIBC_INLINE void heap_sort(const A &array, const F &is_less) { + size_t end = array.len(); size_t start = end / 2; - auto left_child = [](size_t i) -> size_t { return 2 * i + 1; }; + const auto left_child = [](size_t i) -> size_t { return 2 * i + 1; }; while (end > 1) { if (start > 0) { @@ -40,12 +41,11 @@ LIBC_INLINE void heap_sort(const Array &array) { while (left_child(root) < end) { size_t child = left_child(root); // If there are two children, set child to the greater. - if (child + 1 < end && - array.elem_compare(child, array.get(child + 1)) < 0) + if ((child + 1 < end) && is_less(array.get(child), array.get(child + 1))) ++child; // If the root is less than the greater child - if (array.elem_compare(root, array.get(child)) >= 0) + if (!is_less(array.get(root), array.get(child))) break; // Swap the root with the greater child and continue sifting down. diff --git a/libc/src/stdlib/qsort.cpp b/libc/src/stdlib/qsort.cpp index 65a63c239f5c0..0bf5fc7980527 100644 --- a/libc/src/stdlib/qsort.cpp +++ b/libc/src/stdlib/qsort.cpp @@ -18,14 +18,12 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(void, qsort, (void *array, size_t array_size, size_t elem_size, int (*compare)(const void *, const void *))) { - if (array == nullptr || array_size == 0 || elem_size == 0) - return; - internal::Comparator c(compare); - auto arr = internal::Array(reinterpret_cast(array), array_size, - elem_size, c); + const auto is_less = [compare](const void *a, const void *b) -> bool { + return compare(a, b) < 0; + }; - internal::sort(arr); + internal::unstable_sort(array, array_size, elem_size, is_less); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/qsort_data.h b/libc/src/stdlib/qsort_data.h index c529d55ca46ff..aa6d9bbc123de 100644 --- a/libc/src/stdlib/qsort_data.h +++ b/libc/src/stdlib/qsort_data.h @@ -17,91 +17,122 @@ namespace LIBC_NAMESPACE_DECL { namespace internal { -using Compare = int(const void *, const void *); -using CompareWithState = int(const void *, const void *, void *); - -enum class CompType { COMPARE, COMPARE_WITH_STATE }; - -struct Comparator { - union { - Compare *comp_func; - CompareWithState *comp_func_r; - }; - const CompType comp_type; - - void *arg; - - Comparator(Compare *func) - : comp_func(func), comp_type(CompType::COMPARE), arg(nullptr) {} - - Comparator(CompareWithState *func, void *arg_val) - : comp_func_r(func), comp_type(CompType::COMPARE_WITH_STATE), - arg(arg_val) {} - -#if defined(__clang__) - // Recent upstream changes to -fsanitize=function find more instances of - // function type mismatches. One case is with the comparator passed to this - // class. Libraries will tend to pass comparators that take pointers to - // varying types while this comparator expects to accept const void pointers. - // Ideally those tools would pass a function that strictly accepts const - // void*s to avoid UB, or would use qsort_r to pass their own comparator. - [[clang::no_sanitize("function")]] -#endif - int comp_vals(const void *a, const void *b) const { - if (comp_type == CompType::COMPARE) { - return comp_func(a, b); - } else { - return comp_func_r(a, b, arg); +class ArrayGenericSize { + cpp::byte *array_base; + size_t array_len; + size_t elem_size; + + LIBC_INLINE cpp::byte *get_internal(size_t i) const { + return array_base + (i * elem_size); + } + +public: + LIBC_INLINE ArrayGenericSize(void *a, size_t s, size_t e) + : array_base(reinterpret_cast(a)), array_len(s), + elem_size(e) {} + + static constexpr bool has_fixed_size() { return false; } + + LIBC_INLINE void *get(size_t i) const { return get_internal(i); } + + LIBC_INLINE void swap(size_t i, size_t j) const { + // It's possible to use 8 byte blocks with `uint64_t`, but that + // generates more machine code as the remainder loop gets + // unrolled, plus 4 byte operations are more likely to be + // efficient on a wider variety of hardware. On x86 LLVM tends + // to unroll the block loop again into 2 16 byte swaps per + // iteration which is another reason that 4 byte blocks yields + // good performance even for big types. + using block_t = uint32_t; + constexpr size_t BLOCK_SIZE = sizeof(block_t); + + alignas(block_t) cpp::byte tmp_block[BLOCK_SIZE]; + + cpp::byte *elem_i = get_internal(i); + cpp::byte *elem_j = get_internal(j); + + const size_t elem_size_rem = elem_size % BLOCK_SIZE; + const cpp::byte *elem_i_block_end = elem_i + (elem_size - elem_size_rem); + + while (elem_i != elem_i_block_end) { + __builtin_memcpy(tmp_block, elem_i, BLOCK_SIZE); + __builtin_memcpy(elem_i, elem_j, BLOCK_SIZE); + __builtin_memcpy(elem_j, tmp_block, BLOCK_SIZE); + + elem_i += BLOCK_SIZE; + elem_j += BLOCK_SIZE; + } + + for (size_t n = 0; n < elem_size_rem; ++n) { + cpp::byte tmp = elem_i[n]; + elem_i[n] = elem_j[n]; + elem_j[n] = tmp; } } + + LIBC_INLINE size_t len() const { return array_len; } + + // Make an Array starting at index |i| and length |s|. + LIBC_INLINE ArrayGenericSize make_array(size_t i, size_t s) const { + return ArrayGenericSize(get_internal(i), s, elem_size); + } + + // Reset this Array to point at a different interval of the same + // items starting at index |i|. + LIBC_INLINE void reset_bounds(size_t i, size_t s) { + array_base = get_internal(i); + array_len = s; + } }; -class Array { - uint8_t *array; - size_t array_size; - size_t elem_size; - Comparator compare; +// Having a specialized Array type for sorting that knows at +// compile-time what the size of the element is, allows for much more +// efficient swapping and for cheaper offset calculations. +template class ArrayFixedSize { + cpp::byte *array_base; + size_t array_len; -public: - Array(uint8_t *a, size_t s, size_t e, Comparator c) - : array(a), array_size(s), elem_size(e), compare(c) {} - - uint8_t *get(size_t i) const { return array + i * elem_size; } - - void swap(size_t i, size_t j) const { - uint8_t *elem_i = get(i); - uint8_t *elem_j = get(j); - for (size_t b = 0; b < elem_size; ++b) { - uint8_t temp = elem_i[b]; - elem_i[b] = elem_j[b]; - elem_j[b] = temp; - } + LIBC_INLINE cpp::byte *get_internal(size_t i) const { + return array_base + (i * ELEM_SIZE); } - int elem_compare(size_t i, const uint8_t *other) const { - // An element must compare equal to itself so we don't need to consult the - // user provided comparator. - if (get(i) == other) - return 0; - return compare.comp_vals(get(i), other); +public: + LIBC_INLINE ArrayFixedSize(void *a, size_t s) + : array_base(reinterpret_cast(a)), array_len(s) {} + + // Beware this function is used a heuristic for cheap to swap types, so + // instantiating `ArrayFixedSize` with `ELEM_SIZE > 100` is probably a bad + // idea perf wise. + static constexpr bool has_fixed_size() { return true; } + + LIBC_INLINE void *get(size_t i) const { return get_internal(i); } + + LIBC_INLINE void swap(size_t i, size_t j) const { + alignas(32) cpp::byte tmp[ELEM_SIZE]; + + cpp::byte *elem_i = get_internal(i); + cpp::byte *elem_j = get_internal(j); + + __builtin_memcpy(tmp, elem_i, ELEM_SIZE); + __builtin_memmove(elem_i, elem_j, ELEM_SIZE); + __builtin_memcpy(elem_j, tmp, ELEM_SIZE); } - size_t size() const { return array_size; } + LIBC_INLINE size_t len() const { return array_len; } - // Make an Array starting at index |i| and size |s|. - LIBC_INLINE Array make_array(size_t i, size_t s) const { - return Array(get(i), s, elem_size, compare); + // Make an Array starting at index |i| and length |s|. + LIBC_INLINE ArrayFixedSize make_array(size_t i, size_t s) const { + return ArrayFixedSize(get_internal(i), s); } - // Reset this Array to point at a different interval of the same items. - LIBC_INLINE void reset_bounds(uint8_t *a, size_t s) { - array = a; - array_size = s; + // Reset this Array to point at a different interval of the same + // items starting at index |i|. + LIBC_INLINE void reset_bounds(size_t i, size_t s) { + array_base = get_internal(i); + array_len = s; } }; -using SortingRoutine = void(const Array &); - } // namespace internal } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/qsort_pivot.h b/libc/src/stdlib/qsort_pivot.h new file mode 100644 index 0000000000000..b7e1b4294f6d6 --- /dev/null +++ b/libc/src/stdlib/qsort_pivot.h @@ -0,0 +1,85 @@ +//===-- Implementation header for qsort utilities ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H +#define LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H + +#include + +namespace LIBC_NAMESPACE_DECL { +namespace internal { + +// Recursively select a pseudomedian if above this threshold. +constexpr size_t PSEUDO_MEDIAN_REC_THRESHOLD = 64; + +// Selects a pivot from `array`. Algorithm taken from glidesort by Orson Peters. +// +// This chooses a pivot by sampling an adaptive amount of points, approximating +// the quality of a median of sqrt(n) elements. +template +size_t choose_pivot(const A &array, const F &is_less) { + const size_t len = array.len(); + + if (len < 8) { + return 0; + } + + const size_t len_div_8 = len / 8; + + const size_t a = 0; // [0, floor(n/8)) + const size_t b = len_div_8 * 4; // [4*floor(n/8), 5*floor(n/8)) + const size_t c = len_div_8 * 7; // [7*floor(n/8), 8*floor(n/8)) + + if (len < PSEUDO_MEDIAN_REC_THRESHOLD) + return median3(array, a, b, c, is_less); + else + return median3_rec(array, a, b, c, len_div_8, is_less); +} + +// Calculates an approximate median of 3 elements from sections a, b, c, or +// recursively from an approximation of each, if they're large enough. By +// dividing the size of each section by 8 when recursing we have logarithmic +// recursion depth and overall sample from f(n) = 3*f(n/8) -> f(n) = +// O(n^(log(3)/log(8))) ~= O(n^0.528) elements. +template +size_t median3_rec(const A &array, size_t a, size_t b, size_t c, size_t n, + const F &is_less) { + if (n * 8 >= PSEUDO_MEDIAN_REC_THRESHOLD) { + const size_t n8 = n / 8; + a = median3_rec(array, a, a + (n8 * 4), a + (n8 * 7), n8, is_less); + b = median3_rec(array, b, b + (n8 * 4), b + (n8 * 7), n8, is_less); + c = median3_rec(array, c, c + (n8 * 4), c + (n8 * 7), n8, is_less); + } + return median3(array, a, b, c, is_less); +} + +/// Calculates the median of 3 elements. +template +size_t median3(const A &array, size_t a, size_t b, size_t c, const F &is_less) { + const void *a_ptr = array.get(a); + const void *b_ptr = array.get(b); + const void *c_ptr = array.get(c); + + const bool x = is_less(a_ptr, b_ptr); + const bool y = is_less(a_ptr, c_ptr); + if (x == y) { + // If x=y=0 then b, c <= a. In this case we want to return max(b, c). + // If x=y=1 then a < b, c. In this case we want to return min(b, c). + // By toggling the outcome of b < c using XOR x we get this behavior. + const bool z = is_less(b_ptr, c_ptr); + return z ^ x ? c : b; + } else { + // Either c <= a < b or b <= a < c, thus a is our median. + return a; + } +} + +} // namespace internal +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H diff --git a/libc/src/stdlib/qsort_r.cpp b/libc/src/stdlib/qsort_r.cpp index bf61a40e84734..4e60998b6a6df 100644 --- a/libc/src/stdlib/qsort_r.cpp +++ b/libc/src/stdlib/qsort_r.cpp @@ -19,13 +19,12 @@ LLVM_LIBC_FUNCTION(void, qsort_r, (void *array, size_t array_size, size_t elem_size, int (*compare)(const void *, const void *, void *), void *arg)) { - if (array == nullptr || array_size == 0 || elem_size == 0) - return; - internal::Comparator c(compare, arg); - auto arr = internal::Array(reinterpret_cast(array), array_size, - elem_size, c); - internal::sort(arr); + const auto is_less = [compare, arg](const void *a, const void *b) -> bool { + return compare(a, b, arg) < 0; + }; + + internal::unstable_sort(array, array_size, elem_size, is_less); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/qsort_util.h b/libc/src/stdlib/qsort_util.h index d42adde06d976..7882b829d3274 100644 --- a/libc/src/stdlib/qsort_util.h +++ b/libc/src/stdlib/qsort_util.h @@ -27,11 +27,48 @@ namespace LIBC_NAMESPACE_DECL { namespace internal { -#if LIBC_QSORT_IMPL == LIBC_QSORT_QUICK_SORT -constexpr auto sort = quick_sort; -#elif LIBC_QSORT_IMPL == LIBC_QSORT_HEAP_SORT -constexpr auto sort = heap_sort; -#endif +template +LIBC_INLINE void unstable_sort_impl(void *array, size_t array_len, + size_t elem_size, const F &is_less) { + if (array == nullptr || array_len == 0 || elem_size == 0) + return; + + if constexpr (USE_QUICKSORT) { + switch (elem_size) { + case 4: { + auto arr_fixed_size = internal::ArrayFixedSize<4>(array, array_len); + quick_sort(arr_fixed_size, is_less); + return; + } + case 8: { + auto arr_fixed_size = internal::ArrayFixedSize<8>(array, array_len); + quick_sort(arr_fixed_size, is_less); + return; + } + case 16: { + auto arr_fixed_size = internal::ArrayFixedSize<16>(array, array_len); + quick_sort(arr_fixed_size, is_less); + return; + } + default: + auto arr_generic_size = + internal::ArrayGenericSize(array, array_len, elem_size); + quick_sort(arr_generic_size, is_less); + return; + } + } else { + auto arr_generic_size = + internal::ArrayGenericSize(array, array_len, elem_size); + heap_sort(arr_generic_size, is_less); + } +} + +template +LIBC_INLINE void unstable_sort(void *array, size_t array_len, size_t elem_size, + const F &is_less) { +#define USE_QUICK_SORT ((LIBC_QSORT_IMPL) == (LIBC_QSORT_QUICK_SORT)) + unstable_sort_impl(array, array_len, elem_size, is_less); +} } // namespace internal } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/quick_sort.h b/libc/src/stdlib/quick_sort.h index 82b90a7d511d9..9ab2830250018 100644 --- a/libc/src/stdlib/quick_sort.h +++ b/libc/src/stdlib/quick_sort.h @@ -9,84 +9,175 @@ #ifndef LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H #define LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H -#include "src/__support/macros/attributes.h" +#include "src/__support/CPP/bit.h" +#include "src/__support/CPP/cstddef.h" #include "src/__support/macros/config.h" -#include "src/stdlib/qsort_data.h" +#include "src/stdlib/qsort_pivot.h" #include namespace LIBC_NAMESPACE_DECL { namespace internal { -// A simple quicksort implementation using the Hoare partition scheme. -LIBC_INLINE size_t partition(const Array &array) { - const size_t array_size = array.size(); - size_t pivot_index = array_size / 2; - uint8_t *pivot = array.get(pivot_index); - size_t i = 0; - size_t j = array_size - 1; +// Branchless Lomuto partition based on the implementation by Lukas +// Bergdoll and Orson Peters +// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/lomcyc_partition/text.md. +// Simplified to avoid having to stack allocate. +template +LIBC_INLINE size_t partition_lomuto_branchless(const A &array, + const void *pivot, + const F &is_less) { + const size_t array_len = array.len(); + + size_t left = 0; + size_t right = 0; + + while (right < array_len) { + const bool right_is_lt = is_less(array.get(right), pivot); + array.swap(left, right); + left += static_cast(right_is_lt); + right += 1; + } + + return left; +} + +// Optimized for large types that are expensive to move. Not optimized +// for integers. It's possible to use a cyclic permutation here for +// large types as done in ipnsort but the advantages of this are limited +// as `is_less` is a small wrapper around a call to a function pointer +// and won't incur much binary-size overhead. The other reason to use +// cyclic permutation is to have more efficient swapping, but we don't +// know the element size so this isn't applicable here either. +template +LIBC_INLINE size_t partition_hoare_branchy(const A &array, const void *pivot, + const F &is_less) { + const size_t array_len = array.len(); + + size_t left = 0; + size_t right = array_len; while (true) { - int compare_i, compare_j; - - while ((compare_i = array.elem_compare(i, pivot)) < 0) - ++i; - while ((compare_j = array.elem_compare(j, pivot)) > 0) - --j; - - // At some point i will crossover j so we will definitely break out of - // this while loop. - if (i >= j) - return j + 1; - - array.swap(i, j); - - // The pivot itself might have got swapped so we will update the pivot. - if (i == pivot_index) { - pivot = array.get(j); - pivot_index = j; - } else if (j == pivot_index) { - pivot = array.get(i); - pivot_index = i; + while (left < right && is_less(array.get(left), pivot)) + ++left; + + while (true) { + --right; + if (left >= right || is_less(array.get(right), pivot)) { + break; + } } - if (compare_i == 0 && compare_j == 0) { - // If we do not move the pointers, we will end up with an - // infinite loop as i and j will be stuck without advancing. - ++i; - --j; - } + if (left >= right) + break; + + array.swap(left, right); + ++left; + } + + return left; +} + +template +LIBC_INLINE size_t partition(const A &array, size_t pivot_index, + const F &is_less) { + // Place the pivot at the beginning of the array. + if (pivot_index != 0) { + array.swap(0, pivot_index); } + + const A array_without_pivot = array.make_array(1, array.len() - 1); + const void *pivot = array.get(0); + + size_t num_lt; + if constexpr (A::has_fixed_size()) { + // Branchless Lomuto avoid branch misprediction penalties, but + // it also swaps more often which is only faster if the swap is a fast + // constant operation. + num_lt = partition_lomuto_branchless(array_without_pivot, pivot, is_less); + } else { + num_lt = partition_hoare_branchy(array_without_pivot, pivot, is_less); + } + + // Place the pivot between the two partitions. + array.swap(0, num_lt); + + return num_lt; } -LIBC_INLINE void quick_sort(Array array) { +template +LIBC_INLINE void quick_sort_impl(A &array, const void *ancestor_pivot, + size_t limit, const F &is_less) { while (true) { - const size_t array_size = array.size(); - if (array_size <= 1) + const size_t array_len = array.len(); + if (array_len <= 1) return; - size_t split_index = partition(array); - if (array_size == 2) - // The partition operation sorts the two element array. + + // If too many bad pivot choices were made, simply fall back to + // heapsort in order to guarantee `O(N x log(N))` worst-case. + if (limit == 0) { + heap_sort(array, is_less); return; + } - // Make Arrays describing the two sublists that still need sorting. - Array left = array.make_array(0, split_index); - Array right = array.make_array(split_index, array.size() - split_index); - - // Recurse to sort the smaller of the two, and then loop round within this - // function to sort the larger. This way, recursive call depth is bounded - // by log2 of the total array size, because every recursive call is sorting - // a list at most half the length of the one in its caller. - if (left.size() < right.size()) { - quick_sort(left); - array.reset_bounds(right.get(0), right.size()); - } else { - quick_sort(right); - array.reset_bounds(left.get(0), left.size()); + limit -= 1; + + const size_t pivot_index = choose_pivot(array, is_less); + + // If the chosen pivot is equal to the predecessor, then it's the smallest + // element in the slice. Partition the slice into elements equal to and + // elements greater than the pivot. This case is usually hit when the slice + // contains many duplicate elements. + if (ancestor_pivot) { + if (!is_less(ancestor_pivot, array.get(pivot_index))) { + const size_t num_lt = + partition(array, pivot_index, + [is_less](const void *a, const void *b) -> bool { + return !is_less(b, a); + }); + + // Continue sorting elements greater than the pivot. We know that + // `num_lt` cont + array.reset_bounds(num_lt + 1, array.len() - (num_lt + 1)); + ancestor_pivot = nullptr; + continue; + } } + + size_t split_index = partition(array, pivot_index, is_less); + + if (array_len == 2) + // The partition operation sorts the two element array. + return; + + // Split the array into `left`, `pivot`, and `right`. + A left = array.make_array(0, split_index); + const void *pivot = array.get(split_index); + const size_t right_start = split_index + 1; + A right = array.make_array(right_start, array.len() - right_start); + + // Recurse into the left side. We have a fixed recursion limit, + // testing shows no real benefit for recursing into the shorter + // side. + quick_sort_impl(left, ancestor_pivot, limit, is_less); + + // Continue with the right side. + array = right; + ancestor_pivot = pivot; } } +constexpr size_t ilog2(size_t n) { return cpp::bit_width(n) - 1; } + +template +LIBC_INLINE void quick_sort(A &array, const F &is_less) { + const void *ancestor_pivot = nullptr; + // Limit the number of imbalanced partitions to `2 * floor(log2(len))`. + // The binary OR by one is used to eliminate the zero-check in the logarithm. + const size_t limit = 2 * ilog2((array.len() | 1)); + quick_sort_impl(array, ancestor_pivot, limit, is_less); +} + } // namespace internal } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt index 4ca2043ab4c9b..8cc0428632ba3 100644 --- a/libc/test/src/stdlib/CMakeLists.txt +++ b/libc/test/src/stdlib/CMakeLists.txt @@ -300,18 +300,6 @@ add_libc_test( libc.src.stdlib.bsearch ) -add_libc_test( - quick_sort_test - SUITE - libc-stdlib-tests - SRCS - quick_sort_test.cpp - HDRS - SortingTest.h - DEPENDS - libc.src.stdlib.qsort_util -) - add_libc_test( heap_sort_test SUITE @@ -321,15 +309,15 @@ add_libc_test( HDRS SortingTest.h DEPENDS - libc.src.stdlib.qsort_util + libc.src.stdlib.qsort ) add_libc_test( - qsort_test + quick_sort_test SUITE libc-stdlib-tests SRCS - qsort_test.cpp + quick_sort_test.cpp HDRS SortingTest.h DEPENDS diff --git a/libc/test/src/stdlib/SortingTest.h b/libc/test/src/stdlib/SortingTest.h index d34584e5addf0..034c0e4f1fd01 100644 --- a/libc/test/src/stdlib/SortingTest.h +++ b/libc/test/src/stdlib/SortingTest.h @@ -7,19 +7,19 @@ //===----------------------------------------------------------------------===// #include "src/__support/macros/config.h" -#include "src/stdlib/qsort_data.h" +#include "src/stdlib/qsort.h" #include "test/UnitTest/Test.h" class SortingTest : public LIBC_NAMESPACE::testing::Test { - using Array = LIBC_NAMESPACE::internal::Array; - using Comparator = LIBC_NAMESPACE::internal::Comparator; - using SortingRoutine = LIBC_NAMESPACE::internal::SortingRoutine; + using SortingRoutine = void (*)(void *array, size_t array_len, + size_t elem_size, + int (*compare)(const void *, const void *)); -public: static int int_compare(const void *l, const void *r) { int li = *reinterpret_cast(l); int ri = *reinterpret_cast(r); + if (li == ri) return 0; else if (li > ri) @@ -28,16 +28,19 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { return -1; } + static void int_sort(SortingRoutine sort_func, int *array, size_t array_len) { + sort_func(reinterpret_cast(array), array_len, sizeof(int), + int_compare); + } + +public: void test_sorted_array(SortingRoutine sort_func) { int array[25] = {10, 23, 33, 35, 55, 70, 71, 100, 110, 123, 133, 135, 155, 170, 171, 1100, 1110, 1123, 1133, 1135, 1155, 1170, 1171, 11100, 12310}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_LE(array[0], 10); ASSERT_LE(array[1], 23); @@ -69,14 +72,11 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { void test_reversed_sorted_array(SortingRoutine sort_func) { int array[] = {25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + int_sort(sort_func, array, ARRAY_LEN); - sort_func(arr); - - for (int i = 0; i < int(ARRAY_SIZE - 1); ++i) + for (int i = 0; i < int(ARRAY_LEN - 1); ++i) ASSERT_EQ(array[i], i + 1); } @@ -84,14 +84,11 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { int array[] = {100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_func, array, ARRAY_LEN); - for (size_t i = 0; i < ARRAY_SIZE; ++i) + for (size_t i = 0; i < ARRAY_LEN; ++i) ASSERT_EQ(array[i], 100); } @@ -99,12 +96,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { int array[25] = {10, 23, 8, 35, 55, 45, 40, 100, 110, 123, 90, 80, 70, 60, 171, 11, 1, -1, -5, -10, 1155, 1170, 1171, 12, -100}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], -100); ASSERT_EQ(array[1], -10); @@ -135,12 +129,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { void test_unsorted_array_2(SortingRoutine sort_func) { int array[7] = {10, 40, 45, 55, 35, 23, 60}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); - - sort_func(arr); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 10); ASSERT_EQ(array[1], 23); @@ -153,12 +144,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { void test_unsorted_array_duplicated_1(SortingRoutine sort_func) { int array[6] = {10, 10, 20, 20, 5, 5}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 5); ASSERT_EQ(array[1], 5); @@ -170,12 +158,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { void test_unsorted_array_duplicated_2(SortingRoutine sort_func) { int array[10] = {20, 10, 10, 10, 10, 20, 21, 21, 21, 21}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 10); ASSERT_EQ(array[1], 10); @@ -191,12 +176,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { void test_unsorted_array_duplicated_3(SortingRoutine sort_func) { int array[10] = {20, 30, 30, 30, 30, 20, 21, 21, 21, 21}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); - - sort_func(arr); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 20); ASSERT_EQ(array[1], 20); @@ -213,12 +195,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { void test_unsorted_three_element_1(SortingRoutine sort_func) { int array[3] = {14999024, 0, 3}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 0); ASSERT_EQ(array[1], 3); @@ -228,12 +207,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { void test_unsorted_three_element_2(SortingRoutine sort_func) { int array[3] = {3, 14999024, 0}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 0); ASSERT_EQ(array[1], 3); @@ -243,12 +219,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { void test_unsorted_three_element_3(SortingRoutine sort_func) { int array[3] = {3, 0, 14999024}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); - - sort_func(arr); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 0); ASSERT_EQ(array[1], 3); @@ -258,12 +231,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { void test_same_three_element(SortingRoutine sort_func) { int array[3] = {12345, 12345, 12345}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 12345); ASSERT_EQ(array[1], 12345); @@ -273,12 +243,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { void test_unsorted_two_element_1(SortingRoutine sort_func) { int array[] = {14999024, 0}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 0); ASSERT_EQ(array[1], 14999024); @@ -287,12 +254,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { void test_unsorted_two_element_2(SortingRoutine sort_func) { int array[] = {0, 14999024}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); - - sort_func(arr); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 0); ASSERT_EQ(array[1], 14999024); @@ -301,12 +265,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { void test_same_two_element(SortingRoutine sort_func) { int array[] = {12345, 12345}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 12345); ASSERT_EQ(array[1], 12345); @@ -315,15 +276,76 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { void test_single_element(SortingRoutine sort_func) { int array[] = {12345}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 12345); } + + void test_different_elem_size(SortingRoutine sort_func) { + // Random order of values [0,50) to avoid only testing pre-sorted handling. + // Long enough to reach interesting code. + constexpr uint8_t ARRAY_INITIAL_VALS[] = { + 42, 13, 8, 4, 17, 28, 20, 32, 22, 29, 7, 2, 46, 37, 26, 49, 24, + 38, 10, 18, 40, 36, 47, 15, 11, 48, 44, 33, 1, 5, 16, 35, 39, 41, + 14, 23, 3, 9, 6, 27, 21, 25, 31, 45, 12, 43, 34, 30, 19, 0}; + + constexpr size_t ARRAY_LEN = sizeof(ARRAY_INITIAL_VALS); + constexpr size_t MAX_ELEM_SIZE = 150; + constexpr size_t BUF_SIZE = ARRAY_LEN * MAX_ELEM_SIZE; + + static_assert(ARRAY_LEN < 256); // so we can encode the values. + + // Minimum alignment to test implementation for bugs related to assuming + // incorrect association between alignment and element size. + alignas(1) uint8_t buf[BUF_SIZE]; + + const auto fill_buf = [&buf](size_t elem_size) { + for (size_t i = 0; i < BUF_SIZE; ++i) { + buf[i] = 0; + } + + for (size_t elem_i = 0, buf_i = 0; elem_i < ARRAY_LEN; ++elem_i) { + const uint8_t elem_val = ARRAY_INITIAL_VALS[elem_i]; + for (size_t elem_byte_i = 0; elem_byte_i < elem_size; ++elem_byte_i) { + buf[buf_i] = elem_val; + buf_i += 1; + } + } + }; + + for (size_t elem_size = 0; elem_size <= MAX_ELEM_SIZE; ++elem_size) { + // Fill all bytes with data to ensure mistakes in elem swap are noticed. + fill_buf(elem_size); + + sort_func(reinterpret_cast(buf), ARRAY_LEN, elem_size, + [](const void *a, const void *b) -> int { + const uint8_t a_val = *reinterpret_cast(a); + const uint8_t b_val = *reinterpret_cast(b); + + if (a_val < b_val) { + return -1; + } else if (a_val > b_val) { + return 1; + } else { + return 0; + } + }); + + for (size_t elem_i = 0, buf_i = 0; elem_i < ARRAY_LEN; ++elem_i) { + const uint8_t expected_elem_val = static_cast(elem_i); + + for (size_t elem_byte_i = 0; elem_byte_i < elem_size; ++elem_byte_i) { + const uint8_t buf_val = buf[buf_i]; + // Check that every byte in the element has the expected value. + ASSERT_EQ(buf_val, expected_elem_val) + << "elem_size: " << elem_size << " buf_i: " << buf_i << '\n'; + buf_i += 1; + } + } + } + } }; #define LIST_SORTING_TESTS(Name, Func) \ @@ -374,4 +396,7 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { TEST_F(LlvmLibc##Name##Test, SingleElementArray) { \ test_single_element(Func); \ } \ + TEST_F(LlvmLibc##Name##Test, DifferentElemSizeArray) { \ + test_different_elem_size(Func); \ + } \ static_assert(true) diff --git a/libc/test/src/stdlib/heap_sort_test.cpp b/libc/test/src/stdlib/heap_sort_test.cpp index d70e3dc2272be..18d4244506ec2 100644 --- a/libc/test/src/stdlib/heap_sort_test.cpp +++ b/libc/test/src/stdlib/heap_sort_test.cpp @@ -7,10 +7,20 @@ //===----------------------------------------------------------------------===// #include "SortingTest.h" -#include "src/stdlib/heap_sort.h" +#include "src/stdlib/qsort_util.h" -void sort(const LIBC_NAMESPACE::internal::Array &array) { - LIBC_NAMESPACE::internal::heap_sort(array); +void heap_sort(void *array, size_t array_size, size_t elem_size, + int (*compare)(const void *, const void *)) { + + constexpr bool USE_QUICKSORT = false; + + const auto is_less = [compare](const void *a, + const void *b) noexcept -> bool { + return compare(a, b) < 0; + }; + + LIBC_NAMESPACE::internal::unstable_sort_impl( + array, array_size, elem_size, is_less); } -LIST_SORTING_TESTS(HeapSort, sort); +LIST_SORTING_TESTS(HeapSort, heap_sort); diff --git a/libc/test/src/stdlib/qsort_r_test.cpp b/libc/test/src/stdlib/qsort_r_test.cpp index 6893fdc7b74c8..f18923618ed5e 100644 --- a/libc/test/src/stdlib/qsort_r_test.cpp +++ b/libc/test/src/stdlib/qsort_r_test.cpp @@ -62,9 +62,9 @@ TEST(LlvmLibcQsortRTest, SortedArray) { ASSERT_LE(array[23], 11100); ASSERT_LE(array[24], 12310); - // This is a sorted list, but there still have to have been at least N + // This is a sorted list, but there still have to have been at least N - 1 // comparisons made. - ASSERT_GE(count, ARRAY_SIZE); + ASSERT_GE(count, ARRAY_SIZE - 1); } TEST(LlvmLibcQsortRTest, ReverseSortedArray) { diff --git a/libc/test/src/stdlib/qsort_test.cpp b/libc/test/src/stdlib/qsort_test.cpp deleted file mode 100644 index 1e921a86fd1fd..0000000000000 --- a/libc/test/src/stdlib/qsort_test.cpp +++ /dev/null @@ -1,17 +0,0 @@ -//===-- Unittests for qsort -----------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "SortingTest.h" -#include "src/stdlib/qsort.h" - -void sort(const LIBC_NAMESPACE::internal::Array &array) { - LIBC_NAMESPACE::qsort(reinterpret_cast(array.get(0)), array.size(), - sizeof(int), SortingTest::int_compare); -} - -LIST_SORTING_TESTS(Qsort, sort); diff --git a/libc/test/src/stdlib/quick_sort_test.cpp b/libc/test/src/stdlib/quick_sort_test.cpp index d6bf77ebfd40d..2832c855370bc 100644 --- a/libc/test/src/stdlib/quick_sort_test.cpp +++ b/libc/test/src/stdlib/quick_sort_test.cpp @@ -1,4 +1,4 @@ -//===-- Unittests for quick sort ------------------------------------------===// +//===-- Unittests for qsort -----------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -7,10 +7,19 @@ //===----------------------------------------------------------------------===// #include "SortingTest.h" -#include "src/stdlib/quick_sort.h" +#include "src/stdlib/qsort_util.h" -void sort(const LIBC_NAMESPACE::internal::Array &array) { - LIBC_NAMESPACE::internal::quick_sort(array); +void quick_sort(void *array, size_t array_size, size_t elem_size, + int (*compare)(const void *, const void *)) { + constexpr bool USE_QUICKSORT = true; + + const auto is_less = [compare](const void *a, + const void *b) noexcept -> bool { + return compare(a, b) < 0; + }; + + LIBC_NAMESPACE::internal::unstable_sort_impl( + array, array_size, elem_size, is_less); } -LIST_SORTING_TESTS(QuickSort, sort); +LIST_SORTING_TESTS(Qsort, quick_sort); diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel index e4b4b075705e8..c0f1546912662 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel @@ -120,31 +120,23 @@ libc_support_library( ], ) -libc_test( - name = "qsort_test", - srcs = ["qsort_test.cpp"], - libc_function_deps = ["//libc:qsort"], - deps = [ - ":qsort_test_helper", - "//libc:types_size_t", - ], -) - libc_test( name = "quick_sort_test", srcs = ["quick_sort_test.cpp"], + libc_function_deps = ["//libc:qsort"], deps = [ ":qsort_test_helper", - "//libc:qsort_util", + "//libc:types_size_t", ], ) libc_test( name = "heap_sort_test", srcs = ["heap_sort_test.cpp"], + libc_function_deps = ["//libc:qsort"], deps = [ ":qsort_test_helper", - "//libc:qsort_util", + "//libc:types_size_t", ], ) From 2bbdce9a42f58af4ca917eaba1bf1019ba658fd5 Mon Sep 17 00:00:00 2001 From: Evgenii Kudriashov Date: Sun, 5 Jan 2025 01:10:25 +0100 Subject: [PATCH 446/567] [GlobalISel] Support physical register inputs in nested patterns (#121239) When importing nested patterns, we create InsnMatcher for each pattern and miss them if consider only the top level InsnMatcher. Iterate PhysRegOperands instead. Change the type of PhysRegOperands from DenseMap to SmallMapVector to have stable generation. Also drop PhysRegInputs member from InsnMatcher as there are no users of it. --- .../GlobalISelEmitter/gisel-physreg-input.td | 84 ++++++++++++++++++- .../GlobalISel/GlobalISelMatchTable.cpp | 1 - .../Common/GlobalISel/GlobalISelMatchTable.h | 18 ++-- llvm/utils/TableGen/GlobalISelEmitter.cpp | 8 +- 4 files changed, 94 insertions(+), 17 deletions(-) diff --git a/llvm/test/TableGen/GlobalISelEmitter/gisel-physreg-input.td b/llvm/test/TableGen/GlobalISelEmitter/gisel-physreg-input.td index a05f364eb3f05..1f1b557ace608 100644 --- a/llvm/test/TableGen/GlobalISelEmitter/gisel-physreg-input.td +++ b/llvm/test/TableGen/GlobalISelEmitter/gisel-physreg-input.td @@ -22,6 +22,86 @@ class I Pat> let Pattern = Pat; } +// Try a nested physical register + +// GISEL: GIM_Try, +// GISEL-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/2, +// GISEL-NEXT: GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_STORE), +// GISEL-NEXT: GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic, +// GISEL-NEXT: // MIs[0] src0 +// GISEL-NEXT: GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32, +// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID), +// GISEL-NEXT: // MIs[0] Operand 1 +// GISEL-NEXT: GIM_CheckPointerToAny, /*MI*/0, /*Op*/1, /*SizeInBits*/32, +// GISEL-NEXT: GIM_RecordInsn, /*DefineMI*/1, /*MI*/0, /*OpIdx*/1, // MIs[1] +// GISEL-NEXT: GIM_CheckNumOperands, /*MI*/1, /*Expected*/3, +// GISEL-NEXT: GIM_CheckOpcode, /*MI*/1, GIMT_Encode2(TargetOpcode::G_MUL), +// GISEL-NEXT: // MIs[1] Operand 0 +// GISEL-NEXT: GIM_CheckType, /*MI*/1, /*Op*/0, /*Type*/GILLT_s32, +// GISEL-NEXT: // MIs[1] src1 +// GISEL-NEXT: GIM_CheckType, /*MI*/1, /*Op*/1, /*Type*/GILLT_s32, +// GISEL-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/1, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID), +// GISEL-NEXT: // MIs[1] Operand 2 +// GISEL-NEXT: GIM_CheckType, /*MI*/1, /*Op*/2, /*Type*/GILLT_s32, +// GISEL-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/2, /*RC*/GIMT_Encode2(MyTarget::Special32RegClassID), +// GISEL-NEXT: GIM_CheckIsSafeToFold, /*NumInsns*/1, +// GISEL-NEXT: // (st GPR32:{ *:[i32] }:$src0, (mul:{ *:[i32] } GPR32:{ *:[i32] }:$src1, SPECIAL:{ *:[i32] })) => (MULM_PHYS GPR32:{ *:[i32] }:$src0, GPR32:{ *:[i32] }:$src1) +// GISEL-NEXT: GIR_BuildMI, /*InsnID*/1, /*Opcode*/GIMT_Encode2(TargetOpcode::COPY), +// GISEL-NEXT: GIR_AddRegister, /*InsnID*/1, GIMT_Encode2(MyTarget::SPECIAL), /*AddRegisterRegFlags*/GIMT_Encode2(RegState::Define), +// GISEL-NEXT: GIR_Copy, /*NewInsnID*/1, /*OldInsnID*/1, /*OpIdx*/2, // SPECIAL +// GISEL-NEXT: GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::MULM_PHYS), +// GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/0, // src0 +// GISEL-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/1, // src1 +// GISEL-NEXT: GIR_MergeMemOperands, /*InsnID*/0, /*NumInsns*/2, /*MergeInsnID's*/0, 1, +// GISEL-NEXT: GIR_RootConstrainSelectedInstOperands, +// GISEL-NEXT: // GIR_Coverage, 0, +// GISEL-NEXT: GIR_EraseRootFromParent_Done, +def MULM_PHYS : I<(outs), (ins GPR32:$src0, GPR32:$src1), + [(st GPR32:$src0, (mul GPR32:$src1, SPECIAL))]> { + let Uses = [SPECIAL]; +} + +// Try nested physical registers and check on duplicated copies + +// GISEL: GIM_Try, +// GISEL-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/2, +// GISEL-NEXT: GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_STORE), +// GISEL-NEXT: GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic, +// GISEL-NEXT: // MIs[0] src0 +// GISEL-NEXT: GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32, +// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID), +// GISEL-NEXT: // MIs[0] Operand 1 +// GISEL-NEXT: GIM_CheckPointerToAny, /*MI*/0, /*Op*/1, /*SizeInBits*/32, +// GISEL-NEXT: GIM_RecordInsn, /*DefineMI*/1, /*MI*/0, /*OpIdx*/1, // MIs[1] +// GISEL-NEXT: GIM_CheckNumOperands, /*MI*/1, /*Expected*/3, +// GISEL-NEXT: GIM_CheckOpcode, /*MI*/1, GIMT_Encode2(TargetOpcode::G_MUL), +// GISEL-NEXT: // MIs[1] Operand 0 +// GISEL-NEXT: GIM_CheckType, /*MI*/1, /*Op*/0, /*Type*/GILLT_s32, +// GISEL-NEXT: // MIs[1] Operand 1 +// GISEL-NEXT: GIM_CheckType, /*MI*/1, /*Op*/1, /*Type*/GILLT_s32, +// GISEL-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/1, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID), +// GISEL-NEXT: // MIs[1] Operand 2 +// GISEL-NEXT: GIM_CheckType, /*MI*/1, /*Op*/2, /*Type*/GILLT_s32, +// GISEL-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/2, /*RC*/GIMT_Encode2(MyTarget::Special32RegClassID), +// GISEL-NEXT: GIM_CheckIsSafeToFold, /*NumInsns*/1, +// GISEL-NEXT: // (st GPR32:{ *:[i32] }:$src0, (mul:{ *:[i32] } R0:{ *:[i32] }, SPECIAL:{ *:[i32] })) => (MULMR0_PHYS GPR32:{ *:[i32] }:$src0) +// GISEL-NEXT: GIR_BuildMI, /*InsnID*/2, /*Opcode*/GIMT_Encode2(TargetOpcode::COPY), +// GISEL-NEXT: GIR_AddRegister, /*InsnID*/2, GIMT_Encode2(MyTarget::SPECIAL), /*AddRegisterRegFlags*/GIMT_Encode2(RegState::Define), +// GISEL-NEXT: GIR_Copy, /*NewInsnID*/2, /*OldInsnID*/1, /*OpIdx*/2, // SPECIAL +// GISEL-NEXT: GIR_BuildMI, /*InsnID*/1, /*Opcode*/GIMT_Encode2(TargetOpcode::COPY), +// GISEL-NEXT: GIR_AddRegister, /*InsnID*/1, GIMT_Encode2(MyTarget::R0), /*AddRegisterRegFlags*/GIMT_Encode2(RegState::Define), +// GISEL-NEXT: GIR_Copy, /*NewInsnID*/1, /*OldInsnID*/1, /*OpIdx*/1, // R0 +// GISEL-NEXT: GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::MULMR0_PHYS), +// GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/0, // src0 +// GISEL-NEXT: GIR_MergeMemOperands, /*InsnID*/0, /*NumInsns*/2, /*MergeInsnID's*/0, 1, +// GISEL-NEXT: GIR_RootConstrainSelectedInstOperands, +// GISEL-NEXT: // GIR_Coverage, 1, +// GISEL-NEXT: GIR_EraseRootFromParent_Done, +def MULMR0_PHYS : I<(outs), (ins GPR32:$src0), + [(st GPR32:$src0, (mul R0, SPECIAL))]> { + let Uses = [R0, SPECIAL]; +} + // Try a normal physical register use. // GISEL: GIM_Try, @@ -44,7 +124,7 @@ class I Pat> // GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/0, // DstI[dst] // GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/1, // src0 // GISEL-NEXT: GIR_RootConstrainSelectedInstOperands, -// GISEL-NEXT: // GIR_Coverage, 0, +// GISEL-NEXT: // GIR_Coverage, 2, // GISEL-NEXT: GIR_EraseRootFromParent_Done, def ADD_PHYS : I<(outs GPR32:$dst), (ins GPR32:$src0), [(set GPR32:$dst, (add GPR32:$src0, SPECIAL))]> { @@ -73,7 +153,7 @@ def ADD_PHYS : I<(outs GPR32:$dst), (ins GPR32:$src0), // GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/0, // DstI[dst] // GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/1, // SPECIAL // GISEL-NEXT: GIR_RootConstrainSelectedInstOperands, -// GISEL-NEXT: // GIR_Coverage, 1, +// GISEL-NEXT: // GIR_Coverage, 3, // GISEL-NEXT: GIR_EraseRootFromParent_Done, def MUL_PHYS : I<(outs GPR32:$dst), (ins GPR32:$SPECIAL), [(set GPR32:$dst, (mul GPR32:$SPECIAL, SPECIAL))]> { diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp index 619e7a4790c88..a81f2b53f2846 100644 --- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp +++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp @@ -1723,7 +1723,6 @@ OperandMatcher &InstructionMatcher::addPhysRegInput(const Record *Reg, OperandMatcher *OM = new OperandMatcher(*this, OpIdx, "", TempOpIdx); Operands.emplace_back(OM); Rule.definePhysRegOperand(Reg, *OM); - PhysRegInputs.emplace_back(Reg, OpIdx); return *OM; } diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h index 48ce71be677c0..8e6de80d6083c 100644 --- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h +++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h @@ -19,6 +19,7 @@ #include "Common/CodeGenDAGPatterns.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" @@ -492,9 +493,11 @@ class RuleMatcher : public Matcher { /// the renderers. StringMap DefinedOperands; + using PhysRegOperandsTy = SmallMapVector; + /// A map of anonymous physical register operands defined by the matchers that /// may be referenced by the renderers. - DenseMap PhysRegOperands; + PhysRegOperandsTy PhysRegOperands; /// ID for the next instruction variable defined with /// implicitlyDefineInsnVar() @@ -695,6 +698,10 @@ class RuleMatcher : public Matcher { unsigned allocateOutputInsnID() { return NextOutputInsnID++; } unsigned allocateTempRegID() { return NextTempRegID++; } + iterator_range physoperands() const { + return make_range(PhysRegOperands.begin(), PhysRegOperands.end()); + } + iterator_range insnmatchers() { return make_range(Matchers.begin(), Matchers.end()); } @@ -1756,11 +1763,6 @@ class InstructionMatcher final : public PredicateListMatcher { unsigned InsnVarID; bool AllowNumOpsCheck; - /// PhysRegInputs - List list has an entry for each explicitly specified - /// physreg input to the pattern. The first elt is the Register node, the - /// second is the recorded slot number the input pattern match saved it in. - SmallVector, 2> PhysRegInputs; - bool canAddNumOperandsCheck() const { // Add if it's allowed, and: // - We don't have a variadic operand @@ -1802,10 +1804,6 @@ class InstructionMatcher final : public PredicateListMatcher { OperandMatcher &addPhysRegInput(const Record *Reg, unsigned OpIdx, unsigned TempOpIdx); - ArrayRef> getPhysRegInputs() const { - return PhysRegInputs; - } - StringRef getSymbolicName() const { return SymbolicName; } unsigned getNumOperandMatchers() const { return Operands.size(); } diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp index 9f6d3a506dceb..3b334ea4ce152 100644 --- a/llvm/utils/TableGen/GlobalISelEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp @@ -1429,15 +1429,15 @@ Expected GlobalISelEmitter::createAndImportInstructionRenderer( action_iterator InsertPt = InsertPtOrError.get(); BuildMIAction &DstMIBuilder = *static_cast(InsertPt->get()); - for (auto PhysInput : InsnMatcher.getPhysRegInputs()) { + for (auto PhysOp : M.physoperands()) { InsertPt = M.insertAction( InsertPt, M.allocateOutputInsnID(), &Target.getInstruction(RK.getDef("COPY"))); BuildMIAction &CopyToPhysRegMIBuilder = *static_cast(InsertPt->get()); - CopyToPhysRegMIBuilder.addRenderer( - Target, PhysInput.first, true); - CopyToPhysRegMIBuilder.addRenderer(PhysInput.first); + CopyToPhysRegMIBuilder.addRenderer(Target, + PhysOp.first, true); + CopyToPhysRegMIBuilder.addRenderer(PhysOp.first); } if (auto Error = importExplicitDefRenderers(InsertPt, M, DstMIBuilder, Dst, From 66f16e682f84551552099a45e608fa260b14e3ab Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Sat, 4 Jan 2025 16:14:25 -0800 Subject: [PATCH 447/567] [clang-format][NFC] Add missing config tests for List of Strings (#121451) Also, simplify the existing test for NamespaceMacros. Like the options tested by the added tests, it's also a list of arbitrary strings and initialized to an empty list. (The other existing tests for list of strings either are initialized to a list of one or more strings or require specific strings.) --- clang/unittests/Format/ConfigParseTest.cpp | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp index 9c38dbbc51f0a..1f0beafaad7f7 100644 --- a/clang/unittests/Format/ConfigParseTest.cpp +++ b/clang/unittests/Format/ConfigParseTest.cpp @@ -144,6 +144,9 @@ TEST(ConfigParseTest, GetsCorrectBasedOnStyle) { EXPECT_EQ(0, parseConfiguration(TEXT, &Style).value()); \ EXPECT_EQ(VALUE, Style.FIELD) << "Unexpected value after parsing!" +#define CHECK_PARSE_LIST(FIELD) \ + CHECK_PARSE(#FIELD ": [foo]", FIELD, std::vector{"foo"}) + #define CHECK_PARSE_NESTED_VALUE(TEXT, STRUCT, FIELD, VALUE) \ EXPECT_NE(VALUE, Style.STRUCT.FIELD) << "Initial value already the same!"; \ EXPECT_EQ(0, parseConfiguration(#STRUCT ":\n " TEXT, &Style).value()); \ @@ -906,11 +909,15 @@ TEST(ConfigParseTest, ParsesConfiguration) { CHECK_PARSE("StatementMacros: [QUNUSED, QT_REQUIRE_VERSION]", StatementMacros, std::vector({"QUNUSED", "QT_REQUIRE_VERSION"})); - Style.NamespaceMacros.clear(); - CHECK_PARSE("NamespaceMacros: [TESTSUITE]", NamespaceMacros, - std::vector{"TESTSUITE"}); - CHECK_PARSE("NamespaceMacros: [TESTSUITE, SUITE]", NamespaceMacros, - std::vector({"TESTSUITE", "SUITE"})); + CHECK_PARSE_LIST(JavaImportGroups); + CHECK_PARSE_LIST(Macros); + CHECK_PARSE_LIST(NamespaceMacros); + CHECK_PARSE_LIST(ObjCPropertyAttributeOrder); + CHECK_PARSE_LIST(TableGenBreakingDAGArgOperators); + CHECK_PARSE_LIST(TemplateNames); + CHECK_PARSE_LIST(TypeNames); + CHECK_PARSE_LIST(TypenameMacros); + CHECK_PARSE_LIST(VariableTemplates); Style.WhitespaceSensitiveMacros.clear(); CHECK_PARSE("WhitespaceSensitiveMacros: [STRINGIZE]", From 04610b901f41c4abec169b9a38f1b0a2fde976c1 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Sat, 4 Jan 2025 16:19:46 -0800 Subject: [PATCH 448/567] [clang-format][NFC] Replace SmallVectorImpl with ArrayRef (#121621) --- clang/lib/Format/AffectedRangeManager.cpp | 10 ++-- clang/lib/Format/AffectedRangeManager.h | 4 +- clang/lib/Format/Format.cpp | 8 +-- clang/lib/Format/FormatTokenLexer.cpp | 5 +- clang/lib/Format/UnwrappedLineFormatter.cpp | 64 ++++++++++----------- clang/lib/Format/UnwrappedLineParser.cpp | 9 +-- clang/lib/Format/UnwrappedLineParser.h | 2 +- 7 files changed, 48 insertions(+), 54 deletions(-) diff --git a/clang/lib/Format/AffectedRangeManager.cpp b/clang/lib/Format/AffectedRangeManager.cpp index bf124d73e89e7..67108f3540191 100644 --- a/clang/lib/Format/AffectedRangeManager.cpp +++ b/clang/lib/Format/AffectedRangeManager.cpp @@ -21,8 +21,8 @@ namespace format { bool AffectedRangeManager::computeAffectedLines( SmallVectorImpl &Lines) { - SmallVectorImpl::iterator I = Lines.begin(); - SmallVectorImpl::iterator E = Lines.end(); + ArrayRef::iterator I = Lines.begin(); + ArrayRef::iterator E = Lines.end(); bool SomeLineAffected = false; const AnnotatedLine *PreviousLine = nullptr; while (I != E) { @@ -34,7 +34,7 @@ bool AffectedRangeManager::computeAffectedLines( // if any token within the directive is affected. if (Line->InPPDirective) { FormatToken *Last = Line->Last; - SmallVectorImpl::iterator PPEnd = I + 1; + const auto *PPEnd = I + 1; while (PPEnd != E && !(*PPEnd)->First->HasUnescapedNewline) { Last = (*PPEnd)->Last; ++PPEnd; @@ -89,8 +89,8 @@ bool AffectedRangeManager::affectsLeadingEmptyLines(const FormatToken &Tok) { } void AffectedRangeManager::markAllAsAffected( - SmallVectorImpl::iterator I, - SmallVectorImpl::iterator E) { + ArrayRef::iterator I, + ArrayRef::iterator E) { while (I != E) { (*I)->Affected = true; markAllAsAffected((*I)->Children.begin(), (*I)->Children.end()); diff --git a/clang/lib/Format/AffectedRangeManager.h b/clang/lib/Format/AffectedRangeManager.h index add16bdd7a7c3..eef056fdf0633 100644 --- a/clang/lib/Format/AffectedRangeManager.h +++ b/clang/lib/Format/AffectedRangeManager.h @@ -47,8 +47,8 @@ class AffectedRangeManager { bool affectsLeadingEmptyLines(const FormatToken &Tok); // Marks all lines between I and E as well as all their children as affected. - void markAllAsAffected(SmallVectorImpl::iterator I, - SmallVectorImpl::iterator E); + void markAllAsAffected(ArrayRef::iterator I, + ArrayRef::iterator E); // Determines whether 'Line' is affected by the SourceRanges given as input. // Returns \c true if line or one if its children is affected. diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index e51d7ac2e5b6c..fc60c5ec5eebc 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -3085,8 +3085,8 @@ static bool affectsRange(ArrayRef Ranges, unsigned Start, // its current line. // If `Cursor` is not on any #include, `Index` will be UINT_MAX. static std::pair -FindCursorIndex(const SmallVectorImpl &Includes, - const SmallVectorImpl &Indices, unsigned Cursor) { +FindCursorIndex(const ArrayRef &Includes, + const ArrayRef &Indices, unsigned Cursor) { unsigned CursorIndex = UINT_MAX; unsigned OffsetToEOL = 0; for (int i = 0, e = Includes.size(); i != e; ++i) { @@ -3135,7 +3135,7 @@ std::string replaceCRLF(const std::string &Code) { // provided and put on a deleted #include, it will be moved to the remaining // #include in the duplicate #includes. static void sortCppIncludes(const FormatStyle &Style, - const SmallVectorImpl &Includes, + const ArrayRef &Includes, ArrayRef Ranges, StringRef FileName, StringRef Code, tooling::Replacements &Replaces, unsigned *Cursor) { @@ -3378,7 +3378,7 @@ static unsigned findJavaImportGroup(const FormatStyle &Style, // import group, a newline is inserted, and within each import group, a // lexicographic sort based on ASCII value is performed. static void sortJavaImports(const FormatStyle &Style, - const SmallVectorImpl &Imports, + const ArrayRef &Imports, ArrayRef Ranges, StringRef FileName, StringRef Code, tooling::Replacements &Replaces) { unsigned ImportsBeginOffset = Imports.front().Offset; diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp index 0f8d4940d4369..a1d7eeadec441 100644 --- a/clang/lib/Format/FormatTokenLexer.cpp +++ b/clang/lib/Format/FormatTokenLexer.cpp @@ -564,8 +564,7 @@ bool FormatTokenLexer::tryMergeTokens(ArrayRef Kinds, if (Tokens.size() < Kinds.size()) return false; - SmallVectorImpl::const_iterator First = - Tokens.end() - Kinds.size(); + const auto *First = Tokens.end() - Kinds.size(); for (unsigned i = 0; i < Kinds.size(); ++i) if (First[i]->isNot(Kinds[i])) return false; @@ -577,7 +576,7 @@ bool FormatTokenLexer::tryMergeTokens(size_t Count, TokenType NewType) { if (Tokens.size() < Count) return false; - SmallVectorImpl::const_iterator First = Tokens.end() - Count; + const auto *First = Tokens.end() - Count; unsigned AddLength = 0; for (size_t i = 1; i < Count; ++i) { // If there is whitespace separating the token and the previous one, diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp index bc6766a47f5c7..2fe4ebd4ff8eb 100644 --- a/clang/lib/Format/UnwrappedLineFormatter.cpp +++ b/clang/lib/Format/UnwrappedLineFormatter.cpp @@ -183,9 +183,9 @@ class LevelIndentTracker { unsigned Indent = 0; }; -const FormatToken *getMatchingNamespaceToken( - const AnnotatedLine *Line, - const SmallVectorImpl &AnnotatedLines) { +const FormatToken * +getMatchingNamespaceToken(const AnnotatedLine *Line, + const ArrayRef &AnnotatedLines) { if (!Line->startsWith(tok::r_brace)) return nullptr; size_t StartLineIndex = Line->MatchingOpeningBlockLineIndex; @@ -200,9 +200,9 @@ StringRef getNamespaceTokenText(const AnnotatedLine *Line) { return NamespaceToken ? NamespaceToken->TokenText : StringRef(); } -StringRef getMatchingNamespaceTokenText( - const AnnotatedLine *Line, - const SmallVectorImpl &AnnotatedLines) { +StringRef +getMatchingNamespaceTokenText(const AnnotatedLine *Line, + const ArrayRef &AnnotatedLines) { const FormatToken *NamespaceToken = getMatchingNamespaceToken(Line, AnnotatedLines); return NamespaceToken ? NamespaceToken->TokenText : StringRef(); @@ -241,8 +241,8 @@ class LineJoiner { /// Calculates how many lines can be merged into 1 starting at \p I. unsigned tryFitMultipleLinesInOne(LevelIndentTracker &IndentTracker, - SmallVectorImpl::const_iterator I, - SmallVectorImpl::const_iterator E) { + ArrayRef::const_iterator I, + ArrayRef::const_iterator E) { const unsigned Indent = IndentTracker.getIndent(); // Can't join the last line with anything. @@ -614,8 +614,8 @@ class LineJoiner { } unsigned - tryMergeSimplePPDirective(SmallVectorImpl::const_iterator I, - SmallVectorImpl::const_iterator E, + tryMergeSimplePPDirective(ArrayRef::const_iterator I, + ArrayRef::const_iterator E, unsigned Limit) { if (Limit == 0) return 0; @@ -626,8 +626,8 @@ class LineJoiner { return 1; } - unsigned tryMergeNamespace(SmallVectorImpl::const_iterator I, - SmallVectorImpl::const_iterator E, + unsigned tryMergeNamespace(ArrayRef::const_iterator I, + ArrayRef::const_iterator E, unsigned Limit) { if (Limit == 0) return 0; @@ -692,9 +692,10 @@ class LineJoiner { return 2; } - unsigned tryMergeSimpleControlStatement( - SmallVectorImpl::const_iterator I, - SmallVectorImpl::const_iterator E, unsigned Limit) { + unsigned + tryMergeSimpleControlStatement(ArrayRef::const_iterator I, + ArrayRef::const_iterator E, + unsigned Limit) { if (Limit == 0) return 0; if (Style.BraceWrapping.AfterControlStatement == @@ -734,10 +735,9 @@ class LineJoiner { return 1; } - unsigned - tryMergeShortCaseLabels(SmallVectorImpl::const_iterator I, - SmallVectorImpl::const_iterator E, - unsigned Limit) { + unsigned tryMergeShortCaseLabels(ArrayRef::const_iterator I, + ArrayRef::const_iterator E, + unsigned Limit) { if (Limit == 0 || I + 1 == E || I[1]->First->isOneOf(tok::kw_case, tok::kw_default)) { return 0; @@ -768,7 +768,7 @@ class LineJoiner { if (Line->First->is(tok::comment)) { if (Level != Line->Level) return 0; - SmallVectorImpl::const_iterator J = I + 2 + NumStmts; + const auto *J = I + 2 + NumStmts; for (; J != E; ++J) { Line = *J; if (Line->InPPDirective != InPPDirective) @@ -789,10 +789,9 @@ class LineJoiner { return NumStmts; } - unsigned - tryMergeSimpleBlock(SmallVectorImpl::const_iterator I, - SmallVectorImpl::const_iterator E, - unsigned Limit) { + unsigned tryMergeSimpleBlock(ArrayRef::const_iterator I, + ArrayRef::const_iterator E, + unsigned Limit) { // Don't merge with a preprocessor directive. if (I[1]->Type == LT_PreprocessorDirective) return 0; @@ -974,10 +973,9 @@ class LineJoiner { /// Returns the modified column limit for \p I if it is inside a macro and /// needs a trailing '\'. - unsigned - limitConsideringMacros(SmallVectorImpl::const_iterator I, - SmallVectorImpl::const_iterator E, - unsigned Limit) { + unsigned limitConsideringMacros(ArrayRef::const_iterator I, + ArrayRef::const_iterator E, + unsigned Limit) { if (I[0]->InPPDirective && I + 1 != E && !I[1]->First->HasUnescapedNewline && I[1]->First->isNot(tok::eof)) { return Limit < 2 ? 0 : Limit - 2; @@ -985,15 +983,15 @@ class LineJoiner { return Limit; } - bool nextTwoLinesFitInto(SmallVectorImpl::const_iterator I, + bool nextTwoLinesFitInto(ArrayRef::const_iterator I, unsigned Limit) { if (I[1]->First->MustBreakBefore || I[2]->First->MustBreakBefore) return false; return 1 + I[1]->Last->TotalLength + 1 + I[2]->Last->TotalLength <= Limit; } - bool nextNLinesFitInto(SmallVectorImpl::const_iterator I, - SmallVectorImpl::const_iterator E, + bool nextNLinesFitInto(ArrayRef::const_iterator I, + ArrayRef::const_iterator E, unsigned Limit) { unsigned JoinedLength = 0; for (const auto *J = I + 1; J != E; ++J) { @@ -1034,9 +1032,9 @@ class LineJoiner { const FormatStyle &Style; const AdditionalKeywords &Keywords; - const SmallVectorImpl::const_iterator End; + const ArrayRef::const_iterator End; - SmallVectorImpl::const_iterator Next; + ArrayRef::const_iterator Next; const SmallVectorImpl &AnnotatedLines; }; diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index 654148a161bd7..5375eef90c579 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -51,9 +51,7 @@ void printLine(llvm::raw_ostream &OS, const UnwrappedLine &Line, << "T=" << (unsigned)I->Tok->getType() << ", OC=" << I->Tok->OriginalColumn << ", \"" << I->Tok->TokenText << "\"] "; - for (SmallVectorImpl::const_iterator - CI = I->Children.begin(), - CE = I->Children.end(); + for (const auto *CI = I->Children.begin(), *CE = I->Children.end(); CI != CE; ++CI) { OS << "\n"; printLine(OS, *CI, (Prefix + " ").str()); @@ -4788,8 +4786,7 @@ void UnwrappedLineParser::nextToken(int LevelDifference) { } void UnwrappedLineParser::distributeComments( - const SmallVectorImpl &Comments, - const FormatToken *NextTok) { + const ArrayRef &Comments, const FormatToken *NextTok) { // Whether or not a line comment token continues a line is controlled by // the method continuesLineCommentSection, with the following caveat: // @@ -5011,7 +5008,7 @@ void UnwrappedLineParser::readToken(int LevelDifference) { namespace { template void pushTokens(Iterator Begin, Iterator End, - llvm::SmallVectorImpl &Into) { + SmallVectorImpl &Into) { for (auto I = Begin; I != End; ++I) { Into.push_back(I->Tok); for (const auto &Child : I->Children) diff --git a/clang/lib/Format/UnwrappedLineParser.h b/clang/lib/Format/UnwrappedLineParser.h index b7daf8d9f4401..8160d5e84186e 100644 --- a/clang/lib/Format/UnwrappedLineParser.h +++ b/clang/lib/Format/UnwrappedLineParser.h @@ -228,7 +228,7 @@ class UnwrappedLineParser { // NextTok specifies the next token. A null pointer NextTok is supported, and // signifies either the absence of a next token, or that the next token // shouldn't be taken into account for the analysis. - void distributeComments(const SmallVectorImpl &Comments, + void distributeComments(const ArrayRef &Comments, const FormatToken *NextTok); // Adds the comment preceding the next token to unwrapped lines. From 44b83e81b5a48d543bf718907f00a21179ec03a4 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Sat, 4 Jan 2025 16:22:54 -0800 Subject: [PATCH 449/567] [clang-format] Add TT_AfterPPDirective for better annotation (#121622) For now, we only need to annotate the token after #error or #warning. Fixes #117706. --- clang/lib/Format/FormatToken.h | 1 + clang/lib/Format/TokenAnnotator.cpp | 4 ++++ clang/lib/Format/UnwrappedLineParser.cpp | 9 +++++++-- clang/unittests/Format/TokenAnnotatorTest.cpp | 7 +++++++ 4 files changed, 19 insertions(+), 2 deletions(-) diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h index 8917049cefb86..0fd3a49c71f9d 100644 --- a/clang/lib/Format/FormatToken.h +++ b/clang/lib/Format/FormatToken.h @@ -25,6 +25,7 @@ namespace clang { namespace format { #define LIST_TOKEN_TYPES \ + TYPE(AfterPPDirective) \ TYPE(ArrayInitializerLSquare) \ TYPE(ArraySubscriptLSquare) \ TYPE(AttributeColon) \ diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index b0f570966a63f..fad375733ef84 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -4941,6 +4941,10 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line, Right.is(TT_ModulePartitionColon)) { return true; } + + if (Right.is(TT_AfterPPDirective)) + return true; + // No space between import foo:bar but keep a space between import :bar; if (Left.is(tok::identifier) && Right.is(TT_ModulePartitionColon)) return false; diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index 5375eef90c579..46fd566ae221e 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -1030,6 +1030,12 @@ void UnwrappedLineParser::parsePPDirective() { case tok::pp_pragma: parsePPPragma(); break; + case tok::pp_error: + case tok::pp_warning: + nextToken(); + if (!eof() && Style.isCpp()) + FormatTok->setFinalizedType(TT_AfterPPDirective); + [[fallthrough]]; default: parsePPUnknown(); break; @@ -1209,9 +1215,8 @@ void UnwrappedLineParser::parsePPPragma() { } void UnwrappedLineParser::parsePPUnknown() { - do { + while (!eof()) nextToken(); - } while (!eof()); if (Style.IndentPPDirectives != FormatStyle::PPDIS_None) Line->Level += PPBranchLevel + 1; addUnwrappedLine(); diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index d61b9adf4f58c..875feff3d5420 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -3635,6 +3635,13 @@ TEST_F(TokenAnnotatorTest, SwitchInMacroArgument) { EXPECT_TOKEN(Tokens[9], tok::l_brace, TT_FunctionLBrace); } +TEST_F(TokenAnnotatorTest, AfterPPDirective) { + auto Tokens = annotate("#error -- My error message"); + + ASSERT_EQ(Tokens.size(), 7u) << Tokens; + EXPECT_TOKEN(Tokens[2], tok::minusminus, TT_AfterPPDirective); +} + } // namespace } // namespace format } // namespace clang From c1ea05eaf0fbe4b539952689dbf9f0df716c72e7 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Sat, 4 Jan 2025 16:24:41 -0800 Subject: [PATCH 450/567] [clang-format] Don't break short macro call followed by l_paren (#121626) Fixes #105658. --- clang/lib/Format/UnwrappedLineParser.cpp | 4 +++- clang/unittests/Format/FormatTest.cpp | 5 +++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index 46fd566ae221e..39aa37af480c9 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -2044,7 +2044,9 @@ void UnwrappedLineParser::parseStructuralElement( ? FormatTok->NewlinesBefore > 0 : CommentsBeforeNextToken.front()->NewlinesBefore > 0; - if (FollowedByNewline && (Text.size() >= 5 || FunctionLike) && + if (FollowedByNewline && + (Text.size() >= 5 || + (FunctionLike && FormatTok->isNot(tok::l_paren))) && tokenCanStartNewLine(*FormatTok) && Text == Text.upper()) { if (PreviousToken->isNot(TT_UntouchableMacroFunc)) PreviousToken->setFinalizedType(TT_FunctionLikeOrFreestandingMacro); diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index 44b9dba249890..4d48bcacddead 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -5887,6 +5887,11 @@ TEST_F(FormatTest, MacrosWithoutTrailingSemicolon) { verifyFormat("SOME_WEIRD_LOG_MACRO << SomeThing;", "SOME_WEIRD_LOG_MACRO\n" "<< SomeThing;"); + verifyFormat("GGGG(ffff(xxxxxxxxxxxxxxxxxxxx)->yyyyyyyyyyyyyyyyyyyy)(foo);", + "GGGG(ffff(xxxxxxxxxxxxxxxxxxxx)->yyyyyyyyyyyyyyyyyyyy)\n" + "(foo);", + getLLVMStyleWithColumns(60)); + verifyFormat("VISIT_GL_CALL(GenBuffers, void, (GLsizei n, GLuint* buffers), " "(n, buffers))", getChromiumStyle(FormatStyle::LK_Cpp)); From 8df7dabd5d60240cf4f2029a40b91c3a8f8e209d Mon Sep 17 00:00:00 2001 From: Yury Plyakhin Date: Wed, 8 Jan 2025 13:34:29 -0800 Subject: [PATCH 451/567] [SYCL][Joint Matrix][E2E] Fix several tests (#16548) with this fix joint_matrix_colA_rowB_colC.cpp is passing on CPU --- ...nt_matrix_bfloat16_colmajorA_colmajorB.cpp | 11 +------ .../SG32/joint_matrix_colA_rowB_colC.cpp | 5 +-- sycl/test-e2e/Matrix/common.hpp | 6 ++-- .../joint_matrix_bf16_fill_k_cache_impl.hpp | 2 +- ...nt_matrix_bfloat16_colmajorA_colmajorB.cpp | 3 -- ...trix_bfloat16_colmajorA_colmajorB_impl.hpp | 6 ++-- .../Matrix/joint_matrix_colA_rowB_colC.cpp | 5 +-- .../joint_matrix_colA_rowB_colC_impl.hpp | 31 ++++++++++++------- 8 files changed, 29 insertions(+), 40 deletions(-) diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp index 57a41d55f8fee..504da33d936f1 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp @@ -16,16 +16,7 @@ // XFAIL: gpu // XFAIL-TRACKER: GSD-5768 -#include "../common.hpp" -#include -#include -#include - -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; -using bfloat16 = sycl::ext::oneapi::bfloat16; +#include "../common.hpp" #define SG_SZ 32 -constexpr size_t TN = 16; - #include "../joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_colA_rowB_colC.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_colA_rowB_colC.cpp index 4186ad1acc943..98ded99791115 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_colA_rowB_colC.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_colA_rowB_colC.cpp @@ -11,12 +11,9 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// XFAIL: run-mode +// XFAIL: gpu && run-mode // XFAIL-TRACKER: GSD-5768 #include "../common.hpp" - #define SG_SZ 32 -constexpr size_t TN = 16; - #include "../joint_matrix_colA_rowB_colC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/common.hpp b/sycl/test-e2e/Matrix/common.hpp index f568f3251024b..58937722642df 100644 --- a/sycl/test-e2e/Matrix/common.hpp +++ b/sycl/test-e2e/Matrix/common.hpp @@ -174,7 +174,7 @@ bool matrix_compare(unsigned int rows, unsigned int cols, T1 *src, T2 *ref) { std::is_same_v))) { float diff = std::fabs(src[i * cols + j] - (T1)ref[i * cols + j]); if (diff > FLOAT_EPSILON || std::isnan(src[i * cols + j])) { - std::cout << "Incorrect result in matrix. " + std::cerr << "Incorrect result in matrix. " << "i: " << i << ", j: " << j << ", Ref: " << (T1)ref[i * cols + j] << ", Val: " << src[i * cols + j] << ", Diff: " << diff @@ -183,14 +183,14 @@ bool matrix_compare(unsigned int rows, unsigned int cols, T1 *src, T2 *ref) { } } else if constexpr (exact || std::is_integral_v) { if (src[i * cols + j] != ref[i * cols + j]) { - std::cout << "Incorrect result in matrix." + std::cerr << "Incorrect result in matrix." << "i: " << i << ", j: " << j << ", Ref: " << ref[i * cols + j] << ", Val: " << src[i * cols + j] << "\n"; return false; } } else { - std::cout << "Unsupported type in matrix_compare\n"; + std::cerr << "Unsupported type in matrix_compare\n"; return false; } } diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp index 1afb397d9815a..097d7e42bfd96 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp @@ -119,7 +119,7 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i // along the workgroup prefetch for B matrix. For A matrix, sgId is // enough. size_t pm1B = sgId / 16; // prefetch m1 (sgId/16) - size_t pn1B = sgId & 0x15; // prefetch n1 (sgId%16) + size_t pn1B = sgId & 0xF; // prefetch n1 (sgId%16) #else // VNNI size_t pm1B = sgId / 8; // prefetch m1 (sgId/8) size_t pn1B = sgId & 0x7; // prefetch n1 (sgId%8) diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp index 82bedf7043e9d..2519b0fdb4c79 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp @@ -17,7 +17,4 @@ // XFAIL-TRACKER: GSD-5768 #include "common.hpp" - -constexpr size_t TN = 16; - #include "joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp index d8f5e45474a77..bab88721fb1a9 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp @@ -6,8 +6,9 @@ // //===----------------------------------------------------------------------===// -#define TM 8 -#define TK 16 +constexpr size_t TM = 8; +constexpr size_t TN = 16; +constexpr size_t TK = 16; template void matrix_multiply(big_matrix &C, big_matrix &A, @@ -43,7 +44,6 @@ void matrix_multiply(big_matrix &C, big_matrix &A, sub_group sg = spmd_item.get_sub_group(); joint_matrix sub_a; - // For B, we assume B has been already VNNIed. joint_matrix sub_b; joint_matrix sub_c; diff --git a/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC.cpp b/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC.cpp index c62175a8af439..0921ae04df2e5 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC.cpp @@ -10,11 +10,8 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// XFAIL: run-mode +// XFAIL: gpu && run-mode // XFAIL-TRACKER: GSD-5768 #include "common.hpp" - -constexpr size_t TN = 16; - #include "joint_matrix_colA_rowB_colC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC_impl.hpp index 65b091477cae5..0ea98fdc357ab 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC_impl.hpp @@ -11,6 +11,7 @@ #include constexpr size_t TM = 8; +constexpr size_t TN = 16; constexpr size_t TK = 16; template sub_c; joint_matrix_fill(sg, sub_c, 1); for (int k = 0; k < K; k += TK) { - joint_matrix_load(sg, sub_a, pA + (sg_startx * TM) * K + k, K); + joint_matrix_load(sg, sub_a, pA + k * M + sg_startx * TM, M); joint_matrix_load(sg, sub_b, pB + k * N + sg_starty / sg_size * TN, N); joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); } - joint_matrix_store( - sg, sub_c, pC + (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::col_major); + joint_matrix_store(sg, sub_c, + pC + (sg_startx * TM) + + (sg_starty / sg_size * TN) * M, + M, layout::col_major); }); // parallel for }).wait(); } @@ -76,23 +78,28 @@ int main() { static constexpr size_t MATRIX_M = 1024; static constexpr size_t MATRIX_N = 1024; static constexpr size_t MATRIX_K = 1024; + queue q; - bfloat16 *A = malloc_shared(MATRIX_M * MATRIX_K, q); + bfloat16 *A = malloc_shared(MATRIX_K * MATRIX_M, q); bfloat16 *B = malloc_shared(MATRIX_K * MATRIX_N, q); - float *C = malloc_shared(MATRIX_M * MATRIX_N, q); - float *D = malloc_shared(MATRIX_M * MATRIX_N, q); + float *C = malloc_shared(MATRIX_N * MATRIX_M, q); + float *D = malloc_shared(MATRIX_N * MATRIX_M, q); - matrix_rand(MATRIX_M, MATRIX_K, A, (bfloat16)5); + matrix_rand(MATRIX_K, MATRIX_M, A, (bfloat16)5); matrix_rand(MATRIX_K, MATRIX_N, B, (bfloat16)5); - matrix_fill(MATRIX_M, MATRIX_N, C, (float)1.0); - matrix_fill(MATRIX_M, MATRIX_N, D, (float)1.0); + matrix_fill(MATRIX_N, MATRIX_M, D, (float)1.0); matrix_multiply(C, A, B, q); matrix_multiply_ref(A, B, D, MATRIX_M, MATRIX_N, MATRIX_K, - true /*transposed c*/); + /*transposed c*/ true, /*colmajor a*/ true); + + bool res = matrix_compare(MATRIX_N, MATRIX_M, C, D); - bool res = matrix_compare(MATRIX_M, MATRIX_N, C, D); + sycl::free(A, q); + sycl::free(B, q); + sycl::free(C, q); + sycl::free(D, q); std::cout << (res ? "passed" : "failed") << std::endl; return !res; From 03cb2b25026f060149eb94c85b228e5b3a780588 Mon Sep 17 00:00:00 2001 From: Chris Perkins Date: Wed, 8 Jan 2025 16:32:04 -0800 Subject: [PATCH 452/567] [SYCL] kernel_compiler remove GCC < 8 workarounds. (#16550) We can use experimental filesystem when compiling SYCL with GCC 7, which means we can remove the #ifdef hacks from the kernel_compiler. --- .../kernel_compiler/kernel_compiler_sycl.cpp | 138 +++++++----------- .../KernelCompiler/kernel_compiler_sycl.cpp | 15 +- 2 files changed, 55 insertions(+), 98 deletions(-) diff --git a/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.cpp b/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.cpp index 6362bf355cfc5..4590a17ecf67b 100644 --- a/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.cpp +++ b/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.cpp @@ -9,59 +9,26 @@ #include "kernel_compiler_sycl.hpp" #include // make_error_code -#if __GNUC__ && __GNUC__ < 8 - -// std::filesystem is not availalbe for GCC < 8 -// and much of the cross-platform file handling code depends upon it. -// Given that this extension is experimental and that the file -// handling aspects are most likely temporary, it makes sense to -// simply not support GCC<8. - -namespace sycl { -inline namespace _V1 { -namespace ext::oneapi::experimental { -namespace detail { - -bool SYCL_Compilation_Available() { return false; } - -spirv_vec_t -SYCL_to_SPIRV(const std::string &SYCLSource, include_pairs_t IncludePairs, - const std::vector &UserArgs, std::string *LogPtr, - const std::vector &RegisteredKernelNames) { - (void)SYCLSource; - (void)IncludePairs; - (void)UserArgs; - (void)LogPtr; - (void)RegisteredKernelNames; - throw sycl::exception(sycl::errc::build, - "kernel_compiler does not support GCC<8"); -} - -std::string userArgsAsString(const std::vector &UserArguments) { - return std::accumulate(UserArguments.begin(), UserArguments.end(), - std::string(""), - [](const std::string &A, const std::string &B) { - return A.empty() ? B : A + " " + B; - }); -} - -} // namespace detail -} // namespace ext::oneapi::experimental -} // namespace _V1 -} // namespace sycl - -#else - #include #include -#include #include #include #include #include #include +// For GCC versions less than 8, use experimental/filesystem. +#if defined(__has_include) && __has_include() +#include +namespace fs = std::filesystem; +#elif defined(__has_include) && __has_include() +#include +namespace fs = std::experimental::filesystem; +#else +#error "kernel_compiler sycl requires C++ filesystem support" +#endif + namespace sycl { inline namespace _V1 { namespace ext::oneapi::experimental { @@ -80,14 +47,12 @@ std::string generateSemiUniqueId() { // Combine time and random number into a string. std::stringstream Ss; - Ss << Milliseconds.count() << "_" << std::setfill('0') << std::setw(5) - << RandomNumber; + Ss << Milliseconds.count() << "_" << RandomNumber; return Ss.str(); } -std::filesystem::path prepareWS(const std::string &Id) { - namespace fs = std::filesystem; +fs::path prepareWS(const std::string &Id) { const fs::path TmpDirectoryPath = fs::temp_directory_path(); fs::path NewDirectoryPath = TmpDirectoryPath / Id; @@ -104,10 +69,10 @@ std::filesystem::path prepareWS(const std::string &Id) { return NewDirectoryPath; } -void deleteWS(const std::filesystem::path &ParentDir) { +void deleteWS(const fs::path &ParentDir) { try { - std::filesystem::remove_all(ParentDir); - } catch (const std::filesystem::filesystem_error &E) { + fs::remove_all(ParentDir); + } catch (const fs::filesystem_error &E) { // We could simply suppress this, since deleting the directory afterwards // is not critical. But if there are problems, seems good to know. throw sycl::exception(sycl::errc::build, E.what()); @@ -122,8 +87,7 @@ std::string userArgsAsString(const std::vector &UserArguments) { }); } -void outputPreamble(std::ofstream &Os, const std::filesystem::path &FilePath, - const std::string &Id, +void outputPreamble(std::ofstream &Os, const std::string &Id, const std::vector &UserArgs) { Os << "/*\n"; @@ -133,15 +97,15 @@ void outputPreamble(std::ofstream &Os, const std::filesystem::path &FilePath, Os << ".cpp \n */" << std::endl; } -std::filesystem::path -outputCpp(const std::filesystem::path &ParentDir, const std::string &Id, - std::string RawCodeString, const std::vector &UserArgs, - const std::vector &RegisteredKernelNames) { - std::filesystem::path FilePath = ParentDir / (Id + ".cpp"); +fs::path outputCpp(const fs::path &ParentDir, const std::string &Id, + std::string RawCodeString, + const std::vector &UserArgs, + const std::vector &RegisteredKernelNames) { + fs::path FilePath = ParentDir / (Id + ".cpp"); std::ofstream Outfile(FilePath, std::ios::out | std::ios::trunc); if (Outfile.is_open()) { - outputPreamble(Outfile, FilePath, Id, UserArgs); + outputPreamble(Outfile, Id, UserArgs); Outfile << RawCodeString << std::endl; // Temporarily needed until -c works with -fsycl-dump-spirv. @@ -161,12 +125,11 @@ outputCpp(const std::filesystem::path &ParentDir, const std::string &Id, return FilePath; } -void outputIncludeFiles(const std::filesystem::path &Dirpath, - include_pairs_t IncludePairs) { +void outputIncludeFiles(const fs::path &Dirpath, include_pairs_t IncludePairs) { using pairStrings = std::pair; for (pairStrings p : IncludePairs) { - std::filesystem::path FilePath = Dirpath / p.first; - std::filesystem::create_directories(FilePath.parent_path()); + fs::path FilePath = Dirpath / p.first; + fs::create_directories(FilePath.parent_path()); std::ofstream outfile(FilePath, std::ios::out | std::ios::trunc); if (outfile.is_open()) { outfile << p.second << std::endl; @@ -191,11 +154,10 @@ std::string getCompilerName() { // We are assuming that the compiler is in /bin and the shared lib in // the adjacent /lib. -std::filesystem::path getCompilerPath() { +fs::path getCompilerPath() { std::string Compiler = getCompilerName(); const std::string LibSYCLDir = sycl::detail::OSUtil::getCurrentDSODir(); - std::filesystem::path CompilerPath = - std::filesystem::path(LibSYCLDir) / ".." / "bin" / Compiler; + fs::path CompilerPath = fs::path(LibSYCLDir) / ".." / "bin" / Compiler; return CompilerPath; } @@ -225,16 +187,15 @@ int invokeCommand(const std::string &command, std::string &output) { return 0; } -std::string invokeCompiler(const std::filesystem::path &FPath, - const std::filesystem::path &DPath, +std::string invokeCompiler(const fs::path &FPath, const fs::path &DPath, const std::string &Id, const std::vector &UserArgs, std::string *LogPtr) { - std::filesystem::path FilePath(FPath); - std::filesystem::path ParentDir(DPath); - std::filesystem::path TargetPath = ParentDir / (Id + ".bin"); - std::filesystem::path LogPath = ParentDir / "compilation_log.txt"; + fs::path FilePath(FPath); + fs::path ParentDir(DPath); + fs::path TargetPath = ParentDir / (Id + ".bin"); + fs::path LogPath = ParentDir / "compilation_log.txt"; std::string Compiler = getCompilerPath().make_preferred().string(); std::string Command = @@ -262,13 +223,13 @@ std::string invokeCompiler(const std::filesystem::path &FPath, return CompileLog; } -std::filesystem::path findSpv(const std::filesystem::path &ParentDir, - const std::string &Id, std::string &CompileLog) { +fs::path findSpv(const fs::path &ParentDir, const std::string &Id, + std::string &CompileLog) { std::regex PatternRegex(Id + R"(.*\.spv)"); // Iterate through all files in the directory matching the pattern. - for (const auto &Entry : std::filesystem::directory_iterator(ParentDir)) { - if (Entry.is_regular_file() && + for (const auto &Entry : fs::directory_iterator(ParentDir)) { + if (fs::is_regular_file(Entry.path()) && std::regex_match(Entry.path().filename().string(), PatternRegex)) { return Entry.path(); // Return the path if it matches the SPV pattern. } @@ -278,7 +239,7 @@ std::filesystem::path findSpv(const std::filesystem::path &ParentDir, throw sycl::exception(sycl::errc::build, "Compile failure: " + CompileLog); } -spirv_vec_t loadSpvFromFile(const std::filesystem::path &FileName) { +spirv_vec_t loadSpvFromFile(const fs::path &FileName) { std::ifstream SpvStream(FileName, std::ios::binary); SpvStream.seekg(0, std::ios::end); size_t Size = SpvStream.tellg(); @@ -294,23 +255,23 @@ SYCL_to_SPIRV(const std::string &SYCLSource, include_pairs_t IncludePairs, const std::vector &UserArgs, std::string *LogPtr, const std::vector &RegisteredKernelNames) { // clang-format off - const std::string id = generateSemiUniqueId(); - const std::filesystem::path ParentDir = prepareWS(id); - std::filesystem::path FilePath = outputCpp(ParentDir, id, SYCLSource, UserArgs, RegisteredKernelNames); - outputIncludeFiles(ParentDir, IncludePairs); - std::string CompileLog = invokeCompiler(FilePath, ParentDir, id, UserArgs, LogPtr); - std::filesystem::path SpvPath = findSpv(ParentDir, id, CompileLog); - spirv_vec_t Spv = loadSpvFromFile(SpvPath); - deleteWS(ParentDir); - return Spv; + const std::string id = generateSemiUniqueId(); + const fs::path ParentDir = prepareWS(id); + fs::path FilePath = outputCpp(ParentDir, id, SYCLSource, UserArgs, RegisteredKernelNames); + outputIncludeFiles(ParentDir, IncludePairs); + std::string CompileLog = invokeCompiler(FilePath, ParentDir, id, UserArgs, LogPtr); + fs::path SpvPath = findSpv(ParentDir, id, CompileLog); + spirv_vec_t Spv = loadSpvFromFile(SpvPath); + deleteWS(ParentDir); + return Spv; // clang-format on } bool SYCL_Compilation_Available() { // Is compiler on $PATH ? We try to invoke it. std::string id = generateSemiUniqueId(); - const std::filesystem::path tmp = std::filesystem::temp_directory_path(); - std::filesystem::path DumpPath = tmp / (id + "_version.txt"); + const fs::path tmp = fs::temp_directory_path(); + fs::path DumpPath = tmp / (id + "_version.txt"); std::string Compiler = getCompilerPath().make_preferred().string(); std::string TestCommand = Compiler + " --version > " + DumpPath.make_preferred().string(); @@ -323,7 +284,6 @@ bool SYCL_Compilation_Available() { } // namespace ext::oneapi::experimental } // namespace _V1 } // namespace sycl -#endif #if SYCL_EXT_JIT_ENABLE #include "../jit_compiler.hpp" diff --git a/sycl/test-e2e/KernelCompiler/kernel_compiler_sycl.cpp b/sycl/test-e2e/KernelCompiler/kernel_compiler_sycl.cpp index 27ab3401d4426..d8b020e971d4a 100644 --- a/sycl/test-e2e/KernelCompiler/kernel_compiler_sycl.cpp +++ b/sycl/test-e2e/KernelCompiler/kernel_compiler_sycl.cpp @@ -12,14 +12,11 @@ // -- Test the kernel_compiler with SYCL source. // RUN: %{build} -o %t.out -// If clang++ is not on the PATH, or if sycl was compiled with GCC < 8, then -// the kernel_compiler is not available for SYCL language. // Note: this 'invoking clang++' version for SYCL language support is temporary, // and will be replaced by the SYCL_JIT version soon. -// DEFINE: %{available} = %t.out available -// RUN: %if available %{ %{run} %t.out %} -// RUN: %if available %{ %{l0_leak_check} %{run} %t.out %} +// RUN: %{run} %t.out +// RUN: %{l0_leak_check} %{run} %t.out // -- Test again, with caching. // 'reading-from-cache' is just a string we pass to differentiate between the @@ -27,13 +24,13 @@ // DEFINE: %{cache_vars} = %{l0_leak_check} env SYCL_CACHE_PERSISTENT=1 SYCL_CACHE_TRACE=5 SYCL_CACHE_DIR=%t/cache_dir // RUN: rm -rf %t/cache_dir -// RUN: %if available %{ %{cache_vars} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK-WRITTEN-TO-CACHE %} -// RUN: %if available %{ %{cache_vars} %t.out reading-from-cache 2>&1 | FileCheck %s --check-prefixes=CHECK-READ-FROM-CACHE %} +// RUN: %{cache_vars} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK-WRITTEN-TO-CACHE +// RUN: %{cache_vars} %t.out reading-from-cache 2>&1 | FileCheck %s --check-prefixes=CHECK-READ-FROM-CACHE // -- Add leak check. // RUN: rm -rf %t/cache_dir -// RUN: %if available %{ %{l0_leak_check} %{cache_vars} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK-WRITTEN-TO-CACHE %} -// RUN: %if available %{ %{l0_leak_check} %{cache_vars} %t.out reading-from-cache 2>&1 | FileCheck %s --check-prefixes=CHECK-READ-FROM-CACHE %} +// RUN: %{l0_leak_check} %{cache_vars} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK-WRITTEN-TO-CACHE +// RUN: %{l0_leak_check} %{cache_vars} %t.out reading-from-cache 2>&1 | FileCheck %s --check-prefixes=CHECK-READ-FROM-CACHE // CHECK-WRITTEN-TO-CACHE: [Persistent Cache]: enabled // CHECK-WRITTEN-TO-CACHE-NOT: [kernel_compiler Persistent Cache]: using cached binary From a5f83c2851dcce48f238a770c03872668fb2511d Mon Sep 17 00:00:00 2001 From: Nikita Kornev Date: Thu, 9 Jan 2025 11:03:15 +0100 Subject: [PATCH 453/567] [CI] Fix sycl-rel-nightly.yml (#16560) The workflow was introduced with errors. Fix them and also turn off the pre-commit for changes in this workflow. --- .github/workflows/sycl-linux-build.yml | 4 ---- .github/workflows/sycl-linux-precommit.yml | 1 + .github/workflows/sycl-rel-nightly.yml | 8 +++++--- .github/workflows/sycl-windows-build.yml | 7 +++++++ .github/workflows/sycl-windows-precommit.yml | 1 + 5 files changed, 14 insertions(+), 7 deletions(-) diff --git a/.github/workflows/sycl-linux-build.yml b/.github/workflows/sycl-linux-build.yml index 0702ce93e0aab..6221afd481938 100644 --- a/.github/workflows/sycl-linux-build.yml +++ b/.github/workflows/sycl-linux-build.yml @@ -50,9 +50,6 @@ on: description: 'Artifacts retention period' type: string default: 3 - ref: - type: string - required: false outputs: build_conclusion: @@ -146,7 +143,6 @@ jobs: with: sparse-checkout: | devops/actions - ref: ${{ inputs.ref || github.sha }} # Cleanup will be run after all actions are completed. - name: Register cleanup after job is finished uses: ./devops/actions/cleanup diff --git a/.github/workflows/sycl-linux-precommit.yml b/.github/workflows/sycl-linux-precommit.yml index 978865ed5cd87..9d1825067a661 100644 --- a/.github/workflows/sycl-linux-precommit.yml +++ b/.github/workflows/sycl-linux-precommit.yml @@ -21,6 +21,7 @@ on: - '.github/workflows/sycl-windows-*.yml' - '.github/workflows/sycl-macos-*.yml' - '.github/workflows/sycl-nightly.yml' + - '.github/workflows/sycl-rel-nightly.yml' - 'devops/containers/**' - 'devops/actions/build_container/**' diff --git a/.github/workflows/sycl-rel-nightly.yml b/.github/workflows/sycl-rel-nightly.yml index ebd61e87756c3..96c691451965b 100644 --- a/.github/workflows/sycl-rel-nightly.yml +++ b/.github/workflows/sycl-rel-nightly.yml @@ -36,10 +36,10 @@ jobs: secrets: inherit with: build_cache_root: "/__w/" - build_artifact_suffix: v6 + build_artifact_suffix: default build_configure_extra_args: '--hip --cuda' merge_ref: '' - ref: sycl-rel-6_0_0 + build_ref: sycl-rel-6_0_0 # We upload the build for people to download/use, override its name and # prefer widespread gzip compression. @@ -118,10 +118,12 @@ jobs: if: ${{ github.repository == 'intel/llvm' && needs.check_for_new_commits.outputs.is_new_commit != 'false' }} uses: ./.github/workflows/sycl-windows-build.yml with: + merge_ref: '' + build_ref: sycl-rel-6_0_0 + # We upload both Linux/Windows build via Github's "Releases" # functionality, make sure Linux/Windows names follow the same pattern. artifact_archive_name: sycl_windows.tar.gz - build_ref: sycl-rel-6_0_0 e2e-win: needs: build-win diff --git a/.github/workflows/sycl-windows-build.yml b/.github/workflows/sycl-windows-build.yml index 23fc3291e2fc8..f7e13df7a97c7 100644 --- a/.github/workflows/sycl-windows-build.yml +++ b/.github/workflows/sycl-windows-build.yml @@ -18,6 +18,12 @@ on: description: 'Filter matches for the changed files in the PR' default: '[llvm, clang, sycl, llvm_spirv, xptifw, libclc, libdevice]' required: false + merge_ref: + description: | + Commit-ish to merge post-checkout if non-empty. Must be reachable from + the default_branch input paramter. + type: string + default: 'FETCH_HEAD' artifact_archive_name: type: string default: llvm_sycl.tar.gz @@ -100,6 +106,7 @@ jobs: with: path: src ref: ${{ inputs.build_ref || github.sha }} + merge_ref: ${{ inputs.merge_ref }} cache_path: "D:\\\\github\\\\_work\\\\repo_cache\\\\" - name: Configure shell: cmd diff --git a/.github/workflows/sycl-windows-precommit.yml b/.github/workflows/sycl-windows-precommit.yml index ccd10950b3da2..8bc1fbd5b8e1a 100644 --- a/.github/workflows/sycl-windows-precommit.yml +++ b/.github/workflows/sycl-windows-precommit.yml @@ -20,6 +20,7 @@ on: - '.github/workflows/sycl-precommit-aws.yml' - '.github/workflows/sycl-macos-*.yml' - '.github/workflows/sycl-nightly.yml' + - '.github/workflows/sycl-rel-nightly.yml' - 'devops/containers/**' - 'devops/actions/build_container/**' From 6e0d90e73ed1b8d4d13c18638eae10e0b41979e5 Mon Sep 17 00:00:00 2001 From: "Jiang, Zhiwei" Date: Thu, 9 Jan 2025 18:05:25 +0800 Subject: [PATCH 454/567] [SYCLCompat] Fix vectorized_binary impl to make SYCLomatic migrated code run pass (#16553) --------- Signed-off-by: Jiang, Zhiwei --- sycl/doc/syclcompat/README.md | 19 + sycl/include/syclcompat/math.hpp | 39 +- .../math/math_emu_simd_from_syclomatic.cpp | 2966 +++++++++++++++++ 3 files changed, 2992 insertions(+), 32 deletions(-) create mode 100644 sycl/test-e2e/syclcompat/math/math_emu_simd_from_syclomatic.cpp diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md index 68dbca50e5fdb..a518ea04e3e74 100644 --- a/sycl/doc/syclcompat/README.md +++ b/sycl/doc/syclcompat/README.md @@ -2090,6 +2090,25 @@ struct sub_sat { } // namespace syclcompat ``` +`vectorized_binary` also supports comparison operators from the standard library (`std::equal_to`, `std::not_equal_to`, etc) +and the semantics can be modified by changing the comparison operator template instantiation. For example: + +```cpp +unsigned int Input1; +unsigned int Input2; +// initialize inputs... + +// Performs comparison on sycl::ushort2, following sycl::vec semantics +// Returns unsigned int containing, per vector element, 0xFFFF if true, and 0x0000 if false +syclcompat::vectorized_binary( + Input1, Input2, std::equal_to<>()); + +// Performs element-wise comparison on unsigned short +// Returns unsigned int containing, per vector element, 1 if true, and 0 if false +syclcompat::vectorized_binary( + Input1, Input2, std::equal_to()); +``` + The math header provides a set of functions to extend 32-bit operations to 33 bit, and handle sign extension internally. There is support for `add`, `sub`, `absdiff`, `min` and `max` operations. Each operation provides overloads diff --git a/sycl/include/syclcompat/math.hpp b/sycl/include/syclcompat/math.hpp index d58b7b6c7dc80..148817770468c 100644 --- a/sycl/include/syclcompat/math.hpp +++ b/sycl/include/syclcompat/math.hpp @@ -119,34 +119,13 @@ class vectorized_binary { } }; -// Vectorized_binary for logical operations template class vectorized_binary< VecT, BinaryOperation, - std::enable_if_t()( - std::declval(), - std::declval()))>>> { + std::void_t>> { public: inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op) { - unsigned result = 0; - constexpr size_t elem_size = 8 * sizeof(typename VecT::element_type); - static_assert(elem_size < 32, - "Vector element size must be less than 4 bytes"); - constexpr unsigned bool_mask = (1U << elem_size) - 1; - - for (size_t i = 0; i < a.size(); ++i) { - bool comp_result = binary_op(a[i], b[i]); - result |= (comp_result ? bool_mask : 0U) << (i * elem_size); - } - - VecT v4; - for (size_t i = 0; i < v4.size(); ++i) { - v4[i] = static_cast( - (result >> (i * elem_size)) & bool_mask); - } - - return v4; + return binary_op(a, b).template as(); } }; @@ -694,8 +673,9 @@ inline unsigned vectorized_unary(unsigned a, const UnaryOperation unary_op) { template inline unsigned vectorized_sum_abs_diff(unsigned a, unsigned b) { sycl::vec v0{a}, v1{b}; - auto v2 = v0.as(); - auto v3 = v1.as(); + // Need convert element type to wider signed type to avoid overflow. + auto v2 = v0.as().template convert(); + auto v3 = v1.as().template convert(); auto v4 = sycl::abs_diff(v2, v3); unsigned sum = 0; for (size_t i = 0; i < v4.size(); ++i) { @@ -1095,13 +1075,8 @@ inline unsigned vectorized_binary(unsigned a, unsigned b, auto v3 = v1.as(); auto v4 = detail::vectorized_binary()(v2, v3, binary_op); - if constexpr (!std::is_same_v< - bool, decltype(std::declval()( - std::declval(), - std::declval()))>) { - if (need_relu) - v4 = relu(v4); - } + if (need_relu) + v4 = relu(v4); v0 = v4.template as>(); return v0; } diff --git a/sycl/test-e2e/syclcompat/math/math_emu_simd_from_syclomatic.cpp b/sycl/test-e2e/syclcompat/math/math_emu_simd_from_syclomatic.cpp new file mode 100644 index 0000000000000..b9b274aa2442b --- /dev/null +++ b/sycl/test-e2e/syclcompat/math/math_emu_simd_from_syclomatic.cpp @@ -0,0 +1,2966 @@ +//===---- math_emu_simd_from_syclomatic.cpp ---------- *- C++ -* ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// This file is modified from the code migrated by SYCLomatic. + +// REQUIRES: aspect-fp16 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include + +#include +#include +#include + +using namespace std; + +typedef pair Uint_pair; + +void checkResult(const string &FuncName, const vector &Inputs, + const unsigned int &Expect, const unsigned int &DeviceResult) { + cout << FuncName << "(" << Inputs[0]; + for (size_t i = 1; i < Inputs.size(); ++i) { + cout << ", " << Inputs[i]; + } + cout << ") = " << DeviceResult << " (expect " << Expect << ")"; + assert(DeviceResult == Expect); +} + +void vabs2(unsigned int *const DeviceResult, unsigned int Input1) { + *DeviceResult = + syclcompat::vectorized_unary(Input1, syclcompat::abs()); +} + +void testVabs2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_ct1 = TestCase.first; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vabs2(DeviceResult, TestCase_first_ct1); + }); + }); + q_ct1.wait(); + checkResult("__vabs2", {TestCase.first}, TestCase.second, *DeviceResult); + } +} + +void vabs4(unsigned int *const DeviceResult, unsigned int Input1) { + *DeviceResult = + syclcompat::vectorized_unary(Input1, syclcompat::abs()); +} + +void testVabs4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_ct1 = TestCase.first; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vabs4(DeviceResult, TestCase_first_ct1); + }); + }); + q_ct1.wait(); + checkResult("__vabs4", {TestCase.first}, TestCase.second, *DeviceResult); + } +} + +void vabsdiffs2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, syclcompat::abs_diff()); +} + +void testVabsdiffs2Cases( + const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vabsdiffs2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vabsdiffs2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vabsdiffs4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, syclcompat::abs_diff()); +} + +void testVabsdiffs4Cases( + const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vabsdiffs4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vabsdiffs4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vabsdiffu2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, syclcompat::abs_diff()); +} + +void testVabsdiffu2Cases( + const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vabsdiffu2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vabsdiffu2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vabsdiffu4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, syclcompat::abs_diff()); +} + +void testVabsdiffu4Cases( + const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vabsdiffu4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vabsdiffu4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vabsss2(unsigned int *const DeviceResult, unsigned int Input1) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, 0, syclcompat::abs_diff()); +} + +void testVabsss2Cases( + const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_ct1 = TestCase.first; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vabsss2(DeviceResult, TestCase_first_ct1); + }); + }); + q_ct1.wait(); + checkResult("__vabsss2", {TestCase.first}, TestCase.second, *DeviceResult); + } +} + +void vabsss4(unsigned int *const DeviceResult, unsigned int Input1) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, 0, syclcompat::abs_diff()); +} + +void testVabsss4Cases( + const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_ct1 = TestCase.first; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vabsss4(DeviceResult, TestCase_first_ct1); + }); + }); + q_ct1.wait(); + checkResult("__vabsss4", {TestCase.first}, TestCase.second, *DeviceResult); + } +} + +void vadd2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary(Input1, Input2, + std::plus<>()); +} + +void testVadd2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vadd2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vadd2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vadd4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary(Input1, Input2, + std::plus<>()); +} + +void testVadd4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vadd4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vadd4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vaddss2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, syclcompat::add_sat()); +} + +void testVaddss2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vaddss2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vaddss2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vaddss4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, syclcompat::add_sat()); +} + +void testVaddss4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vaddss4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vaddss4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vaddus2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, syclcompat::add_sat()); +} + +void testVaddus2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vaddus2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vaddus2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vaddus4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, syclcompat::add_sat()); +} + +void testVaddus4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vaddus4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vaddus4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vavgs2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, syclcompat::rhadd()); +} + +void testVavgs2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vavgs2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vavgs2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vavgs4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, syclcompat::rhadd()); +} + +void testVavgs4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vavgs4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vavgs4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vavgu2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, syclcompat::rhadd()); +} + +void testVavgu2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vavgu2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vavgu2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vavgu4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, syclcompat::rhadd()); +} + +void testVavgu4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vavgu4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vavgu4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vcmpeq2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::equal_to<>()); +} + +void testVcmpeq2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vcmpeq2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vcmpeq2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vcmpeq4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::equal_to<>()); +} + +void testVcmpeq4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vcmpeq4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vcmpeq4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vcmpges2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::greater_equal<>()); +} + +void testVcmpges2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vcmpges2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vcmpges2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vcmpges4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::greater_equal<>()); +} + +void testVcmpges4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vcmpges4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vcmpges4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vcmpgeu2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::greater_equal<>()); +} + +void testVcmpgeu2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vcmpgeu2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vcmpgeu2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vcmpgeu4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::greater_equal<>()); +} + +void testVcmpgeu4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vcmpgeu4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vcmpgeu4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vcmpgts2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary(Input1, Input2, + std::greater<>()); +} + +void testVcmpgts2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vcmpgts2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vcmpgts2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vcmpgts4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary(Input1, Input2, + std::greater<>()); +} + +void testVcmpgts4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vcmpgts4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vcmpgts4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vcmpgtu2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::greater<>()); +} + +void testVcmpgtu2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vcmpgtu2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vcmpgtu2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vcmpgtu4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary(Input1, Input2, + std::greater<>()); +} + +void testVcmpgtu4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vcmpgtu4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vcmpgtu4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vcmples2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::less_equal<>()); +} + +void testVcmples2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vcmples2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vcmples2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vcmples4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::less_equal<>()); +} + +void testVcmples4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vcmples4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vcmples4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vcmpleu2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::less_equal<>()); +} + +void testVcmpleu2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vcmpleu2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vcmpleu2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vcmpleu4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::less_equal<>()); +} + +void testVcmpleu4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vcmpleu4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vcmpleu4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vcmplts2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary(Input1, Input2, + std::less<>()); +} + +void testVcmplts2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vcmplts2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vcmplts2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vcmplts4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = + syclcompat::vectorized_binary(Input1, Input2, std::less<>()); +} + +void testVcmplts4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vcmplts4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vcmplts4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vcmpltu2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary(Input1, Input2, + std::less<>()); +} + +void testVcmpltu2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vcmpltu2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vcmpltu2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vcmpltu4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary(Input1, Input2, + std::less<>()); +} + +void testVcmpltu4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vcmpltu4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vcmpltu4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vcmpne2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::not_equal_to<>()); +} + +void testVcmpne2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vcmpne2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vcmpne2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vcmpne4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::not_equal_to<>()); +} + +void testVcmpne4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vcmpne4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vcmpne4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vhaddu2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, syclcompat::hadd()); +} + +void testVhaddu2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vhaddu2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vhaddu2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vhaddu4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, syclcompat::hadd()); +} + +void testVhaddu4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vhaddu4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vhaddu4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vmaxs2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, syclcompat::maximum()); +} + +void testVmaxs2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vmaxs2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vmaxs2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vmaxs4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, syclcompat::maximum()); +} + +void testVmaxs4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vmaxs4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vmaxs4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vmaxu2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, syclcompat::maximum()); +} + +void testVmaxu2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vmaxu2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vmaxu2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vmaxu4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, syclcompat::maximum()); +} + +void testVmaxu4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vmaxu4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vmaxu4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vmins2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, syclcompat::minimum()); +} + +void testVmins2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vmins2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vmins2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vmins4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, syclcompat::minimum()); +} + +void testVmins4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vmins4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vmins4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vminu2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, syclcompat::minimum()); +} + +void testVminu2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vminu2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vminu2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vminu4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, syclcompat::minimum()); +} + +void testVminu4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vminu4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vminu4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vneg2(unsigned int *const DeviceResult, unsigned int Input1) { + *DeviceResult = + syclcompat::vectorized_unary(Input1, std::negate<>()); +} + +void testVneg2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_ct1 = TestCase.first; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vneg2(DeviceResult, TestCase_first_ct1); + }); + }); + q_ct1.wait(); + checkResult("__vneg2", {TestCase.first}, TestCase.second, *DeviceResult); + } +} + +void vneg4(unsigned int *const DeviceResult, unsigned int Input1) { + *DeviceResult = + syclcompat::vectorized_unary(Input1, std::negate<>()); +} + +void testVneg4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_ct1 = TestCase.first; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vneg4(DeviceResult, TestCase_first_ct1); + }); + }); + q_ct1.wait(); + checkResult("__vneg4", {TestCase.first}, TestCase.second, *DeviceResult); + } +} + +void vnegss2(unsigned int *const DeviceResult, unsigned int Input1) { + *DeviceResult = syclcompat::vectorized_binary( + 0, Input1, syclcompat::sub_sat()); +} + +void testVnegss2Cases( + const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_ct1 = TestCase.first; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vnegss2(DeviceResult, TestCase_first_ct1); + }); + }); + q_ct1.wait(); + checkResult("__vnegss2", {TestCase.first}, TestCase.second, *DeviceResult); + } +} + +void vnegss4(unsigned int *const DeviceResult, unsigned int Input1) { + *DeviceResult = syclcompat::vectorized_binary( + 0, Input1, syclcompat::sub_sat()); +} + +void testVnegss4Cases( + const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_ct1 = TestCase.first; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vnegss4(DeviceResult, TestCase_first_ct1); + }); + }); + q_ct1.wait(); + checkResult("__vnegss4", {TestCase.first}, TestCase.second, *DeviceResult); + } +} + +void vsads2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = + syclcompat::vectorized_sum_abs_diff(Input1, Input2); +} + +void testVsads2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vsads2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vsads2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vsads4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = + syclcompat::vectorized_sum_abs_diff(Input1, Input2); +} + +void testVsads4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vsads4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vsads4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vsadu2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = + syclcompat::vectorized_sum_abs_diff(Input1, Input2); +} + +void testVsadu2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vsadu2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vsadu2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vsadu4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = + syclcompat::vectorized_sum_abs_diff(Input1, Input2); +} + +void testVsadu4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vsadu4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vsadu4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vseteq2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::equal_to()); +} + +void testVseteq2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vseteq2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vseteq2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vseteq4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::equal_to()); +} + +void testVseteq4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vseteq4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vseteq4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vsetges2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::greater_equal()); +} + +void testVsetges2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vsetges2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vsetges2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vsetges4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::greater_equal()); +} + +void testVsetges4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vsetges4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vsetges4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vsetgeu2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::greater_equal()); +} + +void testVsetgeu2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vsetgeu2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vsetgeu2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vsetgeu4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::greater_equal()); +} + +void testVsetgeu4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vsetgeu4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vsetgeu4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vsetgts2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::greater()); +} + +void testVsetgts2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vsetgts2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vsetgts2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vsetgts4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::greater()); +} + +void testVsetgts4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vsetgts4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vsetgts4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vsetgtu2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::greater()); +} + +void testVsetgtu2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vsetgtu2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vsetgtu2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vsetgtu4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::greater()); +} + +void testVsetgtu4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vsetgtu4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vsetgtu4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vsetles2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::less_equal()); +} + +void testVsetles2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vsetles2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vsetles2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vsetles4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::less_equal()); +} + +void testVsetles4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vsetles4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vsetles4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vsetleu2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::less_equal()); +} + +void testVsetleu2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vsetleu2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vsetleu2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vsetleu4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::less_equal()); +} + +void testVsetleu4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vsetleu4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vsetleu4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vsetlts2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::less()); +} + +void testVsetlts2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vsetlts2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vsetlts2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vsetlts4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary(Input1, Input2, + std::less()); +} + +void testVsetlts4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vsetlts4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vsetlts4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vsetltu2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::less()); +} + +void testVsetltu2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vsetltu2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vsetltu2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vsetltu4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::less()); +} + +void testVsetltu4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vsetltu4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vsetltu4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vsetne2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::not_equal_to()); +} + +void testVsetne2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vsetne2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vsetne2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vsetne4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, std::not_equal_to()); +} + +void testVsetne4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vsetne4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vsetne4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vsub2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary(Input1, Input2, + std::minus<>()); +} + +void testVsub2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vsub2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vsub2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vsub4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary(Input1, Input2, + std::minus<>()); +} + +void testVsub4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vsub4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vsub4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vsubss2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, syclcompat::sub_sat()); +} + +void testVsubss2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vsubss2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vsubss2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vsubss4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, syclcompat::sub_sat()); +} + +void testVsubss4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vsubss4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vsubss4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vsubus2(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, syclcompat::sub_sat()); +} + +void testVsubus2Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vsubus2(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vsubus2", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +void vsubus4(unsigned int *const DeviceResult, unsigned int Input1, + unsigned int Input2) { + *DeviceResult = syclcompat::vectorized_binary( + Input1, Input2, syclcompat::sub_sat()); +} + +void testVsubus4Cases(const vector> &TestCases) { + sycl::queue q_ct1 = syclcompat::get_default_queue(); + unsigned int *DeviceResult; + DeviceResult = + (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1); + for (const auto &TestCase : TestCases) { + q_ct1.submit([&](sycl::handler &cgh) { + auto TestCase_first_first_ct1 = TestCase.first.first; + auto TestCase_first_second_ct2 = TestCase.first.second; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + vsubus4(DeviceResult, TestCase_first_first_ct1, + TestCase_first_second_ct2); + }); + }); + q_ct1.wait(); + checkResult("__vsubus4", {TestCase.first.first, TestCase.first.second}, + TestCase.second, *DeviceResult); + } +} + +int main() { + testVabs2Cases({ + {214321, 214321}, + {3, 3}, + {2147483647, 2147418113}, // 7FFF,FFFF-->7FFF,0001 + {0, 0}, + {4294967295, 65537}, // FFFF,FFFF-->0001,0001 + }); + testVabs4Cases({ + {214321, 214321}, + {3, 3}, + {2147483647, 2130772225}, // 7F,FF,FF,FF-->7F,01,01,01 + {0, 0}, + {4294967295, 16843009}, // FF,FF,FF,FF-->01,01,01,01 + }); + testVabsdiffs2Cases({ + {{4, 3}, 1}, + {{214321, 2147483647}, 2147239218}, + {{4294967295, 2147483647}, 2147483648}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 1}, + }); + testVabsdiffs4Cases({ + {{4, 3}, 1}, + {{214321, 2147483647}, 2130986546}, + {{4294967295, 2147483647}, 2147483648}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 1}, + }); + testVabsdiffu2Cases({ + {{4, 3}, 1}, + {{214321, 2147483647}, 2147269326}, + {{4294967295, 2147483647}, 2147483648}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 1}, + }); + testVabsdiffu4Cases({ + {{4, 3}, 1}, + {{214321, 2147483647}, 2147269326}, + {{4294967295, 2147483647}, 2147483648}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 1}, + }); + testVabsss2Cases({ + {214321, 214321}, + {3, 3}, + {2147483647, 2147418113}, + {0, 0}, + {4294967295, 65537}, + }); + testVabsss4Cases({ + {214321, 214321}, + {3, 3}, + {2147483647, 2130772225}, + {0, 0}, + {4294967295, 16843009}, + }); + testVadd2Cases({ + {{4, 3}, 7}, + {{214321, 2147483647}, 2147632432}, + {{4294967295, 2147483647}, 2147418110}, + {{4294967295, 4294967295}, 4294901758}, + {{3, 4}, 7}, + }); + testVadd4Cases({ + {{4, 3}, 7}, + {{214321, 2147483647}, 2130854960}, + {{4294967295, 2147483647}, 2130640638}, + {{4294967295, 4294967295}, 4278124286}, + {{3, 4}, 7}, + }); + testVaddss2Cases({ + {{4, 3}, 7}, + {{214321, 2147483647}, 2147435824}, // 3,4531+7FFF,FFFF-->7FFF,4530 + {{4294967295, 2147483647}, 2147418110}, + {{4294967295, 4294967295}, 4294901758}, + {{3, 4}, 7}, + }); + testVaddss4Cases({ + {{4, 3}, 7}, + {{214321, 2147483647}, 2130854960}, + {{4294967295, 2147483647}, 2130640638}, + {{4294967295, 4294967295}, 4278124286}, + {{3, 4}, 7}, + }); + testVaddus2Cases({ + {{4, 3}, 7}, + {{214321, 2147483647}, 2147680255}, + {{4294967295, 2147483647}, 4294967295}, + {{4294967295, 4294967295}, 4294967295}, + {{3, 4}, 7}, + }); + testVaddus4Cases({ + {{4, 3}, 7}, + {{214321, 2147483647}, 2147483647}, + {{4294967295, 2147483647}, 4294967295}, + {{4294967295, 4294967295}, 4294967295}, + {{3, 4}, 7}, + }); + testVavgs2Cases({ + {{4, 3}, 4}, + {{214321, 2147483647}, 1073816216}, + {{4294967295, 2147483647}, 1073741823}, + {{4294967295, 4294967295}, 4294967295}, + {{3, 4}, 4}, + }); + testVavgs4Cases({ + {{4, 3}, 4}, + {{214321, 2147483647}, 1073816088}, + {{4294967295, 2147483647}, 1073741823}, + {{4294967295, 4294967295}, 4294967295}, + {{3, 4}, 4}, + }); + testVavgu2Cases({ + {{4, 3}, 4}, + {{214321, 2147483647}, 1073848984}, + {{4294967295, 2147483647}, 3221225471}, + {{4294967295, 4294967295}, 4294967295}, + {{3, 4}, 4}, + }); + testVavgu4Cases({ + {{4, 3}, 4}, + {{214321, 2147483647}, 1082237592}, + {{4294967295, 2147483647}, 3221225471}, + {{4294967295, 4294967295}, 4294967295}, + {{3, 4}, 4}, + }); + testVcmpeq2Cases({ + {{4, 3}, 4294901760}, + {{214321, 2147483647}, 0}, + {{4294967295, 2147483647}, 65535}, + {{4294967295, 4294967295}, 4294967295}, + {{3, 4}, 4294901760}, + }); + testVcmpeq4Cases({ + {{4, 3}, 4294967040}, + {{214321, 2147483647}, 0}, + {{4294967295, 2147483647}, 16777215}, + {{4294967295, 4294967295}, 4294967295}, + {{3, 4}, 4294967040}, + }); + testVcmpges2Cases({ + {{4, 3}, 4294967295}, + {{214321, 2147483647}, 65535}, + {{4294967295, 2147483647}, 65535}, + {{4294967295, 4294967295}, 4294967295}, + {{3, 4}, 4294901760}, + }); + testVcmpges4Cases({ + {{4, 3}, 4294967295}, + {{214321, 2147483647}, 16777215}, + {{4294967295, 2147483647}, 16777215}, + {{4294967295, 4294967295}, 4294967295}, + {{3, 4}, 4294967040}, + }); + testVcmpgeu2Cases({ + {{4, 3}, 4294967295}, + {{214321, 2147483647}, 0}, + {{4294967295, 2147483647}, 4294967295}, + {{4294967295, 4294967295}, 4294967295}, + {{3, 4}, 4294901760}, + }); + testVcmpgeu4Cases({ + {{4, 3}, 4294967295}, + {{214321, 2147483647}, 0}, + {{4294967295, 2147483647}, 4294967295}, + {{4294967295, 4294967295}, 4294967295}, + {{3, 4}, 4294967040}, + }); + testVcmpgts2Cases({ + {{4, 3}, 65535}, + {{214321, 2147483647}, 65535}, + {{4294967295, 2147483647}, 0}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 0}, + }); + testVcmpgts4Cases({ + {{4, 3}, 255}, + {{214321, 2147483647}, 16777215}, + {{4294967295, 2147483647}, 0}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 0}, + }); + testVcmpgtu2Cases({ + {{4, 3}, 65535}, + {{214321, 2147483647}, 0}, + {{4294967295, 2147483647}, 4294901760}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 0}, + }); + testVcmpgtu4Cases({ + {{4, 3}, 255}, + {{214321, 2147483647}, 0}, + {{4294967295, 2147483647}, 4278190080}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 0}, + }); + testVcmples2Cases({ + {{4, 3}, 4294901760}, + {{214321, 2147483647}, 4294901760}, + {{4294967295, 2147483647}, 4294967295}, + {{4294967295, 4294967295}, 4294967295}, + {{3, 4}, 4294967295}, + }); + testVcmples4Cases({ + {{4, 3}, 4294967040}, + {{214321, 2147483647}, 4278190080}, + {{4294967295, 2147483647}, 4294967295}, + {{4294967295, 4294967295}, 4294967295}, + {{3, 4}, 4294967295}, + }); + testVcmpleu2Cases({ + {{4, 3}, 4294901760}, + {{214321, 2147483647}, 4294967295}, + {{4294967295, 2147483647}, 65535}, + {{4294967295, 4294967295}, 4294967295}, + {{3, 4}, 4294967295}, + }); + testVcmpleu4Cases({ + {{4, 3}, 4294967040}, + {{214321, 2147483647}, 4294967295}, + {{4294967295, 2147483647}, 16777215}, + {{4294967295, 4294967295}, 4294967295}, + {{3, 4}, 4294967295}, + }); + testVcmplts2Cases({ + {{4, 3}, 0}, + {{214321, 2147483647}, 4294901760}, + {{4294967295, 2147483647}, 4294901760}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 65535}, + }); + testVcmplts4Cases({ + {{4, 3}, 0}, + {{214321, 2147483647}, 4278190080}, + {{4294967295, 2147483647}, 4278190080}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 255}, + }); + testVcmpltu2Cases({ + {{4, 3}, 0}, + {{214321, 2147483647}, 4294967295}, + {{4294967295, 2147483647}, 0}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 65535}, + }); + testVcmpltu4Cases({ + {{4, 3}, 0}, + {{214321, 2147483647}, 4294967295}, + {{4294967295, 2147483647}, 0}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 255}, + }); + testVcmpne2Cases({ + {{4, 3}, 65535}, + {{214321, 2147483647}, 4294967295}, + {{4294967295, 2147483647}, 4294901760}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 65535}, + }); + testVcmpne4Cases({ + {{4, 3}, 255}, + {{214321, 2147483647}, 4294967295}, + {{4294967295, 2147483647}, 4278190080}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 255}, + }); + testVhaddu2Cases({ + {{4, 3}, 3}, + {{214321, 2147483647}, 1073848984}, + {{4294967295, 2147483647}, 3221225471}, + {{4294967295, 4294967295}, 4294967295}, + {{3, 4}, 3}, + }); + testVhaddu4Cases({ + {{4, 3}, 3}, + {{214321, 2147483647}, 1065460376}, + {{4294967295, 2147483647}, 3221225471}, + {{4294967295, 4294967295}, 4294967295}, + {{3, 4}, 3}, + }); + testVmaxs2Cases({ + {{4, 3}, 4}, + {{214321, 2147483647}, 2147435825}, + {{4294967295, 2147483647}, 2147483647}, + {{4294967295, 4294967295}, 4294967295}, + {{3, 4}, 4}, + }); + testVmaxs4Cases({ + {{4, 3}, 4}, + {{214321, 2147483647}, 2130920753}, + {{4294967295, 2147483647}, 2147483647}, + {{4294967295, 4294967295}, 4294967295}, + {{3, 4}, 4}, + }); + testVmaxu2Cases({ + {{4, 3}, 4}, + {{214321, 2147483647}, 2147483647}, + {{4294967295, 2147483647}, 4294967295}, + {{4294967295, 4294967295}, 4294967295}, + {{3, 4}, 4}, + }); + testVmaxu4Cases({ + {{4, 3}, 4}, + {{214321, 2147483647}, 2147483647}, + {{4294967295, 2147483647}, 4294967295}, + {{4294967295, 4294967295}, 4294967295}, + {{3, 4}, 4}, + }); + testVmins2Cases({ + {{4, 3}, 3}, + {{214321, 2147483647}, 262143}, + {{4294967295, 2147483647}, 4294967295}, + {{4294967295, 4294967295}, 4294967295}, + {{3, 4}, 3}, + }); + testVmins4Cases({ + {{4, 3}, 3}, + {{214321, 2147483647}, 16777215}, + {{4294967295, 2147483647}, 4294967295}, + {{4294967295, 4294967295}, 4294967295}, + {{3, 4}, 3}, + }); + testVminu2Cases({ + {{4, 3}, 3}, + {{214321, 2147483647}, 214321}, + {{4294967295, 2147483647}, 2147483647}, + {{4294967295, 4294967295}, 4294967295}, + {{3, 4}, 3}, + }); + testVminu4Cases({ + {{4, 3}, 3}, + {{214321, 2147483647}, 214321}, + {{4294967295, 2147483647}, 2147483647}, + {{4294967295, 4294967295}, 4294967295}, + {{3, 4}, 3}, + }); + testVneg2Cases({ + {214321, 4294818511}, + {3, 65533}, + {2147483647, 2147549185}, + {0, 0}, + {4294967295, 65537}, + }); + testVneg4Cases({ + {214321, 16628687}, + {3, 253}, + {2147483647, 2164326657}, + {0, 0}, + {4294967295, 16843009}, + }); + testVnegss2Cases({ + {214321, 4294818511}, + {3, 65533}, + {2147483647, 2147549185}, + {0, 0}, + {4294967295, 65537}, + }); + testVnegss4Cases({ + {214321, 16628687}, + {3, 253}, + {2147483647, 2164326657}, + {0, 0}, + {4294967295, 16843009}, + }); + testVsads2Cases({ + {{4, 3}, 1}, + {{214321, 2147483647}, 50478}, + {{4294967295, 2147483647}, 32768}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 1}, + }); + testVsads4Cases({ + {{4, 3}, 1}, + {{214321, 2147483647}, 251}, + {{4294967295, 2147483647}, 128}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 1}, + }); + testVsadu2Cases({ + {{4, 3}, 1}, + {{214321, 2147483647}, 80586}, + {{4294967295, 2147483647}, 32768}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 1}, + }); + testVsadu4Cases({ + {{4, 3}, 1}, + {{214321, 2147483647}, 771}, + {{4294967295, 2147483647}, 128}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 1}, + }); + testVseteq2Cases({ + {{4, 3}, 65536}, + {{214321, 2147483647}, 0}, + {{4294967295, 2147483647}, 1}, + {{4294967295, 4294967295}, 65537}, + {{3, 4}, 65536}, + }); + testVseteq4Cases({ + {{4, 3}, 16843008}, + {{214321, 2147483647}, 0}, + {{4294967295, 2147483647}, 65793}, + {{4294967295, 4294967295}, 16843009}, + {{3, 4}, 16843008}, + }); + testVsetges2Cases({ + {{4, 3}, 65537}, + {{214321, 2147483647}, 1}, + {{4294967295, 2147483647}, 1}, + {{4294967295, 4294967295}, 65537}, + {{3, 4}, 65536}, + }); + testVsetges4Cases({ + {{4, 3}, 16843009}, + {{214321, 2147483647}, 65793}, + {{4294967295, 2147483647}, 65793}, + {{4294967295, 4294967295}, 16843009}, + {{3, 4}, 16843008}, + }); + testVsetgeu2Cases({ + {{4, 3}, 65537}, + {{214321, 2147483647}, 0}, + {{4294967295, 2147483647}, 65537}, + {{4294967295, 4294967295}, 65537}, + {{3, 4}, 65536}, + }); + testVsetgeu4Cases({ + {{4, 3}, 16843009}, + {{214321, 2147483647}, 0}, + {{4294967295, 2147483647}, 16843009}, + {{4294967295, 4294967295}, 16843009}, + {{3, 4}, 16843008}, + }); + testVsetgts2Cases({ + {{4, 3}, 1}, + {{214321, 2147483647}, 1}, + {{4294967295, 2147483647}, 0}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 0}, + }); + testVsetgts4Cases({ + {{4, 3}, 1}, + {{214321, 2147483647}, 65793}, + {{4294967295, 2147483647}, 0}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 0}, + }); + testVsetgtu2Cases({ + {{4, 3}, 1}, + {{214321, 2147483647}, 0}, + {{4294967295, 2147483647}, 65536}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 0}, + }); + testVsetgtu4Cases({ + {{4, 3}, 1}, + {{214321, 2147483647}, 0}, + {{4294967295, 2147483647}, 16777216}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 0}, + }); + testVsetles2Cases({ + {{4, 3}, 65536}, + {{214321, 2147483647}, 65536}, + {{4294967295, 2147483647}, 65537}, + {{4294967295, 4294967295}, 65537}, + {{3, 4}, 65537}, + }); + testVsetles4Cases({ + {{4, 3}, 16843008}, + {{214321, 2147483647}, 16777216}, + {{4294967295, 2147483647}, 16843009}, + {{4294967295, 4294967295}, 16843009}, + {{3, 4}, 16843009}, + }); + testVsetleu2Cases({ + {{4, 3}, 65536}, + {{214321, 2147483647}, 65537}, + {{4294967295, 2147483647}, 1}, + {{4294967295, 4294967295}, 65537}, + {{3, 4}, 65537}, + }); + testVsetleu4Cases({ + {{4, 3}, 16843008}, + {{214321, 2147483647}, 16843009}, + {{4294967295, 2147483647}, 65793}, + {{4294967295, 4294967295}, 16843009}, + {{3, 4}, 16843009}, + }); + testVsetlts2Cases({ + {{4, 3}, 0}, + {{214321, 2147483647}, 65536}, + {{4294967295, 2147483647}, 65536}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 1}, + }); + testVsetlts4Cases({ + {{4, 3}, 0}, + {{214321, 2147483647}, 16777216}, + {{4294967295, 2147483647}, 16777216}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 1}, + }); + testVsetltu2Cases({ + {{4, 3}, 0}, + {{214321, 2147483647}, 65537}, + {{4294967295, 2147483647}, 0}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 1}, + }); + testVsetltu4Cases({ + {{4, 3}, 0}, + {{214321, 2147483647}, 16843009}, + {{4294967295, 2147483647}, 0}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 1}, + }); + testVsetne2Cases({ + {{4, 3}, 1}, + {{214321, 2147483647}, 65537}, + {{4294967295, 2147483647}, 65536}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 1}, + }); + testVsetne4Cases({ + {{4, 3}, 1}, + {{214321, 2147483647}, 16843009}, + {{4294967295, 2147483647}, 16777216}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 1}, + }); + testVsub2Cases({ + {{4, 3}, 1}, + {{214321, 2147483647}, 2147763506}, + {{4294967295, 2147483647}, 2147483648}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 65535}, + }); + testVsub4Cases({ + {{4, 3}, 1}, + {{214321, 2147483647}, 2164540978}, + {{4294967295, 2147483647}, 2147483648}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 255}, + }); + testVsubss2Cases({ + {{4, 3}, 1}, + {{214321, 2147483647}, 2147763506}, + {{4294967295, 2147483647}, 2147483648}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 65535}, + }); + testVsubss4Cases({ + {{4, 3}, 1}, + {{214321, 2147483647}, 2164540978}, + {{4294967295, 2147483647}, 2147483648}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 255}, + }); + testVsubus2Cases({ + {{4, 3}, 1}, + {{214321, 2147483647}, 0}, + {{4294967295, 2147483647}, 2147483648}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 0}, + }); + testVsubus4Cases({ + {{4, 3}, 1}, + {{214321, 2147483647}, 0}, + {{4294967295, 2147483647}, 2147483648}, + {{4294967295, 4294967295}, 0}, + {{3, 4}, 0}, + }); + return 0; +} From ac9e5d98d7ae33f6e6ce54609c65ce31ad75df88 Mon Sep 17 00:00:00 2001 From: Kseniya Tikhomirova Date: Thu, 9 Jan 2025 11:10:18 +0100 Subject: [PATCH 455/567] [SYCL] Unload libraries in jit_compiler and kernel_compiler_opencl destructors (#16517) Fixes memory leaks in jit_compiler and kernel_compiler_opencl classes. Libraries loaded to provide compilation utils have to be released. --------- Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/jit_compiler.cpp | 29 +++++++++++++------ sycl/source/detail/jit_compiler.hpp | 5 +++- .../kernel_compiler_opencl.cpp | 26 ++++++++++------- 3 files changed, 39 insertions(+), 21 deletions(-) diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index daec9af9ff6dc..79e3a767bb562 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -23,13 +23,21 @@ namespace sycl { inline namespace _V1 { namespace detail { +std::function jit_compiler::CustomDeleterForLibHandle = + [](void *StoredPtr) { + if (!StoredPtr) + return; + std::ignore = sycl::detail::ur::unloadOsLibrary(StoredPtr); + }; + static inline void printPerformanceWarning(const std::string &Message) { if (detail::SYCLConfig::get() > 0) { std::cerr << "WARNING: " << Message << "\n"; } } -jit_compiler::jit_compiler() { +jit_compiler::jit_compiler() + : LibraryHandle(nullptr, CustomDeleterForLibHandle) { auto checkJITLibrary = [this]() -> bool { #ifdef _WIN32 static const std::string dir = sycl::detail::OSUtil::getCurrentDSODir(); @@ -37,15 +45,16 @@ jit_compiler::jit_compiler() { #else static const std::string JITLibraryName = "libsycl-jit.so"; #endif - - void *LibraryPtr = sycl::detail::ur::loadOsLibrary(JITLibraryName); + std::unique_ptr LibraryPtr( + sycl::detail::ur::loadOsLibrary(JITLibraryName), + CustomDeleterForLibHandle); if (LibraryPtr == nullptr) { printPerformanceWarning("Could not find JIT library " + JITLibraryName); return false; } this->AddToConfigHandle = reinterpret_cast( - sycl::detail::ur::getOsLibraryFuncAddress(LibraryPtr, + sycl::detail::ur::getOsLibraryFuncAddress(LibraryPtr.get(), "addToJITConfiguration")); if (!this->AddToConfigHandle) { printPerformanceWarning( @@ -54,7 +63,7 @@ jit_compiler::jit_compiler() { } this->ResetConfigHandle = reinterpret_cast( - sycl::detail::ur::getOsLibraryFuncAddress(LibraryPtr, + sycl::detail::ur::getOsLibraryFuncAddress(LibraryPtr.get(), "resetJITConfiguration")); if (!this->ResetConfigHandle) { printPerformanceWarning( @@ -63,7 +72,8 @@ jit_compiler::jit_compiler() { } this->FuseKernelsHandle = reinterpret_cast( - sycl::detail::ur::getOsLibraryFuncAddress(LibraryPtr, "fuseKernels")); + sycl::detail::ur::getOsLibraryFuncAddress(LibraryPtr.get(), + "fuseKernels")); if (!this->FuseKernelsHandle) { printPerformanceWarning( "Cannot resolve JIT library function entry point"); @@ -73,7 +83,7 @@ jit_compiler::jit_compiler() { this->MaterializeSpecConstHandle = reinterpret_cast( sycl::detail::ur::getOsLibraryFuncAddress( - LibraryPtr, "materializeSpecConstants")); + LibraryPtr.get(), "materializeSpecConstants")); if (!this->MaterializeSpecConstHandle) { printPerformanceWarning( "Cannot resolve JIT library function entry point"); @@ -81,13 +91,14 @@ jit_compiler::jit_compiler() { } this->CompileSYCLHandle = reinterpret_cast( - sycl::detail::ur::getOsLibraryFuncAddress(LibraryPtr, "compileSYCL")); + sycl::detail::ur::getOsLibraryFuncAddress(LibraryPtr.get(), + "compileSYCL")); if (!this->CompileSYCLHandle) { printPerformanceWarning( "Cannot resolve JIT library function entry point"); return false; } - + LibraryHandle = std::move(LibraryPtr); return true; }; Available = checkJITLibrary(); diff --git a/sycl/source/detail/jit_compiler.hpp b/sycl/source/detail/jit_compiler.hpp index b673e4d37b8fa..cc6e4e50bec5e 100644 --- a/sycl/source/detail/jit_compiler.hpp +++ b/sycl/source/detail/jit_compiler.hpp @@ -16,6 +16,7 @@ #include #endif // SYCL_EXT_JIT_ENABLE +#include #include namespace jit_compiler { @@ -80,7 +81,7 @@ class jit_compiler { const ::jit_compiler::SYCLKernelAttribute &Attr) const; // Indicate availability of the JIT compiler - bool Available; + bool Available = false; // Manages the lifetime of the UR structs for device binaries. std::vector JITDeviceBinaries; @@ -98,6 +99,8 @@ class jit_compiler { CompileSYCLFuncT CompileSYCLHandle = nullptr; ResetConfigFuncT ResetConfigHandle = nullptr; AddToConfigFuncT AddToConfigHandle = nullptr; + static std::function CustomDeleterForLibHandle; + std::unique_ptr LibraryHandle; #endif // SYCL_EXT_JIT_ENABLE }; diff --git a/sycl/source/detail/kernel_compiler/kernel_compiler_opencl.cpp b/sycl/source/detail/kernel_compiler/kernel_compiler_opencl.cpp index 10a65d05dec1f..5452bf40795dd 100644 --- a/sycl/source/detail/kernel_compiler/kernel_compiler_opencl.cpp +++ b/sycl/source/detail/kernel_compiler/kernel_compiler_opencl.cpp @@ -14,8 +14,9 @@ #include "../online_compiler/ocloc_api.h" #include "../split_string.hpp" -#include // strlen -#include // for std::accumulate +#include // strlen +#include // for std::function +#include // for std::accumulate #include #include @@ -56,16 +57,21 @@ void checkOclocLibrary(void *OclocLibrary) { } } -static void *OclocLibrary = nullptr; +static std::unique_ptr> + OclocLibrary(nullptr, [](void *StoredPtr) { + if (!StoredPtr) + return; + std::ignore = sycl::detail::ur::unloadOsLibrary(StoredPtr); + }); // load the ocloc shared library, check it. -void *loadOclocLibrary() { +void loadOclocLibrary() { #ifdef __SYCL_RT_OS_WINDOWS static const std::string OclocLibraryName = "ocloc64.dll"; #else static const std::string OclocLibraryName = "libocloc.so"; #endif - void *tempPtr = OclocLibrary; + void *tempPtr = OclocLibrary.get(); if (tempPtr == nullptr) { tempPtr = sycl::detail::ur::loadOsLibrary(OclocLibraryName); @@ -75,10 +81,8 @@ void *loadOclocLibrary() { checkOclocLibrary(tempPtr); - OclocLibrary = tempPtr; + OclocLibrary.reset(tempPtr); } - - return OclocLibrary; } bool OpenCLC_Compilation_Available() { @@ -103,13 +107,13 @@ void SetupLibrary(voidPtr &oclocInvokeHandle, voidPtr &oclocFreeOutputHandle, if (OclocLibrary == nullptr) loadOclocLibrary(); - oclocInvokeHandle = - sycl::detail::ur::getOsLibraryFuncAddress(OclocLibrary, "oclocInvoke"); + oclocInvokeHandle = sycl::detail::ur::getOsLibraryFuncAddress( + OclocLibrary.get(), "oclocInvoke"); if (!oclocInvokeHandle) throw sycl::exception(the_errc, "Cannot load oclocInvoke() function"); oclocFreeOutputHandle = sycl::detail::ur::getOsLibraryFuncAddress( - OclocLibrary, "oclocFreeOutput"); + OclocLibrary.get(), "oclocFreeOutput"); if (!oclocFreeOutputHandle) throw sycl::exception(the_errc, "Cannot load oclocFreeOutput() function"); } From ac1b3b61a9fa1ba331e1857d575132be4eae993a Mon Sep 17 00:00:00 2001 From: Harald van Dijk Date: Thu, 9 Jan 2025 11:01:50 +0000 Subject: [PATCH 456/567] [NativeCPU] Handle local args. (#16500) https://github.com/oneapi-src/unified-runtime/pull/2512 --- sycl/cmake/modules/UnifiedRuntimeTag.cmake | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index c4a2931df90bd..1b0c86e1f207a 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -1,7 +1,7 @@ -# commit b2ac58f27c63b8ff714e8b0c39b79aaab05a3faf -# Merge: 3472b5bd ead3d07d +# commit 9d2a711684f2ba4065fc68e2276fb326d1f6ec01 +# Merge: b2ac58f2 6972fbbd # Author: Kenneth Benzie (Benie) -# Date: Wed Jan 8 10:49:59 2025 +0000 -# Merge pull request #2462 from Bensuo/cmd-buf_update_errors -# Improve specification of command-buffer update errors -set(UNIFIED_RUNTIME_TAG b2ac58f27c63b8ff714e8b0c39b79aaab05a3faf) +# Date: Wed Jan 8 16:12:33 2025 +0000 +# Merge pull request #2512 from hvdijk/handle-local-args +# [NativeCPU] Handle local args. +set(UNIFIED_RUNTIME_TAG 9d2a711684f2ba4065fc68e2276fb326d1f6ec01) From 531b76ee015a099fb721dc3d13f2dcad2c158f4d Mon Sep 17 00:00:00 2001 From: "Kenneth Benzie (Benie)" Date: Thu, 9 Jan 2025 13:32:49 +0000 Subject: [PATCH 457/567] [DeviceSanitizer] Ensure __USE_SPIR_BUILTIN__ has a value (#16570) Ensure that `__USE_SPIR_BUILTIN__` is defined to the value `0` when `defined(__SPIR__) || defined(__SPIRV__)` is false to fix build errors like this: libdevice/include/sanitizer_defs.hpp:51:25: error: expected value in expression 51 | #if __USE_SPIR_BUILTIN__ | ^ --- libdevice/include/sanitizer_defs.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libdevice/include/sanitizer_defs.hpp b/libdevice/include/sanitizer_defs.hpp index 831ebaaf77714..dd5d0d94604be 100644 --- a/libdevice/include/sanitizer_defs.hpp +++ b/libdevice/include/sanitizer_defs.hpp @@ -44,7 +44,7 @@ enum ADDRESS_SPACE : uint32_t { #else // __SYCL_DEVICE_ONLY__ -#define __USE_SPIR_BUILTIN__ +#define __USE_SPIR_BUILTIN__ 0 #endif // __SYCL_DEVICE_ONLY__ From 93de8f1c6127096660ae2ceb2b76271278e25f60 Mon Sep 17 00:00:00 2001 From: Isaac Ault Date: Thu, 9 Jan 2025 13:46:08 +0000 Subject: [PATCH 458/567] [UR] Improve Kernel CTS (#16555) PR for CI purposes. UR PR: https://github.com/oneapi-src/unified-runtime/pull/2444 --------- Co-authored-by: Kenneth Benzie (Benie) --- sycl/cmake/modules/UnifiedRuntimeTag.cmake | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index 1b0c86e1f207a..8b0d8248d009e 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -1,7 +1,7 @@ -# commit 9d2a711684f2ba4065fc68e2276fb326d1f6ec01 -# Merge: b2ac58f2 6972fbbd -# Author: Kenneth Benzie (Benie) -# Date: Wed Jan 8 16:12:33 2025 +0000 -# Merge pull request #2512 from hvdijk/handle-local-args -# [NativeCPU] Handle local args. -set(UNIFIED_RUNTIME_TAG 9d2a711684f2ba4065fc68e2276fb326d1f6ec01) +# commit 19182934ba5b15ebf26ae9ec7c06df1b9269725a +# Merge: 3e62cc91 9c2b0400 +# Author: Isaac Ault +# Date: Wed Jan 8 22:24:12 2025 +0000 +# Merge pull request #2444 from isaacault/kernel-cts +# Reduce gap between Kernel CTS and Specification. +set(UNIFIED_RUNTIME_TAG 19182934ba5b15ebf26ae9ec7c06df1b9269725a) From 4d3ffae3b78b2ea0e7bef6d37b7173c72daf6af6 Mon Sep 17 00:00:00 2001 From: Maosu Zhao Date: Thu, 9 Jan 2025 23:12:42 +0800 Subject: [PATCH 459/567] [DevMSAN] Only return clean shadow for device usm and ptr with global as (#16567) --- libdevice/sanitizer/msan_rtl.cpp | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/libdevice/sanitizer/msan_rtl.cpp b/libdevice/sanitizer/msan_rtl.cpp index bb0287ca476f9..8f3babd709038 100644 --- a/libdevice/sanitizer/msan_rtl.cpp +++ b/libdevice/sanitizer/msan_rtl.cpp @@ -139,11 +139,12 @@ inline uptr __msan_get_shadow_cpu(uptr addr) { inline uptr __msan_get_shadow_pvc(uptr addr, uint32_t as) { if (as == ADDRESS_SPACE_GENERIC) { ConvertGenericPointer(addr, as); - if (as != ADDRESS_SPACE_GLOBAL) - return (uptr)((__SYCL_GLOBAL__ MsanLaunchInfo *)__MsanLaunchInfo.get()) - ->CleanShadow; } + if (as != ADDRESS_SPACE_GLOBAL || !(addr & 0xFF00000000000000)) + return (uptr)((__SYCL_GLOBAL__ MsanLaunchInfo *)__MsanLaunchInfo.get()) + ->CleanShadow; + // Device USM only auto shadow_begin = ((__SYCL_GLOBAL__ MsanLaunchInfo *)__MsanLaunchInfo.get()) ->GlobalShadowOffset; @@ -196,11 +197,6 @@ DEVICE_EXTERN_C_NOINLINE uptr __msan_get_shadow(uptr addr, uint32_t as) { if (!__MsanLaunchInfo.get()) return shadow_ptr; - if (UNLIKELY(!__MsanLaunchInfo)) { - __spirv_ocl_printf(__msan_print_warning_nolaunchinfo); - return shadow_ptr; - } - auto launch_info = (__SYCL_GLOBAL__ MsanLaunchInfo *)__MsanLaunchInfo.get(); MSAN_DEBUG(__spirv_ocl_printf(__msan_print_launchinfo, (void *)launch_info, launch_info->GlobalShadowOffset)); From cd30a5936c7470499278a311fb99b6f549e0a1fe Mon Sep 17 00:00:00 2001 From: Steffen Larsen Date: Thu, 9 Jan 2025 19:59:37 +0100 Subject: [PATCH 460/567] [SYCL][NFC] Remove dead arg and check (#16534) The JITCompilationIsRequired in a selection of build functions default to false and are never set by any callers. Given this, the argument can be removed. As a side-effect of this, the CheckJITCompilationForImage function will return immediately if the JITCompilationIsRequired was false, and since it always was the function is obsolete and can be removed. Signed-off-by: Larsen, Steffen --- .../program_manager/program_manager.cpp | 41 ++++--------------- .../program_manager/program_manager.hpp | 11 ++--- 2 files changed, 10 insertions(+), 42 deletions(-) diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index ada7dc10a2ce1..7e2265484d759 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -781,8 +781,7 @@ CheckAndDecompressImage([[maybe_unused]] RTDeviceBinaryImage *Img) { // its ref count incremented. ur_program_handle_t ProgramManager::getBuiltURProgram( const ContextImplPtr &ContextImpl, const DeviceImplPtr &DeviceImpl, - const std::string &KernelName, const NDRDescT &NDRDesc, - bool JITCompilationIsRequired) { + const std::string &KernelName, const NDRDescT &NDRDesc) { // Check if we can optimize program builds for sub-devices by using a program // built for the root device DeviceImplPtr RootDevImpl = DeviceImpl; @@ -803,8 +802,7 @@ ur_program_handle_t ProgramManager::getBuiltURProgram( auto Context = createSyclObjFromImpl(ContextImpl); auto Device = createSyclObjFromImpl( MustBuildOnSubdevice == true ? DeviceImpl : RootDevImpl); - const RTDeviceBinaryImage &Img = - getDeviceImage(KernelName, Context, Device, JITCompilationIsRequired); + const RTDeviceBinaryImage &Img = getDeviceImage(KernelName, Context, Device); // Check that device supports all aspects used by the kernel if (auto exception = checkDevSupportDeviceRequirements(Device, Img, NDRDesc)) @@ -1403,23 +1401,6 @@ ProgramManager::ProgramManager() } } -void CheckJITCompilationForImage(const RTDeviceBinaryImage *const &Image, - bool JITCompilationIsRequired) { - if (!JITCompilationIsRequired) - return; - // If the image is already compiled with AOT, throw an exception. - const sycl_device_binary_struct &RawImg = Image->getRawData(); - if ((strcmp(RawImg.DeviceTargetSpec, - __SYCL_DEVICE_BINARY_TARGET_SPIRV64_X86_64) == 0) || - (strcmp(RawImg.DeviceTargetSpec, - __SYCL_DEVICE_BINARY_TARGET_SPIRV64_GEN) == 0) || - (strcmp(RawImg.DeviceTargetSpec, - __SYCL_DEVICE_BINARY_TARGET_SPIRV64_FPGA) == 0)) { - throw sycl::exception(sycl::errc::feature_not_supported, - "Recompiling AOT image is not supported"); - } -} - const char *getArchName(const device &Device) { namespace syclex = sycl::ext::oneapi::experimental; auto Arch = getSyclObjImpl(Device)->getDeviceArch(); @@ -1481,13 +1462,11 @@ RTDeviceBinaryImage *getBinImageFromMultiMap( RTDeviceBinaryImage & ProgramManager::getDeviceImage(const std::string &KernelName, - const context &Context, const device &Device, - bool JITCompilationIsRequired) { + const context &Context, const device &Device) { if constexpr (DbgProgMgr > 0) { std::cerr << ">>> ProgramManager::getDeviceImage(\"" << KernelName << "\", " << getSyclObjImpl(Context).get() << ", " - << getSyclObjImpl(Device).get() << ", " - << JITCompilationIsRequired << ")\n"; + << getSyclObjImpl(Device).get() << ")\n"; std::cerr << "available device images:\n"; debugPrintBinaryImages(); @@ -1497,7 +1476,7 @@ ProgramManager::getDeviceImage(const std::string &KernelName, assert(m_SpvFileImage); return getDeviceImage( std::unordered_set({m_SpvFileImage.get()}), - Context, Device, JITCompilationIsRequired); + Context, Device); } RTDeviceBinaryImage *Img = nullptr; @@ -1517,8 +1496,6 @@ ProgramManager::getDeviceImage(const std::string &KernelName, CheckAndDecompressImage(Img); if (Img) { - CheckJITCompilationForImage(Img, JITCompilationIsRequired); - if constexpr (DbgProgMgr > 0) { std::cerr << "selected device image: " << &Img->getRawData() << "\n"; Img->print(); @@ -1532,15 +1509,13 @@ ProgramManager::getDeviceImage(const std::string &KernelName, RTDeviceBinaryImage &ProgramManager::getDeviceImage( const std::unordered_set &ImageSet, - const context &Context, const device &Device, - bool JITCompilationIsRequired) { + const context &Context, const device &Device) { assert(ImageSet.size() > 0); if constexpr (DbgProgMgr > 0) { std::cerr << ">>> ProgramManager::getDeviceImage(Custom SPV file " << getSyclObjImpl(Context).get() << ", " - << getSyclObjImpl(Device).get() << ", " - << JITCompilationIsRequired << ")\n"; + << getSyclObjImpl(Device).get() << ")\n"; std::cerr << "available device images:\n"; debugPrintBinaryImages(); @@ -1569,8 +1544,6 @@ RTDeviceBinaryImage &ProgramManager::getDeviceImage( ImageIterator = ImageSet.begin(); std::advance(ImageIterator, ImgInd); - CheckJITCompilationForImage(*ImageIterator, JITCompilationIsRequired); - if constexpr (DbgProgMgr > 0) { std::cerr << "selected device image: " << &(*ImageIterator)->getRawData() << "\n"; diff --git a/sycl/source/detail/program_manager/program_manager.hpp b/sycl/source/detail/program_manager/program_manager.hpp index 61a3240c1ddd4..78d3574c01427 100644 --- a/sycl/source/detail/program_manager/program_manager.hpp +++ b/sycl/source/detail/program_manager/program_manager.hpp @@ -135,13 +135,11 @@ class ProgramManager { RTDeviceBinaryImage &getDeviceImage(const std::string &KernelName, const context &Context, - const device &Device, - bool JITCompilationIsRequired = false); + const device &Device); RTDeviceBinaryImage &getDeviceImage( const std::unordered_set &ImagesToVerify, - const context &Context, const device &Device, - bool JITCompilationIsRequired = false); + const context &Context, const device &Device); ur_program_handle_t createURProgram(const RTDeviceBinaryImage &Img, const context &Context, @@ -177,13 +175,10 @@ class ProgramManager { /// \param Context the context to build the program with /// \param Device the device for which the program is built /// \param KernelName the kernel's name - /// \param JITCompilationIsRequired If JITCompilationIsRequired is true - /// add a check that kernel is compiled, otherwise don't add the check. ur_program_handle_t getBuiltURProgram(const ContextImplPtr &ContextImpl, const DeviceImplPtr &DeviceImpl, const std::string &KernelName, - const NDRDescT &NDRDesc = {}, - bool JITCompilationIsRequired = false); + const NDRDescT &NDRDesc = {}); /// Builds a program from a given set of images or retrieves that program from /// cache. From c6b1edfb7512c893b50f49943abde916a71d6993 Mon Sep 17 00:00:00 2001 From: Ross Brunton Date: Thu, 9 Jan 2025 20:57:12 +0000 Subject: [PATCH 461/567] [UR] Use reference counting on factories (#15296) Pre-commit MR for https://github.com/oneapi-src/unified-runtime/pull/2048 --- sycl/cmake/modules/UnifiedRuntimeTag.cmake | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index 8b0d8248d009e..de4a5aa14251b 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -1,7 +1,8 @@ -# commit 19182934ba5b15ebf26ae9ec7c06df1b9269725a -# Merge: 3e62cc91 9c2b0400 -# Author: Isaac Ault -# Date: Wed Jan 8 22:24:12 2025 +0000 -# Merge pull request #2444 from isaacault/kernel-cts -# Reduce gap between Kernel CTS and Specification. -set(UNIFIED_RUNTIME_TAG 19182934ba5b15ebf26ae9ec7c06df1b9269725a) +# commit 7eae5c80a9e969bc12fda57c9cc0a0970f0cd17f +# Merge: 9c652ffb b78cfa71 +# Author: Ross Brunton +# Date: Thu Jan 9 17:28:00 2025 +0000 +# Merge pull request #2048 from RossBrunton/ross/refc +# +# Use reference counting on factories +set(UNIFIED_RUNTIME_TAG 7eae5c80a9e969bc12fda57c9cc0a0970f0cd17f) From 26afacd6b4fe8ced99a73a681e1ebde875516766 Mon Sep 17 00:00:00 2001 From: Ben Ashbaugh Date: Tue, 7 Jan 2025 04:36:48 -0800 Subject: [PATCH 462/567] add support for SPV_EXT_optnone (#2951) Specifically: Updates SPIR-V headers to the latest tag, to pull in support for SPV_EXT_optnone (and more). Removes all internal enums for SPV_INTEL_optnone and uses the support in the headers instead. Registers the SPV_EXT_optnone extension. Uses the SPV_EXT_optnone extension if it is enabled, otherwise uses the SPV_INTEL_optnone extension if it is enabled, otherwise ignores the OptimizeNone attribute (the ignoring part is not new). Updates the OptNone test: Ensures that the right extension support is declared, depending on the enabled extensions. Ensures that the OptNone capability is declared when either extension is enabled. Note, the spelling for the capability is unconditionally the EXT version. Ensures that the Function Control is present when either extension is enabled. Original commit: https://github.com/KhronosGroup/SPIRV-LLVM-Translator/commit/dd33e5952bfc1ef --- llvm-spirv/include/LLVMSPIRVExtensions.inc | 1 + llvm-spirv/lib/SPIRV/SPIRVInternal.h | 2 +- llvm-spirv/lib/SPIRV/SPIRVWriter.cpp | 12 +++++-- .../lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h | 2 +- .../lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h | 3 +- .../lib/SPIRV/libSPIRV/spirv_internal.hpp | 8 ----- llvm-spirv/spirv-headers-tag.conf | 2 +- llvm-spirv/test/optnone.ll | 36 ++++++++++++------- 8 files changed, 38 insertions(+), 28 deletions(-) diff --git a/llvm-spirv/include/LLVMSPIRVExtensions.inc b/llvm-spirv/include/LLVMSPIRVExtensions.inc index 9b31bdbefe0bf..c4a2ee0ef096d 100644 --- a/llvm-spirv/include/LLVMSPIRVExtensions.inc +++ b/llvm-spirv/include/LLVMSPIRVExtensions.inc @@ -46,6 +46,7 @@ EXT(SPV_INTEL_fpga_cluster_attributes) EXT(SPV_INTEL_loop_fuse) EXT(SPV_INTEL_long_constant_composite) // TODO: rename to // SPV_INTEL_long_composites later +EXT(SPV_EXT_optnone) EXT(SPV_INTEL_optnone) EXT(SPV_INTEL_fpga_dsp_control) EXT(SPV_INTEL_memory_access_aliasing) diff --git a/llvm-spirv/lib/SPIRV/SPIRVInternal.h b/llvm-spirv/lib/SPIRV/SPIRVInternal.h index 9a7bf4f758f84..e41083f9253ea 100644 --- a/llvm-spirv/lib/SPIRV/SPIRVInternal.h +++ b/llvm-spirv/lib/SPIRV/SPIRVInternal.h @@ -251,7 +251,7 @@ inline void SPIRVMap::init() { add(Attribute::AlwaysInline, FunctionControlInlineMask); add(Attribute::NoInline, FunctionControlDontInlineMask); - add(Attribute::OptimizeNone, internal::FunctionControlOptNoneINTELMask); + add(Attribute::OptimizeNone, FunctionControlOptNoneEXTMask); } typedef SPIRVMap SPIRSPIRVFuncCtlMaskMap; diff --git a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp index 495eb24cf292d..5cc0fe1731276 100644 --- a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp +++ b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp @@ -5532,10 +5532,15 @@ SPIRVWord LLVMToSPIRVBase::transFunctionControlMask(Function *F) { [&](Attribute::AttrKind Attr, SPIRVFunctionControlMaskKind Mask) { if (F->hasFnAttribute(Attr)) { if (Attr == Attribute::OptimizeNone) { - if (!BM->isAllowedToUseExtension(ExtensionID::SPV_INTEL_optnone)) + if (BM->isAllowedToUseExtension(ExtensionID::SPV_EXT_optnone)) { + BM->addExtension(ExtensionID::SPV_EXT_optnone); + BM->addCapability(CapabilityOptNoneEXT); + } else if (BM->isAllowedToUseExtension( + ExtensionID::SPV_INTEL_optnone)) { + BM->addExtension(ExtensionID::SPV_INTEL_optnone); + BM->addCapability(CapabilityOptNoneINTEL); + } else return; - BM->addExtension(ExtensionID::SPV_INTEL_optnone); - BM->addCapability(internal::CapabilityOptNoneINTEL); } FCM |= Mask; } @@ -7024,6 +7029,7 @@ bool runSpirvBackend(Module *M, std::string &Result, std::string &ErrMsg, SPIRV::ExtensionID::SPV_INTEL_cache_controls, SPIRV::ExtensionID::SPV_INTEL_global_variable_fpga_decorations, SPIRV::ExtensionID::SPV_INTEL_global_variable_host_access, + SPIRV::ExtensionID::SPV_EXT_optnone, SPIRV::ExtensionID::SPV_INTEL_optnone, SPIRV::ExtensionID::SPV_INTEL_usm_storage_classes, SPIRV::ExtensionID::SPV_INTEL_subgroups, diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h index 34b73691c4d1d..de46aa7313e92 100644 --- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h +++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h @@ -294,7 +294,7 @@ inline bool isValidFunctionControlMask(SPIRVWord Mask) { ValidMask |= FunctionControlDontInlineMask; ValidMask |= FunctionControlPureMask; ValidMask |= FunctionControlConstMask; - ValidMask |= internal::FunctionControlOptNoneINTELMask; + ValidMask |= FunctionControlOptNoneEXTMask; return (Mask & ~ValidMask) == 0; } diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h index ff7045eb540a2..7646a182cf62b 100644 --- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h +++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h @@ -636,7 +636,7 @@ template <> inline void SPIRVMap::init() { add(CapabilityAtomicFloat32AddEXT, "AtomicFloat32AddEXT"); add(CapabilityAtomicFloat64AddEXT, "AtomicFloat64AddEXT"); add(CapabilityLongCompositesINTEL, "LongCompositesINTEL"); - add(CapabilityOptNoneINTEL, "OptNoneINTEL"); + add(CapabilityOptNoneEXT, "OptNoneEXT"); add(CapabilityAtomicFloat16AddEXT, "AtomicFloat16AddEXT"); add(CapabilityDebugInfoModuleINTEL, "DebugInfoModuleINTEL"); add(CapabilitySplitBarrierINTEL, "SplitBarrierINTEL"); @@ -656,7 +656,6 @@ template <> inline void SPIRVMap::init() { add(CapabilityRegisterLimitsINTEL, "RegisterLimitsINTEL"); // From spirv_internal.hpp add(internal::CapabilityFastCompositeINTEL, "FastCompositeINTEL"); - add(internal::CapabilityOptNoneINTEL, "OptNoneINTEL"); add(internal::CapabilityTokenTypeINTEL, "TokenTypeINTEL"); add(internal::CapabilityFPArithmeticFenceINTEL, "FPArithmeticFenceINTEL"); add(internal::CapabilityBfloat16ConversionINTEL, "Bfloat16ConversionINTEL"); diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/spirv_internal.hpp b/llvm-spirv/lib/SPIRV/libSPIRV/spirv_internal.hpp index d7367c3fd5fb9..d796e7e0d20f4 100644 --- a/llvm-spirv/lib/SPIRV/libSPIRV/spirv_internal.hpp +++ b/llvm-spirv/lib/SPIRV/libSPIRV/spirv_internal.hpp @@ -107,7 +107,6 @@ enum InternalDecoration { enum InternalCapability { ICapFastCompositeINTEL = 6093, - ICapOptNoneINTEL = 6094, ICapTokenTypeINTEL = 6112, ICapBfloat16ConversionINTEL = 6115, ICapabilityJointMatrixINTEL = 6118, @@ -132,8 +131,6 @@ enum InternalCapability { ICapabilityBindlessImagesINTEL = 6528 }; -enum InternalFunctionControlMask { IFunctionControlOptNoneINTELMask = 0x10000 }; - enum InternalExecutionMode { IExecModeFastCompositeKernelINTEL = 6088, IExecModeNamedSubgroupSizeINTEL = 6446, @@ -291,8 +288,6 @@ constexpr Decoration DecorationArgumentAttributeINTEL = constexpr Capability CapabilityFastCompositeINTEL = static_cast(ICapFastCompositeINTEL); -constexpr Capability CapabilityOptNoneINTEL = - static_cast(ICapOptNoneINTEL); constexpr Capability CapabilityTokenTypeINTEL = static_cast(ICapTokenTypeINTEL); constexpr Capability CapabilityFPArithmeticFenceINTEL = @@ -302,9 +297,6 @@ constexpr Capability CapabilityBfloat16ConversionINTEL = constexpr Capability CapabilityGlobalVariableDecorationsINTEL = static_cast(ICapGlobalVariableDecorationsINTEL); -constexpr FunctionControlMask FunctionControlOptNoneINTELMask = - static_cast(IFunctionControlOptNoneINTELMask); - constexpr ExecutionMode ExecutionModeFastCompositeKernelINTEL = static_cast(IExecModeFastCompositeKernelINTEL); diff --git a/llvm-spirv/spirv-headers-tag.conf b/llvm-spirv/spirv-headers-tag.conf index dbca53dc304fe..066641d4b2d42 100644 --- a/llvm-spirv/spirv-headers-tag.conf +++ b/llvm-spirv/spirv-headers-tag.conf @@ -1 +1 @@ -efb6b4099ddb8fa60f62956dee592c4b94ec6a49 +3f17b2af6784bfa2c5aa5dbb8e0e74a607dd8b3b diff --git a/llvm-spirv/test/optnone.ll b/llvm-spirv/test/optnone.ll index 2cea007fd5f7c..702498a6128aa 100644 --- a/llvm-spirv/test/optnone.ll +++ b/llvm-spirv/test/optnone.ll @@ -1,32 +1,44 @@ ; RUN: llvm-as %s -o %t.bc -; RUN: llvm-spirv %t.bc --spirv-ext=+SPV_INTEL_optnone -spirv-text -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llvm-spirv %t.bc --spirv-ext=+SPV_EXT_optnone -spirv-text -o - | FileCheck %s --check-prefix=CHECK-SPIRV-EXT +; RUN: llvm-spirv %t.bc --spirv-ext=+SPV_INTEL_optnone -spirv-text -o - | FileCheck %s --check-prefix=CHECK-SPIRV-INTEL +; RUN: llvm-spirv %t.bc --spirv-ext=+SPV_EXT_optnone,+SPV_INTEL_optnone -spirv-text -o - | FileCheck %s --check-prefix=CHECK-SPIRV-BOTH -; RUN: llvm-spirv --spirv-ext=+SPV_INTEL_optnone %t.bc -o %t.spv + +; RUN: llvm-spirv --spirv-ext=+SPV_EXT_optnone %t.bc -o %t.spv ; RUN: llvm-spirv -r %t.spv -o %t.rev.bc ; RUN: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM ; Check that optnone is correctly ignored when extension is not enabled -; RUN: llvm-spirv %t.bc -spirv-text -o - | FileCheck %s --check-prefix=CHECK-SPIRV-NO-EXT +; RUN: llvm-spirv %t.bc -spirv-text -o - | FileCheck %s --check-prefix=CHECK-SPIRV-NONE ; RUN: llvm-spirv %t.bc -o %t.spv ; RUN: llvm-spirv -r %t.spv -o %t.rev.bc -; RUN: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM-NO-EXT +; RUN: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM-NONE + +; Note: The capability is unconditionally printed with the EXT suffix. +; CHECK-SPIRV-EXT: Capability OptNoneEXT +; CHECK-SPIRV-INTEL: Capability OptNoneEXT +; CHECK-SPIRV-BOTH: Capability OptNoneEXT -; CHECK-SPIRV: Capability OptNoneINTEL -; CHECK-SPIRV: Extension "SPV_INTEL_optnone" +; CHECK-SPIRV-EXT: Extension "SPV_EXT_optnone" +; CHECK-SPIRV-INTEL: Extension "SPV_INTEL_optnone" +; Note: When both extensions are enabled, prefer the EXT extension. +; CHECK-SPIRV-BOTH: Extension "SPV_EXT_optnone" ; Per SPIR-V spec: ; FunctionControlDontInlineMask = 0x2 (2) -; Per SPIR-V spec extension SPV_INTEL_optnone: -; FunctionControlOptNoneINTELMask = 0x10000 (65536) -; CHECK-SPIRV: Function {{[0-9]+}} {{[0-9]+}} 65538 -; CHECK-SPIRV-NO-EXT: Function {{[0-9]+}} {{[0-9]+}} 2 +; Per SPIR-V spec extension spec: +; FunctionControlOptNoneMask = 0x10000 (65536) +; CHECK-SPIRV-EXT: Function {{[0-9]+}} {{[0-9]+}} 65538 +; CHECK-SPIRV-INTEL: Function {{[0-9]+}} {{[0-9]+}} 65538 +; CHECK-SPIRV-BOTH: Function {{[0-9]+}} {{[0-9]+}} 65538 +; CHECK-SPIRV-NONE: Function {{[0-9]+}} {{[0-9]+}} 2 ; CHECK-LLVM: define spir_func void @_Z3foov() #[[ATTR:[0-9]+]] ; CHECK-LLVM: #[[ATTR]] = { {{.*}}noinline{{.*}}optnone{{.*}} } -; CHECK-LLVM-NO-EXT: define spir_func void @_Z3foov() #[[ATTR:[0-9]+]] -; CHECK-LLVM-NO-EXT-NOT: #[[ATTR]] = { {{.*}}noinline{{.*}}optnone{{.*}} } +; CHECK-LLVM-NONE: define spir_func void @_Z3foov() #[[ATTR:[0-9]+]] +; CHECK-LLVM-NONE-NOT: #[[ATTR]] = { {{.*}}noinline{{.*}}optnone{{.*}} } target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" target triple = "spir-unknown-unknown" From 4b2a35bccece3bc146dd35376855535a9848ec6b Mon Sep 17 00:00:00 2001 From: Ben Ashbaugh Date: Tue, 7 Jan 2025 04:37:45 -0800 Subject: [PATCH 463/567] handle OpBitcast between pointers and non-pointers (#2948) Adds support for SPIR-V OpBitcast instructions where the source is a pointer and the destination is not a pointer, and where the source is not a pointer and the destination is a pointer. This needs to be handled as a special case because the LLVM bitcast instruction does not support this. Handles bitcasts between pointers and scalar integers, which is supported by all SPIR-V versions, and pointers and vectors of integers, which is supported by SPIR-V 1.5 (though only for vectors of 32-bit integers). Original commit: https://github.com/KhronosGroup/SPIRV-LLVM-Translator/commit/aafca810e5cd645 --- llvm-spirv/lib/SPIRV/SPIRVReader.cpp | 35 +++++++++++++++++++++ llvm-spirv/test/OpBitcast_ptr_scalar.spvasm | 31 ++++++++++++++++++ llvm-spirv/test/OpBitcast_ptr_vector.spvasm | 33 +++++++++++++++++++ 3 files changed, 99 insertions(+) create mode 100644 llvm-spirv/test/OpBitcast_ptr_scalar.spvasm create mode 100644 llvm-spirv/test/OpBitcast_ptr_vector.spvasm diff --git a/llvm-spirv/lib/SPIRV/SPIRVReader.cpp b/llvm-spirv/lib/SPIRV/SPIRVReader.cpp index 994de566e51c2..ccdfd8f375b0e 100644 --- a/llvm-spirv/lib/SPIRV/SPIRVReader.cpp +++ b/llvm-spirv/lib/SPIRV/SPIRVReader.cpp @@ -1087,6 +1087,41 @@ Value *SPIRVToLLVM::transConvertInst(SPIRVValue *BV, Function *F, case OpFConvert: CO = IsExt ? Instruction::FPExt : Instruction::FPTrunc; break; + case OpBitcast: + // OpBitcast need to be handled as a special-case when the source is a + // pointer and the destination is not a pointer, and where the source is not + // a pointer and the destination is a pointer. This is supported by the + // SPIR-V bitcast, but not by the LLVM bitcast. + CO = Instruction::BitCast; + if (Src->getType()->isPointerTy() && !Dst->isPointerTy()) { + if (auto *DstVecTy = dyn_cast(Dst)) { + unsigned TotalBitWidth = + DstVecTy->getElementType()->getIntegerBitWidth() * + DstVecTy->getNumElements(); + auto *IntTy = Type::getIntNTy(BB->getContext(), TotalBitWidth); + if (BB) { + Src = CastInst::CreatePointerCast(Src, IntTy, "", BB); + } else { + Src = ConstantExpr::getPointerCast(dyn_cast(Src), IntTy); + } + } else { + CO = Instruction::PtrToInt; + } + } else if (!Src->getType()->isPointerTy() && Dst->isPointerTy()) { + if (auto *SrcVecTy = dyn_cast(Src->getType())) { + unsigned TotalBitWidth = + SrcVecTy->getElementType()->getIntegerBitWidth() * + SrcVecTy->getNumElements(); + auto *IntTy = Type::getIntNTy(BB->getContext(), TotalBitWidth); + if (BB) { + Src = CastInst::Create(Instruction::BitCast, Src, IntTy, "", BB); + } else { + Src = ConstantExpr::getBitCast(dyn_cast(Src), IntTy); + } + } + CO = Instruction::IntToPtr; + } + break; default: CO = static_cast(OpCodeMap::rmap(BC->getOpCode())); } diff --git a/llvm-spirv/test/OpBitcast_ptr_scalar.spvasm b/llvm-spirv/test/OpBitcast_ptr_scalar.spvasm new file mode 100644 index 0000000000000..2e57d5d694e95 --- /dev/null +++ b/llvm-spirv/test/OpBitcast_ptr_scalar.spvasm @@ -0,0 +1,31 @@ +; Check support of OpBitcast with pointer operands +; Converts to scalar integers, which is supported by all SPIR-V versions + +; REQUIRES: spirv-as +; RUN: spirv-as --target-env spv1.0 -o %t.spv %s +; RUN: spirv-val %t.spv +; RUN: llvm-spirv -r %t.spv -o %t.rev.bc +; RUN: llvm-dis %t.rev.bc +; RUN: FileCheck < %t.rev.ll %s --check-prefix=CHECK-LLVM + OpCapability Addresses + OpCapability Kernel + OpCapability Int64 + OpMemoryModel Physical64 OpenCL + OpEntryPoint Kernel %kernel "test" + %uint = OpTypeInt 32 0 + %ulong = OpTypeInt 64 0 + %void = OpTypeVoid + %pptr_int = OpTypePointer Function %uint + %kernel_sig = OpTypeFunction %void + %kernel = OpFunction %void None %kernel_sig + %entry = OpLabel + %srcptr = OpVariable %pptr_int Function + %dstint = OpBitcast %ulong %srcptr + %dstptr = OpBitcast %pptr_int %dstint + OpReturn + OpFunctionEnd + + +; CHECK-LLVM: [[SRCPTR:%[a-z0-9.]+]] = alloca i32, align 4 +; CHECK-LLVM: [[DSTINT:%[a-z0-9.]+]] = ptrtoint ptr [[SRCPTR]] to i64 +; CHECK-LLVM: [[DSTPTR:%[a-z0-9.]+]] = inttoptr i64 [[DSTINT]] to ptr diff --git a/llvm-spirv/test/OpBitcast_ptr_vector.spvasm b/llvm-spirv/test/OpBitcast_ptr_vector.spvasm new file mode 100644 index 0000000000000..69b206c845636 --- /dev/null +++ b/llvm-spirv/test/OpBitcast_ptr_vector.spvasm @@ -0,0 +1,33 @@ +; Check support of OpBitcast with pointer operands +; Converts to vectors of integers, which is supported by SPIR-V 1.5 + +; REQUIRES: spirv-as +; RUN: spirv-as --target-env spv1.5 -o %t.spv %s +; RUN: spirv-val %t.spv +; RUN: llvm-spirv -r %t.spv -o %t.rev.bc +; RUN: llvm-dis %t.rev.bc +; RUN: FileCheck < %t.rev.ll %s --check-prefix=CHECK-LLVM + OpCapability Addresses + OpCapability Kernel + OpCapability Int64 + OpMemoryModel Physical64 OpenCL + OpEntryPoint Kernel %kernel "test" + %uint = OpTypeInt 32 0 + %uint2 = OpTypeVector %uint 2 + %void = OpTypeVoid + %pptr_int = OpTypePointer Function %uint + %kernel_sig = OpTypeFunction %void + %kernel = OpFunction %void None %kernel_sig + %entry = OpLabel + %srcptr = OpVariable %pptr_int Function + %dstint2 = OpBitcast %uint2 %srcptr + %dstptr = OpBitcast %pptr_int %dstint2 + OpReturn + OpFunctionEnd + + +; CHECK-LLVM: [[SRCPTR:%[a-z0-9.]+]] = alloca i32, align 4 +; CHECK-LLVM: [[TMPLONG0:%[a-z0-9.]+]] = ptrtoint ptr [[SRCPTR]] to i64 +; CHECK-LLVM: [[DSTINT2:%[a-z0-9.]+]] = bitcast i64 [[TMPLONG0]] to <2 x i32> +; CHECK-LLVM: [[TMPLONG1:%[a-z0-9.]+]] = bitcast <2 x i32> [[DSTINT2]] to i64 +; CHECK-LLVM: [[DSTPTR:%[a-z0-9.]+]] = inttoptr i64 [[TMPLONG1]] to ptr From 81e5ceec12f2a6784ef944fa0961efdb38f4928c Mon Sep 17 00:00:00 2001 From: Sven van Haastregt Date: Tue, 7 Jan 2025 13:38:08 +0100 Subject: [PATCH 464/567] Add -fdeclare-opencl-builtins to more .cl tests (#2950) Use clang's tablegen-driven mechanism for builtins, which is faster than parsing the opencl-c.h header. Original commit: https://github.com/KhronosGroup/SPIRV-LLVM-Translator/commit/b33ca8d1868d24a --- llvm-spirv/test/transcoding/OpImageSampleExplicitLod_arg.cl | 2 +- llvm-spirv/test/transcoding/OpImageWrite.cl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm-spirv/test/transcoding/OpImageSampleExplicitLod_arg.cl b/llvm-spirv/test/transcoding/OpImageSampleExplicitLod_arg.cl index 0acb2d4af011d..13d0d2d66fa6a 100644 --- a/llvm-spirv/test/transcoding/OpImageSampleExplicitLod_arg.cl +++ b/llvm-spirv/test/transcoding/OpImageSampleExplicitLod_arg.cl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -O1 -triple spir-unknown-unknown -cl-std=CL2.0 %s -finclude-default-header -emit-llvm-bc -o %t.bc +// RUN: %clang_cc1 -O1 -triple spir-unknown-unknown -cl-std=CL2.0 %s -fdeclare-opencl-builtins -finclude-default-header -emit-llvm-bc -o %t.bc // RUN: llvm-spirv %t.bc -spirv-text -o %t.txt // RUN: FileCheck < %t.txt %s --check-prefix=CHECK-SPIRV // RUN: llvm-spirv %t.bc -o %t.spv diff --git a/llvm-spirv/test/transcoding/OpImageWrite.cl b/llvm-spirv/test/transcoding/OpImageWrite.cl index 48e7bc097ac02..4b6c334d8129a 100644 --- a/llvm-spirv/test/transcoding/OpImageWrite.cl +++ b/llvm-spirv/test/transcoding/OpImageWrite.cl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -O1 -triple spir-unknown-unknown -cl-std=CL2.0 %s -finclude-default-header -emit-llvm-bc -o %t.bc +// RUN: %clang_cc1 -O1 -triple spir-unknown-unknown -cl-std=CL2.0 %s -fdeclare-opencl-builtins -finclude-default-header -emit-llvm-bc -o %t.bc // RUN: llvm-spirv %t.bc -spirv-text -o %t.txt // RUN: FileCheck < %t.txt %s --check-prefix=CHECK-SPIRV // RUN: llvm-spirv %t.bc -o %t.spv From f6fe3ec5f43254b39faa33ae115678a1fbc91cbc Mon Sep 17 00:00:00 2001 From: Wenju He Date: Wed, 8 Jan 2025 18:28:20 +0000 Subject: [PATCH 465/567] Translate nonsemantic attribute and metadata of GlobalVariable (#2944) Motivations is similar as f729c49. This PR addresses SYCL device global which may have attributes "sycl-device-image-scope", "sycl-host-access" and "sycl-unique-id". Failure to preserve "sycl-unique-id" after llvm-spirv translation triggers assert at https://github.com/intel/llvm/blob/2824f61dd36790448a224cd596985bd01cbcd0f3/llvm/lib/SYCLLowerIR/DeviceGlobals.cpp#L85 Also preserve GlobalVariable metadata as an improvement, though there is no test to show this is really needed. Original commit: https://github.com/KhronosGroup/SPIRV-LLVM-Translator/commit/f2d913cb1a22cb3 --- llvm-spirv/lib/SPIRV/SPIRVReader.cpp | 55 +++++++++++------- llvm-spirv/lib/SPIRV/SPIRVWriter.cpp | 41 ++++++++------ llvm-spirv/lib/SPIRV/SPIRVWriter.h | 2 +- .../lib/SPIRV/libSPIRV/NonSemantic.AuxData.h | 3 +- llvm-spirv/lib/SPIRV/libSPIRV/SPIRVExtInst.h | 4 ++ .../preserve-gv-attributes.ll | 56 +++++++++++++++++++ .../preserve-gv-metadata.ll | 41 ++++++++++++++ 7 files changed, 165 insertions(+), 37 deletions(-) create mode 100644 llvm-spirv/test/extensions/KHR/SPV_KHR_non_semantic_info/preserve-gv-attributes.ll create mode 100644 llvm-spirv/test/extensions/KHR/SPV_KHR_non_semantic_info/preserve-gv-metadata.ll diff --git a/llvm-spirv/lib/SPIRV/SPIRVReader.cpp b/llvm-spirv/lib/SPIRV/SPIRVReader.cpp index ccdfd8f375b0e..62764b0cb1c20 100644 --- a/llvm-spirv/lib/SPIRV/SPIRVReader.cpp +++ b/llvm-spirv/lib/SPIRV/SPIRVReader.cpp @@ -5209,36 +5209,53 @@ void SPIRVToLLVM::transAuxDataInst(SPIRVExtInst *BC) { return; auto Args = BC->getArguments(); // Args 0 and 1 are common between attributes and metadata. - // 0 is the function, 1 is the name of the attribute/metadata as a string - auto *SpvFcn = BC->getModule()->getValue(Args[0]); - auto *F = static_cast(getTranslatedValue(SpvFcn)); - assert(F && "Function should already have been translated!"); + // 0 is the global object, 1 is the name of the attribute/metadata as a string + auto *Arg0 = BC->getModule()->getValue(Args[0]); + auto *GO = cast(getTranslatedValue(Arg0)); + auto *F = dyn_cast(GO); + auto *GV = dyn_cast(GO); + assert((F || GV) && "Value should already have been translated!"); auto AttrOrMDName = BC->getModule()->get(Args[1])->getStr(); switch (BC->getExtOp()) { - case NonSemanticAuxData::FunctionAttribute: { + case NonSemanticAuxData::FunctionAttribute: + case NonSemanticAuxData::GlobalVariableAttribute: { assert(Args.size() < 4 && "Unexpected FunctionAttribute Args"); // If this attr was specially handled and added elsewhere, skip it. Attribute::AttrKind AsKind = Attribute::getAttrKindFromName(AttrOrMDName); - if (AsKind != Attribute::None && F->hasFnAttribute(AsKind)) - return; - if (AsKind == Attribute::None && F->hasFnAttribute(AttrOrMDName)) - return; + if (AsKind != Attribute::None) + if ((F && F->hasFnAttribute(AsKind)) || (GV && GV->hasAttribute(AsKind))) + return; + if (AsKind == Attribute::None) + if ((F && F->hasFnAttribute(AttrOrMDName)) || + (GV && GV->hasAttribute(AttrOrMDName))) + return; // For attributes, arg 2 is the attribute value as a string, which may not // exist. if (Args.size() == 3) { auto AttrValue = BC->getModule()->get(Args[2])->getStr(); - F->addFnAttr(AttrOrMDName, AttrValue); - } else { - if (AsKind != Attribute::None) - F->addFnAttr(AsKind); + if (F) + F->addFnAttr(AttrOrMDName, AttrValue); else - F->addFnAttr(AttrOrMDName); + GV->addAttribute(AttrOrMDName, AttrValue); + } else { + if (AsKind != Attribute::None) { + if (F) + F->addFnAttr(AsKind); + else + GV->addAttribute(AsKind); + } else { + if (F) + F->addFnAttr(AttrOrMDName); + else + GV->addAttribute(AttrOrMDName); + } } break; } - case NonSemanticAuxData::FunctionMetadata: { + case NonSemanticAuxData::FunctionMetadata: + case NonSemanticAuxData::GlobalVariableMetadata: { // If this metadata was specially handled and added elsewhere, skip it. - if (F->hasMetadata(AttrOrMDName)) + if (GO->hasMetadata(AttrOrMDName)) return; SmallVector MetadataArgs; // Process the metadata values. @@ -5248,14 +5265,14 @@ void SPIRVToLLVM::transAuxDataInst(SPIRVExtInst *BC) { if (Arg->getOpCode() == OpString) { auto *ArgAsStr = static_cast(Arg); MetadataArgs.push_back( - MDString::get(F->getContext(), ArgAsStr->getStr())); + MDString::get(GO->getContext(), ArgAsStr->getStr())); } else { auto *ArgAsVal = static_cast(Arg); - auto *TranslatedMD = transValue(ArgAsVal, F, nullptr); + auto *TranslatedMD = transValue(ArgAsVal, nullptr, nullptr); MetadataArgs.push_back(ValueAsMetadata::get(TranslatedMD)); } } - F->setMetadata(AttrOrMDName, MDNode::get(*Context, MetadataArgs)); + GO->setMetadata(AttrOrMDName, MDNode::get(*Context, MetadataArgs)); break; } default: diff --git a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp index 5cc0fe1731276..62b7db4644286 100644 --- a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp +++ b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp @@ -1230,23 +1230,27 @@ void LLVMToSPIRVBase::transFunctionMetadataAsUserSemanticDecoration( } } -void LLVMToSPIRVBase::transAuxDataInst(SPIRVFunction *BF, Function *F) { - auto *BM = BF->getModule(); +void LLVMToSPIRVBase::transAuxDataInst(SPIRVValue *BV, Value *V) { + auto *GO = cast(V); + auto *F = dyn_cast(GO); + auto *GV = dyn_cast(GO); + assert((F || GV) && "Invalid value type"); + auto *BM = BV->getModule(); if (!BM->preserveAuxData()) return; if (!BM->isAllowedToUseVersion(VersionNumber::SPIRV_1_6)) BM->addExtension(SPIRV::ExtensionID::SPV_KHR_non_semantic_info); else BM->setMinSPIRVVersion(VersionNumber::SPIRV_1_6); - const auto &FnAttrs = F->getAttributes().getFnAttrs(); - for (const auto &Attr : FnAttrs) { + const auto &Attrs = F ? F->getAttributes().getFnAttrs() : GV->getAttributes(); + for (const auto &Attr : Attrs) { std::vector Ops; - Ops.push_back(BF->getId()); + Ops.push_back(BV->getId()); if (Attr.isStringAttribute()) { // Format for String attributes is: - // NonSemanticAuxDataFunctionAttribute Fcn AttrName AttrValue + // NonSemanticAuxData*Attribute ValueName AttrName AttrValue // or, if no value: - // NonSemanticAuxDataFunctionAttribute Fcn AttrName + // NonSemanticAuxData*Attribute ValueName AttrName // // AttrName and AttrValue are always Strings StringRef AttrKind = Attr.getKindAsString(); @@ -1259,19 +1263,20 @@ void LLVMToSPIRVBase::transAuxDataInst(SPIRVFunction *BF, Function *F) { } } else { // Format for other types is: - // NonSemanticAuxDataFunctionAttribute Fcn AttrStr + // NonSemanticAuxData*Attribute ValueName AttrStr // AttrStr is always a String. std::string AttrStr = Attr.getAsString(); auto *AttrSpvString = BM->getString(AttrStr); Ops.push_back(AttrSpvString->getId()); } - BM->addAuxData(NonSemanticAuxData::FunctionAttribute, - transType(Type::getVoidTy(F->getContext())), Ops); + BM->addAuxData(F ? NonSemanticAuxData::FunctionAttribute + : NonSemanticAuxData::GlobalVariableAttribute, + transType(Type::getVoidTy(V->getContext())), Ops); } SmallVector> AllMD; SmallVector MDNames; - F->getContext().getMDKindNames(MDNames); - F->getAllMetadata(AllMD); + V->getContext().getMDKindNames(MDNames); + GO->getAllMetadata(AllMD); for (const auto &MD : AllMD) { std::string MDName = MDNames[MD.first].str(); @@ -1284,11 +1289,11 @@ void LLVMToSPIRVBase::transAuxDataInst(SPIRVFunction *BF, Function *F) { continue; // Format for metadata is: - // NonSemanticAuxDataFunctionMetadata Fcn MDName MDVals... + // NonSemanticAuxData*Metadata ValueName MDName MDVals... // MDName is always a String, MDVals have different types as explained // below. Also note this instruction has a variable number of operands std::vector Ops; - Ops.push_back(BF->getId()); + Ops.push_back(BV->getId()); Ops.push_back(BM->getString(MDName)->getId()); for (unsigned int OpIdx = 0; OpIdx < MD.second->getNumOperands(); OpIdx++) { const auto &CurOp = MD.second->getOperand(OpIdx); @@ -1304,8 +1309,9 @@ void LLVMToSPIRVBase::transAuxDataInst(SPIRVFunction *BF, Function *F) { assert(false && "Unsupported metadata type"); } } - BM->addAuxData(NonSemanticAuxData::FunctionMetadata, - transType(Type::getVoidTy(F->getContext())), Ops); + BM->addAuxData(F ? NonSemanticAuxData::FunctionMetadata + : NonSemanticAuxData::GlobalVariableMetadata, + transType(Type::getVoidTy(V->getContext())), Ops); } } @@ -2023,6 +2029,7 @@ LLVMToSPIRVBase::transValueWithoutDecoration(Value *V, SPIRVBasicBlock *BB, if (ST && ST->hasName() && isSPIRVConstantName(ST->getName())) { auto *BV = transConstant(Init); assert(BV); + transAuxDataInst(BV, V); return mapValue(V, BV); } if (isa_and_nonnull(Init)) { @@ -2122,6 +2129,8 @@ LLVMToSPIRVBase::transValueWithoutDecoration(Value *V, SPIRVBasicBlock *BB, GV->getAttribute(kVCMetadata::VCSingleElementVector), BVar); } + transAuxDataInst(BVar, V); + mapValue(V, BVar); spv::BuiltIn Builtin = spv::BuiltInPosition; if (!GV->hasName() || !getSPIRVBuiltin(GV->getName().str(), Builtin)) diff --git a/llvm-spirv/lib/SPIRV/SPIRVWriter.h b/llvm-spirv/lib/SPIRV/SPIRVWriter.h index 1b2ea25bc19ac..1183fa6158ad8 100644 --- a/llvm-spirv/lib/SPIRV/SPIRVWriter.h +++ b/llvm-spirv/lib/SPIRV/SPIRVWriter.h @@ -138,7 +138,7 @@ class LLVMToSPIRVBase : protected BuiltinCallHelper { void transFunctionMetadataAsExecutionMode(SPIRVFunction *BF, Function *F); void transFunctionMetadataAsUserSemanticDecoration(SPIRVFunction *BF, Function *F); - void transAuxDataInst(SPIRVFunction *BF, Function *F); + void transAuxDataInst(SPIRVValue *BV, Value *V); bool transGlobalVariables(); diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/NonSemantic.AuxData.h b/llvm-spirv/lib/SPIRV/libSPIRV/NonSemantic.AuxData.h index 240734afd643e..aa01871cab6d1 100644 --- a/llvm-spirv/lib/SPIRV/libSPIRV/NonSemantic.AuxData.h +++ b/llvm-spirv/lib/SPIRV/libSPIRV/NonSemantic.AuxData.h @@ -28,6 +28,7 @@ namespace NonSemanticAuxData { enum Instruction { FunctionMetadata = 0, FunctionAttribute = 1, - PreserveCount = 2 + GlobalVariableMetadata = 2, + GlobalVariableAttribute = 3 }; } // namespace NonSemanticAuxData diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVExtInst.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVExtInst.h index 693d8dab6bcd1..bbce66fc3ccd3 100644 --- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVExtInst.h +++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVExtInst.h @@ -278,6 +278,10 @@ inline void SPIRVMap::init() { "NonSemanticAuxDataFunctionMetadata"); add(NonSemanticAuxData::FunctionAttribute, "NonSemanticAuxDataFunctionAttribute"); + add(NonSemanticAuxData::GlobalVariableMetadata, + "NonSemanticAuxDataGlobalVariableMetadata"); + add(NonSemanticAuxData::GlobalVariableAttribute, + "NonSemanticAuxDataGlobalVariableAttribute"); } SPIRV_DEF_NAMEMAP(NonSemanticAuxDataOpKind, NonSemanticAuxDataOpMap) diff --git a/llvm-spirv/test/extensions/KHR/SPV_KHR_non_semantic_info/preserve-gv-attributes.ll b/llvm-spirv/test/extensions/KHR/SPV_KHR_non_semantic_info/preserve-gv-attributes.ll new file mode 100644 index 0000000000000..c3451b6d073f7 --- /dev/null +++ b/llvm-spirv/test/extensions/KHR/SPV_KHR_non_semantic_info/preserve-gv-attributes.ll @@ -0,0 +1,56 @@ +; RUN: llvm-as < %s -o %t.bc +; RUN: not llvm-spirv %t.bc -spirv-text --spirv-preserve-auxdata --spirv-max-version=1.5 --spirv-ext=-SPV_KHR_non_semantic_info,+SPV_INTEL_global_variable_decorations -o - 2>&1 | FileCheck %s --check-prefix=CHECK-SPIRV-EXT-DISABLED +; RUN: llvm-spirv %t.bc -o %t.spv --spirv-preserve-auxdata --spirv-max-version=1.5 --spirv-ext=+SPV_INTEL_global_variable_decorations +; RUN: llvm-spirv %t.spv -to-text -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV-EXT +; RUN: llvm-spirv -r --spirv-preserve-auxdata %t.spv -o %t.rev.bc +; RUN: llvm-dis %t.rev.bc -o - | FileCheck %s --check-prefix=CHECK-LLVM +; RUN: llvm-spirv -r %t.spv -o %t.rev.without.bc +; RUN: llvm-dis %t.rev.without.bc -o - | FileCheck %s --implicit-check-not="{{foo|bar|baz}}" + +; RUN: llvm-spirv %t.bc -spirv-text --spirv-preserve-auxdata --spirv-ext=+SPV_KHR_non_semantic_info,+SPV_INTEL_global_variable_decorations -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV-NOEXT +; RUN: llvm-spirv %t.bc -o %t.spv --spirv-preserve-auxdata --spirv-ext=+SPV_INTEL_global_variable_decorations +; RUN: llvm-spirv %t.spv -to-text -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV-NOEXT +; RUN: llvm-spirv -r --spirv-preserve-auxdata %t.spv -o %t.rev.bc +; RUN: llvm-dis %t.rev.bc -o - | FileCheck %s --check-prefix=CHECK-LLVM +; RUN: llvm-spirv -r %t.spv -o %t.rev.without.bc +; RUN: llvm-dis %t.rev.without.bc -o - | FileCheck %s --implicit-check-not="{{foo|bar|baz}}" + +; Check SPIR-V versions in a format magic number + version +; CHECK-SPIRV-EXT: 119734787 65536 +; CHECK-SPIRV-EXT: Extension "SPV_KHR_non_semantic_info" +; CHECK-SPIRV-NOEXT: 119734787 67072 + +; CHECK-SPIRV: ExtInstImport [[#Import:]] "NonSemantic.AuxData" + +; CHECK-SPIRV: String [[#Attr0LHS:]] "sycl-device-global-size" +; CHECK-SPIRV: String [[#Attr0RHS:]] "32" +; CHECK-SPIRV: String [[#Attr1:]] "sycl-device-image-scope" +; CHECK-SPIRV: String [[#Attr2LHS:]] "sycl-host-access" +; CHECK-SPIRV: String [[#Attr2RHS:]] "0" +; CHECK-SPIRV: String [[#Attr3LHS:]] "sycl-unique-id" +; CHECK-SPIRV: String [[#Attr3RHS:]] "_Z20__AsanKernelMetadata" + +; CHECK-SPIRV: Name [[#GVName:]] "__AsanKernelMetadata" + +; CHECK-SPIRV: TypeVoid [[#VoidT:]] + +; CHECK-SPIRV: ExtInst [[#VoidT]] [[#Attr0Inst:]] [[#Import]] NonSemanticAuxDataGlobalVariableAttribute [[#GVName]] [[#Attr0LHS]] [[#Attr0RHS]] {{$}} +; CHECK-SPIRV: ExtInst [[#VoidT]] [[#Attr1Inst:]] [[#Import]] NonSemanticAuxDataGlobalVariableAttribute [[#GVName]] [[#Attr1]] {{$}} +; CHECK-SPIRV: ExtInst [[#VoidT]] [[#Attr1Inst:]] [[#Import]] NonSemanticAuxDataGlobalVariableAttribute [[#GVName]] [[#Attr2LHS]] [[#Attr2RHS]] {{$}} +; CHECK-SPIRV: ExtInst [[#VoidT]] [[#Attr1Inst:]] [[#Import]] NonSemanticAuxDataGlobalVariableAttribute [[#GVName]] [[#Attr3LHS]] [[#Attr3RHS]] {{$}} + +target triple = "spir64-unknown-unknown" + +; CHECK-LLVM: @__AsanKernelMetadata = addrspace(1) global [1 x %structtype] [%structtype { i64 0, i64 92 }] #[[#GVIRAttr:]] +%structtype = type { i64, i64 } + +@__AsanKernelMetadata = addrspace(1) global [1 x %structtype] [%structtype { i64 ptrtoint (ptr addrspace(2) null to i64), i64 92 }], !spirv.Decorations !0 #0 + +; CHECK-LLVM: attributes #[[#GVIRAttr]] = { "sycl-device-global-size"="32" "sycl-device-image-scope" "sycl-host-access"="0" "sycl-unique-id"="_Z20__AsanKernelMetadata" } +attributes #0 = { "sycl-device-global-size"="32" "sycl-device-image-scope" "sycl-host-access"="0" "sycl-unique-id"="_Z20__AsanKernelMetadata" } + +!0 = !{!1} +!1 = !{i32 6147, i32 0, !"_Z20__AsanKernelMetadata"} + +; CHECK-SPIRV-EXT-DISABLED: RequiresExtension: Feature requires the following SPIR-V extension: +; CHECK-SPIRV-EXT-DISABLED-NEXT: SPV_KHR_non_semantic_info diff --git a/llvm-spirv/test/extensions/KHR/SPV_KHR_non_semantic_info/preserve-gv-metadata.ll b/llvm-spirv/test/extensions/KHR/SPV_KHR_non_semantic_info/preserve-gv-metadata.ll new file mode 100644 index 0000000000000..01d30e8a03008 --- /dev/null +++ b/llvm-spirv/test/extensions/KHR/SPV_KHR_non_semantic_info/preserve-gv-metadata.ll @@ -0,0 +1,41 @@ +; RUN: llvm-as < %s -o %t.bc +; RUN: llvm-spirv %t.bc -o %t.spv --spirv-preserve-auxdata --spirv-max-version=1.5 +; RUN: llvm-spirv %t.spv -to-text -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV-EXT +; RUN: llvm-spirv -r --spirv-preserve-auxdata %t.spv -o %t.rev.bc +; RUN: llvm-dis %t.rev.bc -o - | FileCheck %s --check-prefix=CHECK-LLVM +; RUN: llvm-spirv -r %t.spv -o %t.rev.without.bc +; RUN: llvm-dis %t.rev.without.bc -o - | FileCheck %s --implicit-check-not="{{foo|bar|baz}}" + +; RUN: llvm-spirv %t.bc -o %t.spv --spirv-preserve-auxdata +; RUN: llvm-spirv %t.spv -to-text -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV-NOEXT +; RUN: llvm-spirv -r --spirv-preserve-auxdata %t.spv -o %t.rev.bc +; RUN: llvm-dis %t.rev.bc -o - | FileCheck %s --check-prefix=CHECK-LLVM +; RUN: llvm-spirv -r %t.spv -o %t.rev.without.bc +; RUN: llvm-dis %t.rev.without.bc -o - | FileCheck %s --implicit-check-not="{{foo|bar|baz}}" + +; Check SPIR-V versions in a format magic number + version +; CHECK-SPIRV-EXT: 119734787 65536 +; CHECK-SPIRV-EXT: Extension "SPV_KHR_non_semantic_info" +; CHECK-SPIRV-NOEXT: 119734787 67072 + +; CHECK-SPIRV: ExtInstImport [[#Import:]] "NonSemantic.AuxData" + +; CHECK-SPIRV: String [[#MDName:]] "absolute_symbol" + +; CHECK-SPIRV: Name [[#GVName:]] "a" + +; CHECK-SPIRV: TypeInt [[#Int32T:]] 64 0 +; CHECK-SPIRV: Constant [[#Int32T]] [[#MDValue0:]] 0 +; CHECK-SPIRV: Constant [[#Int32T]] [[#MDValue1:]] 16 + +; CHECK-SPIRV: TypeVoid [[#VoidT:]] + +; CHECK-SPIRV: ExtInst [[#VoidT]] [[#ValInst:]] [[#Import]] NonSemanticAuxDataGlobalVariableMetadata [[#GVName]] [[#MDName]] [[#MDValue0]] [[#MDValue1]] {{$}} + +target triple = "spir64-unknown-unknown" + +; CHECK-LLVM: @a = external addrspace(1) global i8, !absolute_symbol ![[#LLVMVal:]] +@a = external addrspace(1) global i8, !absolute_symbol !0 + +; CHECK-LLVM: ![[#LLVMVal]] = !{i64 0, i64 16} +!0 = !{i64 0, i64 16} From 17c9fb5a06fe3c055449ca7ffe808c6a7a37c395 Mon Sep 17 00:00:00 2001 From: Jinsong Ji Date: Sun, 5 Jan 2025 21:50:46 -0500 Subject: [PATCH 466/567] [compiler-rt] Add -fPIE to compiler-rt profile test lit.cfg We disable PIE default in https://github.com/intel/llvm/commit/a9ac1671cb428c4d1170faaae8d623ac8243584a, the test that use -shared is failing due to it. Add -fPIE to sync with llorg default. --- compiler-rt/test/profile/lit.cfg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/test/profile/lit.cfg.py b/compiler-rt/test/profile/lit.cfg.py index fc2baf7c40b8f..c8d89537d9a2b 100644 --- a/compiler-rt/test/profile/lit.cfg.py +++ b/compiler-rt/test/profile/lit.cfg.py @@ -194,7 +194,7 @@ def exclude_unsupported_files_for_aix(dirname): config.unsupported = True config.substitutions.append( - ("%shared_lib_flag", "-dynamiclib" if (config.host_os == "Darwin") else "-shared") + ("%shared_lib_flag", "-dynamiclib" if (config.host_os == "Darwin") else "-shared -fPIE") # INTEL ) if config.host_os in ["AIX"]: From aae7f1350358f61748c0dcafc35e51c4d363346f Mon Sep 17 00:00:00 2001 From: Michael Toguchi Date: Fri, 10 Jan 2025 01:08:15 -0800 Subject: [PATCH 467/567] [SYCL][E2E] Cleanup compilation redundancies for FPGA archive tests (#16583) These tests are testing various combinations of compilation modes using FPGA -fsycl-link. The mixing and matching were a bit over-kill. Update these tests to cover the various modes, but reduce the combinations. --- .../AOT/fpga-aoc-archive-split-per-kernel.cpp | 75 +++---------------- sycl/test-e2e/AOT/fpga-aoc-archive2.cpp | 23 ++---- 2 files changed, 15 insertions(+), 83 deletions(-) diff --git a/sycl/test-e2e/AOT/fpga-aoc-archive-split-per-kernel.cpp b/sycl/test-e2e/AOT/fpga-aoc-archive-split-per-kernel.cpp index 0e0c53d217c05..a28372d2d9a65 100644 --- a/sycl/test-e2e/AOT/fpga-aoc-archive-split-per-kernel.cpp +++ b/sycl/test-e2e/AOT/fpga-aoc-archive-split-per-kernel.cpp @@ -6,88 +6,33 @@ // Remove any archives // RUN: rm -f %t_*.a -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Build main object. +// RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl -c %S/Inputs/fpga_main.cpp -o %t_main.o + // Build any early archive binaries. // RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl -fsycl-link=early %S/Inputs/fpga_sub.cpp -o %t_early_sub.a // RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl -fsycl-link=early %S/Inputs/fpga_add.cpp -o %t_early_add.a // RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl -fsycl-link=early %S/Inputs/fpga_sub_x.cpp -o %t_early_sub_x.a // RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl -fsycl-link=early %S/Inputs/fpga_add_x.cpp -o %t_early_add_x.a -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Use a variety of archive orders -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl %S/Inputs/fpga_main.cpp %t_early_add.a %t_early_sub.a %t_early_add_x.a %t_early_sub_x.a -o %t_early.out -// RUN: %{run} %t_early.out -// RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl %S/Inputs/fpga_main.cpp %t_early_sub_x.a %t_early_add.a %t_early_sub.a %t_early_add_x.a -o %t_early.out -// RUN: %{run} %t_early.out -// RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl %S/Inputs/fpga_main.cpp %t_early_add_x.a %t_early_sub_x.a %t_early_add.a %t_early_sub.a -o %t_early.out -// RUN: %{run} %t_early.out -// RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl %S/Inputs/fpga_main.cpp %t_early_sub.a %t_early_add_x.a %t_early_sub_x.a %t_early_add.a -o %t_early.out + +// Test baseline of all early archives and main. +// RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl %t_main.o %t_early_add.a %t_early_sub.a %t_early_add_x.a %t_early_sub_x.a -o %t_early.out // RUN: %{run} %t_early.out -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Build any image archive binaries. // RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl -fsycl-link=image %S/Inputs/fpga_sub.cpp -o %t_image_sub.a // RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl -fsycl-link=image %S/Inputs/fpga_add.cpp -o %t_image_add.a // RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl -fsycl-link=image %S/Inputs/fpga_sub_x.cpp -o %t_image_sub_x.a // RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl -fsycl-link=image %S/Inputs/fpga_add_x.cpp -o %t_image_add_x.a -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Use a variety of archive orders -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl %S/Inputs/fpga_main.cpp %t_image_add.a %t_image_sub.a %t_image_add_x.a %t_image_sub_x.a -o %t_image.out + +// Test baseline of all image archives and main. +// RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl %t_main.o %t_image_add.a %t_image_sub.a %t_image_add_x.a %t_image_sub_x.a -o %t_image.out // RUN: %{run} %t_image.out -// RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl %S/Inputs/fpga_main.cpp %t_image_sub_x.a %t_image_add.a %t_image_sub.a %t_image_add_x.a -o %t_early.out -// RUN: %{run} %t_early.out -// RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl %S/Inputs/fpga_main.cpp %t_image_add_x.a %t_image_sub_x.a %t_image_add.a %t_image_sub.a -o %t_early.out -// RUN: %{run} %t_early.out -// RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl %S/Inputs/fpga_main.cpp %t_image_sub.a %t_image_add_x.a %t_image_sub_x.a %t_image_add.a -o %t_early.out -// RUN: %{run} %t_early.out -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Build any image archive binaries from early archives. -// RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl -fsycl-link=image %t_early_sub.a -o %t_early_image_sub.a -// RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl -fsycl-link=image %t_early_add.a -o %t_early_image_add.a // RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl -fsycl-link=image %t_early_sub_x.a -o %t_early_image_sub_x.a // RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl -fsycl-link=image %t_early_add_x.a -o %t_early_image_add_x.a -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Use a variety of archive orders -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl %S/Inputs/fpga_main.cpp %t_early_image_add.a %t_early_image_sub.a %t_early_image_add_x.a %t_early_image_sub_x.a -o %t_early_image.out -// RUN: %{run} %t_early_image.out -// RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl %S/Inputs/fpga_main.cpp %t_early_image_sub_x.a %t_early_image_add.a %t_early_image_sub.a %t_early_image_add_x.a -o %t_early_image.out -// RUN: %{run} %t_early_image.out -// RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl %S/Inputs/fpga_main.cpp %t_early_image_add_x.a %t_early_image_sub_x.a %t_early_image_add.a %t_early_image_sub.a -o %t_early_image.out -// RUN: %{run} %t_early_image.out -// RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl %S/Inputs/fpga_main.cpp %t_early_image_sub.a %t_early_image_add_x.a %t_early_image_sub_x.a %t_early_image_add.a -o %t_early_image.out -// RUN: %{run} %t_early_image.out -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Mix early and image archive usage -// RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl %S/Inputs/fpga_main.cpp %t_early_add.a %t_image_sub.a %t_early_add_x.a %t_image_sub_x.a -o %t_mix.out -// RUN: %{run} %t_mix.out -// RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl %S/Inputs/fpga_main.cpp %t_image_add.a %t_early_sub.a %t_image_add_x.a %t_early_sub_x.a -o %t_mix.out -// RUN: %{run} %t_mix.out - -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Provide some kernels without going through an archive -// RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl %S/Inputs/fpga_main.cpp %S/Inputs/fpga_add.cpp %t_image_sub.a %t_early_add_x.a %t_image_sub_x.a -o %t_mix.out -// RUN: %{run} %t_mix.out -// RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl %S/Inputs/fpga_main.cpp %S/Inputs/fpga_add.cpp %t_early_sub.a %t_image_add_x.a %t_early_sub_x.a -o %t_mix.out -// RUN: %{run} %t_mix.out -// RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl %S/Inputs/fpga_main.cpp %S/Inputs/fpga_add.cpp %S/Inputs/fpga_sub.cpp %t_early_add_x.a %t_image_sub_x.a -o %t_mix.out -// RUN: %{run} %t_mix.out -// RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl %S/Inputs/fpga_main.cpp %S/Inputs/fpga_add.cpp %S/Inputs/fpga_sub.cpp %t_image_add_x.a %t_early_sub_x.a -o %t_mix.out -// RUN: %{run} %t_mix.out -// RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl %S/Inputs/fpga_main.cpp %S/Inputs/fpga_add.cpp %t_image_sub.a %S/Inputs/fpga_add_x.cpp %t_image_sub_x.a -o %t_mix.out -// RUN: %{run} %t_mix.out -// RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl %S/Inputs/fpga_main.cpp %S/Inputs/fpga_add.cpp %t_image_sub.a %S/Inputs/fpga_add_x.cpp %t_early_sub_x.a -o %t_mix.out +// RUN: %clangxx -fintelfpga -fsycl-device-code-split=per_kernel -fsycl %t_main.o %t_early_add.a %t_image_sub.a %t_early_image_add_x.a %t_early_image_sub_x.a -o %t_mix.out // RUN: %{run} %t_mix.out diff --git a/sycl/test-e2e/AOT/fpga-aoc-archive2.cpp b/sycl/test-e2e/AOT/fpga-aoc-archive2.cpp index 7b7dde8812a93..27ba741f52c69 100644 --- a/sycl/test-e2e/AOT/fpga-aoc-archive2.cpp +++ b/sycl/test-e2e/AOT/fpga-aoc-archive2.cpp @@ -6,9 +6,9 @@ // Remove any archives // RUN: rm -f %t_*.a -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// +// Build main +// RUN: %clangxx -fintelfpga -fsycl %S/Inputs/fpga_main.cpp -c -o %t_main.o + // Build any early archive binaries. // RUN: %clangxx -fintelfpga -fsycl -fsycl-link=early %S/Inputs/fpga_sub.cpp -o %t_early_sub.a // RUN: %clangxx -fintelfpga -fsycl -fsycl-link=early %S/Inputs/fpga_sub_x.cpp -o %t_early_sub_x.a @@ -19,19 +19,6 @@ // RUN: %clangxx -fintelfpga -fsycl -fsycl-link=image %S/Inputs/fpga_sub_x.cpp -o %t_image_sub_x.a // RUN: %clangxx -fintelfpga -fsycl -fsycl-link=image %S/Inputs/fpga_add_x.cpp -o %t_image_add_x.a -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// -// Provide some kernels without going through an archive -// RUN: %clangxx -fintelfpga -fsycl %S/Inputs/fpga_main.cpp %S/Inputs/fpga_add.cpp %t_image_sub.a %t_early_add_x.a %t_image_sub_x.a -o %t_mix.out -// RUN: %{run} %t_mix.out -// RUN: %clangxx -fintelfpga -fsycl %S/Inputs/fpga_main.cpp %S/Inputs/fpga_add.cpp %t_early_sub.a %t_image_add_x.a %t_early_sub_x.a -o %t_mix.out -// RUN: %{run} %t_mix.out -// RUN: %clangxx -fintelfpga -fsycl %S/Inputs/fpga_main.cpp %S/Inputs/fpga_add.cpp %S/Inputs/fpga_sub.cpp %t_early_add_x.a %t_image_sub_x.a -o %t_mix.out -// RUN: %{run} %t_mix.out -// RUN: %clangxx -fintelfpga -fsycl %S/Inputs/fpga_main.cpp %S/Inputs/fpga_add.cpp %S/Inputs/fpga_sub.cpp %t_image_add_x.a %t_early_sub_x.a -o %t_mix.out -// RUN: %{run} %t_mix.out -// RUN: %clangxx -fintelfpga -fsycl %S/Inputs/fpga_main.cpp %S/Inputs/fpga_add.cpp %t_image_sub.a %S/Inputs/fpga_add_x.cpp %t_image_sub_x.a -o %t_mix.out -// RUN: %{run} %t_mix.out -// RUN: %clangxx -fintelfpga -fsycl %S/Inputs/fpga_main.cpp %S/Inputs/fpga_add.cpp %t_image_sub.a %S/Inputs/fpga_add_x.cpp %t_early_sub_x.a -o %t_mix.out +// Build using various combinations of archives and source. +// RUN: %clangxx -fintelfpga -fsycl %t_main.o %S/Inputs/fpga_add.cpp %t_image_sub.a %t_early_add_x.a %t_image_sub_x.a -o %t_mix.out // RUN: %{run} %t_mix.out From 783550602826af12b5c6f8e76cdaab41c997dbf7 Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Fri, 10 Jan 2025 07:50:57 -0800 Subject: [PATCH 468/567] [SYCL][E2E] Pre-compile `fpga_main.cpp` into `.o` file (#16579) This particular TU has no device code so can be pre-compiled and passed as an object file to final link invocations without affecting the tests' purpose. I'm not touching `fpga-aoc-archive-split-per-kernel.cpp`/ `fpga-aoc-archive2.cpp` as these two are the slowest and would need other changes. --- sycl/test-e2e/AOT/fpga-aoc-archive-early.cpp | 10 ++++++---- sycl/test-e2e/AOT/fpga-aoc-archive-early2.cpp | 10 ++++++---- sycl/test-e2e/AOT/fpga-aoc-archive-image.cpp | 10 ++++++---- sycl/test-e2e/AOT/fpga-aoc-archive.cpp | 6 ++++-- 4 files changed, 22 insertions(+), 14 deletions(-) diff --git a/sycl/test-e2e/AOT/fpga-aoc-archive-early.cpp b/sycl/test-e2e/AOT/fpga-aoc-archive-early.cpp index ef0e34ec42f3b..fcda842266752 100644 --- a/sycl/test-e2e/AOT/fpga-aoc-archive-early.cpp +++ b/sycl/test-e2e/AOT/fpga-aoc-archive-early.cpp @@ -8,6 +8,8 @@ //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// +// RUN: %clangxx -fintelfpga -fsycl %S/Inputs/fpga_main.cpp -c -o %t_main.o + // Build any early archive binaries. // RUN: %clangxx -fintelfpga -fsycl -fsycl-link=early %S/Inputs/fpga_sub.cpp -o %t_early_sub.a // RUN: %clangxx -fintelfpga -fsycl -fsycl-link=early %S/Inputs/fpga_add.cpp -o %t_early_add.a @@ -16,11 +18,11 @@ //////////////////////////////////////////////////////////////////////////////// // Use a variety of archive orders //////////////////////////////////////////////////////////////////////////////// -// RUN: %clangxx -fintelfpga -fsycl %S/Inputs/fpga_main.cpp %t_early_add.a %t_early_sub.a %t_early_add_x.a %t_early_sub_x.a -o %t_early.out +// RUN: %clangxx -fintelfpga -fsycl %t_main.o %t_early_add.a %t_early_sub.a %t_early_add_x.a %t_early_sub_x.a -o %t_early.out // RUN: %{run} %t_early.out -// RUN: %clangxx -fintelfpga -fsycl %S/Inputs/fpga_main.cpp %t_early_sub_x.a %t_early_add.a %t_early_sub.a %t_early_add_x.a -o %t_early.out +// RUN: %clangxx -fintelfpga -fsycl %t_main.o %t_early_sub_x.a %t_early_add.a %t_early_sub.a %t_early_add_x.a -o %t_early.out // RUN: %{run} %t_early.out -// RUN: %clangxx -fintelfpga -fsycl %S/Inputs/fpga_main.cpp %t_early_add_x.a %t_early_sub_x.a %t_early_add.a %t_early_sub.a -o %t_early.out +// RUN: %clangxx -fintelfpga -fsycl %t_main.o %t_early_add_x.a %t_early_sub_x.a %t_early_add.a %t_early_sub.a -o %t_early.out // RUN: %{run} %t_early.out -// RUN: %clangxx -fintelfpga -fsycl %S/Inputs/fpga_main.cpp %t_early_sub.a %t_early_add_x.a %t_early_sub_x.a %t_early_add.a -o %t_early.out +// RUN: %clangxx -fintelfpga -fsycl %t_main.o %t_early_sub.a %t_early_add_x.a %t_early_sub_x.a %t_early_add.a -o %t_early.out // RUN: %{run} %t_early.out diff --git a/sycl/test-e2e/AOT/fpga-aoc-archive-early2.cpp b/sycl/test-e2e/AOT/fpga-aoc-archive-early2.cpp index 08cc0e8199cfd..85c18358590ad 100644 --- a/sycl/test-e2e/AOT/fpga-aoc-archive-early2.cpp +++ b/sycl/test-e2e/AOT/fpga-aoc-archive-early2.cpp @@ -8,6 +8,8 @@ //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// +// RUN: %clangxx -fintelfpga -fsycl %S/Inputs/fpga_main.cpp -c -o %t_main.o + // Build any early archive binaries. // RUN: %clangxx -fintelfpga -fsycl -fsycl-link=early %S/Inputs/fpga_sub.cpp -o %t_early_sub.a // RUN: %clangxx -fintelfpga -fsycl -fsycl-link=early %S/Inputs/fpga_add.cpp -o %t_early_add.a @@ -25,11 +27,11 @@ //////////////////////////////////////////////////////////////////////////////// // Use a variety of archive orders //////////////////////////////////////////////////////////////////////////////// -// RUN: %clangxx -fintelfpga -fsycl %S/Inputs/fpga_main.cpp %t_early_image_add.a %t_early_image_sub.a %t_early_image_add_x.a %t_early_image_sub_x.a -o %t_early_image.out +// RUN: %clangxx -fintelfpga -fsycl %t_main.o %t_early_image_add.a %t_early_image_sub.a %t_early_image_add_x.a %t_early_image_sub_x.a -o %t_early_image.out // RUN: %{run} %t_early_image.out -// RUN: %clangxx -fintelfpga -fsycl %S/Inputs/fpga_main.cpp %t_early_image_sub_x.a %t_early_image_add.a %t_early_image_sub.a %t_early_image_add_x.a -o %t_early_image.out +// RUN: %clangxx -fintelfpga -fsycl %t_main.o %t_early_image_sub_x.a %t_early_image_add.a %t_early_image_sub.a %t_early_image_add_x.a -o %t_early_image.out // RUN: %{run} %t_early_image.out -// RUN: %clangxx -fintelfpga -fsycl %S/Inputs/fpga_main.cpp %t_early_image_add_x.a %t_early_image_sub_x.a %t_early_image_add.a %t_early_image_sub.a -o %t_early_image.out +// RUN: %clangxx -fintelfpga -fsycl %t_main.o %t_early_image_add_x.a %t_early_image_sub_x.a %t_early_image_add.a %t_early_image_sub.a -o %t_early_image.out // RUN: %{run} %t_early_image.out -// RUN: %clangxx -fintelfpga -fsycl %S/Inputs/fpga_main.cpp %t_early_image_sub.a %t_early_image_add_x.a %t_early_image_sub_x.a %t_early_image_add.a -o %t_early_image.out +// RUN: %clangxx -fintelfpga -fsycl %t_main.o %t_early_image_sub.a %t_early_image_add_x.a %t_early_image_sub_x.a %t_early_image_add.a -o %t_early_image.out // RUN: %{run} %t_early_image.out diff --git a/sycl/test-e2e/AOT/fpga-aoc-archive-image.cpp b/sycl/test-e2e/AOT/fpga-aoc-archive-image.cpp index 6ca28d4e20ad9..f659ad6b588b4 100644 --- a/sycl/test-e2e/AOT/fpga-aoc-archive-image.cpp +++ b/sycl/test-e2e/AOT/fpga-aoc-archive-image.cpp @@ -8,6 +8,8 @@ //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// +// RUN: %clangxx -fintelfpga -fsycl %S/Inputs/fpga_main.cpp -c -o %t_main.o + // Build any image archive binaries. // RUN: %clangxx -fintelfpga -fsycl -fsycl-link=image %S/Inputs/fpga_sub.cpp -o %t_image_sub.a // RUN: %clangxx -fintelfpga -fsycl -fsycl-link=image %S/Inputs/fpga_add.cpp -o %t_image_add.a @@ -16,11 +18,11 @@ //////////////////////////////////////////////////////////////////////////////// // Use a variety of archive orders //////////////////////////////////////////////////////////////////////////////// -// RUN: %clangxx -fintelfpga -fsycl %S/Inputs/fpga_main.cpp %t_image_add.a %t_image_sub.a %t_image_add_x.a %t_image_sub_x.a -o %t_image.out +// RUN: %clangxx -fintelfpga -fsycl %t_main.o %t_image_add.a %t_image_sub.a %t_image_add_x.a %t_image_sub_x.a -o %t_image.out // RUN: %{run} %t_image.out -// RUN: %clangxx -fintelfpga -fsycl %S/Inputs/fpga_main.cpp %t_image_sub_x.a %t_image_add.a %t_image_sub.a %t_image_add_x.a -o %t_image.out +// RUN: %clangxx -fintelfpga -fsycl %t_main.o %t_image_sub_x.a %t_image_add.a %t_image_sub.a %t_image_add_x.a -o %t_image.out // RUN: %{run} %t_image.out -// RUN: %clangxx -fintelfpga -fsycl %S/Inputs/fpga_main.cpp %t_image_add_x.a %t_image_sub_x.a %t_image_add.a %t_image_sub.a -o %t_image.out +// RUN: %clangxx -fintelfpga -fsycl %t_main.o %t_image_add_x.a %t_image_sub_x.a %t_image_add.a %t_image_sub.a -o %t_image.out // RUN: %{run} %t_image.out -// RUN: %clangxx -fintelfpga -fsycl %S/Inputs/fpga_main.cpp %t_image_sub.a %t_image_add_x.a %t_image_sub_x.a %t_image_add.a -o %t_image.out +// RUN: %clangxx -fintelfpga -fsycl %t_main.o %t_image_sub.a %t_image_add_x.a %t_image_sub_x.a %t_image_add.a -o %t_image.out // RUN: %{run} %t_image.out diff --git a/sycl/test-e2e/AOT/fpga-aoc-archive.cpp b/sycl/test-e2e/AOT/fpga-aoc-archive.cpp index 43fa561592783..878f1804200eb 100644 --- a/sycl/test-e2e/AOT/fpga-aoc-archive.cpp +++ b/sycl/test-e2e/AOT/fpga-aoc-archive.cpp @@ -9,6 +9,8 @@ //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// +// RUN: %clangxx -fintelfpga -fsycl %S/Inputs/fpga_main.cpp -c -o %t_main.o + // Build any early archive binaries. // RUN: %clangxx -fintelfpga -fsycl -fsycl-link=early %S/Inputs/fpga_sub.cpp -o %t_early_sub.a // RUN: %clangxx -fintelfpga -fsycl -fsycl-link=early %S/Inputs/fpga_add.cpp -o %t_early_add.a @@ -25,7 +27,7 @@ //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// // Mix early and image archive usage -// RUN: %clangxx -fintelfpga -fsycl %S/Inputs/fpga_main.cpp %t_early_add.a %t_image_sub.a %t_early_add_x.a %t_image_sub_x.a -o %t_mix.out +// RUN: %clangxx -fintelfpga -fsycl %t_main.o %t_early_add.a %t_image_sub.a %t_early_add_x.a %t_image_sub_x.a -o %t_mix.out // RUN: %{run} %t_mix.out -// RUN: %clangxx -fintelfpga -fsycl %S/Inputs/fpga_main.cpp %t_image_add.a %t_early_sub.a %t_image_add_x.a %t_early_sub_x.a -o %t_mix.out +// RUN: %clangxx -fintelfpga -fsycl %t_main.o %t_image_add.a %t_early_sub.a %t_image_add_x.a %t_early_sub_x.a -o %t_mix.out // RUN: %{run} %t_mix.out From c309c69317f6acfdb48cd2257bef76e105ab16d2 Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Fri, 10 Jan 2025 07:52:18 -0800 Subject: [PATCH 469/567] [SYCL][E2E] Don't cover full combinations in `vec_binary_scalar_order*` (#16574) First, full coverage belongs to CTS, not to unit tests. Second, implementation is heavily templated and there isn't much differences potential for many intger types or vector sizes. This PR eliminates test cases that are unlikely to excercise the code not covered by the remaining ones. The purpose of this is to reduce the time these tests takes as they're very close to be a bottleneck on machines with lots of cores where total wall clock time is defined by the longest tests in the suite. --- .../Basic/vector/vec_binary_scalar_order.hpp | 6 ++---- .../Basic/vector/vec_binary_scalar_order_arith.cpp | 14 +------------- .../vector/vec_binary_scalar_order_bitwise.cpp | 5 ----- .../vector/vec_binary_scalar_order_logical.cpp | 5 ----- .../vector/vec_binary_scalar_order_relational.cpp | 4 ---- 5 files changed, 3 insertions(+), 31 deletions(-) diff --git a/sycl/test-e2e/Basic/vector/vec_binary_scalar_order.hpp b/sycl/test-e2e/Basic/vector/vec_binary_scalar_order.hpp index 3d825665a7690..879a4f6da9463 100644 --- a/sycl/test-e2e/Basic/vector/vec_binary_scalar_order.hpp +++ b/sycl/test-e2e/Basic/vector/vec_binary_scalar_order.hpp @@ -93,7 +93,5 @@ bool CheckResult(sycl::vec V, T2 Ref) { #define CHECK_SIZES(Q, C, T, IS_RELOP, OP) \ CHECK(Q, C, T, 1, IS_RELOP, OP) \ - CHECK(Q, C, T, 2, IS_RELOP, OP) \ - CHECK(Q, C, T, 4, IS_RELOP, OP) \ - CHECK(Q, C, T, 8, IS_RELOP, OP) \ - CHECK(Q, C, T, 16, IS_RELOP, OP) + CHECK(Q, C, T, 3, IS_RELOP, OP) \ + CHECK(Q, C, T, 8, IS_RELOP, OP) diff --git a/sycl/test-e2e/Basic/vector/vec_binary_scalar_order_arith.cpp b/sycl/test-e2e/Basic/vector/vec_binary_scalar_order_arith.cpp index 409838c4077bb..d5b74859b701c 100644 --- a/sycl/test-e2e/Basic/vector/vec_binary_scalar_order_arith.cpp +++ b/sycl/test-e2e/Basic/vector/vec_binary_scalar_order_arith.cpp @@ -26,25 +26,13 @@ int main() { CHECK_SIZES_AND_COMMON_OPS(Q, Failures, double); } - // Check all operators without requirements. + // Check operators without requirements. CHECK_SIZES_AND_COMMON_OPS(Q, Failures, float); CHECK_SIZES_AND_COMMON_OPS(Q, Failures, int8_t); - CHECK_SIZES_AND_COMMON_OPS(Q, Failures, int16_t); - CHECK_SIZES_AND_COMMON_OPS(Q, Failures, int32_t); - CHECK_SIZES_AND_COMMON_OPS(Q, Failures, int64_t); - CHECK_SIZES_AND_COMMON_OPS(Q, Failures, uint8_t); CHECK_SIZES_AND_COMMON_OPS(Q, Failures, uint16_t); - CHECK_SIZES_AND_COMMON_OPS(Q, Failures, uint32_t); - CHECK_SIZES_AND_COMMON_OPS(Q, Failures, uint64_t); // Check integer only operators. - CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, int8_t); CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, int16_t); - CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, int32_t); - CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, int64_t); CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, uint8_t); - CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, uint16_t); - CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, uint32_t); - CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, uint64_t); return Failures; } diff --git a/sycl/test-e2e/Basic/vector/vec_binary_scalar_order_bitwise.cpp b/sycl/test-e2e/Basic/vector/vec_binary_scalar_order_bitwise.cpp index d2b0fe15bb501..6ca491fb6afba 100644 --- a/sycl/test-e2e/Basic/vector/vec_binary_scalar_order_bitwise.cpp +++ b/sycl/test-e2e/Basic/vector/vec_binary_scalar_order_bitwise.cpp @@ -19,12 +19,7 @@ int main() { // Check operators. CHECK_SIZES_AND_OPS(Q, Failures, int8_t); - CHECK_SIZES_AND_OPS(Q, Failures, int16_t); CHECK_SIZES_AND_OPS(Q, Failures, int32_t); - CHECK_SIZES_AND_OPS(Q, Failures, int64_t); - CHECK_SIZES_AND_OPS(Q, Failures, uint8_t); CHECK_SIZES_AND_OPS(Q, Failures, uint16_t); - CHECK_SIZES_AND_OPS(Q, Failures, uint32_t); - CHECK_SIZES_AND_OPS(Q, Failures, uint64_t); return Failures; } diff --git a/sycl/test-e2e/Basic/vector/vec_binary_scalar_order_logical.cpp b/sycl/test-e2e/Basic/vector/vec_binary_scalar_order_logical.cpp index 7c5da5e898f85..e17ad6ecb201e 100644 --- a/sycl/test-e2e/Basic/vector/vec_binary_scalar_order_logical.cpp +++ b/sycl/test-e2e/Basic/vector/vec_binary_scalar_order_logical.cpp @@ -24,12 +24,7 @@ int main() { // Check all operators without requirements. CHECK_SIZES_AND_OPS(Q, Failures, float); CHECK_SIZES_AND_OPS(Q, Failures, int8_t); - CHECK_SIZES_AND_OPS(Q, Failures, int16_t); CHECK_SIZES_AND_OPS(Q, Failures, int32_t); - CHECK_SIZES_AND_OPS(Q, Failures, int64_t); - CHECK_SIZES_AND_OPS(Q, Failures, uint8_t); CHECK_SIZES_AND_OPS(Q, Failures, uint16_t); - CHECK_SIZES_AND_OPS(Q, Failures, uint32_t); - CHECK_SIZES_AND_OPS(Q, Failures, uint64_t); return Failures; } diff --git a/sycl/test-e2e/Basic/vector/vec_binary_scalar_order_relational.cpp b/sycl/test-e2e/Basic/vector/vec_binary_scalar_order_relational.cpp index 514d9b4d9c8aa..655347cb1fc0c 100644 --- a/sycl/test-e2e/Basic/vector/vec_binary_scalar_order_relational.cpp +++ b/sycl/test-e2e/Basic/vector/vec_binary_scalar_order_relational.cpp @@ -29,12 +29,8 @@ int main() { // Check all operators without requirements. CHECK_SIZES_AND_OPS(Q, Failures, float); CHECK_SIZES_AND_OPS(Q, Failures, int8_t); - CHECK_SIZES_AND_OPS(Q, Failures, int16_t); CHECK_SIZES_AND_OPS(Q, Failures, int32_t); - CHECK_SIZES_AND_OPS(Q, Failures, int64_t); CHECK_SIZES_AND_OPS(Q, Failures, uint8_t); CHECK_SIZES_AND_OPS(Q, Failures, uint16_t); - CHECK_SIZES_AND_OPS(Q, Failures, uint32_t); - CHECK_SIZES_AND_OPS(Q, Failures, uint64_t); return Failures; } From 81c2858567f72ad07260ecbf4e85e59fc44c7059 Mon Sep 17 00:00:00 2001 From: Nikita Kornev Date: Fri, 10 Jan 2025 17:17:40 +0100 Subject: [PATCH 470/567] [CI] Update nightly sycl-cts (#16587) Disabled some redundant steps for run-only mode. Test-run: https://github.com/intel/llvm/actions/runs/12708106400 --- .github/workflows/sycl-linux-run-tests.yml | 4 ++-- .github/workflows/sycl-nightly.yml | 5 +---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/.github/workflows/sycl-linux-run-tests.yml b/.github/workflows/sycl-linux-run-tests.yml index e9429e6bcf718..3e4066fdc1904 100644 --- a/.github/workflows/sycl-linux-run-tests.yml +++ b/.github/workflows/sycl-linux-run-tests.yml @@ -222,7 +222,7 @@ jobs: merge_ref: ${{ inputs.merge_ref }} cache_path: "/__w/repo_cache/" - name: Checkout SYCL CTS tests - if: inputs.tests_selector == 'cts' + if: inputs.tests_selector == 'cts' && inputs.cts_testing_mode != 'run-only' uses: ./devops/actions/cached_checkout with: path: khronos_sycl_cts @@ -231,7 +231,7 @@ jobs: default_branch: 'main' cache_path: "/__w/repo_cache/" - name: SYCL CTS GIT submodules init - if: inputs.tests_selector == 'cts' + if: inputs.tests_selector == 'cts' && inputs.cts_testing_mode != 'run-only' run: | git -C khronos_sycl_cts submodule update --init - name: Install drivers diff --git a/.github/workflows/sycl-nightly.yml b/.github/workflows/sycl-nightly.yml index 5c6b6ad89f57d..0c3ff68b27efd 100644 --- a/.github/workflows/sycl-nightly.yml +++ b/.github/workflows/sycl-nightly.yml @@ -186,14 +186,12 @@ jobs: image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN target_devices: opencl:cpu - tests_selector: cts - name: SYCL-CTS on L0 gen12 runner: '["Linux", "gen12"]' image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN target_devices: level_zero:gpu - tests_selector: cts uses: ./.github/workflows/sycl-linux-run-tests.yml with: name: ${{ matrix.name }} @@ -202,9 +200,8 @@ jobs: image: ${{ matrix.image }} image_options: ${{ matrix.image_options }} target_devices: ${{ matrix.target_devices }} - tests_selector: ${{ matrix.tests_selector }} + tests_selector: cts ref: ${{ github.sha }} - merge_ref: '' sycl_toolchain_artifact: sycl_linux_default sycl_toolchain_archive: ${{ needs.ubuntu2204_build.outputs.artifact_archive_name }} sycl_toolchain_decompress_command: ${{ needs.ubuntu2204_build.outputs.artifact_decompress_command }} From 95506e4b130d8502c58964a51a4a283a80f90b76 Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Fri, 10 Jan 2025 08:17:55 -0800 Subject: [PATCH 471/567] [NFC][SYCL][E2E] Split `Reduction/reduction_internal.cpp` (#16578) The test is long enough to be a bottleneck for the total wall clock time when running E2E tests on a machine with enough cores (like our PVC/SPR runners in CI). --- ...on_internal.cpp => reduction_internal.hpp} | 31 +++---------------- .../reduction_internal_nd_range_1dim.cpp | 20 ++++++++++++ .../reduction_internal_range_1dim.cpp | 13 ++++++++ .../reduction_internal_range_2dim.cpp | 13 ++++++++ .../reduction_internal_range_3dim.cpp | 13 ++++++++ 5 files changed, 64 insertions(+), 26 deletions(-) rename sycl/test-e2e/Reduction/{reduction_internal.cpp => reduction_internal.hpp} (82%) create mode 100644 sycl/test-e2e/Reduction/reduction_internal_nd_range_1dim.cpp create mode 100644 sycl/test-e2e/Reduction/reduction_internal_range_1dim.cpp create mode 100644 sycl/test-e2e/Reduction/reduction_internal_range_2dim.cpp create mode 100644 sycl/test-e2e/Reduction/reduction_internal_range_3dim.cpp diff --git a/sycl/test-e2e/Reduction/reduction_internal.cpp b/sycl/test-e2e/Reduction/reduction_internal.hpp similarity index 82% rename from sycl/test-e2e/Reduction/reduction_internal.cpp rename to sycl/test-e2e/Reduction/reduction_internal.hpp index e45037a2ef569..ac2590654b468 100644 --- a/sycl/test-e2e/Reduction/reduction_internal.cpp +++ b/sycl/test-e2e/Reduction/reduction_internal.hpp @@ -1,6 +1,3 @@ -// RUN: %{build} -o %t.out -// RUN: %{run} %t.out - #include #include @@ -120,27 +117,9 @@ void testAllStrategies(RedStorage &Storage, RangeTy Range) { }); } -int main() { - queue q; - RedStorage Storage(q); - - auto TestRange = [&](auto Range) { - testAllStrategies(Storage, Range); - testAllStrategies(Storage, Range); - testAllStrategies(Storage, Range); - testAllStrategies(Storage, Range); - }; - - TestRange(range<1>{42}); - TestRange(range<2>{8, 8}); - TestRange(range<3>{7, 7, 5}); - TestRange(nd_range<1>{range<1>{7}, range<1>{7}}); - TestRange(nd_range<1>{range<1>{3 * 3}, range<1>{3}}); - - // TODO: Strategies historically adopted from sycl::range implementation only - // support 1-Dim case. - // - // TestRange(nd_range<2>{range<2>{7, 3}, range<2> {7, 3}}); - // TestRange(nd_range<2>{range<2>{14, 9}, range<2> {7, 3}}); - return 0; +template void testRange(RedStorage &Storage, RangeTy Range) { + testAllStrategies(Storage, Range); + testAllStrategies(Storage, Range); + testAllStrategies(Storage, Range); + testAllStrategies(Storage, Range); } diff --git a/sycl/test-e2e/Reduction/reduction_internal_nd_range_1dim.cpp b/sycl/test-e2e/Reduction/reduction_internal_nd_range_1dim.cpp new file mode 100644 index 0000000000000..bebbfd13848ae --- /dev/null +++ b/sycl/test-e2e/Reduction/reduction_internal_nd_range_1dim.cpp @@ -0,0 +1,20 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "reduction_internal.hpp" + +int main() { + queue q; + RedStorage Storage(q); + + testRange(Storage, nd_range<1>{range<1>{7}, range<1>{7}}); + testRange(Storage, nd_range<1>{range<1>{3 * 3}, range<1>{3}}); + + // TODO: Strategies historically adopted from sycl::range implementation only + // support 1-Dim case. + // + // testRange(Storage, nd_range<2>{range<2>{7, 3}, range<2> {7, 3}}); + // testRange(Storage, nd_range<2>{range<2>{14, 9}, range<2> {7, 3}}); + + return 0; +} diff --git a/sycl/test-e2e/Reduction/reduction_internal_range_1dim.cpp b/sycl/test-e2e/Reduction/reduction_internal_range_1dim.cpp new file mode 100644 index 0000000000000..832798f656a50 --- /dev/null +++ b/sycl/test-e2e/Reduction/reduction_internal_range_1dim.cpp @@ -0,0 +1,13 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "reduction_internal.hpp" + +int main() { + queue q; + RedStorage Storage(q); + + testRange(Storage, range<1>{42}); + + return 0; +} diff --git a/sycl/test-e2e/Reduction/reduction_internal_range_2dim.cpp b/sycl/test-e2e/Reduction/reduction_internal_range_2dim.cpp new file mode 100644 index 0000000000000..935594aacbe09 --- /dev/null +++ b/sycl/test-e2e/Reduction/reduction_internal_range_2dim.cpp @@ -0,0 +1,13 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "reduction_internal.hpp" + +int main() { + queue q; + RedStorage Storage(q); + + testRange(Storage, range<2>{8, 8}); + + return 0; +} diff --git a/sycl/test-e2e/Reduction/reduction_internal_range_3dim.cpp b/sycl/test-e2e/Reduction/reduction_internal_range_3dim.cpp new file mode 100644 index 0000000000000..0bd5ba9bbfaa1 --- /dev/null +++ b/sycl/test-e2e/Reduction/reduction_internal_range_3dim.cpp @@ -0,0 +1,13 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "reduction_internal.hpp" + +int main() { + queue q; + RedStorage Storage(q); + + testRange(Storage, range<3>{7, 7, 5}); + + return 0; +} From d8327d294f88d336ae8d634f81ce6183a4773dd7 Mon Sep 17 00:00:00 2001 From: Rafal Bielski Date: Fri, 10 Jan 2025 16:57:32 +0000 Subject: [PATCH 472/567] [SYCL][E2E] Flag USM/fill_any_size UNSUPPORTED for all opencl devices (#16588) Test added in #16544 was originally flagged as UNSUPPORTED for OpenCL CPU device only, but has now been observed to fail also for other OpenCL devices. Flag the test as UNSUPPORTED for all OpenCL devices. --- sycl/test-e2e/USM/fill_any_size.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/USM/fill_any_size.cpp b/sycl/test-e2e/USM/fill_any_size.cpp index 101802324c225..037ac4b884cae 100644 --- a/sycl/test-e2e/USM/fill_any_size.cpp +++ b/sycl/test-e2e/USM/fill_any_size.cpp @@ -1,7 +1,7 @@ // RUN: %{build} -o %t1.out // RUN: %{run} %t1.out // clang-format off -// UNSUPPORTED: (opencl && cpu) +// UNSUPPORTED: opencl // UNSUPPORTED-TRACKER: https://github.com/oneapi-src/unified-runtime/issues/2440 // clang-format on From 1677043f1e2a1bfd24f3c7b5d3349f21e7637c1a Mon Sep 17 00:00:00 2001 From: Nick Sarnie Date: Sat, 11 Jan 2025 03:24:51 +0900 Subject: [PATCH 473/567] [SYCL][ESIMD][E2E] Fix DG2 check in lit.local.cfg (#16592) This only worked when `gpu-intel-dg2` was passed explicitly at the command line. Now we automatically add it based on the device architecture, but it's per-device, so check for it in any device like we do for PVC. Signed-off-by: Sarnie, Nick --- sycl/test-e2e/ESIMD/lit.local.cfg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/test-e2e/ESIMD/lit.local.cfg b/sycl/test-e2e/ESIMD/lit.local.cfg index 41191cc92a6c5..26c51002c514c 100644 --- a/sycl/test-e2e/ESIMD/lit.local.cfg +++ b/sycl/test-e2e/ESIMD/lit.local.cfg @@ -8,10 +8,10 @@ config.required_features += ['gpu'] # so there's no difference in coverage. # We should investigate why OCL fails separately. -# Check if any device has arch-intel_gpu_pvc has_arch_gpu_intel_pvc = any('arch-intel_gpu_pvc' in T for T in config.sycl_dev_features.values()) +has_gpu_intel_dg2 = any('gpu-intel-dg2' in T for T in config.sycl_dev_features.values()) -if 'gpu-intel-dg2' in config.available_features or has_arch_gpu_intel_pvc: +if has_gpu_intel_dg2 or has_arch_gpu_intel_pvc: config.required_features += ['level_zero'] # TODO: Remove this once the warnings are resolved From d05e33ea0e993de3e9016b91a4b6305a7d89dc35 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Fri, 10 Jan 2025 14:23:27 -0500 Subject: [PATCH 474/567] [SYCL] Map group load/store to built-ins for 16 shorts (#16581) Those built-ins are mentioned here https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroups_short.html and visible here as well: https://github.com/intel/intel-graphics-compiler/blob/master/IGC/BiFModule/Implementation/group.cl#L1727 https://github.com/intel/intel-graphics-compiler/blob/master/IGC/BiFModule/Implementation/group.cl#L1927 --- .../oneapi/experimental/group_load_store.hpp | 4 +- sycl/test/check_device_code/group_load.cpp | 46 ++++++++++++++ sycl/test/check_device_code/group_store.cpp | 62 +++++++++++++++++++ 3 files changed, 110 insertions(+), 2 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp b/sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp index 42bd78de26f48..694bf2a5eb302 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp @@ -125,7 +125,7 @@ int get_mem_idx(GroupTy g, int vec_or_array_idx) { // | block type | # of blocks | // +------------+-------------+ // | uchar | 1,2,4,8,16 | -// | ushort | 1,2,4,8 | +// | ushort | 1,2,4,8,16 | // | uint | 1,2,4,8 | // | ulong | 1,2,4,8 | // +------------+-------------+ @@ -146,7 +146,7 @@ struct BlockInfo { static constexpr bool has_builtin = detail::is_power_of_two(block_size) && detail::is_power_of_two(num_blocks) && block_size <= 8 && - (num_blocks <= 8 || (num_blocks == 16 && block_size == 1)); + (num_blocks <= 8 || (num_blocks == 16 && block_size <= 2)); }; template struct BlockTypeInfo; diff --git a/sycl/test/check_device_code/group_load.cpp b/sycl/test/check_device_code/group_load.cpp index 9bb12dbf19af9..1da28fb4107b3 100644 --- a/sycl/test/check_device_code/group_load.cpp +++ b/sycl/test/check_device_code/group_load.cpp @@ -573,6 +573,52 @@ SYCL_EXTERNAL void test_four_shorts(sycl::sub_group &sg, group_load(sg, p, out, opt_striped{}); } +// CHECK-LABEL: @_ZN7striped19test_sixteen_shortsERN4sycl3_V19sub_groupEPU3AS1sNS1_4spanIsLm16EEE( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA24]] +// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 +// CHECK-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 3 +// CHECK-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 +// CHECK-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] +// CHECK: if.then.i: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META93:![0-9]+]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META96:![0-9]+]] +// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK: for.cond.i.i: +// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-NEXT: [[CMP_I14_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 16 +// CHECK-NEXT: br i1 [[CMP_I14_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM16ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT_I:%.*]] +// CHECK: for.body.i.i: +// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP4]], [[I_0_I_I]] +// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I_I]] +// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I]] +// CHECK-NEXT: [[TMP5:%.*]] = load i16, ptr addrspace(1) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA29]] +// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-NEXT: store i16 [[TMP5]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 2, !tbaa [[TBAA29]] +// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP99:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ssLm16ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEESS_.exit.i: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM16ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEESQ__EXIT:%.*]] +// CHECK: if.end.i: +// CHECK-NEXT: [[CALL4_I:%.*]] = tail call spir_func noundef <16 x i16> @_Z30__spirv_SubgroupBlockReadINTELIDv16_tET_PU3AS1Kt(ptr addrspace(1) noundef nonnull [[P]]) #[[ATTR4]] +// CHECK-NEXT: store <16 x i16> [[CALL4_I]], ptr addrspace(4) [[TMP1]], align 2 +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM16ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEESQ__EXIT]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ssLm16ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEESQ_.exit: +// CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_sixteen_shorts(sycl::sub_group &sg, + plain_global_ptr p, + span out) { + group_load(sg, p, out, opt_striped{}); +} + // CHECK-LABEL: @_ZN7striped21test_non_power_of_twoERN4sycl3_V19sub_groupEPU3AS1iNS1_4spanIiLm3EEE( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] diff --git a/sycl/test/check_device_code/group_store.cpp b/sycl/test/check_device_code/group_store.cpp index b0cd454287ef1..3727c889feb59 100644 --- a/sycl/test/check_device_code/group_store.cpp +++ b/sycl/test/check_device_code/group_store.cpp @@ -735,6 +735,68 @@ SYCL_EXTERNAL void test_four_shorts(sycl::sub_group &sg, span v, group_store(sg, v, p, opt_striped{}); } +// CHECK-LABEL: @_ZN7striped19test_sixteen_shortsERN4sycl3_V19sub_groupENS1_4spanIsLm16EEEPU3AS1s( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VALUES_I:%.*]] = alloca [16 x i16], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA22]] +// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 +// CHECK-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 15 +// CHECK-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 +// CHECK-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] +// CHECK: if.then.i: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META107:![0-9]+]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META110:![0-9]+]] +// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK: for.cond.i.i: +// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-NEXT: [[CMP_I19_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 16 +// CHECK-NEXT: br i1 [[CMP_I19_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM16EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT_I:%.*]] +// CHECK: for.body.i.i: +// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-NEXT: [[TMP5:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 2, !tbaa [[TBAA27]] +// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP4]], [[I_0_I_I]] +// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I_I]] +// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I]] +// CHECK-NEXT: store i16 [[TMP5]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA27]] +// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP113:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEsLm16EPU3AS1sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_SS_.exit.i: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM16EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_SQ__EXIT:%.*]] +// CHECK: if.end.i: +// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[VALUES_I]]) #[[ATTR7]] +// CHECK-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK: for.cond.i: +// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_END_I]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 16 +// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[FOR_COND_CLEANUP_I:%.*]] +// CHECK: for.cond.cleanup.i: +// CHECK-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr [[VALUES_I]], align 2, !tbaa [[TBAA31]] +// CHECK-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIDv16_tEvPU3AS1tT_(ptr addrspace(1) noundef nonnull [[P]], <16 x i16> noundef [[TMP6]]) #[[ATTR5]] +// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[VALUES_I]]) #[[ATTR7]] +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM16EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_SQ__EXIT]] +// CHECK: for.body.i: +// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I20_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-NEXT: [[TMP7:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I20_I]], align 2, !tbaa [[TBAA27]] +// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [16 x i16], ptr [[VALUES_I]], i64 0, i64 [[CONV_I]] +// CHECK-NEXT: store i16 [[TMP7]], ptr [[ARRAYIDX_I]], align 2, !tbaa [[TBAA27]] +// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP114:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEsLm16EPU3AS1sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_SQ_.exit: +// CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_sixteen_shorts(sycl::sub_group &sg, span v, + plain_global_ptr p) { + group_store(sg, v, p, opt_striped{}); +} + // CHECK-LABEL: @_ZN7striped21test_non_power_of_twoERN4sycl3_V19sub_groupENS1_4spanIiLm3EEEPU3AS1i( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] From 1b569b7c695ca8e4c1462eabd664cc76342e6749 Mon Sep 17 00:00:00 2001 From: dklochkov-intel Date: Fri, 10 Jan 2025 20:50:13 +0100 Subject: [PATCH 475/567] [SYCL] Change SYCL version definition to 202012 (#15890) SYCL language version macro is changed from 202001 to 202012L --- clang/lib/Basic/Version.cpp | 2 +- clang/test/CodeGenSYCL/integration_header_ppmacros.cpp | 2 +- clang/test/Preprocessor/sycl-macro.cpp | 4 ++-- sycl/include/sycl/detail/defines_elementary.hpp | 2 +- sycl/include/sycl/handler.hpp | 5 +++-- 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/clang/lib/Basic/Version.cpp b/clang/lib/Basic/Version.cpp index cc2f537ba024b..84be9bb1d992d 100644 --- a/clang/lib/Basic/Version.cpp +++ b/clang/lib/Basic/Version.cpp @@ -128,7 +128,7 @@ std::string getClangFullCPPVersion() { llvm::SmallVector, 2> getSYCLVersionMacros(const LangOptions &LangOpts) { if (LangOpts.getSYCLVersion() == LangOptions::SYCL_2020) - return {{"SYCL_LANGUAGE_VERSION", "202001"}}; + return {{"SYCL_LANGUAGE_VERSION", "202012L"}}; llvm_unreachable("SYCL standard should be set"); } } // end namespace clang diff --git a/clang/test/CodeGenSYCL/integration_header_ppmacros.cpp b/clang/test/CodeGenSYCL/integration_header_ppmacros.cpp index 29b78a26b5a6e..19f56899b1e52 100644 --- a/clang/test/CodeGenSYCL/integration_header_ppmacros.cpp +++ b/clang/test/CodeGenSYCL/integration_header_ppmacros.cpp @@ -17,7 +17,7 @@ int main() { sycl::kernel_single_task([]() {}); } // CHECK: #ifndef SYCL_LANGUAGE_VERSION -// CHECK-NEXT: #define SYCL_LANGUAGE_VERSION 202001 +// CHECK-NEXT: #define SYCL_LANGUAGE_VERSION 202012 // CHECK-NEXT: #endif //SYCL_LANGUAGE_VERSION // CHECK-RANGE: #ifndef __SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ diff --git a/clang/test/Preprocessor/sycl-macro.cpp b/clang/test/Preprocessor/sycl-macro.cpp index d8e28fb49c375..8e4a0be800b18 100644 --- a/clang/test/Preprocessor/sycl-macro.cpp +++ b/clang/test/Preprocessor/sycl-macro.cpp @@ -16,11 +16,11 @@ // CHECK-NOT:#define SYCL_EXTERNAL // CHECK-NOT:#define __SYCL_ID_QUERIES_FIT_IN_INT__ 1 -// CHECK-SYCL-STD:#define SYCL_LANGUAGE_VERSION 202001 +// CHECK-SYCL-STD:#define SYCL_LANGUAGE_VERSION 202012L // CHECK-SYCL-STD:#define __SYCL_ID_QUERIES_FIT_IN_INT__ 1 -// CHECK-SYCL-STD-2020:#define SYCL_LANGUAGE_VERSION 202001 +// CHECK-SYCL-STD-2020:#define SYCL_LANGUAGE_VERSION 202012 // CHECK-SYCL-STD-2020:#define __SYCL_ID_QUERIES_FIT_IN_INT__ 1 // CHECK-SYCL-STD-DEVICE:#define __SYCL_DEVICE_ONLY__ 1 diff --git a/sycl/include/sycl/detail/defines_elementary.hpp b/sycl/include/sycl/detail/defines_elementary.hpp index bce997e4e6693..17107c9216b38 100644 --- a/sycl/include/sycl/detail/defines_elementary.hpp +++ b/sycl/include/sycl/detail/defines_elementary.hpp @@ -57,7 +57,7 @@ #endif // __SYCL_DEPRECATED #ifndef __SYCL2020_DEPRECATED -#if SYCL_LANGUAGE_VERSION >= 202001 && \ +#if SYCL_LANGUAGE_VERSION == 202012L && \ !defined(SYCL2020_DISABLE_DEPRECATION_WARNINGS) #define __SYCL2020_DEPRECATED(message) __SYCL_DEPRECATED(message) #else diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 323673d871f38..758daa3a81a9b 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -1147,7 +1147,8 @@ class __SYCL_EXPORT handler { // Range rounding is supported only for newer SYCL standards. #if !defined(__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__) && \ !defined(DPCPP_HOST_DEVICE_OPENMP) && \ - !defined(DPCPP_HOST_DEVICE_PERF_NATIVE) && SYCL_LANGUAGE_VERSION >= 202001 + !defined(DPCPP_HOST_DEVICE_PERF_NATIVE) && \ + SYCL_LANGUAGE_VERSION >= 202012L auto [RoundedRange, HasRoundedRange] = getRoundedRange(UserRange); if (HasRoundedRange) { using NameWT = typename detail::get_kernel_wrapper_name_t::name; @@ -1177,7 +1178,7 @@ class __SYCL_EXPORT handler { } else #endif // !__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ && // !DPCPP_HOST_DEVICE_OPENMP && !DPCPP_HOST_DEVICE_PERF_NATIVE && - // SYCL_LANGUAGE_VERSION >= 202001 + // SYCL_LANGUAGE_VERSION >= 202012L { (void)UserRange; (void)Props; From 923afb90386dcd0917a5b2f2613bc7d7ef2085a7 Mon Sep 17 00:00:00 2001 From: Udit Kumar Agarwal Date: Fri, 10 Jan 2025 12:32:40 -0800 Subject: [PATCH 476/567] [SYCL][E2E] Fix `zstd` support detection in LIT with icx on Windows (#16582) Fixes CMPLRLLVM-64568 `-shared` is invalid with `icx` on Windows. This PR makes use of `/LD` instead when compiler accepts MSVC-style args. --- sycl/test-e2e/lit.cfg.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/sycl/test-e2e/lit.cfg.py b/sycl/test-e2e/lit.cfg.py index d723d3452b7d6..a374cfaee402f 100644 --- a/sycl/test-e2e/lit.cfg.py +++ b/sycl/test-e2e/lit.cfg.py @@ -328,8 +328,22 @@ def open_check_file(file_name): # Check if clang is built with ZSTD and compression support. fPIC_opt = "-fPIC" if platform.system() != "Windows" else "" +# -shared is invalid for icx on Windows, use /LD instead. +dll_opt = "/LD" if cl_options else "-shared" + ps = subprocess.Popen( - [config.dpcpp_compiler, "-fsycl", "--offload-compress", "-shared", fPIC_opt, "-x", "c++", "-", "-o", "-"], + [ + config.dpcpp_compiler, + "-fsycl", + "--offload-compress", + dll_opt, + fPIC_opt, + "-x", + "c++", + "-", + "-o", + "-", + ], stdin=subprocess.PIPE, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, From 32a0c85c04dfc62b5f17bc0be453518dd387f4c7 Mon Sep 17 00:00:00 2001 From: Udit Kumar Agarwal Date: Fri, 10 Jan 2025 12:33:17 -0800 Subject: [PATCH 477/567] [SYCL][E2E] Fix tests disabled during PVC enablement in CI (#16577) These tests were disabled/XFAILed in https://github.com/intel/llvm/pull/14720. [sycl/test-e2e/DeprecatedFeatures/set_arg_interop.cpp](https://github.com/intel/llvm/pull/16577/files#diff-9a41bc14675723afa4f98932225243790a5fc5b0fbfc54e732ca1ef84cef7df3) is XPASSing flakily (see https://github.com/intel/llvm/issues/16576) so I've marked it unsupported. --- sycl/test-e2e/Basic/accessor/host_task_accessor_deduction.cpp | 4 ---- sycl/test-e2e/DeprecatedFeatures/set_arg_interop.cpp | 4 ++-- sycl/test-e2e/ESIMD/named_barriers/loop.cpp | 4 ---- sycl/test-e2e/ESIMD/named_barriers/loop_extended.cpp | 4 ---- sycl/test-e2e/GroupAlgorithm/reduce_sycl2020.cpp | 4 ---- 5 files changed, 2 insertions(+), 18 deletions(-) diff --git a/sycl/test-e2e/Basic/accessor/host_task_accessor_deduction.cpp b/sycl/test-e2e/Basic/accessor/host_task_accessor_deduction.cpp index e6da8d355a0f6..597e4ee57248d 100644 --- a/sycl/test-e2e/Basic/accessor/host_task_accessor_deduction.cpp +++ b/sycl/test-e2e/Basic/accessor/host_task_accessor_deduction.cpp @@ -1,6 +1,2 @@ // RUN: %{build} -Daccessor_new_api_test %S/Inputs/host_task_accessor.cpp -o %t.out // RUN: %{run} %t.out - -// Disabled on PVC without igc-dev due to timeout. -// UNSUPPORTED: arch-intel_gpu_pvc && !igc-dev -// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/14826 diff --git a/sycl/test-e2e/DeprecatedFeatures/set_arg_interop.cpp b/sycl/test-e2e/DeprecatedFeatures/set_arg_interop.cpp index 18388bce6c289..325a329a916dd 100644 --- a/sycl/test-e2e/DeprecatedFeatures/set_arg_interop.cpp +++ b/sycl/test-e2e/DeprecatedFeatures/set_arg_interop.cpp @@ -1,7 +1,7 @@ // REQUIRES: opencl, opencl_icd -// XFAIL: arch-intel_gpu_pvc -// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14826 +// UNSUPPORTED: arch-intel_gpu_pvc +// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/16576 // RUN: %{build} -D__SYCL_INTERNAL_API -o %t.out %opencl_lib -O3 // RUN: %{run} %t.out diff --git a/sycl/test-e2e/ESIMD/named_barriers/loop.cpp b/sycl/test-e2e/ESIMD/named_barriers/loop.cpp index 9b526125153c7..e56ddfe5ad0ae 100644 --- a/sycl/test-e2e/ESIMD/named_barriers/loop.cpp +++ b/sycl/test-e2e/ESIMD/named_barriers/loop.cpp @@ -10,10 +10,6 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// Disabled on PVC without igc-dev due to flaky failures. -// UNSUPPORTED: arch-intel_gpu_pvc && !igc-dev -// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/14826 - // Test checks support of named barrier in a loop in ESIMD kernel. // SLM and surface size is 32 bytes, 16 bytes per iteration. // Each iteration has 1 barrier and 1 producer. Producer stores data to SLM, diff --git a/sycl/test-e2e/ESIMD/named_barriers/loop_extended.cpp b/sycl/test-e2e/ESIMD/named_barriers/loop_extended.cpp index 35053824ed2af..b42c58181ca4c 100644 --- a/sycl/test-e2e/ESIMD/named_barriers/loop_extended.cpp +++ b/sycl/test-e2e/ESIMD/named_barriers/loop_extended.cpp @@ -10,10 +10,6 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// Disabled on PVC due to flaky failures. -// UNSUPPORTED: arch-intel_gpu_pvc -// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/14826 - // Test checks support of named barrier in a loop in ESIMD kernel. // First iteration has 1 barrier and 1 producer, second - 2 barriers and 2 // producers. Producer stores data to SLM, then all threads read SLM and store diff --git a/sycl/test-e2e/GroupAlgorithm/reduce_sycl2020.cpp b/sycl/test-e2e/GroupAlgorithm/reduce_sycl2020.cpp index 67b29d213032b..321dc76923fb3 100644 --- a/sycl/test-e2e/GroupAlgorithm/reduce_sycl2020.cpp +++ b/sycl/test-e2e/GroupAlgorithm/reduce_sycl2020.cpp @@ -1,10 +1,6 @@ // RUN: %{build} -fsycl-device-code-split=per_kernel -I . -o %t.out // RUN: %{run} %t.out -// Disabled on PVC without igc-dev due to timeout. -// UNSUPPORTED: arch-intel_gpu_pvc && !igc-dev -// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/14826 - #include "support.h" #include From 99d8f6840f66b5cce78436a89e9ee4cea546d465 Mon Sep 17 00:00:00 2001 From: Udit Kumar Agarwal Date: Fri, 10 Jan 2025 13:15:48 -0800 Subject: [PATCH 478/567] [E2E][NFCI] Remove `import platform` from USM LIT cfg file (#16594) I think `import platform` is not required. Found in https://github.com/intel/llvm/pull/16591#discussion_r1910675290 --- sycl/test-e2e/USM/lit.local.cfg | 2 -- 1 file changed, 2 deletions(-) diff --git a/sycl/test-e2e/USM/lit.local.cfg b/sycl/test-e2e/USM/lit.local.cfg index be995495bd588..7cb1bcf1e3012 100644 --- a/sycl/test-e2e/USM/lit.local.cfg +++ b/sycl/test-e2e/USM/lit.local.cfg @@ -1,4 +1,2 @@ -import platform - # https://github.com/intel/llvm/issues/15648 config.unsupported_features += ['hip'] From 2cfe8e6b15b03322704b6473bd44089f1a3905c3 Mon Sep 17 00:00:00 2001 From: Nick Sarnie Date: Sat, 11 Jan 2025 07:16:04 +0900 Subject: [PATCH 479/567] [SYCL][ESIMD][E2E] Split some atomic update tests (#16580) These are really slow so split them into two parts, we get like a 40 second speedup. Signed-off-by: Sarnie, Nick --- .../Inputs/atomic_update.hpp | 43 ++++++++++++------- .../Inputs/atomic_update_slm.hpp | 40 +++++++++++------ .../atomic_update_acc_dg2_pvc.cpp | 2 +- .../atomic_update_acc_dg2_pvc_2.cpp | 26 +++++++++++ .../atomic_update_acc_dg2_pvc_64_2.cpp | 17 ++++++++ .../atomic_update_acc_dg2_pvc_cmpxchg_2.cpp | 17 ++++++++ .../atomic_update_acc_dg2_pvc_stateless_2.cpp | 14 ++++++ .../atomic_update_slm_acc_pvc.cpp | 2 +- .../atomic_update_slm_acc_pvc_2.cpp | 26 +++++++++++ .../atomic_update_slm_acc_pvc_cmpxchg_2.cpp | 16 +++++++ .../atomic_update_slm_pvc.cpp | 2 +- .../atomic_update_slm_pvc_2.cpp | 26 +++++++++++ .../atomic_update_slm_pvc_cmpxchg_2.cpp | 16 +++++++ .../atomic_update_usm_dg2_pvc.cpp | 2 +- .../atomic_update_usm_dg2_pvc_2.cpp | 30 +++++++++++++ .../atomic_update_usm_dg2_pvc_64_2.cpp | 16 +++++++ .../atomic_update_usm_dg2_pvc_cmpxchg_2.cpp | 17 ++++++++ 17 files changed, 278 insertions(+), 34 deletions(-) create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_acc_dg2_pvc_2.cpp create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_acc_dg2_pvc_64_2.cpp create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_acc_dg2_pvc_cmpxchg_2.cpp create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_acc_dg2_pvc_stateless_2.cpp create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_slm_acc_pvc_2.cpp create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_slm_acc_pvc_cmpxchg_2.cpp create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_slm_pvc_2.cpp create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_slm_pvc_cmpxchg_2.cpp create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_usm_dg2_pvc_2.cpp create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_usm_dg2_pvc_64_2.cpp create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_usm_dg2_pvc_cmpxchg_2.cpp diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp index cf2bccc166ac6..e31f5e923dcf6 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp @@ -118,7 +118,7 @@ bool verify(T *arr, const Config &cfg, size_t size) { template class ImplF, bool UseMask, bool UseProperties> -bool test_usm(queue q, const Config &cfg) { +bool test_usm(queue &q, const Config &cfg) { constexpr auto op = ImplF::atomic_op; using CurAtomicOpT = decltype(op); constexpr int n_args = ImplF::n_args; @@ -245,7 +245,7 @@ bool test_usm(queue q, const Config &cfg) { template class ImplF, bool UseMask, bool UseProperties> -bool test_acc(queue q, const Config &cfg) { +bool test_acc(queue &q, const Config &cfg) { constexpr auto op = ImplF::atomic_op; using CurAtomicOpT = decltype(op); constexpr int n_args = ImplF::n_args; @@ -615,7 +615,7 @@ struct ImplFcmpwr : ImplCmpxchgBase {}; template class ImplF, bool UseMask, bool UseLSCFeatures> -auto run_test(queue q, const Config &cfg) { +auto run_test(queue &q, const Config &cfg) { if constexpr (UseAcc) return test_acc(q, cfg); else @@ -624,7 +624,7 @@ auto run_test(queue q, const Config &cfg) { template class Op, bool UseMask, bool UseLSCFeatures, bool UseAcc, int SignMask = (Signed | Unsigned)> -bool test_int_types(queue q, const Config &cfg) { +bool test_int_types(queue &q, const Config &cfg) { bool passed = true; if constexpr (SignMask & Signed) { // Supported by LSC atomic: @@ -662,7 +662,7 @@ bool test_int_types(queue q, const Config &cfg) { template class Op, bool UseMask, bool UseLSCFeatures, bool UseAcc> -bool test_fp_types(queue q, const Config &cfg) { +bool test_fp_types(queue &q, const Config &cfg) { bool passed = true; // TODO: Enable FADD/FSUB on DG2/PVC when the error in GPU driver is resolved. if constexpr (UseLSCFeatures && @@ -685,7 +685,7 @@ bool test_fp_types(queue q, const Config &cfg) { template