From 32080b56dbf500d93acc05e6d3af81fa59af17c2 Mon Sep 17 00:00:00 2001 From: Andreu Carminati Date: Mon, 14 Oct 2024 16:36:00 +0100 Subject: [PATCH] [AIE2] Add more memory/ptr combiners *Now, we can selectively split memory operations to enhance selection combiner opportunities. --- llvm/lib/Target/AIE/AIE2InstrInfo.cpp | 39 ++++ llvm/lib/Target/AIE/AIE2InstrInfo.h | 6 + llvm/lib/Target/AIE/AIEBaseInstrInfo.h | 15 ++ llvm/lib/Target/AIE/AIECombine.td | 30 ++- llvm/lib/Target/AIE/AIECombinerHelper.cpp | 202 ++++++++++++++++++ llvm/lib/Target/AIE/AIECombinerHelper.h | 21 ++ .../AIE/GlobalISel/combine-loads-stores.mir | 38 +++- .../AIE/GlobalISel/combine-padd-ls-offset.mir | 108 ++++++++++ .../combine-reuse-padd-ls-offset.mir | 94 ++++++++ .../AIE/GlobalISel/combine-split-large-ls.mir | 160 ++++++++++++++ .../GlobalISel/indexed-512-load-store.mir | 34 +-- llvm/test/CodeGen/AIE/aie2/mmo-load.ll | 4 +- llvm/test/CodeGen/AIE/aie2/mmo-store.ll | 2 +- 13 files changed, 723 insertions(+), 30 deletions(-) create mode 100644 llvm/test/CodeGen/AIE/GlobalISel/combine-padd-ls-offset.mir create mode 100644 llvm/test/CodeGen/AIE/GlobalISel/combine-reuse-padd-ls-offset.mir create mode 100644 llvm/test/CodeGen/AIE/GlobalISel/combine-split-large-ls.mir diff --git a/llvm/lib/Target/AIE/AIE2InstrInfo.cpp b/llvm/lib/Target/AIE/AIE2InstrInfo.cpp index 285c5ceddc5d..6b8f6796a767 100644 --- a/llvm/lib/Target/AIE/AIE2InstrInfo.cpp +++ b/llvm/lib/Target/AIE/AIE2InstrInfo.cpp @@ -1403,3 +1403,42 @@ AIE2InstrInfo::getVExtractOpInfo(const MachineInstr &MI) const { return std::nullopt; } } + +unsigned AIE2InstrInfo::getMaxLoadStoreSize() const { return 256; } + +bool AIE2InstrInfo::canCombineWithLoadStore(const MachineInstr &MI) const { + + if (!isa(MI)) + return false; + + const unsigned ID = cast(MI).getIntrinsicID(); + + switch (ID) { + case Intrinsic::aie2_I256_v16_acc32_srs: + case Intrinsic::aie2_I256_v16_acc64_srs: + case Intrinsic::aie2_I256_v32_acc32_srs: + case Intrinsic::aie2_I256_v8_acc64_srs: + case Intrinsic::aie2_I512_v16_acc64_srs: + case Intrinsic::aie2_I512_v32_acc32_srs: + + case Intrinsic::aie2_acc32_v16_I256_ups: + case Intrinsic::aie2_acc32_v32_I256_ups: + case Intrinsic::aie2_acc32_v32_I512_ups: + case Intrinsic::aie2_acc64_v16_I256_ups: + case Intrinsic::aie2_acc64_v16_I512_ups: + case Intrinsic::aie2_acc64_v8_I256_ups: + return true; + } + return false; +} + +bool AIE2InstrInfo::isSplittableType(const LLT Ty) const { + const LLT V16S32 = LLT::fixed_vector(16, 32); + const LLT V32S16 = LLT::fixed_vector(32, 16); + const LLT V64S8 = LLT::fixed_vector(64, 8); + + if (Ty == V16S32 || Ty == V32S16 || Ty == V64S8) + return true; + + return false; +} diff --git a/llvm/lib/Target/AIE/AIE2InstrInfo.h b/llvm/lib/Target/AIE/AIE2InstrInfo.h index 9a6cf70730ea..e5437f8d3385 100644 --- a/llvm/lib/Target/AIE/AIE2InstrInfo.h +++ b/llvm/lib/Target/AIE/AIE2InstrInfo.h @@ -179,6 +179,12 @@ class AIE2InstrInfo : public AIE2GenInstrInfo { std::optional getVExtractOpInfo(const MachineInstr &MI) const override; + unsigned getMaxLoadStoreSize() const override; + + bool canCombineWithLoadStore(const MachineInstr &MI) const override; + + bool isSplittableType(const LLT Ty) const override; + protected: SmallVector getSpillPseudoExpandInfo(const MachineInstr &MI) const override; diff --git a/llvm/lib/Target/AIE/AIEBaseInstrInfo.h b/llvm/lib/Target/AIE/AIEBaseInstrInfo.h index fac5b5f99f5d..7de5b89969b6 100644 --- a/llvm/lib/Target/AIE/AIEBaseInstrInfo.h +++ b/llvm/lib/Target/AIE/AIEBaseInstrInfo.h @@ -424,6 +424,21 @@ struct AIEBaseInstrInfo : public TargetInstrInfo { llvm_unreachable("Target didn't implement getVExtractOpInfo!"); } + /// Return the maximun size for memory operations on this target. + virtual unsigned getMaxLoadStoreSize() const { + llvm_unreachable("Target didn't implement getMaxLoadStoreSize!"); + } + + /// Return true if this instruction can be combined with a memory operation. + virtual bool canCombineWithLoadStore(const MachineInstr &MI) const { + llvm_unreachable("Target didn't implement canCombineWithLoadStore!"); + } + + /// Return true if the type can be splitted to fit target's restrictions. + virtual bool isSplittableType(const LLT Ty) const { + llvm_unreachable("Target didn't implement isSplittableType!"); + } + protected: /// Expand a spill pseudo-instruction into actual target instructions. This /// will essentially split the register being handled into its sub-registers, diff --git a/llvm/lib/Target/AIE/AIECombine.td b/llvm/lib/Target/AIE/AIECombine.td index 6fb874882ab0..c3cb20afd40f 100644 --- a/llvm/lib/Target/AIE/AIECombine.td +++ b/llvm/lib/Target/AIE/AIECombine.td @@ -109,8 +109,36 @@ def combine_add_vector_elt_undef : GICombineRule < (apply [{ applyAddVecEltUndef(*${root}, MRI, B); }] ) >; +def combine_load_store_split_matchdata: GIDefMatchData<"unsigned">; +def combine_load_store_split : GICombineRule< + (defs root:$root, combine_load_store_split_matchdata:$matchinfo), + (match (wip_match_opcode G_LOAD, G_STORE): $root, + [{ return matchLoadStoreSplit(cast(*${root}), MRI, (const AIEBaseInstrInfo &)B.getTII(), ${matchinfo}); }]), + (apply [{ applyLoadStoreSplit(cast(*${root}), MRI, B, ${matchinfo}); }]) +>; + +def combine_offset_load_store_ptradd_matchdata: GIDefMatchData<"std::pair">; +def combine_offset_load_store_ptradd : GICombineRule< + (defs root:$root, combine_offset_load_store_ptradd_matchdata:$matchinfo), + (match (wip_match_opcode G_AIE_OFFSET_LOAD, G_AIE_OFFSET_STORE): $root, + [{ return matchOffsetLoadStorePtrAdd(*${root}, MRI, (const AIEBaseInstrInfo &)B.getTII(), ${matchinfo}); }]), + (apply [{ applyOffsetLoadStorePtrAdd(*${root}, MRI, B, ${matchinfo}); }]) +>; + +def combine_offset_load_store_share_ptradd_matchdata: GIDefMatchData<"Register">; +def combine_offset_load_store_share_ptradd : GICombineRule< + (defs root:$root, combine_offset_load_store_share_ptradd_matchdata:$matchinfo), + (match (wip_match_opcode G_AIE_OFFSET_LOAD, G_AIE_OFFSET_STORE): $root, + [{ return matchOffsetLoadStoreSharePtrAdd(*${root}, MRI, Helper, (const AIEBaseInstrInfo &)B.getTII(), ${matchinfo}); }]), + (apply [{ applyOffsetLoadStoreSharePtrAdd(*${root}, MRI, B, ${matchinfo}); }]) +>; + def AIE2PostLegalizerCustomCombiner - : GICombiner<"AIE2PostLegalizerCustomCombinerImpl", [ combine_load_store_increment, + : GICombiner<"AIE2PostLegalizerCustomCombinerImpl", [ combine_load_store_split, + ptr_add_immed_chain, + combine_load_store_increment, + combine_offset_load_store_ptradd, + combine_offset_load_store_share_ptradd, combine_add_vector_elt_undef, combine_extract_concat, combine_unmerge_concat, diff --git a/llvm/lib/Target/AIE/AIECombinerHelper.cpp b/llvm/lib/Target/AIE/AIECombinerHelper.cpp index 01ea0754330b..99bce87fbc27 100644 --- a/llvm/lib/Target/AIE/AIECombinerHelper.cpp +++ b/llvm/lib/Target/AIE/AIECombinerHelper.cpp @@ -1358,3 +1358,205 @@ void llvm::applyUpdToConcat(MachineInstr &MI, MachineRegisterInfo &MRI, MI.eraseFromParent(); } + +bool llvm::matchLoadStoreSplit(GLoadStore &MI, MachineRegisterInfo &MRI, + const AIEBaseInstrInfo &TII, + unsigned &MaxMemSize) { + + Register ValReg = MI.getReg(0); + const LLT ValTy = MRI.getType(ValReg); + const bool IsLoad = isa(MI); + MaxMemSize = TII.getMaxLoadStoreSize(); + + if (!TII.isSplittableType(ValTy)) + return false; + + /// Avoid splitting operations that can be combined `as is`. + if (IsLoad) { + for (MachineInstr &ConvInstr : MRI.use_instructions(ValReg)) { + if (TII.canCombineWithLoadStore(ConvInstr)) + return false; + } + } else { + MachineInstr &ConvInstr = *getDefIgnoringCopiesAndBitcasts(ValReg, MRI); + if (TII.canCombineWithLoadStore(ConvInstr)) + return false; + } + + return true; +} + +void llvm::applyLoadStoreSplit(GLoadStore &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, unsigned &MaxMemSize) { + + assert(MaxMemSize && "MaxMemSize should be specified!"); + B.setInstrAndDebugLoc(MI); + MachineFunction &MF = B.getMF(); + const bool IsLoad = isa(MI); + Register ValReg = MI.getReg(0); + Register AddrReg = MI.getPointerReg(); + const LLT ValTy = MRI.getType(ValReg); + const LLT PtrTy = MRI.getType(AddrReg); + const LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits()); + const unsigned NumParts = ValTy.getSizeInBits() / MaxMemSize; + const LLT NarrowTy = ValTy.divide(NumParts); + const MachineMemOperand MMO = MI.getMMO(); + + SmallVector NarrowRegs; + if (!IsLoad) + extractParts(ValReg, NarrowTy, NumParts, NarrowRegs, B, MRI); + + for (int I = NumParts - 1; I >= 0; I--) { + const unsigned ByteOffset = I * NarrowTy.getSizeInBytes(); + Register Dst = MRI.createGenericVirtualRegister(NarrowTy); + Register NewAddrReg; + B.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset); + MachineMemOperand *NewMMO = + MF.getMachineMemOperand(&MMO, ByteOffset, NarrowTy); + + if (IsLoad) { + NarrowRegs.push_back(Dst); + B.buildLoad(Dst, NewAddrReg, *NewMMO); + } else { + B.buildStore(NarrowRegs[I], NewAddrReg, *NewMMO); + } + } + + if (IsLoad) { + std::reverse(NarrowRegs.begin(), NarrowRegs.end()); + B.buildConcatVectors(ValReg, NarrowRegs); + } + + MI.eraseFromParent(); +} + +/// Match something like this: +/// %293:_(s20) = G_CONSTANT i20 32 +/// %67:_(s20) = G_CONSTANT i20 64 +/// %68:_(p0) = nuw G_PTR_ADD %61, %67(s20) +/// %295:_(<16 x s16>) = G_AIE_OFFSET_LOAD %68(p0), %293(s20) + +/// To convert to: +/// %298:_(s20) = G_CONSTANT i20 96 +/// %295:_(<16 x s16>) = G_AIE_OFFSET_LOAD %61(p0), %298(s20) +bool llvm::matchOffsetLoadStorePtrAdd(MachineInstr &MI, + MachineRegisterInfo &MRI, + const AIEBaseInstrInfo &TII, + std::pair &RegOffset) { + + Register AddrReg = MI.getOperand(1).getReg(); + + auto CstOffsetLoadStore = + getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); + + if (!CstOffsetLoadStore) + return false; + + MachineInstr *DefAddrRegInstr = MRI.getVRegDef(AddrReg); + + if (DefAddrRegInstr->getOpcode() != TargetOpcode::G_PTR_ADD) + return false; + + auto CstDefAddrRegInstr = getIConstantVRegValWithLookThrough( + DefAddrRegInstr->getOperand(2).getReg(), MRI); + + if (!CstDefAddrRegInstr) + return false; + + RegOffset.first = DefAddrRegInstr->getOperand(1).getReg(); + RegOffset.second = CstDefAddrRegInstr->Value.getSExtValue() + + CstOffsetLoadStore->Value.getSExtValue(); + + return true; +} + +void llvm::applyOffsetLoadStorePtrAdd(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B, + std::pair &RegOffset) { + B.setInstrAndDebugLoc(MI); + + Register NewOffsetReg = + B.buildConstant(LLT::scalar(20), RegOffset.second).getReg(0); + + MI.getOperand(1).setReg(RegOffset.first); + MI.getOperand(2).setReg(NewOffsetReg); +} + +/// Match something like this: +/// %0:_(s32) = COPY $r0 +/// %1:_(p0) = COPY $p0 +/// %2:_(<16 x s32>) = COPY $x0 +/// %13:_(s32) = G_CONSTANT i32 6 +/// %12:_(s32) = G_SHL %0, %13(s32) +/// %5:_(s20) = G_TRUNC %12(s32) +/// %6:_(p0) = G_PTR_ADD %1, %5(s20) +/// %18:_(s20) = G_CONSTANT i20 32 +/// G_AIE_OFFSET_STORE %15(<8 x s32>), %6(p0), %18(s20) +/// G_AIE_OFFSET_STORE %14(<8 x s32>), %1(p0), %5(s20) + +/// To convert to (pointer reuse/CSE): +/// %0:_(s32) = COPY $r0 +/// %1:_(p0) = COPY $p0 +/// %2:_(<16 x s32>) = COPY $x0 +/// %13:_(s32) = G_CONSTANT i32 6 +/// %12:_(s32) = G_SHL %0, %13(s32) +/// %5:_(s20) = G_TRUNC %12(s32) +/// %6:_(p0) = G_PTR_ADD %1, %5(s20) +/// %18:_(s20) = G_CONSTANT i20 32 +/// %19:_(s20) = G_CONSTANT i20 0 +/// G_AIE_OFFSET_STORE %15(<8 x s32>), %6(p0), %18(s20) +/// G_AIE_OFFSET_STORE %14(<8 x s32>), %6(p0), %19(s20) +bool llvm::matchOffsetLoadStoreSharePtrAdd(MachineInstr &MI, + MachineRegisterInfo &MRI, + CombinerHelper &Helper, + const AIEBaseInstrInfo &TII, + Register &PtrAddReg) { + Register PtrReg = MI.getOperand(1).getReg(); + Register OffsetReg = MI.getOperand(2).getReg(); + + auto OffsetCst = getIConstantVRegValWithLookThrough(OffsetReg, MRI); + + // If we have a constant here, don't touch because it is better + // to stay folded. Otherwise we will fold again in the previous + // combiner. + if (OffsetCst) + return false; + + for (auto &Use : MRI.use_nodbg_instructions(PtrReg)) { + if (Use.getOpcode() != TargetOpcode::G_PTR_ADD) + continue; + if (Use.getParent() != MI.getParent()) + continue; + if (!Helper.dominates(Use, MI)) + continue; + + Register PaddDestReg = Use.getOperand(0).getReg(); + Register PaddSrcOffset = Use.getOperand(2).getReg(); + + // Dead instruction? Don't use it! + // Ony use if at least another instruction is using it. + if (!hasNItemsOrMore(MRI.use_instr_nodbg_begin(PaddDestReg), + MRI.use_instr_nodbg_end(), 1)) + continue; + + // We can share a residual G_PTR_ADD. + if (Use.getOperand(2).getReg() == PaddSrcOffset) { + PtrAddReg = PaddDestReg; + return true; + } + } + + return false; +} + +void llvm::applyOffsetLoadStoreSharePtrAdd(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B, + Register &PtrAddReg) { + + Register NewOffsetReg = B.buildConstant(LLT::scalar(20), 0).getReg(0); + + MI.getOperand(1).setReg(PtrAddReg); + MI.getOperand(2).setReg(NewOffsetReg); +} diff --git a/llvm/lib/Target/AIE/AIECombinerHelper.h b/llvm/lib/Target/AIE/AIECombinerHelper.h index 19cec6aa2757..fdf04ee959bb 100644 --- a/llvm/lib/Target/AIE/AIECombinerHelper.h +++ b/llvm/lib/Target/AIE/AIECombinerHelper.h @@ -162,6 +162,27 @@ void applyUpdToConcat(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, std::map &IndexRegMap); +bool matchLoadStoreSplit(GLoadStore &MI, MachineRegisterInfo &MRI, + const AIEBaseInstrInfo &TII, unsigned &MaxMemSize); +void applyLoadStoreSplit(GLoadStore &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, unsigned &MaxMemSize); + +bool matchOffsetLoadStorePtrAdd(MachineInstr &MI, MachineRegisterInfo &MRI, + const AIEBaseInstrInfo &TII, + std::pair &RegOffset); + +void applyOffsetLoadStorePtrAdd(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, + std::pair &RegOffset); + +bool matchOffsetLoadStoreSharePtrAdd(MachineInstr &MI, MachineRegisterInfo &MRI, + CombinerHelper &Helper, + const AIEBaseInstrInfo &TII, + Register &PtrAddReg); + +void applyOffsetLoadStoreSharePtrAdd(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, Register &PtrAddReg); + } // namespace llvm #endif diff --git a/llvm/test/CodeGen/AIE/GlobalISel/combine-loads-stores.mir b/llvm/test/CodeGen/AIE/GlobalISel/combine-loads-stores.mir index 4ce207bb2fe9..866bf5ed4751 100644 --- a/llvm/test/CodeGen/AIE/GlobalISel/combine-loads-stores.mir +++ b/llvm/test/CodeGen/AIE/GlobalISel/combine-loads-stores.mir @@ -931,10 +931,15 @@ body: | ; CHECK-LABEL: name: preinc_combine_vectors_512_bits ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 - ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(<32 x s16>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (<32 x s16>)) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 96 + ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(<16 x s16>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C1]](s20) :: (load (<16 x s16>) from unknown-address + 32) + ; CHECK-NEXT: [[AIE_OFFSET_LOAD1:%[0-9]+]]:_(<16 x s16>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (<16 x s16>), align 64) ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p0 - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 - ; CHECK-NEXT: G_AIE_OFFSET_STORE [[AIE_OFFSET_LOAD]](<32 x s16>), [[COPY1]](p0), [[C1]](s20) :: (store (<32 x s16>)) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<16 x s16>) = COPY [[AIE_OFFSET_LOAD1]](<16 x s16>) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<16 x s16>) = COPY [[AIE_OFFSET_LOAD]](<16 x s16>) + ; CHECK-NEXT: G_AIE_OFFSET_STORE [[COPY3]](<16 x s16>), [[COPY1]](p0), [[C1]](s20) :: (store (<16 x s16>) into unknown-address + 32) + ; CHECK-NEXT: G_AIE_OFFSET_STORE [[COPY2]](<16 x s16>), [[COPY1]](p0), [[C2]](s20) :: (store (<16 x s16>), align 64) %0:_(p0) = COPY $p0 %1:_(s20) = G_CONSTANT i20 64 %3:_(p0) = G_PTR_ADD %0, %1 @@ -1452,10 +1457,15 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 - ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(<32 x s16>), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (<32 x s16>)) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 32 + ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(<16 x s16>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C2]](s20) :: (load (<16 x s16>) from unknown-address + 32) + ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(<16 x s16>), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (<16 x s16>), align 64) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<16 x s16>) = COPY [[AIE_POSTINC_LOAD]](<16 x s16>) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<16 x s16>) = COPY [[AIE_OFFSET_LOAD]](<16 x s16>) + ; CHECK-NEXT: G_AIE_OFFSET_STORE [[COPY3]](<16 x s16>), [[COPY1]](p0), [[C2]](s20) :: (store (<16 x s16>) into unknown-address + 32) ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_LOAD1]](p0) ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[C]](s20) - ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[AIE_POSTINC_LOAD]](<32 x s16>), [[COPY1]], [[C1]](s20) :: (store (<32 x s16>)) + ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[COPY2]](<16 x s16>), [[COPY1]], [[C1]](s20) :: (store (<16 x s16>), align 64) ; CHECK-NEXT: $p1 = COPY [[PTR_ADD]](p0) ; CHECK-NEXT: $p2 = COPY [[AIE_POSTINC_STORE]](p0) %0:_(p0) = COPY $p0 @@ -1480,8 +1490,13 @@ body: | ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 - ; CHECK-NEXT: [[AIE_POSTINC_2D_LOAD:%[0-9]+]]:_(<32 x s16>), [[AIE_POSTINC_2D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_2D_LOAD2:%[0-9]+]]:_(s20) = G_AIE_POSTINC_2D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]] :: (load (<32 x s16>)) - ; CHECK-NEXT: [[AIE_POSTINC_2D_STORE:%[0-9]+]]:_(p0), [[AIE_POSTINC_2D_STORE1:%[0-9]+]]:_(s20) = G_AIE_POSTINC_2D_STORE [[AIE_POSTINC_2D_LOAD]](<32 x s16>), [[COPY1]], [[C]], [[C]], [[C]], [[C]] :: (store (<32 x s16>)) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 32 + ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(<16 x s16>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C1]](s20) :: (load (<16 x s16>) from unknown-address + 32) + ; CHECK-NEXT: [[AIE_POSTINC_2D_LOAD:%[0-9]+]]:_(<16 x s16>), [[AIE_POSTINC_2D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_2D_LOAD2:%[0-9]+]]:_(s20) = G_AIE_POSTINC_2D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]] :: (load (<16 x s16>), align 64) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<16 x s16>) = COPY [[AIE_POSTINC_2D_LOAD]](<16 x s16>) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<16 x s16>) = COPY [[AIE_OFFSET_LOAD]](<16 x s16>) + ; CHECK-NEXT: G_AIE_OFFSET_STORE [[COPY3]](<16 x s16>), [[COPY1]](p0), [[C1]](s20) :: (store (<16 x s16>) into unknown-address + 32) + ; CHECK-NEXT: [[AIE_POSTINC_2D_STORE:%[0-9]+]]:_(p0), [[AIE_POSTINC_2D_STORE1:%[0-9]+]]:_(s20) = G_AIE_POSTINC_2D_STORE [[COPY2]](<16 x s16>), [[COPY1]], [[C]], [[C]], [[C]], [[C]] :: (store (<16 x s16>), align 64) ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_2D_LOAD1]](p0) ; CHECK-NEXT: $p1 = COPY [[AIE_POSTINC_2D_STORE]](p0) %0:_(p0) = COPY $p0 @@ -1503,8 +1518,13 @@ body: | ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 - ; CHECK-NEXT: [[AIE_POSTINC_3D_LOAD:%[0-9]+]]:_(<32 x s16>), [[AIE_POSTINC_3D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_LOAD2:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_LOAD3:%[0-9]+]]:_ = G_AIE_POSTINC_3D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (load (<32 x s16>)) - ; CHECK-NEXT: [[AIE_POSTINC_3D_STORE:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_STORE1:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_STORE2:%[0-9]+]]:_ = G_AIE_POSTINC_3D_STORE [[AIE_POSTINC_3D_LOAD]](<32 x s16>), [[COPY1]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (store (<32 x s16>)) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 32 + ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(<16 x s16>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C1]](s20) :: (load (<16 x s16>) from unknown-address + 32) + ; CHECK-NEXT: [[AIE_POSTINC_3D_LOAD:%[0-9]+]]:_(<16 x s16>), [[AIE_POSTINC_3D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_LOAD2:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_LOAD3:%[0-9]+]]:_ = G_AIE_POSTINC_3D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (load (<16 x s16>), align 64) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<16 x s16>) = COPY [[AIE_POSTINC_3D_LOAD]](<16 x s16>) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<16 x s16>) = COPY [[AIE_OFFSET_LOAD]](<16 x s16>) + ; CHECK-NEXT: G_AIE_OFFSET_STORE [[COPY3]](<16 x s16>), [[COPY1]](p0), [[C1]](s20) :: (store (<16 x s16>) into unknown-address + 32) + ; CHECK-NEXT: [[AIE_POSTINC_3D_STORE:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_STORE1:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_STORE2:%[0-9]+]]:_ = G_AIE_POSTINC_3D_STORE [[COPY2]](<16 x s16>), [[COPY1]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (store (<16 x s16>), align 64) ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_3D_LOAD1]](p0) ; CHECK-NEXT: $p1 = COPY [[AIE_POSTINC_3D_STORE]](p0) %0:_(p0) = COPY $p0 diff --git a/llvm/test/CodeGen/AIE/GlobalISel/combine-padd-ls-offset.mir b/llvm/test/CodeGen/AIE/GlobalISel/combine-padd-ls-offset.mir new file mode 100644 index 000000000000..9832dbc5e9b9 --- /dev/null +++ b/llvm/test/CodeGen/AIE/GlobalISel/combine-padd-ls-offset.mir @@ -0,0 +1,108 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +# RUN: llc -mtriple aie2 -run-pass=aie2-postlegalizer-custom-combiner %s -verify-machineinstrs -o - | FileCheck %s + +# Test for combine_offset_load_store_ptradd + +# Case 1: Can be combined. + +--- +name: combine_offset_load_ptradd_const +body: | + bb.0: + liveins: $p0 + ; CHECK-LABEL: name: combine_offset_load_ptradd_const + ; CHECK: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 96 + ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(<16 x s16>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) + ; CHECK-NEXT: $wl0 = COPY [[AIE_OFFSET_LOAD]](<16 x s16>) + %1:_(p0) = COPY $p0 + %293:_(s20) = G_CONSTANT i20 32 + %67:_(s20) = G_CONSTANT i20 64 + %68:_(p0) = nuw G_PTR_ADD %1, %67(s20) + %295:_(<16 x s16>) = G_AIE_OFFSET_LOAD %68(p0), %293(s20) + $wl0 = COPY %295:_(<16 x s16>) +... + +# Case 2: Can't be combined. + +--- +name: combine_offset_load_ptradd_reg +body: | + bb.0: + liveins: $p0, $r0 + ; CHECK-LABEL: name: combine_offset_load_ptradd_reg + ; CHECK: liveins: $p0, $r0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $r0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 32 + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = nuw G_PTR_ADD [[COPY1]], [[TRUNC]](s20) + ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(<16 x s16>) = G_AIE_OFFSET_LOAD [[PTR_ADD]](p0), [[C]](s20) + ; CHECK-NEXT: $wl0 = COPY [[AIE_OFFSET_LOAD]](<16 x s16>) + %0:_(s32) = COPY $r0 + %1:_(p0) = COPY $p0 + %7:_(s20) = G_TRUNC %0(s32) + %293:_(s20) = G_CONSTANT i20 32 + %67:_(s20) = G_CONSTANT i20 64 + %68:_(p0) = nuw G_PTR_ADD %1, %7(s20) + %295:_(<16 x s16>) = G_AIE_OFFSET_LOAD %68(p0), %293(s20) + $wl0 = COPY %295:_(<16 x s16>) +... + +# Case 3: Can be combined (shared G_PTR_ADD). + +--- +name: combine_offset_load_ptradd_reg_shared +body: | + bb.0: + liveins: $p0 + ; CHECK-LABEL: name: combine_offset_load_ptradd_reg_shared + ; CHECK: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 32 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 + ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(<16 x s16>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C1]](s20) + ; CHECK-NEXT: [[AIE_OFFSET_LOAD1:%[0-9]+]]:_(<16 x s16>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) + ; CHECK-NEXT: $wl0 = COPY [[AIE_OFFSET_LOAD]](<16 x s16>) + ; CHECK-NEXT: $wl1 = COPY [[AIE_OFFSET_LOAD1]](<16 x s16>) + %1:_(p0) = COPY $p0 + %293:_(s20) = G_CONSTANT i20 32 + %294:_(s20) = G_CONSTANT i20 32 + %68:_(p0) = nuw G_PTR_ADD %1, %294(s20) + %295:_(<16 x s16>) = G_AIE_OFFSET_LOAD %68(p0), %293(s20) + %296:_(<16 x s16>) = G_AIE_OFFSET_LOAD %1(p0), %294(s20) + $wl0 = COPY %295:_(<16 x s16>) + $wl1 = COPY %296:_(<16 x s16>) +... + +# Case 4: Can be combined. + +--- +name: combine_offset_store_ptradd_const +body: | + bb.0: + liveins: $p0, $wl0 + ; CHECK-LABEL: name: combine_offset_store_ptradd_const + ; CHECK: liveins: $p0, $wl0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s16>) = COPY $wl0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 96 + ; CHECK-NEXT: G_AIE_OFFSET_STORE [[COPY]](<16 x s16>), [[COPY1]](p0), [[C]](s20) + %0:_(<16 x s16>) = COPY $wl0 + %1:_(p0) = COPY $p0 + %293:_(s20) = G_CONSTANT i20 32 + %67:_(s20) = G_CONSTANT i20 64 + %68:_(p0) = nuw G_PTR_ADD %1, %67(s20) + G_AIE_OFFSET_STORE %0, %68, %293 +... diff --git a/llvm/test/CodeGen/AIE/GlobalISel/combine-reuse-padd-ls-offset.mir b/llvm/test/CodeGen/AIE/GlobalISel/combine-reuse-padd-ls-offset.mir new file mode 100644 index 000000000000..e027a077bfcd --- /dev/null +++ b/llvm/test/CodeGen/AIE/GlobalISel/combine-reuse-padd-ls-offset.mir @@ -0,0 +1,94 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +# RUN: llc -mtriple aie2 -run-pass=aie2-postlegalizer-custom-combiner %s -verify-machineinstrs -o - | FileCheck %s + +# Test for combine_offset_load_store_share_ptradd + +# Case 1: Can be combined. + +--- +name: combine_offset_load_ptradd_reg_share +body: | + bb.0: + liveins: $p0, $r0 + ; CHECK-LABEL: name: combine_offset_load_ptradd_reg_share + ; CHECK: liveins: $p0, $r0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $r0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 32 + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = nuw G_PTR_ADD [[COPY1]], [[TRUNC]](s20) + ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(<16 x s16>) = G_AIE_OFFSET_LOAD [[PTR_ADD]](p0), [[C]](s20) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 0 + ; CHECK-NEXT: [[AIE_OFFSET_LOAD1:%[0-9]+]]:_(<16 x s16>) = G_AIE_OFFSET_LOAD [[PTR_ADD]](p0), [[C1]](s20) + ; CHECK-NEXT: $wl0 = COPY [[AIE_OFFSET_LOAD]](<16 x s16>) + ; CHECK-NEXT: $wl1 = COPY [[AIE_OFFSET_LOAD1]](<16 x s16>) + %0:_(s32) = COPY $r0 + %1:_(p0) = COPY $p0 + %7:_(s20) = G_TRUNC %0(s32) + %293:_(s20) = G_CONSTANT i20 32 + %68:_(p0) = nuw G_PTR_ADD %1, %7(s20) + %295:_(<16 x s16>) = G_AIE_OFFSET_LOAD %68(p0), %293(s20) + %296:_(<16 x s16>) = G_AIE_OFFSET_LOAD %1(p0), %7(s20) + $wl0 = COPY %295:_(<16 x s16>) + $wl1 = COPY %296:_(<16 x s16>) +... + +# Case 2: Can't be combined because G_PTR_ADD is dead and we don't ane to. +# bring it back to life. + +--- +name: combine_offset_load_ptradd_reg_ptradd_dead +body: | + bb.0: + liveins: $p0, $r0 + ; CHECK-LABEL: name: combine_offset_load_ptradd_reg_ptradd_dead + ; CHECK: liveins: $p0, $r0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $r0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(<16 x s16>) = G_AIE_OFFSET_LOAD [[COPY1]](p0), [[TRUNC]](s20) + ; CHECK-NEXT: $wl0 = COPY %5:_(<16 x s16>) + %0:_(s32) = COPY $r0 + %1:_(p0) = COPY $p0 + %7:_(s20) = G_TRUNC %0(s32) + %68:_(p0) = nuw G_PTR_ADD %1, %7(s20) + %296:_(<16 x s16>) = G_AIE_OFFSET_LOAD %1(p0), %7(s20) + $wl0 = COPY %295:_(<16 x s16>) +... + +# Case 3: Can be combined (store). + +--- +name: combine_offset_store_ptradd_reg_share +body: | + bb.0: + liveins: $p0, $r0, $wl0 + ; CHECK-LABEL: name: combine_offset_store_ptradd_reg_share + ; CHECK: liveins: $p0, $r0, $wl0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $r0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<16 x s16>) = COPY $wl0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 32 + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = nuw G_PTR_ADD [[COPY1]], [[TRUNC]](s20) + ; CHECK-NEXT: G_AIE_OFFSET_STORE [[COPY2]](<16 x s16>), [[PTR_ADD]](p0), [[C]](s20) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 0 + ; CHECK-NEXT: G_AIE_OFFSET_STORE [[COPY2]](<16 x s16>), [[PTR_ADD]](p0), [[C1]](s20) + %0:_(s32) = COPY $r0 + %1:_(p0) = COPY $p0 + %2:_(<16 x s16>) = COPY $wl0 + %7:_(s20) = G_TRUNC %0(s32) + %293:_(s20) = G_CONSTANT i20 32 + %68:_(p0) = nuw G_PTR_ADD %1, %7(s20) + G_AIE_OFFSET_STORE %2, %68, %293 + G_AIE_OFFSET_STORE %2, %1, %7 +... diff --git a/llvm/test/CodeGen/AIE/GlobalISel/combine-split-large-ls.mir b/llvm/test/CodeGen/AIE/GlobalISel/combine-split-large-ls.mir new file mode 100644 index 000000000000..1add9805febd --- /dev/null +++ b/llvm/test/CodeGen/AIE/GlobalISel/combine-split-large-ls.mir @@ -0,0 +1,160 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +# RUN: llc -mtriple aie2 -run-pass=aie2-postlegalizer-custom-combiner %s -verify-machineinstrs -o - | FileCheck %s + +# Test for combine_load_store_split + +# Case 1: Can split. + +--- +name: load_16xs32 +body: | + bb.0: + liveins: $p0 + ; CHECK-LABEL: name: load_16xs32 + ; CHECK: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 32 + ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(<8 x s32>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (<8 x s32>) from unknown-address + 32) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p0) :: (load (<8 x s32>), align 64) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<8 x s32>), [[AIE_OFFSET_LOAD]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + %1:_(p0) = COPY $p0 + %295:_(<16 x s32>) = G_LOAD %1(p0) :: (load (<16 x s32>)) + $x0 = COPY %295:_(<16 x s32>) +... + +# Case 2: Can split. + +--- +name: load_32xs16 +body: | + bb.0: + liveins: $p0 + ; CHECK-LABEL: name: load_32xs16 + ; CHECK: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 32 + ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(<16 x s16>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (<16 x s16>) from unknown-address + 32) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<16 x s16>) = G_LOAD [[COPY]](p0) :: (load (<16 x s16>), align 64) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<32 x s16>) = G_CONCAT_VECTORS [[LOAD]](<16 x s16>), [[AIE_OFFSET_LOAD]](<16 x s16>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<32 x s16>) + %1:_(p0) = COPY $p0 + %295:_(<32 x s16>) = G_LOAD %1(p0) :: (load (<32 x s16>)) + $x0 = COPY %295:_(<32 x s16>) +... + +# Case 3: Can split. + +--- +name: load_64xs8 +body: | + bb.0: + liveins: $p0 + ; CHECK-LABEL: name: load_64xs8 + ; CHECK: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 32 + ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(<32 x s8>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (<32 x s8>) from unknown-address + 32) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<32 x s8>) = G_LOAD [[COPY]](p0) :: (load (<32 x s8>), align 64) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<64 x s8>) = G_CONCAT_VECTORS [[LOAD]](<32 x s8>), [[AIE_OFFSET_LOAD]](<32 x s8>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<64 x s8>) + %1:_(p0) = COPY $p0 + %295:_(<64 x s8>) = G_LOAD %1(p0) :: (load (<64 x s8>)) + $x0 = COPY %295:_(<64 x s8>) +... + +# Case 4: Can split. + +--- +name: store_16xs32 +body: | + bb.0: + liveins: $p0, $x0 + ; CHECK-LABEL: name: store_16xs32 + ; CHECK: liveins: $p0, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 32 + ; CHECK-NEXT: G_AIE_OFFSET_STORE [[UV1]](<8 x s32>), [[COPY1]](p0), [[C]](s20) :: (store (<8 x s32>) into unknown-address + 32) + ; CHECK-NEXT: G_STORE [[UV]](<8 x s32>), [[COPY1]](p0) :: (store (<8 x s32>), align 64) + %0:_(<16 x s32>) = COPY $x0 + %1:_(p0) = COPY $p0 + G_STORE %0, %1(p0) :: (store (<16 x s32>)) +... + +# Case 5: Can't split (skip accumulators). + +--- +name: load_acc16xs64 +body: | + bb.0: + liveins: $p0 + ; CHECK-LABEL: name: load_acc16xs64 + ; CHECK: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<16 x s64>) = G_LOAD [[COPY]](p0) :: (load (<16 x s64>)) + ; CHECK-NEXT: $cm0 = COPY [[LOAD]](<16 x s64>) + %1:_(p0) = COPY $p0 + %295:_(<16 x s64>) = G_LOAD %1(p0) :: (load (<16 x s64>)) + $cm0 = COPY %295:_(<16 x s64>) +... + +# Case 6: Can't split (will miss selection combine). + +--- +name: load_16xs32_ups +body: | + bb.0: + liveins: $p0, $r1 + ; CHECK-LABEL: name: load_16xs32_ups + ; CHECK: liveins: $p0, $r1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $r1 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[COPY]](p0) :: (load (<16 x s32>)) + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(<16 x s64>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2.acc64.v16.I512.ups), [[LOAD]](<16 x s32>), [[COPY1]](s32), [[C]](s32) + ; CHECK-NEXT: $cm0 = COPY [[INT]](<16 x s64>) + %1:_(p0) = COPY $p0 + %101:_(s32) = COPY $r1 + %102:_(s32) = G_CONSTANT i32 0 + %25:_(<16 x s32>) = G_LOAD %1 :: (load (<16 x s32>)) + %103:_(<16 x s64>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2.acc64.v16.I512.ups), %25:_(<16 x s32>), %101:_(s32), %102:_(s32) + $cm0 = COPY %103:_(<16 x s64>) +... + +# Case 7: Can't split (will miss selection combine). + +--- +name: store_16xs32_srs +body: | + bb.0: + liveins: $p0, $cm0 + ; CHECK-LABEL: name: store_16xs32_srs + ; CHECK: liveins: $p0, $cm0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s64>) = COPY $cm0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $r0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(<32 x s16>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2.I512.v32.acc32.srs), [[COPY]](<16 x s64>), [[COPY1]](s32), [[C]](s32) + ; CHECK-NEXT: G_STORE [[INT]](<32 x s16>), [[COPY2]](p0) :: (store (<32 x s16>)) + %0:_(<16 x s64>) = COPY $cm0 + %1:_(s32) = COPY $r0 + %3:_(p0) = COPY $p0 + %7:_(s32) = G_CONSTANT i32 0 + %6:_(<32 x s16>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2.I512.v32.acc32.srs), %0:_(<16 x s64>), %1:_(s32), %7:_(s32) + G_STORE %6:_(<32 x s16>), %3:_(p0) :: (store (<32 x s16>)) +... diff --git a/llvm/test/CodeGen/AIE/aie2/GlobalISel/indexed-512-load-store.mir b/llvm/test/CodeGen/AIE/aie2/GlobalISel/indexed-512-load-store.mir index 9e279d3d38b5..974f2901e8eb 100644 --- a/llvm/test/CodeGen/AIE/aie2/GlobalISel/indexed-512-load-store.mir +++ b/llvm/test/CodeGen/AIE/aie2/GlobalISel/indexed-512-load-store.mir @@ -30,13 +30,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:edj = COPY [[MOV_RLC_imm10_pseudo]] ; CHECK-NEXT: [[COPY2:%[0-9]+]]:em = COPY [[COPY1]] ; CHECK-NEXT: [[PADD_mod_pseudo:%[0-9]+]]:ep = PADD_mod_pseudo [[COPY]], [[COPY2]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:em = COPY [[COPY1]] - ; CHECK-NEXT: [[PADD_mod_pseudo1:%[0-9]+]]:ep = PADD_mod_pseudo [[COPY]], [[COPY3]] - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[PADD_mod_pseudo1]], 32 :: (load (<16 x s16>) from unknown-address + 32) - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm1:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[PADD_mod_pseudo1]], 0 :: (load (<16 x s16>), align 64) - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_dmw_lda_w_ag_idx_imm1]], %subreg.sub_256_lo, [[VLDA_dmw_lda_w_ag_idx_imm]], %subreg.sub_256_hi - ; CHECK-NEXT: [[VLD_idx_pseudo:%[0-9]+]]:vec256 = VLD_idx_pseudo [[PADD_mod_pseudo]], [[COPY1]] :: (load (<16 x s16>)) - ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[PADD_mod_pseudo]], implicit [[REG_SEQUENCE]], implicit [[VLD_idx_pseudo]] + ; CHECK-NEXT: [[MOV_PD_imm10_pseudo:%[0-9]+]]:edj = MOV_PD_imm10_pseudo 56 + ; CHECK-NEXT: [[VLD_idx_pseudo:%[0-9]+]]:ewh = VLD_idx_pseudo [[COPY]], [[MOV_PD_imm10_pseudo]] :: (load (<16 x s16>) from unknown-address + 32) + ; CHECK-NEXT: [[VLD_idx_pseudo1:%[0-9]+]]:ewl = VLD_idx_pseudo [[COPY]], [[COPY1]] :: (load (<16 x s16>), align 64) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLD_idx_pseudo1]], %subreg.sub_256_lo, [[VLD_idx_pseudo]], %subreg.sub_256_hi + ; CHECK-NEXT: [[MOV_PD_imm10_pseudo1:%[0-9]+]]:edj = MOV_PD_imm10_pseudo 48 + ; CHECK-NEXT: [[VLD_idx_pseudo2:%[0-9]+]]:vec256 = VLD_idx_pseudo [[COPY]], [[MOV_PD_imm10_pseudo1]] :: (load (<16 x s16>)) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[PADD_mod_pseudo]], implicit [[REG_SEQUENCE]], implicit [[VLD_idx_pseudo2]] %0:_(p0) = COPY $p0 %1:_(s32) = G_CONSTANT i32 24 %2:_(s20) = G_TRUNC %1 @@ -61,11 +61,11 @@ body: | ; CHECK-NEXT: [[MOV_RLC_imm10_pseudo:%[0-9]+]]:er = MOV_RLC_imm10_pseudo 64 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:em = COPY [[MOV_RLC_imm10_pseudo]] ; CHECK-NEXT: [[PADD_mod_pseudo:%[0-9]+]]:ep = PADD_mod_pseudo [[COPY]], [[COPY1]] - ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[COPY]], 96 :: (load (<16 x s16>) from unknown-address + 32) - ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo1:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[COPY]], 64 :: (load (<16 x s16>), align 64) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:ewh = VLD_idx_imm_3x32_pseudo [[COPY]], 96 :: (load (<16 x s16>) from unknown-address + 32) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo1:%[0-9]+]]:ewl = VLD_idx_imm_3x32_pseudo [[COPY]], 64 :: (load (<16 x s16>), align 64) ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLD_idx_imm_3x32_pseudo1]], %subreg.sub_256_lo, [[VLD_idx_imm_3x32_pseudo]], %subreg.sub_256_hi - ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo2:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[PADD_mod_pseudo]], 64 :: (load (<16 x s16>)) - ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[PADD_mod_pseudo]], implicit [[REG_SEQUENCE]], implicit [[VLD_idx_imm_3x32_pseudo2]] + ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 128 :: (load (<16 x s16>)) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[PADD_mod_pseudo]], implicit [[REG_SEQUENCE]], implicit [[VLDA_dmw_lda_w_ag_idx_imm]] %0:_(p0) = COPY $p0 %1:_(s32) = G_CONSTANT i32 64 %2:_(s20) = G_TRUNC %1 @@ -95,11 +95,11 @@ body: | ; CHECK-NEXT: [[PADD_mod_pseudo:%[0-9]+]]:ep = PADD_mod_pseudo [[COPY]], [[COPY4]] ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vec256 = COPY [[COPY1]].sub_256_lo ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vec256 = COPY [[COPY1]].sub_256_hi - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:em = COPY [[COPY3]] - ; CHECK-NEXT: [[PADD_mod_pseudo1:%[0-9]+]]:ep = PADD_mod_pseudo [[COPY]], [[COPY7]] - ; CHECK-NEXT: VST_dmw_sts_w_ag_idx_imm [[COPY6]], [[PADD_mod_pseudo1]], 32 :: (store (<16 x s16>) into unknown-address + 32) - ; CHECK-NEXT: VST_dmw_sts_w_ag_idx_imm [[COPY5]], [[PADD_mod_pseudo1]], 0 :: (store (<16 x s16>), align 64) - ; CHECK-NEXT: VST_dmw_sts_w_ag_idx [[COPY2]], [[PADD_mod_pseudo]], [[COPY3]] :: (store (<16 x s16>)) + ; CHECK-NEXT: [[MOV_PD_imm10_pseudo:%[0-9]+]]:edj = MOV_PD_imm10_pseudo 56 + ; CHECK-NEXT: VST_dmw_sts_w_ag_idx [[COPY6]], [[COPY]], [[MOV_PD_imm10_pseudo]] :: (store (<16 x s16>) into unknown-address + 32) + ; CHECK-NEXT: VST_dmw_sts_w_ag_idx [[COPY5]], [[COPY]], [[COPY3]] :: (store (<16 x s16>), align 64) + ; CHECK-NEXT: [[MOV_PD_imm10_pseudo1:%[0-9]+]]:edj = MOV_PD_imm10_pseudo 48 + ; CHECK-NEXT: VST_dmw_sts_w_ag_idx [[COPY2]], [[COPY]], [[MOV_PD_imm10_pseudo1]] :: (store (<16 x s16>)) ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[PADD_mod_pseudo]] %0:_(p0) = COPY $p0 %1:_(<32 x s16>) = COPY $x0 @@ -133,7 +133,7 @@ body: | ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vec256 = COPY [[COPY1]].sub_256_hi ; CHECK-NEXT: VST_dmw_sts_w_ag_idx_imm [[COPY5]], [[COPY]], 96 :: (store (<16 x s16>) into unknown-address + 32) ; CHECK-NEXT: VST_dmw_sts_w_ag_idx_imm [[COPY4]], [[COPY]], 64 :: (store (<16 x s16>), align 64) - ; CHECK-NEXT: VST_dmw_sts_w_ag_idx_imm [[COPY2]], [[PADD_mod_pseudo]], 64 :: (store (<16 x s16>)) + ; CHECK-NEXT: VST_dmw_sts_w_ag_idx_imm [[COPY2]], [[COPY]], 128 :: (store (<16 x s16>)) ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[PADD_mod_pseudo]] %0:_(p0) = COPY $p0 %1:_(<32 x s16>) = COPY $x0 diff --git a/llvm/test/CodeGen/AIE/aie2/mmo-load.ll b/llvm/test/CodeGen/AIE/aie2/mmo-load.ll index cc1eb5889d7f..731f9c113744 100644 --- a/llvm/test/CodeGen/AIE/aie2/mmo-load.ll +++ b/llvm/test/CodeGen/AIE/aie2/mmo-load.ll @@ -80,9 +80,9 @@ define void @load_v16i32(i32 %idx, ptr %array) { ; CHECK-SAME: (load (<8 x s32>) from %ir.arrayidx.1 + 32) ; CHECK: VLDA_dmw_lda_w_ag_idx_imm ; CHECK-SAME: (load (<8 x s32>) from %ir.arrayidx.1) - ; CHECK: VLDA_dmw_lda_w_ag_idx_imm + ; CHECK: VLD_idx_imm_3x32_pseudo ; CHECK-SAME: (load (<8 x s32>) from %ir.arrayidx.0 + 32) - ; CHECK: VLDA_dmw_lda_w_ag_idx_imm + ; CHECK: VLD_idx_imm_3x32_pseudo ; CHECK-SAME: (load (<8 x s32>) from %ir.arrayidx.0) entry: %arrayidx.1 = getelementptr inbounds [16 x <16 x i32>], ptr %array, i32 0, i32 2 diff --git a/llvm/test/CodeGen/AIE/aie2/mmo-store.ll b/llvm/test/CodeGen/AIE/aie2/mmo-store.ll index ce2369795304..f9882283783c 100644 --- a/llvm/test/CodeGen/AIE/aie2/mmo-store.ll +++ b/llvm/test/CodeGen/AIE/aie2/mmo-store.ll @@ -71,7 +71,7 @@ define void @store_v16i32(i32 %idx, ptr %array, <16 x i32> %val) { ; CHECK-LABEL: name: store_v16i32 ; CHECK: VST_dmw_sts_w_ag_idx_imm ; CHECK-SAME: (store (<8 x s32>) into %ir.arrayidx.0 + 32) - ; CHECK: VST_dmw_sts_w_ag_idx_imm + ; CHECK: VST_dmw_sts_w_ag_idx ; CHECK-SAME: (store (<8 x s32>) into %ir.arrayidx.0) ; CHECK: VST_dmw_sts_w_ag_idx_imm ; CHECK-SAME: (store (<8 x s32>) into %ir.arrayidx.1 + 32)