diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp index 51cf72a211f3..7f4cb6e57588 100644 --- a/llvm/lib/CodeGen/MachineLICM.cpp +++ b/llvm/lib/CodeGen/MachineLICM.cpp @@ -226,8 +226,8 @@ namespace { void HoistPostRA(MachineInstr *MI, unsigned Def, MachineLoop *CurLoop, MachineBasicBlock *CurPreheader); - void ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs, - BitVector &PhysRegClobbers, SmallSet &StoredFIs, + void ProcessMI(MachineInstr *MI, BitVector &RUDefs, BitVector &RUClobbers, + SmallSet &StoredFIs, SmallVectorImpl &Candidates, MachineLoop *CurLoop); @@ -356,7 +356,6 @@ bool MachineLICMBase::runOnMachineFunction(MachineFunction &MF) { MRI = &MF.getRegInfo(); SchedModel.init(&ST); - PreRegAlloc = MRI->isSSA(); HasProfileData = MF.getFunction().hasProfileData(); if (PreRegAlloc) @@ -427,10 +426,63 @@ static bool InstructionStoresToFI(const MachineInstr *MI, int FI) { return false; } +static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo &TRI, + BitVector &RUs, + const uint32_t *Mask) { + // FIXME: This intentionally works in reverse due to some issues with the + // Register Units infrastructure. + // + // This is used to apply callee-saved-register masks to the clobbered regunits + // mask. + // + // The right way to approach this is to start with a BitVector full of ones, + // then reset all the bits of the regunits of each register that is set in the + // mask (registers preserved), then OR the resulting bits with the Clobbers + // mask. This correctly prioritizes the saved registers, so if a RU is shared + // between a register that is preserved, and one that is NOT preserved, that + // RU will not be set in the output vector (the clobbers). + // + // What we have to do for now is the opposite: we have to assume that the + // regunits of all registers that are NOT preserved are clobbered, even if + // those regunits are preserved by another register. So if a RU is shared + // like described previously, that RU will be set. + // + // This is to work around an issue which appears in AArch64, but isn't + // exclusive to that target: AArch64's Qn registers (128 bits) have Dn + // register (lower 64 bits). A few Dn registers are preserved by some calling + // conventions, but Qn and Dn share exactly the same reg units. + // + // If we do this the right way, Qn will be marked as NOT clobbered even though + // its upper 64 bits are NOT preserved. The conservative approach handles this + // correctly at the cost of some missed optimizations on other targets. + // + // This is caused by how RegUnits are handled within TableGen. Ideally, Qn + // should have an extra RegUnit to model the "unknown" bits not covered by the + // subregs. + BitVector RUsFromRegsNotInMask(TRI.getNumRegUnits()); + const unsigned NumRegs = TRI.getNumRegs(); + const unsigned MaskWords = (NumRegs + 31) / 32; + for (unsigned K = 0; K < MaskWords; ++K) { + const uint32_t Word = Mask[K]; + for (unsigned Bit = 0; Bit < 32; ++Bit) { + const unsigned PhysReg = (K * 32) + Bit; + if (PhysReg == NumRegs) + break; + + if (PhysReg && !((Word >> Bit) & 1)) { + for (MCRegUnitIterator RUI(PhysReg, &TRI); RUI.isValid(); ++RUI) + RUsFromRegsNotInMask.set(*RUI); + } + } + } + + RUs |= RUsFromRegsNotInMask; +} + /// Examine the instruction for potentai LICM candidate. Also /// gather register def and frame object update information. -void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs, - BitVector &PhysRegClobbers, +void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &RUDefs, + BitVector &RUClobbers, SmallSet &StoredFIs, SmallVectorImpl &Candidates, MachineLoop *CurLoop) { @@ -452,7 +504,7 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs, // We can't hoist an instruction defining a physreg that is clobbered in // the loop. if (MO.isRegMask()) { - PhysRegClobbers.setBitsNotInMask(MO.getRegMask()); + applyBitsNotInRegMaskToRegUnitsMask(*TRI, RUClobbers, MO.getRegMask()); continue; } @@ -464,16 +516,24 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs, assert(Reg.isPhysical() && "Not expecting virtual register!"); if (!MO.isDef()) { - if (Reg && (PhysRegDefs.test(Reg) || PhysRegClobbers.test(Reg))) - // If it's using a non-loop-invariant register, then it's obviously not - // safe to hoist. - HasNonInvariantUse = true; + if (!HasNonInvariantUse) { + for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) { + // If it's using a non-loop-invariant register, then it's obviously + // not safe to hoist. + // Note this isn't a final check, as we haven't gathered all the loop + // register definitions yet. + if (RUDefs.test(*RUI) || RUClobbers.test(*RUI)) { + HasNonInvariantUse = true; + break; + } + } + } continue; } if (MO.isImplicit()) { - for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) - PhysRegClobbers.set(*AI); + for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) + RUClobbers.set(*RUI); if (!MO.isDead()) // Non-dead implicit def? This cannot be hoisted. RuledOut = true; @@ -492,19 +552,18 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs, // If we have already seen another instruction that defines the same // register, then this is not safe. Two defs is indicated by setting a // PhysRegClobbers bit. - for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS) { - if (PhysRegDefs.test(*AS)) - PhysRegClobbers.set(*AS); + for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) { + if (RUDefs.test(*RUI)) { + RUClobbers.set(*RUI); + RuledOut = true; + } else if (RUClobbers.test(*RUI)) { + // MI defined register is seen defined by another instruction in + // the loop, it cannot be a LICM candidate. + RuledOut = true; + } + + RUDefs.set(*RUI); } - // Need a second loop because MCRegAliasIterator can visit the same - // register twice. - for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS) - PhysRegDefs.set(*AS); - - if (PhysRegClobbers.test(Reg)) - // MI defined register is seen defined by another instruction in - // the loop, it cannot be a LICM candidate. - RuledOut = true; } // Only consider reloads for now and remats which do not have register @@ -525,9 +584,9 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop, if (!Preheader) return; - unsigned NumRegs = TRI->getNumRegs(); - BitVector PhysRegDefs(NumRegs); // Regs defined once in the loop. - BitVector PhysRegClobbers(NumRegs); // Regs defined more than once. + unsigned NumRegUnits = TRI->getNumRegUnits(); + BitVector RUDefs(NumRegUnits); // RUs defined once in the loop. + BitVector RUClobbers(NumRegUnits); // RUs defined more than once. SmallVector Candidates; SmallSet StoredFIs; @@ -540,26 +599,31 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop, const MachineLoop *ML = MLI->getLoopFor(BB); if (ML && ML->getHeader()->isEHPad()) continue; - // Conservatively treat live-in's as an external def. - // FIXME: That means a reload that're reused in successor block(s) will not - // be LICM'ed. - for (const auto &LI : BB->liveins()) { - for (MCRegAliasIterator AI(LI.PhysReg, TRI, true); AI.isValid(); ++AI) - PhysRegDefs.set(*AI); - } - // Funclet entry blocks will clobber all registers if (const uint32_t *Mask = BB->getBeginClobberMask(TRI)) - PhysRegClobbers.setBitsNotInMask(Mask); + applyBitsNotInRegMaskToRegUnitsMask(*TRI, RUClobbers, Mask); SpeculationState = SpeculateUnknown; for (MachineInstr &MI : *BB) - ProcessMI(&MI, PhysRegDefs, PhysRegClobbers, StoredFIs, Candidates, - CurLoop); + ProcessMI(&MI, RUDefs, RUClobbers, StoredFIs, Candidates, CurLoop); + } + + // Mark registers as clobbered if they are livein and also defined in the loop + for (const auto &LoopLI : CurLoop->getHeader()->liveins()) { + MCPhysReg LoopLiveInReg = LoopLI.PhysReg; + LaneBitmask LiveInMask = LoopLI.LaneMask; + for (MCRegUnitMaskIterator RUI(LoopLiveInReg, TRI); RUI.isValid(); ++RUI) { + auto LiveInUnit = (*RUI).first; + LaneBitmask UnitMask = (*RUI).second; + // Check if the livein lanes overlap with the lanes touched by LiveInUnit + if ((UnitMask & LiveInMask).any() && RUDefs.test(LiveInUnit)) { + RUClobbers.set(LiveInUnit); + } + } } // Gather the registers read / clobbered by the terminator. - BitVector TermRegs(NumRegs); + BitVector TermRUs(NumRegUnits); MachineBasicBlock::iterator TI = Preheader->getFirstTerminator(); if (TI != Preheader->end()) { for (const MachineOperand &MO : TI->operands()) { @@ -568,8 +632,8 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop, Register Reg = MO.getReg(); if (!Reg) continue; - for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) - TermRegs.set(*AI); + for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) + TermRUs.set(*RUI); } } @@ -587,24 +651,36 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop, continue; unsigned Def = Candidate.Def; - if (!PhysRegClobbers.test(Def) && !TermRegs.test(Def)) { - bool Safe = true; - MachineInstr *MI = Candidate.MI; - for (const MachineOperand &MO : MI->all_uses()) { - if (!MO.getReg()) - continue; - Register Reg = MO.getReg(); - if (PhysRegDefs.test(Reg) || - PhysRegClobbers.test(Reg)) { + bool Safe = true; + for (MCRegUnitIterator RUI(Def, TRI); RUI.isValid(); ++RUI) { + if (RUClobbers.test(*RUI) || TermRUs.test(*RUI)) { + Safe = false; + break; + } + } + + if (!Safe) + continue; + + MachineInstr *MI = Candidate.MI; + for (const MachineOperand &MO : MI->all_uses()) { + if (!MO.getReg()) + continue; + for (MCRegUnitIterator RUI(MO.getReg(), TRI); RUI.isValid(); ++RUI) { + if (RUDefs.test(*RUI) || RUClobbers.test(*RUI)) { // If it's using a non-loop-invariant register, then it's obviously // not safe to hoist. Safe = false; break; } } - if (Safe) - HoistPostRA(MI, Candidate.Def, CurLoop, CurPreheader); + + if (!Safe) + break; } + + if (Safe) + HoistPostRA(MI, Candidate.Def, CurLoop, CurPreheader); } } diff --git a/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp b/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp index 073129a02e69..a22d3de87ddc 100644 --- a/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp +++ b/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp @@ -361,10 +361,12 @@ class EnforceCopyEdges : public ScheduleDAGMutation { class PropagateIncomingLatencies : public ScheduleDAGMutation { bool OnlyCopyLike; + bool OnlyLocalSources; public: - PropagateIncomingLatencies(bool OnlyCopyLike = true) - : OnlyCopyLike(OnlyCopyLike) {} + PropagateIncomingLatencies(bool OnlyCopyLike = true, + bool OnlyLocalSources = true) + : OnlyCopyLike(OnlyCopyLike), OnlyLocalSources(OnlyLocalSources) {} void apply(ScheduleDAGInstrs *DAG) override { auto IsData = [](const SDep &D) { return D.getKind() == SDep::Data; }; for (SUnit &SU : DAG->SUnits) { @@ -381,25 +383,55 @@ class PropagateIncomingLatencies : public ScheduleDAGMutation { })) continue; - // Find the common latency for all predecessors that can be - // "moved" to successors. - SDep *MinLatencyDep = nullptr; - for (SDep &PredEdge : make_filter_range(SU.Preds, IsData)) { - if (!MinLatencyDep || - PredEdge.getLatency() < MinLatencyDep->getLatency()) - MinLatencyDep = &PredEdge; + // Avoid pushing a REG_SEQUENCE close to its sources if it is likely to + // generate a hoistable COPY after regalloc. Keeping that COPY close to + // its consumers instead will facilitate MachineLICM. + // Indeed, that typically means that only the lanes corresponding to + // internal sources will be loop-carried. The external lane will come + // directly from the pre-header, and the corresponding COPY can then be + // hoisted by MachineLICM. + const MachineBasicBlock &MBB = *MI.getParent(); + const MachineRegisterInfo &MRI = DAG->MRI; + auto MayProduceHoistableCopy = [&MBB, &MRI](const MachineInstr &MI) { + if (!MI.isRegSequence() || !MRI.isSSA()) + return false; + const auto NumExternal = + count_if(MI.uses(), [&MBB, &MRI](const MachineOperand &MO) { + return MO.isReg() && MO.getReg().isVirtual() && + MRI.getVRegDef(MO.getReg())->getParent() != &MBB; + }); + const auto NumInternal = MI.getNumOperands() - 1 - (2 * NumExternal); + return NumExternal == 1 && NumInternal >= 1; + }; + + // Whether to propagate latency from predecessors to successors (true), + // or from successors to predecessors (false). + const bool MoveLatToSuccessors = + !OnlyLocalSources || !MayProduceHoistableCopy(MI); + + // Find the common latency for all predecessors (or successors) that + // can be "moved" to successors (or predecessors). + const SDep *MinLatencyDep = nullptr; + ArrayRef SuccsOrPreds = MoveLatToSuccessors ? SU.Preds : SU.Succs; + for (const SDep &Edge : make_filter_range(SuccsOrPreds, IsData)) { + if (!MinLatencyDep || Edge.getLatency() < MinLatencyDep->getLatency()) + MinLatencyDep = &Edge; } if (!MinLatencyDep) continue; - int PropagatableSrcLatency = MinLatencyDep->getLatency(); + int AmountToShiftToSuccessors = MoveLatToSuccessors + ? int(MinLatencyDep->getLatency()) + : -int(MinLatencyDep->getLatency()); for (SDep &PredEdge : make_filter_range(SU.Preds, IsData)) { updatePredLatency(PredEdge, SU, - PredEdge.getLatency() - PropagatableSrcLatency); + int(PredEdge.getLatency()) - + AmountToShiftToSuccessors); } for (SDep &SuccEdge : make_filter_range(SU.Succs, IsData)) { updateSuccLatency(SuccEdge, SU, - SuccEdge.getLatency() + PropagatableSrcLatency); + int(SuccEdge.getLatency()) + + AmountToShiftToSuccessors); } } } diff --git a/llvm/test/CodeGen/AArch64/mlicm-csr-mask.mir b/llvm/test/CodeGen/AArch64/mlicm-csr-mask.mir new file mode 100644 index 000000000000..f6a0abfdc410 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/mlicm-csr-mask.mir @@ -0,0 +1,49 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=aarch64-unknown-linux-gnu -run-pass=greedy,machinelicm -verify-machineinstrs -debug -o - %s | FileCheck %s + +# FIXME: Running RA is needed otherwise it runs pre-RA LICM. +--- +name: test +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: test + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $x0, $w1, $x2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: B %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $x0, $w1, $x2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $q11 = MOVIv4i32 2, 8 + ; CHECK-NEXT: BL &memset, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit $w1, implicit $x2, implicit-def $sp, implicit-def $x0 + ; CHECK-NEXT: renamable $q10 = MVNIv4i32 4, 0 + ; CHECK-NEXT: $xzr = SUBSXri $x0, 1, 0, implicit-def $nzcv + ; CHECK-NEXT: Bcc 11, %bb.1, implicit $nzcv + ; CHECK-NEXT: B %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $q10, $q11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $q0 = COPY $q10 + ; CHECK-NEXT: $q1 = COPY $q11 + bb.0: + liveins: $x0, $w1, $x2 + B %bb.1 + + bb.1: + liveins: $x0, $w1, $x2 + renamable $q11 = MOVIv4i32 2, 8 + BL &memset, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit $w1, implicit $x2, implicit-def $sp, implicit-def $x0 + renamable $q10 = MVNIv4i32 4, 0 + $xzr = SUBSXri $x0, 1, 0, implicit-def $nzcv + Bcc 11, %bb.1, implicit $nzcv + B %bb.2 + + bb.2: + liveins: $q10, $q11 + $q0 = COPY $q10 + $q1 = COPY $q11 +... diff --git a/llvm/test/CodeGen/AArch64/mlicm-stack-write-check.mir b/llvm/test/CodeGen/AArch64/mlicm-stack-write-check.mir index 0b1fdf9c33d6..1a61c6a88c4d 100644 --- a/llvm/test/CodeGen/AArch64/mlicm-stack-write-check.mir +++ b/llvm/test/CodeGen/AArch64/mlicm-stack-write-check.mir @@ -2,8 +2,6 @@ --- name: test tracksRegLiveness: true -registers: - - { id: 0, class: gpr64 } stack: - { id: 0, size: 8, type: spill-slot } body: | @@ -28,14 +26,11 @@ body: | bb.2: liveins: $x0 - %0 = COPY $x0 - %0 = COPY $x0 ; Force isSSA = false. ... + --- name: test2 tracksRegLiveness: true -registers: - - { id: 0, class: gpr64 } stack: - { id: 0, size: 8, type: spill-slot } body: | @@ -60,6 +55,4 @@ body: | bb.2: liveins: $x0 - %0 = COPY $x0 - %0 = COPY $x0 ; Force isSSA = false. ... diff --git a/llvm/test/CodeGen/AArch64/peephole-and-tst.ll b/llvm/test/CodeGen/AArch64/peephole-and-tst.ll index 17ad2983abe9..c9a89bd7b6e4 100644 --- a/llvm/test/CodeGen/AArch64/peephole-and-tst.ll +++ b/llvm/test/CodeGen/AArch64/peephole-and-tst.ll @@ -9,6 +9,7 @@ define i32 @test_func_i32_two_uses(i32 %in, i32 %bit, i32 %mask) { ; CHECK-LABEL: test_func_i32_two_uses: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: adrp x8, :got:ptr_wrapper +; CHECK-NEXT: and w11, w2, w0 ; CHECK-NEXT: ldr x8, [x8, :got_lo12:ptr_wrapper] ; CHECK-NEXT: ldr x9, [x8] ; CHECK-NEXT: mov w8, wzr @@ -21,7 +22,6 @@ define i32 @test_func_i32_two_uses(i32 %in, i32 %bit, i32 %mask) { ; CHECK-NEXT: .LBB0_3: // %do.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ands w10, w1, w0 -; CHECK-NEXT: and w11, w2, w0 ; CHECK-NEXT: cinc w8, w8, ne ; CHECK-NEXT: cmp w10, w11 ; CHECK-NEXT: b.eq .LBB0_1 diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll index 9c522630b1cd..0fe1d291e633 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll @@ -224,78 +224,77 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-LABEL: conv2d.loop.nest: ; DCL: .p2align 4 ; DCL-NEXT: // %bb.0: // %newFuncRoot -; DCL-NEXT: paddb [sp], #192 +; DCL-NEXT: nopa ; nopb ; nopx ; mov s0, r0 +; DCL-NEXT: paddb [sp], #192; mov s1, r1 ; DCL-NEXT: st p6, [sp, #-188] // 4-byte Folded Spill ; DCL-NEXT: mov p6, sp -; DCL-NEXT: paddb [p6], #-292 +; DCL-NEXT: paddb [p6], #-292; mov s2, r6 ; DCL-NEXT: lda m0, [p6, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-296 +; DCL-NEXT: paddb [p6], #-296; mov m5, p4 ; DCL-NEXT: lda dj0, [p6, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-300 +; DCL-NEXT: mova dj3, #0; paddb [p6], #-300; mov s3, r6 ; DCL-NEXT: lda dn0, [p6, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-204 +; DCL-NEXT: paddb [p6], #-204; mov dc0, dj3 ; DCL-NEXT: lda m0, [p6, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-208; mov s0, r0 +; DCL-NEXT: paddb [p6], #-208; mov dc4, dj3 ; DCL-NEXT: lda dj0, [p6, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-212; mov s1, r1 +; DCL-NEXT: paddb [p6], #-212; mov dc1, dj3 ; DCL-NEXT: lda dj4, [p6, #0]; mov p6, sp ; DCL-NEXT: paddb [p6], #-216; st m0, [sp, #-96] // 4-byte Folded Spill ; DCL-NEXT: lda dn0, [p6, #0]; mov p6, sp -; DCL-NEXT: mova dj3, #0; paddb [p6], #-220; st dj0, [sp, #-88] // 4-byte Folded Spill +; DCL-NEXT: paddb [p6], #-220; st dj0, [sp, #-88] // 4-byte Folded Spill ; DCL-NEXT: lda dn4, [p6, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-228; mov dc0, dj3 +; DCL-NEXT: paddb [p6], #-228; mov dc5, dj3 ; DCL-NEXT: lda r11, [p6, #0]; mov p6, sp ; DCL-NEXT: paddb [p6], #-232; st dn0, [sp, #-92] // 4-byte Folded Spill ; DCL-NEXT: lda dj1, [p6, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-236; mov dc4, dj3 +; DCL-NEXT: paddb [p6], #-236; mov dc2, dj3 ; DCL-NEXT: lda r12, [p6, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-240; mov dc1, dj3 +; DCL-NEXT: paddb [p6], #-240; st p7, [sp, #-192] // 4-byte Folded Spill ; DCL-NEXT: lda dn1, [p6, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-244; mov dc5, dj3 +; DCL-NEXT: vst wl0, [sp, #-64]; paddb [p6], #-244; mov dc6, dj3 // 32-byte Folded Spill ; DCL-NEXT: lda dn5, [p6, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-248; st p7, [sp, #-192] // 4-byte Folded Spill -; DCL-NEXT: lda r13, [p6, #0]; mov p6, sp -; DCL-NEXT: vst wl0, [sp, #-64]; paddb [p6], #-252; mov p7, sp // 32-byte Folded Spill -; DCL-NEXT: lda dj2, [p6, #0]; mov p6, sp -; DCL-NEXT: vst wh0, [sp, #-32]; paddb [p6], #-256; mov dc7, dj3 // 32-byte Folded Spill +; DCL-NEXT: vst wh0, [sp, #-32]; paddb [p6], #-248; mov p7, sp // 32-byte Folded Spill +; DCL-NEXT: lda r13, [p6, #0]; paddb [p7], #-272; mov p6, sp +; DCL-NEXT: lda r25, [p7, #0]; paddb [p6], #-252; mov p7, sp +; DCL-NEXT: lda dj2, [p6, #0]; paddb [p7], #-200; mov p6, sp +; DCL-NEXT: lda m6, [p7, #0]; paddb [p6], #-256; mov dc3, dj3 ; DCL-NEXT: lda dj6, [p6, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-260; st dc7, [sp, #-84] // 4-byte Folded Spill -; DCL-NEXT: lda dn2, [p6, #0]; paddb [p7], #-272; mov p6, sp -; DCL-NEXT: lda r25, [p7, #0]; paddb [p6], #-264; mov p7, sp -; DCL-NEXT: lda dn6, [p6, #0]; paddb [p7], #-200; mov p6, sp -; DCL-NEXT: lda m6, [p7, #0]; paddb [p6], #-268; mov dc2, dj3 +; DCL-NEXT: lda m7, [sp, #-96]; paddb [p6], #-260; mov r28, dj3 // 4-byte Folded Reload +; DCL-NEXT: lda dn2, [p6, #0]; mov p6, sp +; DCL-NEXT: lda dj7, [sp, #-88]; paddb [p6], #-264; mov dc7, dj3 // 4-byte Folded Reload +; DCL-NEXT: lda dn6, [p6, #0]; mov p6, sp +; DCL-NEXT: lda dn7, [sp, #-92]; paddb [p6], #-268; mov p7, sp // 4-byte Folded Reload ; DCL-NEXT: lda r14, [p6, #0]; mov p6, sp -; DCL-NEXT: lda m7, [sp, #-96]; paddb [p6], #-276; mov dc6, dj3 // 4-byte Folded Reload +; DCL-NEXT: paddb [p6], #-276; st dc7, [sp, #-84] // 4-byte Folded Spill ; DCL-NEXT: lda dn3, [p6, #0]; mov p6, sp -; DCL-NEXT: lda dj7, [sp, #-88]; paddb [p6], #-280; mov dc3, dj3 // 4-byte Folded Reload +; DCL-NEXT: paddb [p6], #-280; st m7, [sp, #-96] // 4-byte Folded Spill ; DCL-NEXT: lda r26, [p6, #0]; mov p6, sp -; DCL-NEXT: lda dn7, [sp, #-92]; paddb [p6], #-196; mov p7, sp // 4-byte Folded Reload -; DCL-NEXT: lda r15, [p6, #0]; paddb [p7], #-288; mov p6, sp -; DCL-NEXT: lda r27, [p7, #0]; paddb [p6], #-224; mov s2, r6 -; DCL-NEXT: lda r24, [p6, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-284; st m7, [sp, #-96] // 4-byte Folded Spill -; DCL-NEXT: lda m4, [p6, #0]; mov r28, dj3 -; DCL-NEXT: st dj7, [sp, #-88] // 4-byte Folded Spill -; DCL-NEXT: st dn7, [sp, #-92]; movx r9, #31; mov r8, #11 // 4-byte Folded Spill +; DCL-NEXT: paddb [p6], #-196; st dj7, [sp, #-88] // 4-byte Folded Spill +; DCL-NEXT: lda r15, [p6, #0]; mov p6, sp +; DCL-NEXT: paddb [p6], #-224; st dn7, [sp, #-92] // 4-byte Folded Spill +; DCL-NEXT: lda r24, [p6, #0]; paddb [p7], #-288; mov p6, sp +; DCL-NEXT: lda r27, [p7, #0]; paddb [p6], #-284; movx r8, #11; mov dj5, r12 +; DCL-NEXT: lda m4, [p6, #0]; movx r9, #31; mov m3, r14 ; DCL-NEXT: // implicit-def: $x4 ; DCL-NEXT: // implicit-def: $x2 ; DCL-NEXT: .p2align 4 ; DCL-NEXT: .LBB0_1: // %outer.loop.header ; DCL-NEXT: // =>This Loop Header: Depth=1 ; DCL-NEXT: // Child Loop BB0_2 Depth 2 -; DCL-NEXT: vldb wl6, [p1], #32; nopxm +; DCL-NEXT: nopa ; vldb wl6, [p1], #32; nopxm ; DCL-NEXT: vldb wl3, [p0], m6; mov r0, p0 ; DCL-NEXT: vlda.ups.s32.s16 bmh0, s0, [p2, #32] -; DCL-NEXT: vldb wh6, [p1], #32 -; DCL-NEXT: vldb wh3, [p0], m6; mov m5, p4 ; DCL-NEXT: vlda.ups.s32.s16 bml0, s0, [p2], m5 -; DCL-NEXT: vldb wl8, [p1], #32 -; DCL-NEXT: vldb wl7, [p0], m6 +; DCL-NEXT: vldb wh6, [p1], #32 +; DCL-NEXT: vldb wh3, [p0], m6 ; DCL-NEXT: vlda.ups.s32.s16 bmh1, s0, [p2, #32]; mov m1, p5 ; DCL-NEXT: vlda.ups.s32.s16 bml1, s0, [p2], m1 -; DCL-NEXT: vldb.3d wh7, [p0], d0 +; DCL-NEXT: vldb wl8, [p1], #32 +; DCL-NEXT: vldb wl7, [p0], m6 ; DCL-NEXT: vlda.ups.s32.s16 bmh2, s0, [p2, #32] ; DCL-NEXT: vlda.ups.s32.s16 bml2, s0, [p2], m5 +; DCL-NEXT: vldb.3d wh7, [p0], d0 ; DCL-NEXT: vlda.ups.s32.s16 bmh3, s0, [p2, #32]; mov m2, r15 ; DCL-NEXT: vlda.ups.s32.s16 bml3, s0, [p2], m2 ; DCL-NEXT: vlda.ups.s32.s16 bmh4, s0, [p2, #32] @@ -330,7 +329,7 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-NEXT: vldb wl10, [p1], #32; add r0, r10, #33; mov r10, p0; vmac cm3, cm3, x11, x6, r4 // Delay Slot 2 ; DCL-NEXT: vldb wh10, [p1], #32; and r10, r10, r9; vmov x8, x10; vmac cm7, cm7, x11, x8, r4 // Delay Slot 1 ; DCL-NEXT: // %bb.3: // in Loop: Header=BB0_1 Depth=1 -; DCL-NEXT: nopa ; nopb ; nopx ; vmov x11, x0 +; DCL-NEXT: nopx ; vmov x11, x0 ; DCL-NEXT: vshuffle x0, x4, x2, r3 ; DCL-NEXT: vshuffle x11, x0, x11, r8 ; DCL-NEXT: vlda wl0, [sp, #-64] // 32-byte Folded Reload @@ -352,7 +351,7 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-NEXT: vshuffle x3, x4, x2, r3; vmac cm0, cm1, x8, x1, r4 ; DCL-NEXT: st dj7, [sp, #-88] // 4-byte Folded Spill ; DCL-NEXT: vshuffle x5, x3, x0, r8; vmac cm1, cm2, x3, x1, r4 -; DCL-NEXT: lda m7, [sp, #-96]; vst.srs.s16.s32 bmh7, s2, [p3, #32]; mov s3, r6 // 4-byte Folded Reload +; DCL-NEXT: lda m7, [sp, #-96]; vst.srs.s16.s32 bmh7, s2, [p3, #32] // 4-byte Folded Reload ; DCL-NEXT: lda dc7, [sp, #-84]; vst.srs.s16.s32 bml7, s3, [p3], #64; vmac cm2, cm3, x5, x1, r4 // 4-byte Folded Reload ; DCL-NEXT: vst.srs.s16.s32 bmh0, s3, [p3, #32] ; DCL-NEXT: vst.srs.s16.s32 bml0, s3, [p3], m4; vmac cm3, cm8, x6, x10, r4 @@ -362,9 +361,9 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-NEXT: vst.srs.s16.s32 bml2, s3, [p3], m1; vmac cm5, cm5, x3, x10, r4 ; DCL-NEXT: vst.srs.s16.s32 bmh3, s3, [p3, #32] ; DCL-NEXT: vst.srs.s16.s32 bml3, s3, [p3], #64; vmac cm4, cm6, x5, x10, r4 -; DCL-NEXT: vst.srs.s16.s32 bmh8, s3, [p3, #32]; mov dj5, r12 -; DCL-NEXT: vst.srs.s16.s32 bml8, s3, [p3], m4; mov m2, r13 -; DCL-NEXT: vst.srs.s16.s32 bmh5, s3, [p3, #32]; mov m3, r14 +; DCL-NEXT: vst.srs.s16.s32 bmh8, s3, [p3, #32] +; DCL-NEXT: vst.srs.s16.s32 bml8, s3, [p3], m4 +; DCL-NEXT: vst.srs.s16.s32 bmh5, s3, [p3, #32]; mov m2, r13 ; DCL-NEXT: vst.srs.s16.s32 bml5, s3, [p3], #64; mov m1, r11 ; DCL-NEXT: padda.3d [p0], d1; vst.srs.s16.s32 bmh4, s3, [p3, #32]; mov m1, r24 ; DCL-NEXT: vst.2d.srs.s16.s32 bml4, s3, [p3], d7; add r7, r7, #-1; mov dj7, r25 @@ -387,59 +386,58 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-LABEL: conv2d.loop.nest: ; ZOL: .p2align 4 ; ZOL-NEXT: // %bb.0: // %newFuncRoot -; ZOL-NEXT: paddb [sp], #192 +; ZOL-NEXT: nopa ; nopb ; nopx ; mov s0, r0 +; ZOL-NEXT: paddb [sp], #192; mov s1, r1 ; ZOL-NEXT: st p6, [sp, #-188] // 4-byte Folded Spill ; ZOL-NEXT: mov p6, sp -; ZOL-NEXT: paddb [p6], #-292 +; ZOL-NEXT: paddb [p6], #-292; mov s2, r6 ; ZOL-NEXT: lda m0, [p6, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-296 +; ZOL-NEXT: paddb [p6], #-296; mov m5, p4 ; ZOL-NEXT: lda dj0, [p6, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-300 +; ZOL-NEXT: mova dj3, #0; paddb [p6], #-300; mov s3, r6 ; ZOL-NEXT: lda dn0, [p6, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-204 +; ZOL-NEXT: paddb [p6], #-204; mov dc0, dj3 ; ZOL-NEXT: lda m0, [p6, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-208; mov s0, r0 +; ZOL-NEXT: paddb [p6], #-208; mov dc4, dj3 ; ZOL-NEXT: lda dj0, [p6, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-212; mov s1, r1 +; ZOL-NEXT: paddb [p6], #-212; mov dc1, dj3 ; ZOL-NEXT: lda dj4, [p6, #0]; mov p6, sp ; ZOL-NEXT: paddb [p6], #-216; st m0, [sp, #-96] // 4-byte Folded Spill ; ZOL-NEXT: lda dn0, [p6, #0]; mov p6, sp -; ZOL-NEXT: mova dj3, #0; paddb [p6], #-220; st dj0, [sp, #-88] // 4-byte Folded Spill +; ZOL-NEXT: paddb [p6], #-220; st dj0, [sp, #-88] // 4-byte Folded Spill ; ZOL-NEXT: lda dn4, [p6, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-228; mov dc0, dj3 +; ZOL-NEXT: paddb [p6], #-228; mov dc5, dj3 ; ZOL-NEXT: lda r10, [p6, #0]; mov p6, sp ; ZOL-NEXT: paddb [p6], #-232; st dn0, [sp, #-92] // 4-byte Folded Spill ; ZOL-NEXT: lda dj1, [p6, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-236; mov dc4, dj3 +; ZOL-NEXT: paddb [p6], #-236; mov dc2, dj3 ; ZOL-NEXT: lda r11, [p6, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-240; mov dc1, dj3 +; ZOL-NEXT: paddb [p6], #-240; st p7, [sp, #-192] // 4-byte Folded Spill ; ZOL-NEXT: lda dn1, [p6, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-244; mov dc5, dj3 +; ZOL-NEXT: vst wl0, [sp, #-64]; paddb [p6], #-244; mov dc6, dj3 // 32-byte Folded Spill ; ZOL-NEXT: lda dn5, [p6, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-248; st p7, [sp, #-192] // 4-byte Folded Spill -; ZOL-NEXT: lda r12, [p6, #0]; mov p6, sp -; ZOL-NEXT: vst wl0, [sp, #-64]; paddb [p6], #-252; mov p7, sp // 32-byte Folded Spill -; ZOL-NEXT: lda dj2, [p6, #0]; mov p6, sp -; ZOL-NEXT: vst wh0, [sp, #-32]; paddb [p6], #-256; mov dc7, dj3 // 32-byte Folded Spill +; ZOL-NEXT: vst wh0, [sp, #-32]; paddb [p6], #-248; mov p7, sp // 32-byte Folded Spill +; ZOL-NEXT: lda r12, [p6, #0]; paddb [p7], #-272; mov p6, sp +; ZOL-NEXT: lda r24, [p7, #0]; paddb [p6], #-252; mov p7, sp +; ZOL-NEXT: lda dj2, [p6, #0]; paddb [p7], #-200; mov p6, sp +; ZOL-NEXT: lda m6, [p7, #0]; paddb [p6], #-256; mov dc3, dj3 ; ZOL-NEXT: lda dj6, [p6, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-260; st dc7, [sp, #-84] // 4-byte Folded Spill -; ZOL-NEXT: lda dn2, [p6, #0]; paddb [p7], #-272; mov p6, sp -; ZOL-NEXT: lda r24, [p7, #0]; paddb [p6], #-264; mov p7, sp -; ZOL-NEXT: lda dn6, [p6, #0]; paddb [p7], #-200; mov p6, sp -; ZOL-NEXT: lda m6, [p7, #0]; paddb [p6], #-268; mov dc2, dj3 +; ZOL-NEXT: lda m7, [sp, #-96]; paddb [p6], #-260; mov r27, dj3 // 4-byte Folded Reload +; ZOL-NEXT: lda dn2, [p6, #0]; mov p6, sp +; ZOL-NEXT: lda dj7, [sp, #-88]; paddb [p6], #-264; mov dc7, dj3 // 4-byte Folded Reload +; ZOL-NEXT: lda dn6, [p6, #0]; mov p6, sp +; ZOL-NEXT: lda dn7, [sp, #-92]; paddb [p6], #-268; mov p7, sp // 4-byte Folded Reload ; ZOL-NEXT: lda r13, [p6, #0]; mov p6, sp -; ZOL-NEXT: lda m7, [sp, #-96]; paddb [p6], #-276; mov dc6, dj3 // 4-byte Folded Reload +; ZOL-NEXT: paddb [p6], #-276; st dc7, [sp, #-84] // 4-byte Folded Spill ; ZOL-NEXT: lda dn3, [p6, #0]; mov p6, sp -; ZOL-NEXT: lda dj7, [sp, #-88]; paddb [p6], #-280; mov dc3, dj3 // 4-byte Folded Reload +; ZOL-NEXT: paddb [p6], #-280; st m7, [sp, #-96] // 4-byte Folded Spill ; ZOL-NEXT: lda r25, [p6, #0]; mov p6, sp -; ZOL-NEXT: lda dn7, [sp, #-92]; paddb [p6], #-196; mov p7, sp // 4-byte Folded Reload -; ZOL-NEXT: lda r14, [p6, #0]; paddb [p7], #-288; mov p6, sp -; ZOL-NEXT: lda r26, [p7, #0]; paddb [p6], #-224; mov s2, r6 -; ZOL-NEXT: lda r15, [p6, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-284; st m7, [sp, #-96] // 4-byte Folded Spill -; ZOL-NEXT: lda m4, [p6, #0]; mov r27, dj3 -; ZOL-NEXT: st dj7, [sp, #-88] // 4-byte Folded Spill -; ZOL-NEXT: st dn7, [sp, #-92]; movx r9, #31; mov r8, #11 // 4-byte Folded Spill +; ZOL-NEXT: paddb [p6], #-196; st dj7, [sp, #-88] // 4-byte Folded Spill +; ZOL-NEXT: lda r14, [p6, #0]; mov p6, sp +; ZOL-NEXT: paddb [p6], #-224; st dn7, [sp, #-92] // 4-byte Folded Spill +; ZOL-NEXT: lda r15, [p6, #0]; paddb [p7], #-288; mov p6, sp +; ZOL-NEXT: lda r26, [p7, #0]; paddb [p6], #-284; movx r8, #11; mov dj5, r11 +; ZOL-NEXT: lda m4, [p6, #0]; movx r9, #31; mov m3, r13 ; ZOL-NEXT: // implicit-def: $x4 ; ZOL-NEXT: // implicit-def: $x2 ; ZOL-NEXT: .p2align 4 @@ -448,21 +446,21 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-NEXT: // Child Loop BB0_2 Depth 2 ; ZOL-NEXT: vldb wl6, [p1], #32; nopa ; nops ; nopxm ; nopv ; ZOL-NEXT: vldb wl3, [p0], m6; mov r0, p0 -; ZOL-NEXT: vlda.ups.s32.s16 bmh0, s0, [p2, #32] -; ZOL-NEXT: vldb wh6, [p1], #32 -; ZOL-NEXT: vldb wh3, [p0], m6; mov m5, p4 +; ZOL-NEXT: vlda.ups.s32.s16 bmh0, s0, [p2, #32]; nopx ; ZOL-NEXT: vlda.ups.s32.s16 bml0, s0, [p2], m5 -; ZOL-NEXT: vldb wl8, [p1], #32 -; ZOL-NEXT: vldb wl7, [p0], m6 +; ZOL-NEXT: vldb wh6, [p1], #32 +; ZOL-NEXT: vldb wh3, [p0], m6 ; ZOL-NEXT: vlda.ups.s32.s16 bmh1, s0, [p2, #32]; mov m1, p5 ; ZOL-NEXT: vlda.ups.s32.s16 bml1, s0, [p2], m1 -; ZOL-NEXT: vldb wh8, [p1], #32 -; ZOL-NEXT: vldb.3d wh7, [p0], d0 +; ZOL-NEXT: vldb wl8, [p1], #32 +; ZOL-NEXT: vldb wl7, [p0], m6 ; ZOL-NEXT: vlda.ups.s32.s16 bmh2, s0, [p2, #32] ; ZOL-NEXT: vlda.ups.s32.s16 bml2, s0, [p2], m5 -; ZOL-NEXT: vldb wl1, [p1], #32 +; ZOL-NEXT: vldb wh8, [p1], #32 +; ZOL-NEXT: vldb.3d wh7, [p0], d0 ; ZOL-NEXT: vlda.ups.s32.s16 bmh3, s0, [p2, #32]; mov m2, r14 ; ZOL-NEXT: vlda.ups.s32.s16 bml3, s0, [p2], m2 +; ZOL-NEXT: vldb wl1, [p1], #32 ; ZOL-NEXT: vlda.ups.s32.s16 bmh4, s0, [p2, #32] ; ZOL-NEXT: vlda.ups.s32.s16 bml4, s0, [p2], m5 ; ZOL-NEXT: vlda.ups.s32.s16 bmh5, s0, [p2, #32] @@ -494,7 +492,7 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-NEXT: .L_LEnd0: ; ZOL-NEXT: vldb wh10, [p1], #32; nopa ; nops ; and r1, r1, r9; vmov x8, x10; vmac cm7, cm7, x11, x8, r4 ; ZOL-NEXT: // %bb.3: // in Loop: Header=BB0_1 Depth=1 -; ZOL-NEXT: nopa ; nopb ; nopx ; vmov x11, x0 +; ZOL-NEXT: nopx ; vmov x11, x0 ; ZOL-NEXT: vshuffle x0, x4, x2, r3 ; ZOL-NEXT: vshuffle x11, x0, x11, r8 ; ZOL-NEXT: vlda wl0, [sp, #-64] // 32-byte Folded Reload @@ -516,7 +514,7 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-NEXT: vshuffle x3, x4, x2, r3; vmac cm0, cm1, x8, x1, r4 ; ZOL-NEXT: st dj7, [sp, #-88] // 4-byte Folded Spill ; ZOL-NEXT: vshuffle x5, x3, x0, r8; vmac cm1, cm2, x3, x1, r4 -; ZOL-NEXT: lda m7, [sp, #-96]; vst.srs.s16.s32 bmh7, s2, [p3, #32]; mov s3, r6 // 4-byte Folded Reload +; ZOL-NEXT: lda m7, [sp, #-96]; vst.srs.s16.s32 bmh7, s2, [p3, #32] // 4-byte Folded Reload ; ZOL-NEXT: lda dc7, [sp, #-84]; vst.srs.s16.s32 bml7, s3, [p3], #64; vmac cm2, cm3, x5, x1, r4 // 4-byte Folded Reload ; ZOL-NEXT: vst.srs.s16.s32 bmh0, s3, [p3, #32] ; ZOL-NEXT: vst.srs.s16.s32 bml0, s3, [p3], m4; vmac cm3, cm8, x6, x10, r4 @@ -526,9 +524,9 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-NEXT: vst.srs.s16.s32 bml2, s3, [p3], m1; vmac cm5, cm5, x3, x10, r4 ; ZOL-NEXT: vst.srs.s16.s32 bmh3, s3, [p3, #32] ; ZOL-NEXT: vst.srs.s16.s32 bml3, s3, [p3], #64; vmac cm4, cm6, x5, x10, r4 -; ZOL-NEXT: vst.srs.s16.s32 bmh8, s3, [p3, #32]; mov dj5, r11 -; ZOL-NEXT: vst.srs.s16.s32 bml8, s3, [p3], m4; mov m2, r12 -; ZOL-NEXT: vst.srs.s16.s32 bmh5, s3, [p3, #32]; mov m3, r13 +; ZOL-NEXT: vst.srs.s16.s32 bmh8, s3, [p3, #32] +; ZOL-NEXT: vst.srs.s16.s32 bml8, s3, [p3], m4 +; ZOL-NEXT: vst.srs.s16.s32 bmh5, s3, [p3, #32]; mov m2, r12 ; ZOL-NEXT: vst.srs.s16.s32 bml5, s3, [p3], #64; mov m1, r10 ; ZOL-NEXT: padda.3d [p0], d1; vst.srs.s16.s32 bmh4, s3, [p3, #32]; mov m1, r15 ; ZOL-NEXT: vst.2d.srs.s16.s32 bml4, s3, [p3], d7; add r7, r7, #-1; mov dj7, r24 diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll index e993495e55fa..7af1e07aa841 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll @@ -37,7 +37,7 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ASM-LABEL: conv2d.loop.nest: ; ASM: .p2align 4 ; ASM-NEXT: // %bb.0: // %newFuncRoot -; ASM-NEXT: nopa ; paddb [sp], #32; nopx +; ASM-NEXT: paddb [sp], #32; nopx ; ASM-NEXT: st p6, [sp, #-28] // 4-byte Folded Spill ; ASM-NEXT: mov p6, sp ; ASM-NEXT: paddb [p6], #-132 @@ -56,13 +56,13 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ASM-NEXT: lda dn0, [p6, #0]; mov p6, sp ; ASM-NEXT: paddb [p6], #-60 ; ASM-NEXT: lda dn4, [p6, #0]; mov p6, sp -; ASM-NEXT: paddb [p6], #-68 +; ASM-NEXT: paddb [p6], #-68; mov s0, r0 ; ASM-NEXT: lda r10, [p6, #0]; mov p6, sp -; ASM-NEXT: paddb [p6], #-72; mov s0, r0 +; ASM-NEXT: paddb [p6], #-72; mov s1, r1 ; ASM-NEXT: lda dj1, [p6, #0]; mov p6, sp -; ASM-NEXT: paddb [p6], #-76; mov s1, r1 +; ASM-NEXT: paddb [p6], #-76; mov s2, r6 ; ASM-NEXT: lda r11, [p6, #0]; mov p6, sp -; ASM-NEXT: mova dj3, #0; paddb [p6], #-80; mov s2, r6 +; ASM-NEXT: mova dj3, #0; paddb [p6], #-80; mov s3, r6 ; ASM-NEXT: lda dn1, [p6, #0]; mov p6, sp ; ASM-NEXT: paddb [p6], #-84; mov dc0, dj3 ; ASM-NEXT: lda r12, [p6, #0]; mov p6, sp @@ -137,7 +137,7 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ASM-NEXT: add r1, r1, #33; vmac cm0, cm0, x5, x7, r4 // Delay Slot 1 ; ASM-NEXT: // %bb.3: // %outer.loop.latch ; ASM-NEXT: // in Loop: Header=BB0_1 Depth=1 -; ASM-NEXT: nopb ; nopa ; vst.srs.s16.s32 bmh1, s2, [p3, #32]; nopx ; mov s3, r6; nopv +; ASM-NEXT: nopb ; nopa ; vst.srs.s16.s32 bmh1, s2, [p3, #32]; nopxm ; nopv ; ASM-NEXT: vst.srs.s16.s32 bml1, s3, [p3], #64 ; ASM-NEXT: vst.srs.s16.s32 bmh2, s3, [p3, #32] ; ASM-NEXT: vst.srs.s16.s32 bml2, s3, [p3], m4 diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll index a88352a428e3..761dd1e918aa 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -O2 -mtriple=aie2 --enable-pipeliner=0 %s -o - | FileCheck %s ; ; This file is licensed under the Apache License v2.0 with LLVM Exceptions. ; See https://llvm.org/LICENSE.txt for license information. @@ -7,6 +6,7 @@ ; ; (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates +; RUN: llc -O2 -mtriple=aie2 %s -o - | FileCheck %s ; RUN: opt -mtriple=aie2 -passes=aa-eval -print-all-alias-modref-info -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AA ; A reduced example from MLLib's mul2d benchmark. @@ -54,14 +54,14 @@ define void @mul2d(ptr noalias %in_ptr0, ptr noalias %in_ptr1, ptr noalias %out_ ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: mova r0, #2; nopb ; extend.u16 r1, r4; nopm ; CHECK-NEXT: ltu r0, r1, r0 -; CHECK-NEXT: jnz r0, #.LBB0_4 +; CHECK-NEXT: jnz r0, #.LBB0_5 ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.lr.ph -; CHECK-NEXT: nopa ; nopb ; nopx ; mov p3, sp; nops +; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; mov p3, sp; nopv ; CHECK-NEXT: paddb [p3], #-4 ; CHECK-NEXT: lda.u8 r0, [p3, #0]; mov p3, sp ; CHECK-NEXT: paddb [p3], #-8 @@ -74,36 +74,45 @@ define void @mul2d(ptr noalias %in_ptr0, ptr noalias %in_ptr1, ptr noalias %out_ ; CHECK-NEXT: lda dn4, [p3, #0]; mov p3, sp ; CHECK-NEXT: paddb [p3], #-24 ; CHECK-NEXT: lda m0, [p3, #0] -; CHECK-NEXT: extend.u8 r5, r5 -; CHECK-NEXT: mova dc0, #0; mov s0, r5 -; CHECK-NEXT: mova r3, #0; movx r2, #1; mov dc4, dc0 -; CHECK-NEXT: mova r4, #-1; ne r2, r0, r2; vbcst.8 x0, r3 -; CHECK-NEXT: mova r0, #808; lshl r1, r1, r4; mov crSRSSign, r2 -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: .LBB0_2: // %for.body -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldb wl2, [p1], #32; nopx -; CHECK-NEXT: vldb.3d wl6, [p0], d0 -; CHECK-NEXT: vldb wl4, [p1], #32 -; CHECK-NEXT: vldb.3d wl2, [p0], d0 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vmov wh6, wl0 -; CHECK-NEXT: vmov wh2, wl0 -; CHECK-NEXT: add r1, r1, #-1; vmul cm0, x6, x2, r0 -; CHECK-NEXT: jnz r1, #.LBB0_2; vmul cm1, x2, x4, r0 -; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: mova r4, #-1 +; CHECK-NEXT: mova dc0, #0; vldb wl2, [p1], #32; lshl r1, r1, r4 +; CHECK-NEXT: vldb wl8, [p1], #32; add r1, r1, #-1; mov dc4, dc0 +; CHECK-NEXT: vldb.3d wl6, [p0], d0; jz r1, #.LBB0_4 +; CHECK-NEXT: vldb.3d wl4, [p0], d0 // Delay Slot 5 +; CHECK-NEXT: extend.u8 r5, r5 // Delay Slot 4 +; CHECK-NEXT: mova r3, #0; movx r2, #1; mov s0, r5 // Delay Slot 3 +; CHECK-NEXT: ne r2, r0, r2; vbcst.8 x0, r3 // Delay Slot 2 +; CHECK-NEXT: mova r0, #808; mov crSRSSign, r2 // Delay Slot 1 +; CHECK-NEXT: // %bb.2: +; CHECK-NEXT: nopa ; nopb ; nopx ; vmov wh6, wl0 +; CHECK-NEXT: vmov wh4, wl0 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB0_3: // %for.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldb wl2, [p1], #32; nopxm +; CHECK-NEXT: vldb.3d wl6, [p0], d0; add r1, r1, #-1; vmul cm0, x6, x2, r0 +; CHECK-NEXT: vldb.3d wl4, [p0], d0; jnz r1, #.LBB0_3; vmul cm1, x4, x8, r0 +; CHECK-NEXT: vldb wl8, [p1], #32 // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 ; CHECK-NEXT: vst.srs.d8.s32 cm0, s0, [p2], #32 // Delay Slot 2 ; CHECK-NEXT: vst.srs.d8.s32 cm1, s0, [p2], #32 // Delay Slot 1 -; CHECK-NEXT: // %bb.3: -; CHECK-NEXT: nopa ; nopxm +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmov wh6, wl0; nopv +; CHECK-NEXT: nopa ; vmov wh4, wl0 +; CHECK-NEXT: vmul cm0, x6, x2, r0 +; CHECK-NEXT: vmul cm1, x4, x8, r0 +; CHECK-NEXT: nop +; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: mov crSRSSign, #0 +; CHECK-NEXT: vst.srs.d8.s32 cm0, s0, [p2], #32 +; CHECK-NEXT: vst.srs.d8.s32 cm1, s0, [p2], #32; mov crSRSSign, #0 ; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup +; CHECK-NEXT: .LBB0_5: // %for.cond.cleanup ; CHECK-NEXT: nopa ; ret lr ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll index 98417f324c92..bd0a860e4127 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll @@ -59,126 +59,13 @@ declare <32 x i16> @llvm.aie2.vbroadcast16.I512(i32) #0 ; Function Attrs: mustprogress noinline define dso_local void @TanhTemplated(ptr noalias %ifm, ptr noalias %ofm, ptr nonnull align 32 dereferenceable(64) %params) align 2 { -; WAW-STICKY-ON-LABEL: TanhTemplated: -; WAW-STICKY-ON: .p2align 4 -; WAW-STICKY-ON-NEXT: // %bb.0: // %for.body.lr.ph -; WAW-STICKY-ON-NEXT: nopx ; mov r9, r16 -; WAW-STICKY-ON-NEXT: movxm r3, #16512 -; WAW-STICKY-ON-NEXT: movxm r0, #16256 -; WAW-STICKY-ON-NEXT: movxm r1, #16384 -; WAW-STICKY-ON-NEXT: lda r5, [p2, #0]; movxm r2, #16128 -; WAW-STICKY-ON-NEXT: vbcst.16 x0, r1 -; WAW-STICKY-ON-NEXT: vbcst.16 x3, r0 -; WAW-STICKY-ON-NEXT: vldb wl3, [p0], #32; vbcst.16 x2, r2 -; WAW-STICKY-ON-NEXT: mova r0, #0; vconv.fp32.bf16 bmh0, wl2 -; WAW-STICKY-ON-NEXT: vbcst.16 x2, r0 -; WAW-STICKY-ON-NEXT: vconv.fp32.bf16 bmh1, wl3 -; WAW-STICKY-ON-NEXT: mova r1, #-5; vmov wh0, wl2 -; WAW-STICKY-ON-NEXT: lshl r1, r5, r1; vmov wh3, wl2 -; WAW-STICKY-ON-NEXT: mova r1, #60; vldb wl3, [p0], #32; add.nc r2, r1, #-2 -; WAW-STICKY-ON-NEXT: movxm r4, #-16256; vmul.f bmh2, x0, x3, r1 -; WAW-STICKY-ON-NEXT: movxm r6, #32767 -; WAW-STICKY-ON-NEXT: movxm r7, #15616 -; WAW-STICKY-ON-NEXT: movxm r8, #16000 -; WAW-STICKY-ON-NEXT: vbcst.16 x1, r3 -; WAW-STICKY-ON-NEXT: vbcst.16 x10, r4 -; WAW-STICKY-ON-NEXT: vconv.bf16.fp32 wl3, bmh2; vbcst.16 x8, r6; vmul.f bmh3, x0, x3, r1 -; WAW-STICKY-ON-NEXT: vbcst.16 x6, r7 -; WAW-STICKY-ON-NEXT: vmin_ge.bf16 x3, r16, x3, x1 -; WAW-STICKY-ON-NEXT: vmax_lt.bf16 x3, r16, x3, x10 -; WAW-STICKY-ON-NEXT: vmov wh3, wl2 -; WAW-STICKY-ON-NEXT: vmov wh6, wl2 -; WAW-STICKY-ON-NEXT: vconv.bf16.fp32 wl5, bmh3; vband x7, x8, x3 -; WAW-STICKY-ON-NEXT: vldb wl7, [p0], #32; vmov wh7, wl2 -; WAW-STICKY-ON-NEXT: vmin_ge.bf16 x5, r16, x5, x1 -; WAW-STICKY-ON-NEXT: vmax_lt.bf16 x5, r16, x5, x10 -; WAW-STICKY-ON-NEXT: vldb wl7, [p0], #32; vband x7, x8, x5 -; WAW-STICKY-ON-NEXT: vmov wh7, wl2; vmul.f bmh2, x6, x7, r1 -; WAW-STICKY-ON-NEXT: vbcst.16 x4, r8 -; WAW-STICKY-ON-NEXT: vmov wh4, wl2; vmul.f bmh4, x6, x7, r1 -; WAW-STICKY-ON-NEXT: vmov wh5, wl2; vmul.f bmh5, x0, x7, r1 -; WAW-STICKY-ON-NEXT: vmac.f bmh3, bmh0, x3, x4, r1 -; WAW-STICKY-ON-NEXT: movxm ls, #.LBB0_1; vmac.f bmh6, bmh0, x5, x4, r1 -; WAW-STICKY-ON-NEXT: vconv.bf16.fp32 wl7, bmh2; movxm le, #.L_LEnd0; vmul.f bmh7, x0, x7, r1 -; WAW-STICKY-ON-NEXT: add.nc lc, r2, #0 -; WAW-STICKY-ON-NEXT: nopb ; nopa ; vconv.bf16.fp32 wl3, bmh4; nopxm ; vmsc.f bmh3, bmh3, x7, x3, r1 -; WAW-STICKY-ON-NEXT: nopb ; nopa ; vconv.bf16.fp32 wl3, bmh5; nopxm ; nopv -; WAW-STICKY-ON-NEXT: nopb ; nopa ; nops ; nopxm ; vmsc.f bmh2, bmh6, x3, x5, r1 -; WAW-STICKY-ON-NEXT: nopb ; nopa ; nops ; nopx ; vmin_ge.bf16 x3, r16, x3, x1; nopv -; WAW-STICKY-ON-NEXT: nopb ; nopa ; vconv.bf16.fp32 wl5, bmh7; nopx ; vmax_lt.bf16 x3, r16, x3, x10; nopv -; WAW-STICKY-ON-NEXT: nopb ; nopa ; nops ; nopx ; vmov wh3, wl2; nopv -; WAW-STICKY-ON-NEXT: nopb ; mova r0, #28; vconv.bf16.fp32 wl7, bmh3; nopx ; vmin_ge.bf16 x5, r16, x5, x1; nopv -; WAW-STICKY-ON-NEXT: .p2align 4 -; WAW-STICKY-ON-NEXT: .LBB0_1: // %for.body -; WAW-STICKY-ON-NEXT: // =>This Inner Loop Header: Depth=1 -; WAW-STICKY-ON-NEXT: nopa ; nopb ; nopx ; vband x9, x8, x3; nops -; WAW-STICKY-ON-NEXT: vmov wh6, wl2 -; WAW-STICKY-ON-NEXT: vmax_lt.bf16 x5, r16, x5, x10 -; WAW-STICKY-ON-NEXT: vconv.bf16.fp32 wl7, bmh2; vmov wh7, wl2 -; WAW-STICKY-ON-NEXT: vldb wl7, [p0], #32; vmov wh9, wl2; vmul.f bmh6, x7, x0, r1 -; WAW-STICKY-ON-NEXT: vmov wh5, wl2; vmul.f bmh2, x7, x0, r1 -; WAW-STICKY-ON-NEXT: nop -; WAW-STICKY-ON-NEXT: vldb wl7, [p0], #32; vband x9, x8, x5; vmul.f bmh3, x6, x9, r1 -; WAW-STICKY-ON-NEXT: vmov wh9, wl2; vsub.f bmh6, bmh6, bmh1, r0 -; WAW-STICKY-ON-NEXT: vsub.f bml0, bmh2, bmh1, r0 -; WAW-STICKY-ON-NEXT: vmov wh4, wl2; vmul.f bmh4, x6, x9, r1 -; WAW-STICKY-ON-NEXT: vmul.f bmh7, x0, x7, r1 -; WAW-STICKY-ON-NEXT: vmac.f bmh5, bmh0, x3, x4, r1 -; WAW-STICKY-ON-NEXT: vmac.f bmh2, bmh0, x5, x4, r1 -; WAW-STICKY-ON-NEXT: vconv.bf16.fp32 wl7, bmh3; vmul.f bmh8, x0, x7, r1 -; WAW-STICKY-ON-NEXT: nop -; WAW-STICKY-ON-NEXT: vconv.bf16.fp32 wl3, bmh4; vmov wh3, wl2; vmsc.f bmh3, bmh5, x7, x3, r1 -; WAW-STICKY-ON-NEXT: vconv.bf16.fp32 wl3, bmh7 -; WAW-STICKY-ON-NEXT: vmsc.f bmh2, bmh2, x3, x5, r1 -; WAW-STICKY-ON-NEXT: vst.conv.bf16.fp32 bmh6, [p1], #32; vmin_ge.bf16 x3, r16, x3, x1 -; WAW-STICKY-ON-NEXT: vconv.bf16.fp32 wl5, bmh8; vmax_lt.bf16 x3, r16, x3, x10 -; WAW-STICKY-ON-NEXT: vst.conv.bf16.fp32 bml0, [p1], #32; vmov wh3, wl2 -; WAW-STICKY-ON-NEXT: .L_LEnd0: -; WAW-STICKY-ON-NEXT: nopb ; nopa ; vconv.bf16.fp32 wl7, bmh3; nopx ; vmin_ge.bf16 x5, r16, x5, x1; nopv -; WAW-STICKY-ON-NEXT: // %bb.2: -; WAW-STICKY-ON-NEXT: nopx -; WAW-STICKY-ON-NEXT: vconv.bf16.fp32 wl1, bmh2; vmov wh1, wl2 -; WAW-STICKY-ON-NEXT: vmov wh6, wl2; vmul.f bmh3, x7, x0, r1 -; WAW-STICKY-ON-NEXT: vmax_lt.bf16 x10, r16, x5, x10; vmul.f bmh2, x1, x0, r1 -; WAW-STICKY-ON-NEXT: vband x1, x8, x3 -; WAW-STICKY-ON-NEXT: vband x8, x8, x10 -; WAW-STICKY-ON-NEXT: vmov wh1, wl2; vsub.f bmh3, bmh3, bmh1, r0 -; WAW-STICKY-ON-NEXT: vmov wh8, wl2; vsub.f bmh2, bmh2, bmh1, r0 -; WAW-STICKY-ON-NEXT: vmul.f bmh2, x6, x1, r1 -; WAW-STICKY-ON-NEXT: vmul.f bmh3, x6, x8, r1 -; WAW-STICKY-ON-NEXT: vmov wh4, wl2 -; WAW-STICKY-ON-NEXT: vmov wh10, wl2 -; WAW-STICKY-ON-NEXT: vst.conv.bf16.fp32 bmh3, [p1], #32; vmac.f bmh4, bmh0, x3, x4, r1 -; WAW-STICKY-ON-NEXT: vst.conv.bf16.fp32 bmh2, [p1], #32; vmac.f bmh0, bmh0, x10, x4, r1 -; WAW-STICKY-ON-NEXT: vconv.bf16.fp32 wl4, bmh2 -; WAW-STICKY-ON-NEXT: vconv.bf16.fp32 wl4, bmh3 -; WAW-STICKY-ON-NEXT: vmsc.f bmh2, bmh4, x4, x3, r1 -; WAW-STICKY-ON-NEXT: vmsc.f bmh0, bmh0, x4, x10, r1 -; WAW-STICKY-ON-NEXT: nop -; WAW-STICKY-ON-NEXT: nop -; WAW-STICKY-ON-NEXT: nop -; WAW-STICKY-ON-NEXT: nop -; WAW-STICKY-ON-NEXT: vconv.bf16.fp32 wl4, bmh2 -; WAW-STICKY-ON-NEXT: vconv.bf16.fp32 wl4, bmh0 -; WAW-STICKY-ON-NEXT: vmul.f bmh2, x4, x0, r1 -; WAW-STICKY-ON-NEXT: vmul.f bmh0, x4, x0, r1 -; WAW-STICKY-ON-NEXT: nop -; WAW-STICKY-ON-NEXT: nop -; WAW-STICKY-ON-NEXT: vsub.f bmh2, bmh2, bmh1, r0 -; WAW-STICKY-ON-NEXT: vsub.f bmh0, bmh0, bmh1, r0 -; WAW-STICKY-ON-NEXT: nop -; WAW-STICKY-ON-NEXT: nop -; WAW-STICKY-ON-NEXT: ret lr -; WAW-STICKY-ON-NEXT: nop // Delay Slot 5 -; WAW-STICKY-ON-NEXT: vst.conv.bf16.fp32 bmh2, [p1], #32 // Delay Slot 4 -; WAW-STICKY-ON-NEXT: vst.conv.bf16.fp32 bmh0, [p1], #32 // Delay Slot 3 -; WAW-STICKY-ON-NEXT: nop // Delay Slot 2 -; WAW-STICKY-ON-NEXT: mov r16, r9 // Delay Slot 1 ; CHECK-LABEL: TanhTemplated: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %for.body.lr.ph -; CHECK-NEXT: nopa ; nopb ; nopx ; mov r8, r16; nops +; CHECK-NEXT: nopa ; mov r8, r16 ; CHECK-NEXT: movxm r3, #16512 +; CHECK-NEXT: movxm r4, #-16256 +; CHECK-NEXT: movxm r5, #32767 ; CHECK-NEXT: movxm r0, #16256 ; CHECK-NEXT: movxm r1, #16384 ; CHECK-NEXT: lda r0, [p2, #0]; movxm r2, #16128 @@ -187,71 +74,66 @@ define dso_local void @TanhTemplated(ptr noalias %ifm, ptr noalias %ofm, ptr non ; CHECK-NEXT: vbcst.16 x2, r2 ; CHECK-NEXT: mova r1, #0; vconv.fp32.bf16 bmh0, wl2 ; CHECK-NEXT: vbcst.16 x2, r1 -; CHECK-NEXT: vmov wh0, wl2 +; CHECK-NEXT: vldb wl3, [p0], #32; vmov wh0, wl2 ; CHECK-NEXT: mova r1, #-5; vmov wh3, wl2 -; CHECK-NEXT: mova r1, #60; vldb wl3, [p0], #32; lshl r2, r0, r1; vconv.fp32.bf16 bmh1, wl3 -; CHECK-NEXT: movxm r4, #-16256; vmul.f bmh2, x0, x3, r1 -; CHECK-NEXT: movxm r5, #32767 -; CHECK-NEXT: movxm r6, #15616 +; CHECK-NEXT: mova r1, #60; lshl r2, r0, r1; vconv.fp32.bf16 bmh1, wl3 +; CHECK-NEXT: movxm r6, #15616; vmul.f bmh2, x0, x3, r1 ; CHECK-NEXT: movxm r7, #16000 ; CHECK-NEXT: vbcst.16 x1, r3 ; CHECK-NEXT: vbcst.16 x10, r4 -; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh2; vbcst.16 x8, r5; vmul.f bmh3, x0, x3, r1 +; CHECK-NEXT: vbcst.16 x8, r5; vmul.f bmh3, x0, x3, r1 ; CHECK-NEXT: vbcst.16 x6, r6 +; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh2; vbcst.16 x4, r7 +; CHECK-NEXT: vmov wh6, wl2 ; CHECK-NEXT: vmin_ge.bf16 x3, r16, x3, x1 ; CHECK-NEXT: vmax_lt.bf16 x3, r16, x3, x10 -; CHECK-NEXT: vmov wh3, wl2 -; CHECK-NEXT: vmov wh6, wl2 ; CHECK-NEXT: vconv.bf16.fp32 wl5, bmh3; vband x7, x8, x3 -; CHECK-NEXT: vldb wl7, [p0], #32; vmov wh7, wl2 +; CHECK-NEXT: vmov wh7, wl2 ; CHECK-NEXT: vmin_ge.bf16 x5, r16, x5, x1 -; CHECK-NEXT: vmax_lt.bf16 x5, r16, x5, x10 -; CHECK-NEXT: vldb wl7, [p0], #32; vband x7, x8, x5 -; CHECK-NEXT: vmov wh7, wl2; vmul.f bmh2, x6, x7, r1 -; CHECK-NEXT: vbcst.16 x4, r7 -; CHECK-NEXT: vmov wh4, wl2; vmul.f bmh4, x6, x7, r1 -; CHECK-NEXT: vmov wh5, wl2; vmul.f bmh5, x0, x7, r1 -; CHECK-NEXT: vmac.f bmh3, bmh0, x3, x4, r1 +; CHECK-NEXT: vldb wl7, [p0], #32; vmax_lt.bf16 x5, r16, x5, x10 +; CHECK-NEXT: vband x7, x8, x5 +; CHECK-NEXT: vldb wl7, [p0], #32; vmov wh7, wl2; vmul.f bmh2, x6, x7, r1 +; CHECK-NEXT: vmov wh4, wl2 +; CHECK-NEXT: vmov wh3, wl2; vmul.f bmh4, x6, x7, r1 +; CHECK-NEXT: nop +; CHECK-NEXT: vmov wh5, wl2; vmac.f bmh3, bmh0, x3, x4, r1 +; CHECK-NEXT: vmul.f bmh5, x0, x7, r1 ; CHECK-NEXT: movxm ls, #.LBB0_1; vmac.f bmh6, bmh0, x5, x4, r1 ; CHECK-NEXT: vconv.bf16.fp32 wl7, bmh2; movxm le, #.L_LEnd0; vmul.f bmh7, x0, x7, r1 -; CHECK-NEXT: add.nc lc, r2, #-2 -; CHECK-NEXT: nopb ; nopa ; vconv.bf16.fp32 wl3, bmh4; nopxm ; vmsc.f bmh3, bmh3, x7, x3, r1 -; CHECK-NEXT: nopb ; nopa ; vconv.bf16.fp32 wl3, bmh5; nopxm ; nopv +; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh4; add.nc lc, r2, #-2 +; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmsc.f bmh3, bmh3, x7, x3, r1 ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmsc.f bml4, bmh6, x3, x5, r1 -; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmin_ge.bf16 x3, r16, x3, x1; nopv -; CHECK-NEXT: nopb ; nopa ; vconv.bf16.fp32 wl5, bmh7; nopx ; vmax_lt.bf16 x3, r16, x3, x10; nopv -; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmov wh3, wl2; nopv +; CHECK-NEXT: nopb ; nopa ; vconv.bf16.fp32 wl3, bmh5; nopxm ; nopv +; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv +; CHECK-NEXT: nopb ; nopa ; vconv.bf16.fp32 wl5, bmh7; nopx ; vmin_ge.bf16 x3, r16, x3, x1; nopv +; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmax_lt.bf16 x3, r16, x3, x10; nopv ; CHECK-NEXT: nopb ; mova r0, #28; vconv.bf16.fp32 wl7, bmh3; nopx ; vmin_ge.bf16 x5, r16, x5, x1; nopv ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vband x9, x8, x3 -; CHECK-NEXT: vmov wh6, wl2 +; CHECK-NEXT: nopa ; nopb ; nopx ; vband x9, x8, x3; nops ; CHECK-NEXT: vmax_lt.bf16 x5, r16, x5, x10 -; CHECK-NEXT: vldb wl7, [p0], #32; vmov wh4, wl2 -; CHECK-NEXT: vmov wh7, wl2 +; CHECK-NEXT: vconv.bf16.fp32 wl7, bml4; vldb wl7, [p0], #32; vmov wh3, wl2 ; CHECK-NEXT: vmov wh9, wl2; vmul.f bmh6, x7, x0, r1 -; CHECK-NEXT: vconv.bf16.fp32 wl7, bml4; vldb wl7, [p0], #32; vmov wh5, wl2; vmac.f bmh5, bmh0, x3, x4, r1 -; CHECK-NEXT: vmul.f bmh3, x6, x9, r1 -; CHECK-NEXT: vband x9, x8, x5; vmul.f bmh2, x7, x0, r1 -; CHECK-NEXT: vmov wh9, wl2; vsub.f bml1, bmh6, bmh1, r0 -; CHECK-NEXT: vmul.f bmh7, x0, x7, r1 +; CHECK-NEXT: vldb wl7, [p0], #32; vband x9, x8, x5; vmul.f bmh2, x7, x0, r1 +; CHECK-NEXT: vmov wh9, wl2; vmul.f bmh3, x6, x9, r1 +; CHECK-NEXT: vmac.f bmh5, bmh0, x3, x4, r1 ; CHECK-NEXT: vmul.f bmh4, x6, x9, r1 -; CHECK-NEXT: vsub.f bml0, bmh2, bmh1, r0 -; CHECK-NEXT: vconv.bf16.fp32 wl7, bmh3; vmul.f bmh8, x0, x7, r1 +; CHECK-NEXT: vmov wh5, wl2; vsub.f bml1, bmh6, bmh1, r0 +; CHECK-NEXT: vmul.f bmh7, x0, x7, r1 ; CHECK-NEXT: vmac.f bml2, bmh0, x5, x4, r1 -; CHECK-NEXT: vmsc.f bml3, bmh5, x7, x3, r1 -; CHECK-NEXT: vconv.bf16.fp32 wl11, bmh7 -; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh4; vmov wh3, wl2 -; CHECK-NEXT: vst.conv.bf16.fp32 bml1, [p1], #32; vmin_ge.bf16 x3, r16, x11, x1 -; CHECK-NEXT: vconv.bf16.fp32 wl5, bmh8; vmax_lt.bf16 x3, r16, x3, x10; vmsc.f bml4, bml2, x3, x5, r1 -; CHECK-NEXT: vst.conv.bf16.fp32 bml0, [p1], #32; vmov wh3, wl2 +; CHECK-NEXT: vconv.bf16.fp32 wl7, bmh3; vmul.f bmh8, x0, x7, r1 +; CHECK-NEXT: vsub.f bml0, bmh2, bmh1, r0 +; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh4; vmsc.f bml3, bmh5, x7, x3, r1 +; CHECK-NEXT: nop +; CHECK-NEXT: vconv.bf16.fp32 wl11, bmh7; vmsc.f bml4, bml2, x3, x5, r1 +; CHECK-NEXT: vst.conv.bf16.fp32 bml1, [p1], #32 +; CHECK-NEXT: vconv.bf16.fp32 wl5, bmh8; vmin_ge.bf16 x3, r16, x11, x1 +; CHECK-NEXT: vst.conv.bf16.fp32 bml0, [p1], #32; vmax_lt.bf16 x3, r16, x3, x10 ; CHECK-NEXT: .L_LEnd0: ; CHECK-NEXT: nopb ; nopa ; vconv.bf16.fp32 wl7, bml3; nopx ; vmin_ge.bf16 x5, r16, x5, x1; nopv ; CHECK-NEXT: // %bb.2: -; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv -; CHECK-NEXT: nop -; CHECK-NEXT: nop +; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmov wh7, wl2; nopv ; CHECK-NEXT: vconv.bf16.fp32 wl1, bml4; vmov wh1, wl2 ; CHECK-NEXT: vmov wh6, wl2; vmul.f bmh3, x7, x0, r1 ; CHECK-NEXT: vmax_lt.bf16 x10, r16, x5, x10; vmul.f bmh2, x1, x0, r1 @@ -260,8 +142,8 @@ define dso_local void @TanhTemplated(ptr noalias %ifm, ptr noalias %ofm, ptr non ; CHECK-NEXT: vmov wh1, wl2; vsub.f bmh3, bmh3, bmh1, r0 ; CHECK-NEXT: vmov wh8, wl2; vsub.f bmh2, bmh2, bmh1, r0 ; CHECK-NEXT: vmul.f bmh2, x6, x1, r1 -; CHECK-NEXT: vmul.f bmh3, x6, x8, r1 -; CHECK-NEXT: vmov wh4, wl2 +; CHECK-NEXT: vmov wh4, wl2; vmul.f bmh3, x6, x8, r1 +; CHECK-NEXT: vmov wh3, wl2 ; CHECK-NEXT: vmov wh10, wl2 ; CHECK-NEXT: vst.conv.bf16.fp32 bmh3, [p1], #32; vmac.f bmh4, bmh0, x3, x4, r1 ; CHECK-NEXT: vst.conv.bf16.fp32 bmh2, [p1], #32; vmac.f bmh0, bmh0, x10, x4, r1 diff --git a/llvm/test/CodeGen/AIE/aie2/hoist_s_reg_copy.mir b/llvm/test/CodeGen/AIE/aie2/hoist_s_reg_copy.mir index 261d32206389..c5ee29da91e8 100644 --- a/llvm/test/CodeGen/AIE/aie2/hoist_s_reg_copy.mir +++ b/llvm/test/CodeGen/AIE/aie2/hoist_s_reg_copy.mir @@ -5,7 +5,7 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # # (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates -# RUN: llc --mtriple=aie2 --run-pass=machinelicm %s -o - | FileCheck %s +# RUN: llc --mtriple=aie2 --run-pass=early-machinelicm %s -o - | FileCheck %s # %24:mss = COPY %20 can be safely hoisted, it doesn't increase the register # pressure beyond any threshold. diff --git a/llvm/test/CodeGen/AIE/aie2/hoist_subreg_copy_postra.mir b/llvm/test/CodeGen/AIE/aie2/hoist_subreg_copy_postra.mir new file mode 100644 index 000000000000..9f921ed7b9bf --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/hoist_subreg_copy_postra.mir @@ -0,0 +1,247 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +# RUN: llc --mtriple=aie2 --run-pass=machinelicm -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: constant_vmov +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: constant_vmov + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0, $p1, $p2, $r0, $r1, $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x2 = VBCST_32 $r0 + ; CHECK-NEXT: $wh0 = VMOV_mv_w $wl2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $wl2, $p0, $p1, $p2, $r0, $r1, $s0, $wh0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $wl0, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm $p0, 32 + ; CHECK-NEXT: $wl1, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm_imm $p1, 32 + ; CHECK-NEXT: $cm0 = VMUL_vmac_cm_core_dense $x0, $x1, $r0 + ; CHECK-NEXT: $p2 = VST_SRS_S8_S32_ag_pstm_nrm_imm $p2, 32, $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; CHECK-NEXT: PseudoJZ $r1, %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: PseudoRET implicit $lr + bb.1: + liveins: $p0, $p1, $p2, $r0, $r1, $s0 + $x2 = VBCST_32 $r0 + + bb.3: + liveins: $wl2, $p0, $p1, $p2, $r0, $r1, $s0 + $wl0, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm $p0, 32 + $wh0 = VMOV_mv_w $wl2 + $wl1, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm_imm $p1, 32 + $cm0 = VMUL_vmac_cm_core_dense $x0, $x1, $r0 + $p2 = VST_SRS_S8_S32_ag_pstm_nrm_imm $p2, 32, $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + PseudoJZ $r1, %bb.3 + + bb.2: + PseudoRET implicit $lr +... + + +# $wh0 = VMOV_mv_w $wl2 cannot be hoisted due to the clobber from csr_aie2 +--- +name: regmask_clobber +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: regmask_clobber + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0, $p1, $p2, $r0, $r16, $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x2 = VBCST_32 $r0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $wl2, $p0, $p1, $p2, $r0, $r16, $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $wl0, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm $p0, 32 + ; CHECK-NEXT: $wh0 = VMOV_mv_w $wl2 + ; CHECK-NEXT: $wl1, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm_imm $p1, 32 + ; CHECK-NEXT: $cm0 = VMUL_vmac_cm_core_dense $x0, $x1, $r0 + ; CHECK-NEXT: $p2 = VST_SRS_S8_S32_ag_pstm_nrm_imm $p2, 32, $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; CHECK-NEXT: PseudoJL 32, csr_aie2, implicit-def $lr + ; CHECK-NEXT: PseudoJZ $r16, %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: PseudoRET implicit $lr + bb.1: + liveins: $p0, $p1, $p2, $r0, $r16, $s0 + $x2 = VBCST_32 $r0 + + bb.3: + liveins: $wl2, $p0, $p1, $p2, $r0, $r16, $s0 + $wl0, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm $p0, 32 + $wh0 = VMOV_mv_w $wl2 + $wl1, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm_imm $p1, 32 + $cm0 = VMUL_vmac_cm_core_dense $x0, $x1, $r0 + $p2 = VST_SRS_S8_S32_ag_pstm_nrm_imm $p2, 32, $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + PseudoJL 32, implicit-def $lr, csr_aie2 + PseudoJZ $r16, %bb.3 + + bb.2: + PseudoRET implicit $lr +... + +# Simulate a 2-stage SW pipeline on the previous example. +# The loads are in the first stage, while the VMOV to fill the hi part of x0 is +# in the second stage. The load of wl0 makes x0 livein with a lane mask. +# We should still be able to hoist the VMOV, because it doesn't collide with the +# lanes of x0 that are livein. +--- +name: constant_vmov_sublane +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: constant_vmov_sublane + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0, $p1, $p2, $r0, $r1, $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x2 = VBCST_32 $r0 + ; CHECK-NEXT: $wl0, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm $p0, 32 + ; CHECK-NEXT: $wl1, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm_imm $p1, 32 + ; CHECK-NEXT: $wh0 = VMOV_mv_w $wl2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $x0:0x0000000000000002, $x1:0x0000000000000002, $x2:0x0000000000000002, $p0, $p1, $p2, $r0, $r1, $s0, $wh0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $cm0 = VMUL_vmac_cm_core_dense $x0, $x1, $r0 + ; CHECK-NEXT: $p2 = VST_SRS_S8_S32_ag_pstm_nrm_imm $p2, 32, $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; CHECK-NEXT: $wl0, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm $p0, 32 + ; CHECK-NEXT: $wl1, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm_imm $p1, 32 + ; CHECK-NEXT: PseudoJZ $r1, %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $x0:0x0000000000000002, $x1:0x0000000000000002, $x2:0x0000000000000002, $p0, $p1, $p2, $r0, $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $wh0 = VMOV_mv_w $wl2 + ; CHECK-NEXT: $cm0 = VMUL_vmac_cm_core_dense $x0, $x1, $r0 + ; CHECK-NEXT: $p2 = VST_SRS_S8_S32_ag_pstm_nrm_imm $p2, 32, $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + ; CHECK-NEXT: PseudoRET implicit $lr + bb.1: + liveins: $p0, $p1, $p2, $r0, $r1, $s0 + $x2 = VBCST_32 $r0 + $wl0, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm $p0, 32 + $wl1, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm_imm $p1, 32 + + bb.3: + liveins: $x0:0x02, $x1:0x02, $x2:0x02, $p0, $p1, $p2, $r0, $r1, $s0 + $wh0 = VMOV_mv_w $wl2 + $cm0 = VMUL_vmac_cm_core_dense $x0, $x1, $r0 + $p2 = VST_SRS_S8_S32_ag_pstm_nrm_imm $p2, 32, $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + $wl0, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm $p0, 32 + $wl1, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm_imm $p1, 32 + PseudoJZ $r1, %bb.3 + + bb.2: + liveins: $x0:0x02, $x1:0x02, $x2:0x02, $p0, $p1, $p2, $r0, $s0 + $wh0 = VMOV_mv_w $wl2 + $cm0 = VMUL_vmac_cm_core_dense $x0, $x1, $r0 + $p2 = VST_SRS_S8_S32_ag_pstm_nrm_imm $p2, 32, $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd + PseudoRET implicit $lr +... + +# wh3 is re-defined in the loop through the definition of x3 +# Make sure it's not hoisted +--- +name: non_constant_vmov +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: non_constant_vmov + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0, $p1, $p2, $r0, $r1, $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x2 = VBCST_32 $r0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $wl2, $p0, $p1, $p2, $r0, $r1, $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $wl0, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm $p0, 32 + ; CHECK-NEXT: $wl1, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm_imm $p1, 32 + ; CHECK-NEXT: $x3 = VADD_32 $x0, $x1 + ; CHECK-NEXT: $wh3 = VMOV_mv_w $wl2 + ; CHECK-NEXT: $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm $wl3, $p2, 32 + ; CHECK-NEXT: $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm $wh3, $p2, 32 + ; CHECK-NEXT: PseudoJZ $r1, %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: PseudoRET implicit $lr + bb.1: + liveins: $p0, $p1, $p2, $r0, $r1, $s0 + $x2 = VBCST_32 $r0 + + bb.3: + liveins: $wl2, $p0, $p1, $p2, $r0, $r1, $s0 + $wl0, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm $p0, 32 + $wl1, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm_imm $p1, 32 + $x3 = VADD_32 $x0, $x1 + $wh3 = VMOV_mv_w $wl2 + $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm $wl3, $p2, 32 + $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm $wh3, $p2, 32 + PseudoJZ $r1, %bb.3 + + bb.2: + PseudoRET implicit $lr +... + +# Here, x6.lo (wl6) is livein for the loop through $x6:0x02, but it's also re-defined. +# Check $wl6 = VMOV_mv_w ... isn't hoisted. +--- +name: overwrite_live_lane +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: overwrite_live_lane + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0, $p1, $p2, $r0, $r1, $s0, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x2 = VBCST_32 $r0 + ; CHECK-NEXT: $wl6 = VMOV_mv_w $wl2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $x0, $x2:0x0000000000000002, $x6:0x0000000000000002, $p0, $r1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $wh6, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm $p0, 32 + ; CHECK-NEXT: $x0 = VADD_32 $x0, $x6 + ; CHECK-NEXT: $wl6 = VMOV_mv_w $wl2 + ; CHECK-NEXT: PseudoJZ $r1, %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + bb.1: + liveins: $p0, $p1, $p2, $r0, $r1, $s0, $x0 + $x2 = VBCST_32 $r0 + $wl6 = VMOV_mv_w $wl2 + + bb.3: + liveins: $x0, $x2:0x02, $x6:0x02, $p0, $r1 + $wh6, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm $p0, 32 + $x0 = VADD_32 $x0, $x6 + $wl6 = VMOV_mv_w $wl2 + PseudoJZ $r1, %bb.3 + + bb.2: + liveins: $x0 + PseudoRET implicit $lr, implicit $x0 +... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/swp/swp-regsequence.mir b/llvm/test/CodeGen/AIE/aie2/schedule/swp/swp-regsequence.mir new file mode 100644 index 000000000000..85c07fbdaec3 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/swp/swp-regsequence.mir @@ -0,0 +1,216 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2 --run-pass=pipeliner %s -o - \ +# RUN: --aie-loop-min-tripcount=2 --aie-pipeliner-max-stagecount=2 --aie-postpipeliner-limit=1 | FileCheck %s + +--- | + + define void @vmul_256(ptr noalias %out, ptr noalias readonly %in1, ptr noalias readonly %in2) { + entry: + ret void + } + define void @vmul_512(ptr noalias %out, ptr noalias readonly %in1, ptr noalias readonly %in2) { + entry: + ret void + } +... + +# Here we verify in which stage REG_SEQUENCE is placed after SW pipelining. +# This is mostly driven by the PropagateIncomingLatencies DAGMutator. + +# When we only load 256 bit into a 512 bit vector, and the rest comes from an +# external source to fill the hi bits, we would like the REG_SEQUENCE to be +# placed close to consumer to avoid making the whole 512 bit register +# loop-carried and facilitate LICM for the constant part. +# When the whole 512 bit register is defined in the loop, we would like to keep +# the REG_SEQUENCE close to its sources to facilitate register coalescing and +# "tie" the sources together. + +# We expect the REG_SEQUENCE for the load of %ir.in1 to be in the second stage, close to +# its VMUL consumer. We don't want the VMUL in the steady state to read a PHI node, but +# rather the REG_SEQUENCE. +--- +name: vmul_256 +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: vmul_256 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: liveins: $p0, $p1, $p2, $r0, $r1, $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:er = COPY $r0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:er = COPY $r1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:mss = COPY $s0 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:ep = COPY $p0 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ep = COPY $p1 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:ep = COPY $p2 + ; CHECK-NEXT: [[VBCST_32_:%[0-9]+]]:vec512 = VBCST_32 [[COPY]] + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm_imm:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm1:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY3]], 32 :: (load (<32 x s8>) from %ir.in1) + ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm_imm2:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm3:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY4]], 32 :: (load (<32 x s8>) from %ir.in2) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_dmw_lda_w_ag_pstm_nrm_imm2]], %subreg.sub_256_lo + ; CHECK-NEXT: [[ADD_add_r_ri:%[0-9]+]]:er = nsw ADD_add_r_ri [[COPY1]], -1, implicit-def $srcarry + ; CHECK-NEXT: PseudoJ_jump_imm %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.5(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:er = PHI [[ADD_add_r_ri]], %bb.3, %29, %bb.4 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:ep = PHI [[VLDA_dmw_lda_w_ag_pstm_nrm_imm1]], %bb.3, %28, %bb.4 + ; CHECK-NEXT: [[PHI2:%[0-9]+]]:ep = PHI [[VLDA_dmw_lda_w_ag_pstm_nrm_imm3]], %bb.3, %32, %bb.4 + ; CHECK-NEXT: [[PHI3:%[0-9]+]]:ep = PHI [[COPY5]], %bb.3, %34, %bb.4 + ; CHECK-NEXT: [[PHI4:%[0-9]+]]:vec256 = PHI [[VLDA_dmw_lda_w_ag_pstm_nrm_imm]], %bb.3, %27, %bb.4 + ; CHECK-NEXT: [[PHI5:%[0-9]+]]:vec512 = PHI [[REG_SEQUENCE]], %bb.3, %33, %bb.4 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vec512 = REG_SEQUENCE [[VBCST_32_]].sub_256_lo, %subreg.sub_256_hi, [[PHI4]], %subreg.sub_256_lo + ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm_imm4:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm5:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[PHI1]], 32 :: (load unknown-size from %ir.in1, align 32) + ; CHECK-NEXT: [[ADD_add_r_ri1:%[0-9]+]]:er = nsw ADD_add_r_ri [[PHI]], -1, implicit-def $srcarry + ; CHECK-NEXT: [[VMUL_vmac_cm_core_dense:%[0-9]+]]:acc1024 = VMUL_vmac_cm_core_dense [[REG_SEQUENCE1]], [[PHI5]], [[COPY]] + ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm_imm6:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm7:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[PHI2]], 32 :: (load unknown-size from %ir.in2, align 32) + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_dmw_lda_w_ag_pstm_nrm_imm6]], %subreg.sub_256_lo + ; CHECK-NEXT: [[VST_SRS_S8_S32_ag_pstm_nrm_imm:%[0-9]+]]:ep = VST_SRS_S8_S32_ag_pstm_nrm_imm [[PHI3]], 32, [[VMUL_vmac_cm_core_dense]], [[COPY2]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd :: (store (<32 x s8>) into %ir.out) + ; CHECK-NEXT: PseudoJNZ [[ADD_add_r_ri1]], %bb.4 + ; CHECK-NEXT: PseudoJ_jump_imm %bb.5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI6:%[0-9]+]]:ep = PHI [[VST_SRS_S8_S32_ag_pstm_nrm_imm]], %bb.4 + ; CHECK-NEXT: [[PHI7:%[0-9]+]]:vec256 = PHI [[VLDA_dmw_lda_w_ag_pstm_nrm_imm4]], %bb.4 + ; CHECK-NEXT: [[PHI8:%[0-9]+]]:vec512 = PHI [[REG_SEQUENCE2]], %bb.4 + ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vec512 = REG_SEQUENCE [[VBCST_32_]].sub_256_lo, %subreg.sub_256_hi, [[PHI7]], %subreg.sub_256_lo + ; CHECK-NEXT: [[VMUL_vmac_cm_core_dense1:%[0-9]+]]:acc1024 = VMUL_vmac_cm_core_dense [[REG_SEQUENCE3]], [[PHI8]], [[COPY]] + ; CHECK-NEXT: [[VST_SRS_S8_S32_ag_pstm_nrm_imm1:%[0-9]+]]:ep = VST_SRS_S8_S32_ag_pstm_nrm_imm [[PHI6]], 32, [[VMUL_vmac_cm_core_dense1]], [[COPY2]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd :: (store unknown-size into %ir.out, align 32) + ; CHECK-NEXT: PseudoJ_jump_imm %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: PseudoRET implicit $lr + bb.1: + liveins: $p0, $p1, $p2, $r0, $r1, $s0 + %0:er = COPY $r0 + %1:er = COPY $r1 + %5:mss = COPY $s0 + %10:ep = COPY $p0 + %11:ep = COPY $p1 + %12:ep = COPY $p2 + %20:vec512 = VBCST_32 %0 + + bb.3: + %30:er = PHI %1, %bb.1, %31, %bb.3 + %40:ep = PHI %10, %bb.1, %50, %bb.3 + %41:ep = PHI %11, %bb.1, %51, %bb.3 + %42:ep = PHI %12, %bb.1, %52, %bb.3 + %100:vec256, %50:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm %40, 32 :: (load (<32 x s8>) from %ir.in1) + %101:vec256, %51:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm %41, 32 :: (load (<32 x s8>) from %ir.in2) + %102:vec512 = REG_SEQUENCE %20.sub_256_lo, %subreg.sub_256_hi, %100, %subreg.sub_256_lo + %103:vec512 = REG_SEQUENCE %101, %subreg.sub_256_lo + %104:acc1024 = VMUL_vmac_cm_core_dense %102, %103, %0 + %52:ep = VST_SRS_S8_S32_ag_pstm_nrm_imm %42, 32, %104, %5, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd :: (store (<32 x s8>) into %ir.out) + %31:er = nsw ADD_add_r_ri %30, -1, implicit-def $srcarry + PseudoJNZ %31, %bb.3 + + bb.2: + PseudoRET implicit $lr +... + +# We expect the REG_SEQUENCE for the both loads to be in the first stage, close to +# their sources. +--- +name: vmul_512 +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: vmul_512 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: liveins: $p0, $p1, $p2, $r0, $r1, $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:er = COPY $r0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:er = COPY $r1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:mss = COPY $s0 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:ep = COPY $p0 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ep = COPY $p1 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:ep = COPY $p2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm_imm:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm1:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY3]], 32 :: (load (<32 x s8>) from %ir.in1) + ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm_imm2:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm3:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[VLDA_dmw_lda_w_ag_pstm_nrm_imm1]], 32 :: (load (<32 x s8>) from %ir.in1) + ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm_imm4:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm5:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY4]], 32 :: (load (<32 x s8>) from %ir.in2) + ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm_imm6:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm7:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[VLDA_dmw_lda_w_ag_pstm_nrm_imm5]], 32 :: (load (<32 x s8>) from %ir.in2) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_dmw_lda_w_ag_pstm_nrm_imm2]], %subreg.sub_256_hi, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm]], %subreg.sub_256_lo + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_dmw_lda_w_ag_pstm_nrm_imm6]], %subreg.sub_256_hi, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm4]], %subreg.sub_256_lo + ; CHECK-NEXT: [[ADD_add_r_ri:%[0-9]+]]:er = nsw ADD_add_r_ri [[COPY1]], -1, implicit-def $srcarry + ; CHECK-NEXT: PseudoJ_jump_imm %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.5(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:er = PHI [[ADD_add_r_ri]], %bb.3, %36, %bb.4 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:ep = PHI [[VLDA_dmw_lda_w_ag_pstm_nrm_imm3]], %bb.3, %41, %bb.4 + ; CHECK-NEXT: [[PHI2:%[0-9]+]]:ep = PHI [[VLDA_dmw_lda_w_ag_pstm_nrm_imm7]], %bb.3, %44, %bb.4 + ; CHECK-NEXT: [[PHI3:%[0-9]+]]:ep = PHI [[COPY5]], %bb.3, %46, %bb.4 + ; CHECK-NEXT: [[PHI4:%[0-9]+]]:vec512 = PHI [[REG_SEQUENCE]], %bb.3, %42, %bb.4 + ; CHECK-NEXT: [[PHI5:%[0-9]+]]:vec512 = PHI [[REG_SEQUENCE1]], %bb.3, %45, %bb.4 + ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm_imm8:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm9:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[PHI2]], 32 :: (load unknown-size from %ir.in2, align 32) + ; CHECK-NEXT: [[ADD_add_r_ri1:%[0-9]+]]:er = nsw ADD_add_r_ri [[PHI]], -1, implicit-def $srcarry + ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm_imm10:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm11:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[PHI1]], 32 :: (load unknown-size from %ir.in1, align 32) + ; CHECK-NEXT: [[VMUL_vmac_cm_core_dense:%[0-9]+]]:acc1024 = VMUL_vmac_cm_core_dense [[PHI4]], [[PHI5]], [[COPY]] + ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm_imm12:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm13:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[VLDA_dmw_lda_w_ag_pstm_nrm_imm11]], 32 :: (load unknown-size from %ir.in1, align 32) + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_dmw_lda_w_ag_pstm_nrm_imm12]], %subreg.sub_256_hi, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm10]], %subreg.sub_256_lo + ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm_imm14:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm15:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[VLDA_dmw_lda_w_ag_pstm_nrm_imm9]], 32 :: (load unknown-size from %ir.in2, align 32) + ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_dmw_lda_w_ag_pstm_nrm_imm14]], %subreg.sub_256_hi, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm8]], %subreg.sub_256_lo + ; CHECK-NEXT: [[VST_SRS_S8_S32_ag_pstm_nrm_imm:%[0-9]+]]:ep = VST_SRS_S8_S32_ag_pstm_nrm_imm [[PHI3]], 32, [[VMUL_vmac_cm_core_dense]], [[COPY2]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd :: (store (<32 x s8>) into %ir.out) + ; CHECK-NEXT: PseudoJNZ [[ADD_add_r_ri1]], %bb.4 + ; CHECK-NEXT: PseudoJ_jump_imm %bb.5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI6:%[0-9]+]]:ep = PHI [[VST_SRS_S8_S32_ag_pstm_nrm_imm]], %bb.4 + ; CHECK-NEXT: [[PHI7:%[0-9]+]]:vec512 = PHI [[REG_SEQUENCE2]], %bb.4 + ; CHECK-NEXT: [[PHI8:%[0-9]+]]:vec512 = PHI [[REG_SEQUENCE3]], %bb.4 + ; CHECK-NEXT: [[VMUL_vmac_cm_core_dense1:%[0-9]+]]:acc1024 = VMUL_vmac_cm_core_dense [[PHI7]], [[PHI8]], [[COPY]] + ; CHECK-NEXT: [[VST_SRS_S8_S32_ag_pstm_nrm_imm1:%[0-9]+]]:ep = VST_SRS_S8_S32_ag_pstm_nrm_imm [[PHI6]], 32, [[VMUL_vmac_cm_core_dense1]], [[COPY2]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd :: (store unknown-size into %ir.out, align 32) + ; CHECK-NEXT: PseudoJ_jump_imm %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: PseudoRET implicit $lr + bb.1: + liveins: $p0, $p1, $p2, $r0, $r1, $s0 + %0:er = COPY $r0 + %1:er = COPY $r1 + %5:mss = COPY $s0 + %10:ep = COPY $p0 + %11:ep = COPY $p1 + %12:ep = COPY $p2 + + bb.3: + %30:er = PHI %1, %bb.1, %31, %bb.3 + %40:ep = PHI %10, %bb.1, %60, %bb.3 + %41:ep = PHI %11, %bb.1, %61, %bb.3 + %42:ep = PHI %12, %bb.1, %52, %bb.3 + %100:vec256, %50:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm %40, 32 :: (load (<32 x s8>) from %ir.in1) + %101:vec256, %60:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm %50, 32 :: (load (<32 x s8>) from %ir.in1) + %102:vec256, %51:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm %41, 32 :: (load (<32 x s8>) from %ir.in2) + %103:vec256, %61:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm %51, 32 :: (load (<32 x s8>) from %ir.in2) + %104:vec512 = REG_SEQUENCE %101, %subreg.sub_256_hi, %100, %subreg.sub_256_lo + %105:vec512 = REG_SEQUENCE %103, %subreg.sub_256_hi, %102, %subreg.sub_256_lo + %106:acc1024 = VMUL_vmac_cm_core_dense %104, %105, %0 + %52:ep = VST_SRS_S8_S32_ag_pstm_nrm_imm %42, 32, %106, %5, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd :: (store (<32 x s8>) into %ir.out) + %31:er = nsw ADD_add_r_ri %30, -1, implicit-def $srcarry + PseudoJNZ %31, %bb.3 + + bb.2: + PseudoRET implicit $lr +... diff --git a/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll b/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll index e03c9ca34b82..5067f1a42e6e 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll @@ -10,9 +10,12 @@ define amdgpu_ps void @main(i32 %arg) { ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_mov_b32 s1, exec_lo ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b32 s1, exec_lo ; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: s_mov_b32 s5, s4 +; GFX10-NEXT: s_mov_b32 s6, s4 +; GFX10-NEXT: s_mov_b32 s7, s4 ; GFX10-NEXT: s_branch .LBB0_2 ; GFX10-NEXT: .LBB0_1: ; in Loop: Header=BB0_2 Depth=1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -31,9 +34,6 @@ define amdgpu_ps void @main(i32 %arg) { ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX10-NEXT: s_cbranch_execz .LBB0_1 ; GFX10-NEXT: ; %bb.4: ; in Loop: Header=BB0_2 Depth=1 -; GFX10-NEXT: s_mov_b32 s5, s4 -; GFX10-NEXT: s_mov_b32 s6, s4 -; GFX10-NEXT: s_mov_b32 s7, s4 ; GFX10-NEXT: buffer_atomic_and v0, off, s[4:7], 0 ; GFX10-NEXT: s_branch .LBB0_1 ; GFX10-NEXT: .LBB0_5: ; %bb8 diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 297b5180dfe9..fc3b8f8d4f33 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -142,6 +142,9 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_readlane_b32 s40, v4, 4 ; CHECK-NEXT: image_sample_lz v5, v[1:2], s[44:51], s[20:23] dmask:0x1 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: s_mov_b32 s21, s20 +; CHECK-NEXT: s_mov_b32 s22, s20 +; CHECK-NEXT: s_mov_b32 s23, s20 ; CHECK-NEXT: v_readlane_b32 s41, v4, 5 ; CHECK-NEXT: v_readlane_b32 s42, v4, 6 ; CHECK-NEXT: v_readlane_b32 s43, v4, 7 @@ -152,15 +155,14 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_readlane_b32 s41, v4, 37 ; CHECK-NEXT: v_readlane_b32 s42, v4, 38 ; CHECK-NEXT: v_readlane_b32 s43, v4, 39 -; CHECK-NEXT: s_mov_b32 s21, s20 -; CHECK-NEXT: s_mov_b32 s22, s20 -; CHECK-NEXT: s_mov_b32 s23, s20 -; CHECK-NEXT: v_readlane_b32 s37, v4, 33 -; CHECK-NEXT: v_readlane_b32 s38, v4, 34 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_nop 3 ; CHECK-NEXT: image_sample_lz v6, v[1:2], s[60:67], s[40:43] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s39, v4, 35 +; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: image_sample_lz v1, v[1:2], s[12:19], s[20:23] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s37, v4, 33 +; CHECK-NEXT: v_readlane_b32 s38, v4, 34 +; CHECK-NEXT: v_readlane_b32 s39, v4, 35 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_sub_f32_e32 v1, v1, v6 ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll index 7799b9509ceb..da8aa5446983 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -886,12 +886,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GCN-NEXT: v_writelane_b32 v40, s62, 30 ; GCN-NEXT: v_writelane_b32 v40, s63, 31 ; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: s_movk_i32 s4, 0x7b ; GCN-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s8, v0 ; GCN-NEXT: v_readfirstlane_b32 s9, v1 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] ; GCN-NEXT: s_and_saveexec_b64 s[10:11], vcc +; GCN-NEXT: s_movk_i32 s4, 0x7b ; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: s_xor_b64 exec, exec, s[10:11] @@ -980,12 +980,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s62, 30 ; GISEL-NEXT: v_writelane_b32 v40, s63, 31 ; GISEL-NEXT: s_mov_b64 s[6:7], exec -; GISEL-NEXT: s_movk_i32 s4, 0x7b ; GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s8, v0 ; GISEL-NEXT: v_readfirstlane_b32 s9, v1 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] ; GISEL-NEXT: s_and_saveexec_b64 s[10:11], vcc +; GISEL-NEXT: s_movk_i32 s4, 0x7b ; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: s_xor_b64 exec, exec, s[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index cddfb21a6fbd..08b8494f7604 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -161,10 +161,10 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_cbranch_vccz .LBB2_12 ; GFX11-NEXT: ; %bb.9: ; GFX11-NEXT: s_xor_b32 s0, s8, -1 -; GFX11-NEXT: .LBB2_10: ; %bb17 -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: .LBB2_10: ; %bb17 +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_cbranch_vccz .LBB2_10 ; GFX11-NEXT: ; %bb.11: ; %Flow6 ; GFX11-NEXT: s_mov_b32 s17, -1 diff --git a/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir b/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir index e818eaf95aee..8e999f136e03 100644 --- a/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir +++ b/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass machinelicm -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass early-machinelicm -o - %s | FileCheck -check-prefix=GCN %s # MachineLICM shall limit hoisting of V_CVT instructions out of the loop keeping # register pressure within the budget. VGPR budget at occupancy 10 is 24 vgprs. diff --git a/llvm/test/CodeGen/AMDGPU/licm-valu.mir b/llvm/test/CodeGen/AMDGPU/licm-valu.mir index 00a5a4f1b32e..4c1e53fb115c 100644 --- a/llvm/test/CodeGen/AMDGPU/licm-valu.mir +++ b/llvm/test/CodeGen/AMDGPU/licm-valu.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=machinelicm -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=early-machinelicm -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s --- name: hoist_move diff --git a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll index d62f045674ac..cd95e140ce5e 100644 --- a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll +++ b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll @@ -10,8 +10,18 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) { ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_mov_b32 s1, 0 +; GFX10-NEXT: s_mov_b32 s5, s4 +; GFX10-NEXT: s_mov_b32 s6, s4 +; GFX10-NEXT: s_mov_b32 s7, s4 +; GFX10-NEXT: s_mov_b32 s8, s4 +; GFX10-NEXT: s_mov_b32 s9, s4 +; GFX10-NEXT: s_mov_b32 s10, s4 +; GFX10-NEXT: s_mov_b32 s11, s4 +; GFX10-NEXT: s_mov_b32 s12, s4 +; GFX10-NEXT: s_mov_b32 s13, s4 +; GFX10-NEXT: s_mov_b32 s14, s4 +; GFX10-NEXT: s_mov_b32 s15, s4 ; GFX10-NEXT: ; implicit-def: $sgpr2 -; GFX10-NEXT: s_inst_prefetch 0x1 ; GFX10-NEXT: s_branch .LBB0_2 ; GFX10-NEXT: .p2align 6 ; GFX10-NEXT: .LBB0_1: ; %Flow @@ -29,19 +39,8 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) { ; GFX10-NEXT: s_cbranch_execz .LBB0_1 ; GFX10-NEXT: ; %bb.3: ; %branch2_merge ; GFX10-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GFX10-NEXT: s_mov_b32 s5, s4 -; GFX10-NEXT: s_mov_b32 s6, s4 -; GFX10-NEXT: s_mov_b32 s7, s4 -; GFX10-NEXT: s_mov_b32 s8, s4 -; GFX10-NEXT: s_mov_b32 s9, s4 -; GFX10-NEXT: s_mov_b32 s10, s4 -; GFX10-NEXT: s_mov_b32 s11, s4 -; GFX10-NEXT: s_mov_b32 s12, s4 -; GFX10-NEXT: s_mov_b32 s13, s4 -; GFX10-NEXT: s_mov_b32 s14, s4 -; GFX10-NEXT: s_mov_b32 s15, s4 -; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo ; GFX10-NEXT: image_sample_lz v1, [v2, v2, v1], s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_3D +; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_fma_f32 v1, v1, v0, 0 ; GFX10-NEXT: v_cmp_le_f32_e64 s0, 0, v1 @@ -49,7 +48,6 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) { ; GFX10-NEXT: s_or_b32 s2, s2, s0 ; GFX10-NEXT: s_branch .LBB0_1 ; GFX10-NEXT: .LBB0_4: ; %loop0_merge -; GFX10-NEXT: s_inst_prefetch 0x2 ; GFX10-NEXT: s_endpgm ; ; GFX12-LABEL: _amdgpu_cs_main: @@ -59,6 +57,17 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) { ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: s_mov_b32 s5, s4 +; GFX12-NEXT: s_mov_b32 s6, s4 +; GFX12-NEXT: s_mov_b32 s7, s4 +; GFX12-NEXT: s_mov_b32 s8, s4 +; GFX12-NEXT: s_mov_b32 s9, s4 +; GFX12-NEXT: s_mov_b32 s10, s4 +; GFX12-NEXT: s_mov_b32 s11, s4 +; GFX12-NEXT: s_mov_b32 s12, s4 +; GFX12-NEXT: s_mov_b32 s13, s4 +; GFX12-NEXT: s_mov_b32 s14, s4 +; GFX12-NEXT: s_mov_b32 s15, s4 ; GFX12-NEXT: ; implicit-def: $sgpr2 ; GFX12-NEXT: s_branch .LBB0_2 ; GFX12-NEXT: .LBB0_1: ; %Flow @@ -77,19 +86,8 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) { ; GFX12-NEXT: s_cbranch_execz .LBB0_1 ; GFX12-NEXT: ; %bb.3: ; %branch2_merge ; GFX12-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GFX12-NEXT: s_mov_b32 s5, s4 -; GFX12-NEXT: s_mov_b32 s6, s4 -; GFX12-NEXT: s_mov_b32 s7, s4 -; GFX12-NEXT: s_mov_b32 s8, s4 -; GFX12-NEXT: s_mov_b32 s9, s4 -; GFX12-NEXT: s_mov_b32 s10, s4 -; GFX12-NEXT: s_mov_b32 s11, s4 -; GFX12-NEXT: s_mov_b32 s12, s4 -; GFX12-NEXT: s_mov_b32 s13, s4 -; GFX12-NEXT: s_mov_b32 s14, s4 -; GFX12-NEXT: s_mov_b32 s15, s4 -; GFX12-NEXT: s_and_not1_b32 s2, s2, exec_lo ; GFX12-NEXT: image_sample_lz v1, [v2, v2, v1], s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_3D +; GFX12-NEXT: s_and_not1_b32 s2, s2, exec_lo ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: v_fma_f32 v1, v1, v0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll index 7c351d2b8443..24de217e0772 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll +++ b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll @@ -12,6 +12,7 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) { ; GCN-NEXT: s_mov_b32 s9, s5 ; GCN-NEXT: s_mov_b32 s8, s4 ; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_mov_b32 s11, s7 ; GCN-NEXT: s_branch .LBB0_2 ; GCN-NEXT: .LBB0_1: ; %loop.exit.guard ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 @@ -20,7 +21,6 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) { ; GCN-NEXT: .LBB0_2: ; %bb1 ; GCN-NEXT: ; =>This Loop Header: Depth=1 ; GCN-NEXT: ; Child Loop BB0_4 Depth 2 -; GCN-NEXT: s_mov_b32 s11, s7 ; GCN-NEXT: buffer_load_dword v1, off, s[8:11], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll index 0acee5bd5ac1..a3d1214100cd 100644 --- a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll +++ b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll @@ -56,9 +56,10 @@ define amdgpu_ps float @valley_partially_undef_copy() #0 { ; CHECK-NEXT: s_waitcnt expcnt(1) ; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1 +; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] ; CHECK-NEXT: .LBB1_1: ; %bb9 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] +; CHECK-NEXT: s_mov_b64 vcc, vcc ; CHECK-NEXT: s_cbranch_vccnz .LBB1_1 ; CHECK-NEXT: ; %bb.2: ; %bb11 ; CHECK-NEXT: s_mov_b32 s3, 0xf000 diff --git a/llvm/test/CodeGen/X86/unfoldMemoryOperand.mir b/llvm/test/CodeGen/X86/unfoldMemoryOperand.mir index 4c715b894fae..c4147e016349 100644 --- a/llvm/test/CodeGen/X86/unfoldMemoryOperand.mir +++ b/llvm/test/CodeGen/X86/unfoldMemoryOperand.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=x86_64-- -run-pass machinelicm -mcpu=skx -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -mtriple=x86_64 --run-pass=early-machinelicm -mcpu=skx -verify-machineinstrs -o - %s | FileCheck %s --- | @x = dso_local global i32 0, align 4 @z = dso_local local_unnamed_addr global [1024 x i32] zeroinitializer, align 16 diff --git a/llvm/test/DebugInfo/MIR/X86/mlicm-hoist-pre-regalloc.mir b/llvm/test/DebugInfo/MIR/X86/mlicm-hoist-pre-regalloc.mir index 90a6abdf9bd0..8102cb22732d 100644 --- a/llvm/test/DebugInfo/MIR/X86/mlicm-hoist-pre-regalloc.mir +++ b/llvm/test/DebugInfo/MIR/X86/mlicm-hoist-pre-regalloc.mir @@ -1,5 +1,5 @@ --- | - ; RUN: llc -run-pass=machinelicm -o - %s | FileCheck %s + ; RUN: llc -run-pass=early-machinelicm -o - %s | FileCheck %s ; Line numbers should not be retained when loop invariant instructions are hoisted. ; Doing so causes poor stepping bevavior. ; diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll index ddc14b6a25c2..39cd0430bb9e 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll @@ -151,9 +151,10 @@ define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, doubl ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] ; CHECK-NEXT: .LBB5_1: ; %bb0 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] +; CHECK-NEXT: s_mov_b64 vcc, vcc ; CHECK-NEXT: s_cbranch_vccnz .LBB5_1 ; CHECK-NEXT: ; %bb.2: ; %bb1 ; CHECK-NEXT: v_mov_b32_e32 v0, s6