From 98d0684e64d4302fdd861267485bbf70ffab00dc Mon Sep 17 00:00:00 2001 From: Hamza Khallouki Date: Fri, 24 Jan 2025 14:07:16 +0000 Subject: [PATCH] [AIE2P] ISel support for fifo loads --- .../AIE/aie2p/AIE2PInstructionSelector.cpp | 292 ++++++++ .../GlobalIsel/inst-select-fifo-loads.mir | 424 ++++++++++++ llvm/test/CodeGen/AIE/aie2p/fifo-loads.ll | 651 ++++++++++++++++++ 3 files changed, 1367 insertions(+) create mode 100644 llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-fifo-loads.mir create mode 100644 llvm/test/CodeGen/AIE/aie2p/fifo-loads.ll diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp b/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp index 7de0283b51e6..ee526ed2a415 100644 --- a/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp +++ b/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp @@ -21,6 +21,7 @@ #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/IR/IntrinsicsAIE2P.h" #include "llvm/MC/MCContext.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TypeSize.h" #include @@ -87,6 +88,11 @@ class AIE2PInstructionSelector : public AIEBaseInstructionSelector { Register createDRegSequence(Register ModifierReg, Register IncrReg, Register SizeReg, Register CountReg, MachineRegisterInfo &MRI) override; + Register createPLFRRegSequence(Register PtrReg, Register FifoReg, + Register AvailReg, MachineRegisterInfo &MRI); + bool buildAndConstrainFifoLoadCopies(Register Bfp16Vec, Register Mantissa, + Register Exponent, + MachineRegisterInfo &MRI); Register createDSRegSequence(Register ModifierReg, Register Incr1Reg, Register Incr2Reg, Register Size1Reg, Register Count1Reg, Register Size2Reg, @@ -103,6 +109,7 @@ class AIE2PInstructionSelector : public AIEBaseInstructionSelector { bool select1024BitG_AIE_LOAD_STORE(MachineInstr &I, LoadStoreOpcodes &LSO, AddressingModeInfo &AMI, MachineRegisterInfo &MRI); + bool selectVLD_FIFO(MachineInstr &I, MachineRegisterInfo &MRI); bool selectSetI128(MachineInstr &I, MachineOperand &DstReg, MachineOperand &SrcReg, MachineRegisterInfo &MRI); bool selectExtractI128(MachineInstr &I, Register DstReg, Register SrcReg, @@ -348,6 +355,20 @@ bool AIE2PInstructionSelector::select(MachineInstr &I) { case Intrinsic::aie2p_v64accfloat_to_v64bfp16ebs16: case Intrinsic::aie2p_v64bfp16ebs8_to_v64bfp16ebs16: return selectVCONVbfp16(I, MRI); + case Intrinsic::aie2p_fifo_ld_fill: + case Intrinsic::aie2p_fifo_ld_pop_unaligned: + case Intrinsic::aie2p_fifo_ld_pop_1d_unaligned: + case Intrinsic::aie2p_fifo_ld_pop_544_1d_bfp16: + case Intrinsic::aie2p_fifo_ld_pop_576_1d_bfp16: + case Intrinsic::aie2p_fifo_ld_pop_544_bfp16: + case Intrinsic::aie2p_fifo_ld_pop_576_bfp16: + case Intrinsic::aie2p_fifo_ld_pop_2d_unaligned: + case Intrinsic::aie2p_fifo_ld_pop_3d_unaligned: + case Intrinsic::aie2p_fifo_ld_pop_544_2d_bfp16: + case Intrinsic::aie2p_fifo_ld_pop_576_2d_bfp16: + case Intrinsic::aie2p_fifo_ld_pop_544_3d_bfp16: + case Intrinsic::aie2p_fifo_ld_pop_576_3d_bfp16: + return selectVLD_FIFO(I, MRI); default: return selectImpl(I, *CoverageInfo); } @@ -1534,6 +1555,39 @@ Register AIE2PInstructionSelector::createDRegSequence( return MI.getReg(0); } +Register AIE2PInstructionSelector::createPLFRRegSequence( + Register PtrReg, Register FifoReg, Register AvailReg, + MachineRegisterInfo &MRI) { + + Register PLFRIn = MRI.createVirtualRegister(&AIE2P::ePSRFLdFRegClass); + MachineInstrBuilder MI = + MIB.buildInstr(TargetOpcode::REG_SEQUENCE, {PLFRIn}, {}) + .addReg(PtrReg) + .addImm(AIE2P::sub_ptr) + .addReg(FifoReg) + .addImm(AIE2P::sub_fifo) + .addReg(AvailReg) + .addImm(AIE2P::sub_avail); + + return MI.getReg(0); +} +bool AIE2PInstructionSelector::buildAndConstrainFifoLoadCopies( + Register Bfp16Vec, Register Mantissa, Register Exponent, + MachineRegisterInfo &MRI) { + + auto CopyMI1 = MIB.buildInstr(TargetOpcode::COPY, {Mantissa}, {}) + .addReg(Bfp16Vec, 0, AIE2P::sub_bfp16_x); + auto CopyMI2 = MIB.buildInstr(TargetOpcode::COPY, {Exponent}, {}) + .addReg(Bfp16Vec, 0, AIE2P::sub_bfp16_e); + + return constrainOperandRegClass(*MF, TRI, MRI, TII, RBI, *CopyMI2, + AIE2P::EXPVEC64RegClass, + CopyMI2->getOperand(0)) && + constrainOperandRegClass(*MF, TRI, MRI, TII, RBI, *CopyMI1, + AIE2P::VEC512RegClass, + CopyMI1->getOperand(0)); +} + Register AIE2PInstructionSelector::createDSRegSequence( Register ModifierReg, Register Incr1Reg, Register Incr2Reg, Register Size1Reg, Register Count1Reg, Register Size2Reg, @@ -2370,6 +2424,244 @@ bool AIE2PInstructionSelector::selectG_STORE(MachineInstr &I, return selectImpl(I, *CoverageInfo); } +unsigned int getLoadFifoOpcode(MachineInstr &I) { + switch (cast(I).getIntrinsicID()) { + case Intrinsic::aie2p_fifo_ld_fill: + return AIE2P::VLDB_FILL_512; + case Intrinsic::aie2p_fifo_ld_pop_unaligned: + return AIE2P::VLDB_POP_512_normal_pop; + case Intrinsic::aie2p_fifo_ld_pop_1d_unaligned: + return AIE2P::VLDB_POP_512_fifo_1d_pop; + case Intrinsic::aie2p_fifo_ld_pop_544_1d_bfp16: + return AIE2P::VLDB_POP_544_fifo_1d_pop; + case Intrinsic::aie2p_fifo_ld_pop_576_1d_bfp16: + return AIE2P::VLDB_POP_576_fifo_1d_pop; + case Intrinsic::aie2p_fifo_ld_pop_544_bfp16: + return AIE2P::VLDB_POP_544_normal_pop; + case Intrinsic::aie2p_fifo_ld_pop_576_bfp16: + return AIE2P::VLDB_POP_576_normal_pop; + case Intrinsic::aie2p_fifo_ld_pop_2d_unaligned: + return AIE2P::VLDB_POP_512_2D; + case Intrinsic::aie2p_fifo_ld_pop_3d_unaligned: + return AIE2P::VLDB_POP_512_3D; + case Intrinsic::aie2p_fifo_ld_pop_544_2d_bfp16: + return AIE2P::VLDB_POP_544_2D; + case Intrinsic::aie2p_fifo_ld_pop_576_2d_bfp16: + return AIE2P::VLDB_POP_576_2D; + case Intrinsic::aie2p_fifo_ld_pop_544_3d_bfp16: + return AIE2P::VLDB_POP_544_3D; + case Intrinsic::aie2p_fifo_ld_pop_576_3d_bfp16: + return AIE2P::VLDB_POP_576_3D; + } + llvm_unreachable("unreachable: Failed to get sparse load opcode"); + return AIE2P::INSTRUCTION_LIST_END; +} + +bool AIE2PInstructionSelector::selectVLD_FIFO(MachineInstr &I, + MachineRegisterInfo &MRI) { + auto IntrinsicID = cast(I).getIntrinsicID(); + MachineInstrBuilder MI; + Register PtrIn; + Register FifoIn; + Register AvailIn; + Register PLFRIn; + switch (IntrinsicID) { + case Intrinsic::aie2p_fifo_ld_fill: { + + Register PtrOut = I.getOperand(0).getReg(); + Register FifoOut = I.getOperand(1).getReg(); + Register AvailOut = I.getOperand(2).getReg(); + PtrIn = I.getOperand(4).getReg(); + FifoIn = I.getOperand(5).getReg(); + AvailIn = I.getOperand(6).getReg(); + + MI = MIB.buildInstr(getLoadFifoOpcode(I), {PtrOut, FifoOut, AvailOut}, + {PtrIn, FifoIn, AvailIn}); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI); + } + case Intrinsic::aie2p_fifo_ld_pop_unaligned: + case Intrinsic::aie2p_fifo_ld_pop_1d_unaligned: + case Intrinsic::aie2p_fifo_ld_pop_2d_unaligned: + case Intrinsic::aie2p_fifo_ld_pop_3d_unaligned: { + + Register VecOut = I.getOperand(0).getReg(); + Register PtrOut = I.getOperand(1).getReg(); + Register FifoOut = I.getOperand(2).getReg(); + Register AvailOut = I.getOperand(3).getReg(); + + if (IntrinsicID == Intrinsic::aie2p_fifo_ld_pop_unaligned) { + PtrIn = I.getOperand(5).getReg(); + FifoIn = I.getOperand(6).getReg(); + AvailIn = I.getOperand(7).getReg(); + + MI = MIB.buildInstr(getLoadFifoOpcode(I), + {VecOut, PtrOut, FifoOut, AvailOut}, + {PtrIn, FifoIn, AvailIn}); + } else if (IntrinsicID == Intrinsic::aie2p_fifo_ld_pop_1d_unaligned) { + PtrIn = I.getOperand(5).getReg(); + FifoIn = I.getOperand(6).getReg(); + AvailIn = I.getOperand(7).getReg(); + + Register OffsetReg = I.getOperand(8).getReg(); + MI = MIB.buildInstr(getLoadFifoOpcode(I), + {VecOut, PtrOut, FifoOut, AvailOut}, + {PtrIn, FifoIn, AvailIn, OffsetReg}); + } else if (IntrinsicID == Intrinsic::aie2p_fifo_ld_pop_2d_unaligned) { + Register CountOut1Reg = I.getOperand(4).getReg(); + PtrIn = I.getOperand(6).getReg(); + FifoIn = I.getOperand(7).getReg(); + AvailIn = I.getOperand(8).getReg(); + Register OffsetReg = I.getOperand(9).getReg(); + Register SizeReg = I.getOperand(10).getReg(); + Register CountIn1Reg = I.getOperand(11).getReg(); + Register IncrReg = I.getOperand(12).getReg(); + if (!RBI.constrainGenericRegister(CountOut1Reg, AIE2P::eDCRegClass, MRI)) + return false; + + Register DReg = + createDRegSequence(OffsetReg, IncrReg, SizeReg, CountIn1Reg, MRI); + MI = MIB.buildInstr(getLoadFifoOpcode(I), + {VecOut, PtrOut, FifoOut, AvailOut, CountOut1Reg}, + {PtrIn, FifoIn, AvailIn, DReg}); + } else if (IntrinsicID == Intrinsic::aie2p_fifo_ld_pop_3d_unaligned) { + Register CountOut1Reg = I.getOperand(4).getReg(); + Register CountOut2Reg = I.getOperand(5).getReg(); + PtrIn = I.getOperand(7).getReg(); + FifoIn = I.getOperand(8).getReg(); + AvailIn = I.getOperand(9).getReg(); + Register OffsetReg = I.getOperand(10).getReg(); + Register Size1Reg = I.getOperand(11).getReg(); + Register CountIn1Reg = I.getOperand(12).getReg(); + Register Incr1Reg = I.getOperand(13).getReg(); + Register Size2Reg = I.getOperand(14).getReg(); + Register CountIn2Reg = I.getOperand(15).getReg(); + Register Incr2Reg = I.getOperand(16).getReg(); + + if (!RBI.constrainGenericRegister(CountOut1Reg, + *TRI.getAddrCountRegClass(), MRI) || + !RBI.constrainGenericRegister(CountOut2Reg, + *TRI.getAddrCountRegClass(), MRI)) + return false; + Register DSReg = + createDSRegSequence(OffsetReg, Incr1Reg, Incr2Reg, Size1Reg, + CountIn1Reg, Size2Reg, CountIn2Reg, MRI); + MI = MIB.buildInstr( + getLoadFifoOpcode(I), + {VecOut, PtrOut, FifoOut, AvailOut, CountOut1Reg, CountOut2Reg}, + {PtrIn, FifoIn, AvailIn, DSReg}); + } else { + return false; + } + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI); + } + case Intrinsic::aie2p_fifo_ld_pop_576_bfp16: + case Intrinsic::aie2p_fifo_ld_pop_544_bfp16: + case Intrinsic::aie2p_fifo_ld_pop_576_1d_bfp16: + case Intrinsic::aie2p_fifo_ld_pop_544_1d_bfp16: + case Intrinsic::aie2p_fifo_ld_pop_576_2d_bfp16: + case Intrinsic::aie2p_fifo_ld_pop_544_2d_bfp16: + case Intrinsic::aie2p_fifo_ld_pop_576_3d_bfp16: + case Intrinsic::aie2p_fifo_ld_pop_544_3d_bfp16: { + + Register PtrOut = I.getOperand(0).getReg(); + Register FifoOut = I.getOperand(1).getReg(); + Register AvailOut = I.getOperand(2).getReg(); + Register Vec576Out = MRI.createVirtualRegister(&AIE2P::mEXbRegClass); + Register MantVecOut; + Register ExpVecOut; + if (IntrinsicID == Intrinsic::aie2p_fifo_ld_pop_576_bfp16 || + IntrinsicID == Intrinsic::aie2p_fifo_ld_pop_544_bfp16) { + PtrIn = I.getOperand(6).getReg(); + FifoIn = I.getOperand(7).getReg(); + AvailIn = I.getOperand(8).getReg(); + MantVecOut = I.getOperand(3).getReg(); + ExpVecOut = I.getOperand(4).getReg(); + + MI = MIB.buildInstr(getLoadFifoOpcode(I), + {Vec576Out, PtrOut, FifoOut, AvailOut}, + {PtrIn, FifoIn, AvailIn}); + } else if (IntrinsicID == Intrinsic::aie2p_fifo_ld_pop_576_1d_bfp16 || + IntrinsicID == Intrinsic::aie2p_fifo_ld_pop_544_1d_bfp16) { + PtrIn = I.getOperand(6).getReg(); + FifoIn = I.getOperand(7).getReg(); + AvailIn = I.getOperand(8).getReg(); + MantVecOut = I.getOperand(3).getReg(); + ExpVecOut = I.getOperand(4).getReg(); + + Register OffsetReg = I.getOperand(9).getReg(); + MI = MIB.buildInstr(getLoadFifoOpcode(I), + {Vec576Out, PtrOut, FifoOut, AvailOut}, + {PtrIn, FifoIn, AvailIn, OffsetReg}); + } else if (IntrinsicID == Intrinsic::aie2p_fifo_ld_pop_576_2d_bfp16 || + IntrinsicID == Intrinsic::aie2p_fifo_ld_pop_544_2d_bfp16) { + Register CountOut1Reg = I.getOperand(3).getReg(); + PtrIn = I.getOperand(7).getReg(); + FifoIn = I.getOperand(8).getReg(); + AvailIn = I.getOperand(9).getReg(); + Register OffsetReg = I.getOperand(10).getReg(); + Register SizeReg = I.getOperand(11).getReg(); + Register CountIn1Reg = I.getOperand(12).getReg(); + Register IncrReg = I.getOperand(13).getReg(); + MantVecOut = I.getOperand(4).getReg(); + ExpVecOut = I.getOperand(5).getReg(); + + if (!RBI.constrainGenericRegister(CountOut1Reg, AIE2P::eDCRegClass, MRI)) + return false; + Register DReg = + createDRegSequence(OffsetReg, IncrReg, SizeReg, CountIn1Reg, MRI); + MI = MIB.buildInstr(getLoadFifoOpcode(I), + {Vec576Out, PtrOut, FifoOut, AvailOut, CountOut1Reg}, + {PtrIn, FifoIn, AvailIn, DReg}); + } else if (IntrinsicID == Intrinsic::aie2p_fifo_ld_pop_576_3d_bfp16 || + IntrinsicID == Intrinsic::aie2p_fifo_ld_pop_544_3d_bfp16) { + Register CountOut1Reg = I.getOperand(3).getReg(); + Register CountOut2Reg = I.getOperand(4).getReg(); + PtrIn = I.getOperand(8).getReg(); + FifoIn = I.getOperand(9).getReg(); + AvailIn = I.getOperand(10).getReg(); + Register OffsetReg = I.getOperand(11).getReg(); + Register Size1Reg = I.getOperand(12).getReg(); + Register CountIn1Reg = I.getOperand(13).getReg(); + Register Incr1Reg = I.getOperand(14).getReg(); + Register Size2Reg = I.getOperand(15).getReg(); + Register CountIn2Reg = I.getOperand(16).getReg(); + Register Incr2Reg = I.getOperand(17).getReg(); + MantVecOut = I.getOperand(5).getReg(); + ExpVecOut = I.getOperand(6).getReg(); + + if (!RBI.constrainGenericRegister(CountOut1Reg, + *TRI.getAddrCountRegClass(), MRI) || + !RBI.constrainGenericRegister(CountOut2Reg, + *TRI.getAddrCountRegClass(), MRI)) + return false; + Register DSReg = + createDSRegSequence(OffsetReg, Incr1Reg, Incr2Reg, Size1Reg, + CountIn1Reg, Size2Reg, CountIn2Reg, MRI); + MI = MIB.buildInstr( + getLoadFifoOpcode(I), + {Vec576Out, PtrOut, FifoOut, AvailOut, CountOut1Reg, CountOut2Reg}, + {PtrIn, FifoIn, AvailIn, DSReg}); + } else { + return false; + } + + bool CopiesConstrained = + buildAndConstrainFifoLoadCopies(Vec576Out, MantVecOut, ExpVecOut, MRI); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI) && + CopiesConstrained; + } + return false; + } + + return false; +} + bool AIE2PInstructionSelector::selectG_AIE_LOAD_STORE( MachineInstr &I, MachineRegisterInfo &MRI) { diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-fifo-loads.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-fifo-loads.mir new file mode 100644 index 000000000000..a2ee99cc3c19 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-fifo-loads.mir @@ -0,0 +1,424 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +# RUN: llc -mtriple aie2p -run-pass=instruction-select %s -verify-machineinstrs -o - | FileCheck %s + +--- +name: ld_fill +tracksRegLiveness: true +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $p0, $p1 + ; CHECK-LABEL: name: ld_fill + ; CHECK: liveins: $p0, $p1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:eps = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:eldfiforeg = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:erf2 = IMPLICIT_DEF + ; CHECK-NEXT: [[VLDB_FILL_512_:%[0-9]+]]:eps, [[VLDB_FILL_512_1:%[0-9]+]]:eldfiforeg, [[VLDB_FILL_512_2:%[0-9]+]]:erf2 = VLDB_FILL_512 [[DEF]], [[DEF1]], [[DEF2]] + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VLDB_FILL_512_]], implicit [[VLDB_FILL_512_1]], implicit [[VLDB_FILL_512_2]] + %2:modregbank(s20) = G_IMPLICIT_DEF + %4:ptrregbank(p0) = G_IMPLICIT_DEF + %5:fiforegbank(<32 x s32>) = G_IMPLICIT_DEF + %6:gprregbank(s32) = G_IMPLICIT_DEF + %7:ptrregbank(p0), %8:fiforegbank(<32 x s32>), %9:gprregbank(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.ld.fill), %4:ptrregbank(p0), %5:fiforegbank(<32 x s32>), %6:gprregbank(s32) + PseudoRET implicit $lr, implicit %7, implicit %8, implicit %9 +... + +--- +name: pop_unaligned +tracksRegLiveness: true +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $p0, $p1 + ; CHECK-LABEL: name: pop_unaligned + ; CHECK: liveins: $p0, $p1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:eps = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:eldfiforeg = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:erf2 = IMPLICIT_DEF + ; CHECK-NEXT: [[VLDB_POP_512_normal_pop:%[0-9]+]]:vec512, [[VLDB_POP_512_normal_pop1:%[0-9]+]]:eps, [[VLDB_POP_512_normal_pop2:%[0-9]+]]:eldfiforeg, [[VLDB_POP_512_normal_pop3:%[0-9]+]]:erf2 = VLDB_POP_512_normal_pop [[DEF]], [[DEF1]], [[DEF2]], implicit-def $srfifo_uf + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VLDB_POP_512_normal_pop]], implicit [[VLDB_POP_512_normal_pop1]], implicit [[VLDB_POP_512_normal_pop2]], implicit [[VLDB_POP_512_normal_pop3]] + %3:modregbank(s20) = G_IMPLICIT_DEF + %5:ptrregbank(p0) = G_IMPLICIT_DEF + %6:fiforegbank(<32 x s32>) = G_IMPLICIT_DEF + %7:gprregbank(s32) = G_IMPLICIT_DEF + %8:vregbank(<64 x s8>), %9:ptrregbank(p0), %10:fiforegbank(<32 x s32>), %11:gprregbank(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.ld.pop.unaligned), %5:ptrregbank(p0), %6:fiforegbank(<32 x s32>), %7:gprregbank(s32) + PseudoRET implicit $lr, implicit %8, implicit %9, implicit %10, implicit %11 +... + +--- +name: pop_544 +tracksRegLiveness: true +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $p0, $p1 + ; CHECK-LABEL: name: pop_544 + ; CHECK: liveins: $p0, $p1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:eps = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:eldfiforeg = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:erf2 = IMPLICIT_DEF + ; CHECK-NEXT: [[VLDB_POP_544_normal_pop:%[0-9]+]]:mexb, [[VLDB_POP_544_normal_pop1:%[0-9]+]]:eps, [[VLDB_POP_544_normal_pop2:%[0-9]+]]:eldfiforeg, [[VLDB_POP_544_normal_pop3:%[0-9]+]]:erf2 = VLDB_POP_544_normal_pop [[DEF]], [[DEF1]], [[DEF2]], implicit-def $srfifo_uf + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY [[VLDB_POP_544_normal_pop]].sub_bfp16_x + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:expvec64 = COPY [[VLDB_POP_544_normal_pop]].sub_bfp16_e + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VLDB_POP_544_normal_pop1]], implicit [[VLDB_POP_544_normal_pop2]], implicit [[VLDB_POP_544_normal_pop3]], implicit [[COPY]], implicit [[COPY1]] + %14:vregbank(<64 x s8>) = G_IMPLICIT_DEF + %15:gprregbank(<8 x s8>) = G_IMPLICIT_DEF + %6:ptrregbank(p0) = G_IMPLICIT_DEF + %7:fiforegbank(<32 x s32>) = G_IMPLICIT_DEF + %8:gprregbank(s32) = G_IMPLICIT_DEF + %9:ptrregbank(p0), %10:fiforegbank(<32 x s32>), %11:gprregbank(s32), %12:vregbank(<64 x s8>), %13:gprregbank(<8 x s8>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.ld.pop.544.bfp16), %6:ptrregbank(p0), %7:fiforegbank(<32 x s32>), %8:gprregbank(s32), %14:vregbank(<64 x s8>), %15:gprregbank(<8 x s8>) + PseudoRET implicit $lr, implicit %9, implicit %10, implicit %11, implicit %12, implicit %13 +... + +--- +name: pop_576 +tracksRegLiveness: true +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $p0, $p1 + ; CHECK-LABEL: name: pop_576 + ; CHECK: liveins: $p0, $p1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:eps = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:eldfiforeg = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:erf2 = IMPLICIT_DEF + ; CHECK-NEXT: [[VLDB_POP_576_normal_pop:%[0-9]+]]:mexb, [[VLDB_POP_576_normal_pop1:%[0-9]+]]:eps, [[VLDB_POP_576_normal_pop2:%[0-9]+]]:eldfiforeg, [[VLDB_POP_576_normal_pop3:%[0-9]+]]:erf2 = VLDB_POP_576_normal_pop [[DEF]], [[DEF1]], [[DEF2]], implicit-def $srfifo_uf + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY [[VLDB_POP_576_normal_pop]].sub_bfp16_x + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:expvec64 = COPY [[VLDB_POP_576_normal_pop]].sub_bfp16_e + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VLDB_POP_576_normal_pop1]], implicit [[VLDB_POP_576_normal_pop2]], implicit [[VLDB_POP_576_normal_pop3]], implicit [[COPY]], implicit [[COPY1]] + %14:vregbank(<64 x s8>) = G_IMPLICIT_DEF + %15:gprregbank(<8 x s8>) = G_IMPLICIT_DEF + %6:ptrregbank(p0) = G_IMPLICIT_DEF + %7:fiforegbank(<32 x s32>) = G_IMPLICIT_DEF + %8:gprregbank(s32) = G_IMPLICIT_DEF + %9:ptrregbank(p0), %10:fiforegbank(<32 x s32>), %11:gprregbank(s32), %12:vregbank(<64 x s8>), %13:gprregbank(<8 x s8>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.ld.pop.576.bfp16), %6:ptrregbank(p0), %7:fiforegbank(<32 x s32>), %8:gprregbank(s32), %14:vregbank(<64 x s8>), %15:gprregbank(<8 x s8>) + PseudoRET implicit $lr, implicit %9, implicit %10, implicit %11, implicit %12, implicit %13 +... + +--- +name: pop_unaligned_1d +tracksRegLiveness: true +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $p0, $p1, $r0 + ; CHECK-LABEL: name: pop_unaligned_1d + ; CHECK: liveins: $p0, $p1, $r0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:eps = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:eldfiforeg = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:erf2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:em = IMPLICIT_DEF + ; CHECK-NEXT: [[VLDB_POP_512_fifo_1d_pop:%[0-9]+]]:vec512, [[VLDB_POP_512_fifo_1d_pop1:%[0-9]+]]:eps, [[VLDB_POP_512_fifo_1d_pop2:%[0-9]+]]:eldfiforeg, [[VLDB_POP_512_fifo_1d_pop3:%[0-9]+]]:erf2 = VLDB_POP_512_fifo_1d_pop [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], implicit-def $srfifo_uf + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VLDB_POP_512_fifo_1d_pop]], implicit [[VLDB_POP_512_fifo_1d_pop1]], implicit [[VLDB_POP_512_fifo_1d_pop2]], implicit [[VLDB_POP_512_fifo_1d_pop3]] + %6:ptrregbank(p0) = G_IMPLICIT_DEF + %7:fiforegbank(<32 x s32>) = G_IMPLICIT_DEF + %8:gprregbank(s32) = G_IMPLICIT_DEF + %9:modregbank(s20) = G_IMPLICIT_DEF + %10:vregbank(<64 x s8>), %11:ptrregbank(p0), %12:fiforegbank(<32 x s32>), %13:gprregbank(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.ld.pop.1d.unaligned), %6:ptrregbank(p0), %7:fiforegbank(<32 x s32>), %8:gprregbank(s32), %9:modregbank(s20) + PseudoRET implicit $lr, implicit %10, implicit %11, implicit %12, implicit %13 +... + +--- +name: pop_544_1d +tracksRegLiveness: true +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $p0, $p1, $r0 + ; CHECK-LABEL: name: pop_544_1d + ; CHECK: liveins: $p0, $p1, $r0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:eps = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:eldfiforeg = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:erf2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:em = IMPLICIT_DEF + ; CHECK-NEXT: [[VLDB_POP_544_fifo_1d_pop:%[0-9]+]]:mexb, [[VLDB_POP_544_fifo_1d_pop1:%[0-9]+]]:eps, [[VLDB_POP_544_fifo_1d_pop2:%[0-9]+]]:eldfiforeg, [[VLDB_POP_544_fifo_1d_pop3:%[0-9]+]]:erf2 = VLDB_POP_544_fifo_1d_pop [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], implicit-def $srfifo_uf + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY [[VLDB_POP_544_fifo_1d_pop]].sub_bfp16_x + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:expvec64 = COPY [[VLDB_POP_544_fifo_1d_pop]].sub_bfp16_e + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VLDB_POP_544_fifo_1d_pop1]], implicit [[VLDB_POP_544_fifo_1d_pop2]], implicit [[VLDB_POP_544_fifo_1d_pop3]], implicit [[COPY]], implicit [[COPY1]] + %16:vregbank(<64 x s8>) = G_IMPLICIT_DEF + %17:gprregbank(<8 x s8>) = G_IMPLICIT_DEF + %7:ptrregbank(p0) = G_IMPLICIT_DEF + %8:fiforegbank(<32 x s32>) = G_IMPLICIT_DEF + %9:gprregbank(s32) = G_IMPLICIT_DEF + %10:modregbank(s20) = G_IMPLICIT_DEF + %11:ptrregbank(p0), %12:fiforegbank(<32 x s32>), %13:gprregbank(s32), %14:vregbank(<64 x s8>), %15:gprregbank(<8 x s8>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.ld.pop.544.1d.bfp16), %7:ptrregbank(p0), %8:fiforegbank(<32 x s32>), %9:gprregbank(s32), %10:modregbank(s20), %16:vregbank(<64 x s8>), %17:gprregbank(<8 x s8>) + PseudoRET implicit $lr, implicit %11, implicit %12, implicit %13, implicit %14, implicit %15 + +... + +--- +name: pop_576_1d +tracksRegLiveness: true +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $p0, $p1, $r0 + ; CHECK-LABEL: name: pop_576_1d + ; CHECK: liveins: $p0, $p1, $r0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:eps = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:eldfiforeg = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:erf2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:em = IMPLICIT_DEF + ; CHECK-NEXT: [[VLDB_POP_576_fifo_1d_pop:%[0-9]+]]:mexb, [[VLDB_POP_576_fifo_1d_pop1:%[0-9]+]]:eps, [[VLDB_POP_576_fifo_1d_pop2:%[0-9]+]]:eldfiforeg, [[VLDB_POP_576_fifo_1d_pop3:%[0-9]+]]:erf2 = VLDB_POP_576_fifo_1d_pop [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], implicit-def $srfifo_uf + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY [[VLDB_POP_576_fifo_1d_pop]].sub_bfp16_x + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:expvec64 = COPY [[VLDB_POP_576_fifo_1d_pop]].sub_bfp16_e + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VLDB_POP_576_fifo_1d_pop1]], implicit [[VLDB_POP_576_fifo_1d_pop2]], implicit [[VLDB_POP_576_fifo_1d_pop3]], implicit [[COPY]], implicit [[COPY1]] + %16:vregbank(<64 x s8>) = G_IMPLICIT_DEF + %17:gprregbank(<8 x s8>) = G_IMPLICIT_DEF + %7:ptrregbank(p0) = G_IMPLICIT_DEF + %8:fiforegbank(<32 x s32>) = G_IMPLICIT_DEF + %9:gprregbank(s32) = G_IMPLICIT_DEF + %10:modregbank(s20) = G_IMPLICIT_DEF + %11:ptrregbank(p0), %12:fiforegbank(<32 x s32>), %13:gprregbank(s32), %14:vregbank(<64 x s8>), %15:gprregbank(<8 x s8>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.ld.pop.576.1d.bfp16), %7:ptrregbank(p0), %8:fiforegbank(<32 x s32>), %9:gprregbank(s32), %10:modregbank(s20), %16:vregbank(<64 x s8>), %17:gprregbank(<8 x s8>) + PseudoRET implicit $lr, implicit %11, implicit %12, implicit %13, implicit %14, implicit %15 +... + +--- +name: pop_unaligned_2d +tracksRegLiveness: true +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $p0, $p1, $p2, $r0, $r1, $r2 + ; CHECK-LABEL: name: pop_unaligned_2d + ; CHECK: liveins: $p0, $p1, $p2, $r0, $r1, $r2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:eps = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:eldfiforeg = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:erf2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:em = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF4:%[0-9]+]]:edn = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF5:%[0-9]+]]:edc = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF6:%[0-9]+]]:edj = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ed = REG_SEQUENCE [[DEF3]], %subreg.sub_mod, [[DEF4]], %subreg.sub_dim_size, [[DEF6]], %subreg.sub_dim_stride, [[DEF5]], %subreg.sub_dim_count + ; CHECK-NEXT: [[VLDB_POP_512_2D:%[0-9]+]]:vec512, [[VLDB_POP_512_2D1:%[0-9]+]]:eps, [[VLDB_POP_512_2D2:%[0-9]+]]:eldfiforeg, [[VLDB_POP_512_2D3:%[0-9]+]]:erf2, [[VLDB_POP_512_2D4:%[0-9]+]]:edc = VLDB_POP_512_2D [[DEF]], [[DEF1]], [[DEF2]], [[REG_SEQUENCE]], implicit-def $srfifo_uf + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VLDB_POP_512_2D]], implicit [[VLDB_POP_512_2D1]], implicit [[VLDB_POP_512_2D2]], implicit [[VLDB_POP_512_2D3]], implicit [[VLDB_POP_512_2D4]] + %9:ptrregbank(p0) = G_IMPLICIT_DEF + %10:fiforegbank(<32 x s32>) = G_IMPLICIT_DEF + %11:gprregbank(s32) = G_IMPLICIT_DEF + %12:modregbank(s20) = G_IMPLICIT_DEF + %13:modregbank(s20) = G_IMPLICIT_DEF + %15:modregbank(s20) = G_IMPLICIT_DEF + %16:modregbank(s20) = G_IMPLICIT_DEF + %17:vregbank(<64 x s8>), %18:ptrregbank(p0), %19:fiforegbank(<32 x s32>), %20:gprregbank(s32), %21:modregbank(s20) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.ld.pop.2d.unaligned), %9:ptrregbank(p0), %10:fiforegbank(<32 x s32>), %11:gprregbank(s32), %12:modregbank(s20), %13:modregbank(s20), %15:modregbank(s20), %16:modregbank(s20) + PseudoRET implicit $lr, implicit %17, implicit %18, implicit %19, implicit %20, implicit %21 + +... + +--- +name: pop_544_2d +tracksRegLiveness: true +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $p0, $p1, $p2, $r0, $r1, $r2 + ; CHECK-LABEL: name: pop_544_2d + ; CHECK: liveins: $p0, $p1, $p2, $r0, $r1, $r2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:eps = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:eldfiforeg = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:erf2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:em = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF4:%[0-9]+]]:edn = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF5:%[0-9]+]]:edc = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF6:%[0-9]+]]:edj = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ed = REG_SEQUENCE [[DEF3]], %subreg.sub_mod, [[DEF4]], %subreg.sub_dim_size, [[DEF6]], %subreg.sub_dim_stride, [[DEF5]], %subreg.sub_dim_count + ; CHECK-NEXT: [[VLDB_POP_544_2D:%[0-9]+]]:mexb, [[VLDB_POP_544_2D1:%[0-9]+]]:eps, [[VLDB_POP_544_2D2:%[0-9]+]]:eldfiforeg, [[VLDB_POP_544_2D3:%[0-9]+]]:erf2, [[VLDB_POP_544_2D4:%[0-9]+]]:edc = VLDB_POP_544_2D [[DEF]], [[DEF1]], [[DEF2]], [[REG_SEQUENCE]], implicit-def $srfifo_uf + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY [[VLDB_POP_544_2D]].sub_bfp16_x + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:expvec64 = COPY [[VLDB_POP_544_2D]].sub_bfp16_e + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[COPY]], implicit [[COPY1]], implicit [[VLDB_POP_544_2D1]], implicit [[VLDB_POP_544_2D2]], implicit [[VLDB_POP_544_2D3]], implicit [[VLDB_POP_544_2D4]] + %24:vregbank(<64 x s8>) = G_IMPLICIT_DEF + %25:gprregbank(<8 x s8>) = G_IMPLICIT_DEF + %10:ptrregbank(p0) = G_IMPLICIT_DEF + %11:fiforegbank(<32 x s32>) = G_IMPLICIT_DEF + %12:gprregbank(s32) = G_IMPLICIT_DEF + %13:modregbank(s20) = G_IMPLICIT_DEF + %14:modregbank(s20) = G_IMPLICIT_DEF + %16:modregbank(s20) = G_IMPLICIT_DEF + %17:modregbank(s20) = G_IMPLICIT_DEF + %18:ptrregbank(p0), %19:fiforegbank(<32 x s32>), %20:gprregbank(s32), %21:modregbank(s20), %22:vregbank(<64 x s8>), %23:gprregbank(<8 x s8>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.ld.pop.544.2d.bfp16), %10:ptrregbank(p0), %11:fiforegbank(<32 x s32>), %12:gprregbank(s32), %13:modregbank(s20), %14:modregbank(s20), %16:modregbank(s20), %17:modregbank(s20), %24:vregbank(<64 x s8>), %25:gprregbank(<8 x s8>) + PseudoRET implicit $lr, implicit %22, implicit %23, implicit %18, implicit %19, implicit %20, implicit %21 + +... + +--- +name: pop_576_2d +tracksRegLiveness: true +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $p0, $p1, $p2, $r0, $r1, $r2 + ; CHECK-LABEL: name: pop_576_2d + ; CHECK: liveins: $p0, $p1, $p2, $r0, $r1, $r2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:eps = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:eldfiforeg = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:erf2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:em = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF4:%[0-9]+]]:edn = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF5:%[0-9]+]]:edc = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF6:%[0-9]+]]:edj = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ed = REG_SEQUENCE [[DEF3]], %subreg.sub_mod, [[DEF4]], %subreg.sub_dim_size, [[DEF6]], %subreg.sub_dim_stride, [[DEF5]], %subreg.sub_dim_count + ; CHECK-NEXT: [[VLDB_POP_576_2D:%[0-9]+]]:mexb, [[VLDB_POP_576_2D1:%[0-9]+]]:eps, [[VLDB_POP_576_2D2:%[0-9]+]]:eldfiforeg, [[VLDB_POP_576_2D3:%[0-9]+]]:erf2, [[VLDB_POP_576_2D4:%[0-9]+]]:edc = VLDB_POP_576_2D [[DEF]], [[DEF1]], [[DEF2]], [[REG_SEQUENCE]], implicit-def $srfifo_uf + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY [[VLDB_POP_576_2D]].sub_bfp16_x + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:expvec64 = COPY [[VLDB_POP_576_2D]].sub_bfp16_e + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[COPY]], implicit [[COPY1]], implicit [[VLDB_POP_576_2D1]], implicit [[VLDB_POP_576_2D2]], implicit [[VLDB_POP_576_2D3]], implicit [[VLDB_POP_576_2D4]] + %24:vregbank(<64 x s8>) = G_IMPLICIT_DEF + %25:gprregbank(<8 x s8>) = G_IMPLICIT_DEF + %10:ptrregbank(p0) = G_IMPLICIT_DEF + %11:fiforegbank(<32 x s32>) = G_IMPLICIT_DEF + %12:gprregbank(s32) = G_IMPLICIT_DEF + %13:modregbank(s20) = G_IMPLICIT_DEF + %14:modregbank(s20) = G_IMPLICIT_DEF + %16:modregbank(s20) = G_IMPLICIT_DEF + %17:modregbank(s20) = G_IMPLICIT_DEF + %18:ptrregbank(p0), %19:fiforegbank(<32 x s32>), %20:gprregbank(s32), %21:modregbank(s20), %22:vregbank(<64 x s8>), %23:gprregbank(<8 x s8>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.ld.pop.576.2d.bfp16), %10:ptrregbank(p0), %11:fiforegbank(<32 x s32>), %12:gprregbank(s32), %13:modregbank(s20), %14:modregbank(s20), %16:modregbank(s20), %17:modregbank(s20), %24:vregbank(<64 x s8>), %25:gprregbank(<8 x s8>) + PseudoRET implicit $lr, implicit %22, implicit %23, implicit %18, implicit %19, implicit %20, implicit %21 + +... + +--- +name: pop_unaligned_3d +tracksRegLiveness: true +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $p0, $p1, $p2, $p3, $r0, $r1, $r2, $r3, $r4 + ; CHECK-LABEL: name: pop_unaligned_3d + ; CHECK: liveins: $p0, $p1, $p2, $p3, $r0, $r1, $r2, $r3, $r4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:eps = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:eldfiforeg = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:erf2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:em = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF4:%[0-9]+]]:edn = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF5:%[0-9]+]]:edc = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF6:%[0-9]+]]:edj = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF7:%[0-9]+]]:edn = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF8:%[0-9]+]]:edc = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF9:%[0-9]+]]:edj = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:eds = REG_SEQUENCE [[DEF3]], %subreg.sub_mod, [[DEF4]], %subreg.sub_dim_size, [[DEF6]], %subreg.sub_dim_stride, [[DEF5]], %subreg.sub_dim_count, [[DEF7]], %subreg.sub_hi_dim_then_sub_dim_size, [[DEF9]], %subreg.sub_hi_dim_then_sub_dim_stride, [[DEF8]], %subreg.sub_hi_dim_then_sub_dim_count + ; CHECK-NEXT: [[VLDB_POP_512_3D:%[0-9]+]]:vec512, [[VLDB_POP_512_3D1:%[0-9]+]]:eps, [[VLDB_POP_512_3D2:%[0-9]+]]:eldfiforeg, [[VLDB_POP_512_3D3:%[0-9]+]]:erf2, [[VLDB_POP_512_3D4:%[0-9]+]]:edcl, [[VLDB_POP_512_3D5:%[0-9]+]]:edch = VLDB_POP_512_3D [[DEF]], [[DEF1]], [[DEF2]], [[REG_SEQUENCE]], implicit-def $srfifo_uf + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VLDB_POP_512_3D]], implicit [[VLDB_POP_512_3D1]], implicit [[VLDB_POP_512_3D2]], implicit [[VLDB_POP_512_3D3]], implicit [[VLDB_POP_512_3D4]], implicit [[VLDB_POP_512_3D5]] + %12:ptrregbank(p0) = G_IMPLICIT_DEF + %13:fiforegbank(<32 x s32>) = G_IMPLICIT_DEF + %14:gprregbank(s32) = G_IMPLICIT_DEF + %15:modregbank(s20) = G_IMPLICIT_DEF + %16:modregbank(s20) = G_IMPLICIT_DEF + %18:modregbank(s20) = G_IMPLICIT_DEF + %19:modregbank(s20) = G_IMPLICIT_DEF + %20:modregbank(s20) = G_IMPLICIT_DEF + %22:modregbank(s20) = G_IMPLICIT_DEF + %23:modregbank(s20) = G_IMPLICIT_DEF + %24:vregbank(<64 x s8>), %25:ptrregbank(p0), %26:fiforegbank(<32 x s32>), %27:gprregbank(s32), %28:modregbank(s20), %29:modregbank(s20) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.ld.pop.3d.unaligned), %12:ptrregbank(p0), %13:fiforegbank(<32 x s32>), %14:gprregbank(s32), %15:modregbank(s20), %16:modregbank(s20), %18:modregbank(s20), %19:modregbank(s20), %20:modregbank(s20), %22:modregbank(s20), %23:modregbank(s20) + PseudoRET implicit $lr, implicit %24, implicit %25, implicit %26, implicit %27, implicit %28, implicit %29 +... + +--- +name: pop_544_3d +tracksRegLiveness: true +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $p0, $p1, $p2, $p3, $r0, $r1, $r2, $r3, $r4 + ; CHECK-LABEL: name: pop_544_3d + ; CHECK: liveins: $p0, $p1, $p2, $p3, $r0, $r1, $r2, $r3, $r4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:eps = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:eldfiforeg = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:erf2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:em = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF4:%[0-9]+]]:edn = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF5:%[0-9]+]]:edc = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF6:%[0-9]+]]:edj = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF7:%[0-9]+]]:edn = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF8:%[0-9]+]]:edc = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF9:%[0-9]+]]:edj = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:eds = REG_SEQUENCE [[DEF3]], %subreg.sub_mod, [[DEF4]], %subreg.sub_dim_size, [[DEF6]], %subreg.sub_dim_stride, [[DEF5]], %subreg.sub_dim_count, [[DEF7]], %subreg.sub_hi_dim_then_sub_dim_size, [[DEF9]], %subreg.sub_hi_dim_then_sub_dim_stride, [[DEF8]], %subreg.sub_hi_dim_then_sub_dim_count + ; CHECK-NEXT: [[VLDB_POP_544_3D:%[0-9]+]]:mexb, [[VLDB_POP_544_3D1:%[0-9]+]]:eps, [[VLDB_POP_544_3D2:%[0-9]+]]:eldfiforeg, [[VLDB_POP_544_3D3:%[0-9]+]]:erf2, [[VLDB_POP_544_3D4:%[0-9]+]]:edcl, [[VLDB_POP_544_3D5:%[0-9]+]]:edch = VLDB_POP_544_3D [[DEF]], [[DEF1]], [[DEF2]], [[REG_SEQUENCE]], implicit-def $srfifo_uf + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY [[VLDB_POP_544_3D]].sub_bfp16_x + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:expvec64 = COPY [[VLDB_POP_544_3D]].sub_bfp16_e + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[DEF9]], implicit [[VLDB_POP_544_3D1]], implicit [[VLDB_POP_544_3D2]], implicit [[VLDB_POP_544_3D3]], implicit [[VLDB_POP_544_3D4]], implicit [[VLDB_POP_544_3D5]] + %32:vregbank(<64 x s8>) = G_IMPLICIT_DEF + %33:gprregbank(<8 x s8>) = G_IMPLICIT_DEF + %13:ptrregbank(p0) = G_IMPLICIT_DEF + %14:fiforegbank(<32 x s32>) = G_IMPLICIT_DEF + %15:gprregbank(s32) = G_IMPLICIT_DEF + %16:modregbank(s20) = G_IMPLICIT_DEF + %17:modregbank(s20) = G_IMPLICIT_DEF + %19:modregbank(s20) = G_IMPLICIT_DEF + %20:modregbank(s20) = G_IMPLICIT_DEF + %21:modregbank(s20) = G_IMPLICIT_DEF + %23:modregbank(s20) = G_IMPLICIT_DEF + %24:modregbank(s20) = G_IMPLICIT_DEF + %25:ptrregbank(p0), %26:fiforegbank(<32 x s32>), %27:gprregbank(s32), %28:modregbank(s20), %29:modregbank(s20), %30:vregbank(<64 x s8>), %31:gprregbank(<8 x s8>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.ld.pop.544.3d.bfp16), %13:ptrregbank(p0), %14:fiforegbank(<32 x s32>), %15:gprregbank(s32), %16:modregbank(s20), %17:modregbank(s20), %19:modregbank(s20), %20:modregbank(s20), %21:modregbank(s20), %23:modregbank(s20), %24:modregbank(s20), %32:vregbank(<64 x s8>), %33:gprregbank(<8 x s8>) + PseudoRET implicit $lr, implicit %24, implicit %25, implicit %26, implicit %27, implicit %28, implicit %29 +... + +--- +name: pop_576_3d +tracksRegLiveness: true +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $p0, $p1, $p2, $p3, $r0, $r1, $r2, $r3, $r4 + ; CHECK-LABEL: name: pop_576_3d + ; CHECK: liveins: $p0, $p1, $p2, $p3, $r0, $r1, $r2, $r3, $r4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:eps = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:eldfiforeg = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:erf2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:em = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF4:%[0-9]+]]:edn = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF5:%[0-9]+]]:edc = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF6:%[0-9]+]]:edj = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF7:%[0-9]+]]:edn = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF8:%[0-9]+]]:edc = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF9:%[0-9]+]]:edj = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:eds = REG_SEQUENCE [[DEF3]], %subreg.sub_mod, [[DEF4]], %subreg.sub_dim_size, [[DEF6]], %subreg.sub_dim_stride, [[DEF5]], %subreg.sub_dim_count, [[DEF7]], %subreg.sub_hi_dim_then_sub_dim_size, [[DEF9]], %subreg.sub_hi_dim_then_sub_dim_stride, [[DEF8]], %subreg.sub_hi_dim_then_sub_dim_count + ; CHECK-NEXT: [[VLDB_POP_576_3D:%[0-9]+]]:mexb, [[VLDB_POP_576_3D1:%[0-9]+]]:eps, [[VLDB_POP_576_3D2:%[0-9]+]]:eldfiforeg, [[VLDB_POP_576_3D3:%[0-9]+]]:erf2, [[VLDB_POP_576_3D4:%[0-9]+]]:edcl, [[VLDB_POP_576_3D5:%[0-9]+]]:edch = VLDB_POP_576_3D [[DEF]], [[DEF1]], [[DEF2]], [[REG_SEQUENCE]], implicit-def $srfifo_uf + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY [[VLDB_POP_576_3D]].sub_bfp16_x + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:expvec64 = COPY [[VLDB_POP_576_3D]].sub_bfp16_e + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[COPY]], implicit [[COPY1]], implicit [[VLDB_POP_576_3D1]], implicit [[VLDB_POP_576_3D2]], implicit [[VLDB_POP_576_3D3]], implicit [[VLDB_POP_576_3D4]], implicit [[VLDB_POP_576_3D5]] + %32:vregbank(<64 x s8>) = G_IMPLICIT_DEF + %33:gprregbank(<8 x s8>) = G_IMPLICIT_DEF + %13:ptrregbank(p0) = G_IMPLICIT_DEF + %14:fiforegbank(<32 x s32>) = G_IMPLICIT_DEF + %15:gprregbank(s32) = G_IMPLICIT_DEF + %16:modregbank(s20) = G_IMPLICIT_DEF + %17:modregbank(s20) = G_IMPLICIT_DEF + %19:modregbank(s20) = G_IMPLICIT_DEF + %20:modregbank(s20) = G_IMPLICIT_DEF + %21:modregbank(s20) = G_IMPLICIT_DEF + %23:modregbank(s20) = G_IMPLICIT_DEF + %24:modregbank(s20) = G_IMPLICIT_DEF + %25:ptrregbank(p0), %26:fiforegbank(<32 x s32>), %27:gprregbank(s32), %28:modregbank(s20), %29:modregbank(s20), %30:vregbank(<64 x s8>), %31:gprregbank(<8 x s8>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.ld.pop.576.3d.bfp16), %13:ptrregbank(p0), %14:fiforegbank(<32 x s32>), %15:gprregbank(s32), %16:modregbank(s20), %17:modregbank(s20), %19:modregbank(s20), %20:modregbank(s20), %21:modregbank(s20), %23:modregbank(s20), %24:modregbank(s20), %32:vregbank(<64 x s8>), %33:gprregbank(<8 x s8>) + PseudoRET implicit $lr, implicit %30, implicit %31, implicit %25, implicit %26, implicit %27, implicit %28, implicit %29 + +... diff --git a/llvm/test/CodeGen/AIE/aie2p/fifo-loads.ll b/llvm/test/CodeGen/AIE/aie2p/fifo-loads.ll new file mode 100644 index 000000000000..e2784b883ef5 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/fifo-loads.ll @@ -0,0 +1,651 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; +; This file is licensed under the Apache License v2.0 with LLVM Exceptions. +; See https://llvm.org/LICENSE.txt for license information. +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +; +; (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +; RUN: llc < %s -verify-machineinstrs -mtriple=aie2p | FileCheck %s + +%struct.v64bfp16ebs8 = type <{ <64 x i8>, <8 x i8> }> +%struct.v64bfp16ebs16 = type <{ <64 x i8>, <8 x i8> }> + + +define dso_local void @_Z17test_fifo_ld_fillRPDv64_DB8_R12fifo_state_t(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s) local_unnamed_addr #2 { +; CHECK-LABEL: _Z17test_fifo_ld_fillRPDv64_DB8_R12fifo_state_t: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: lda p0, [p0, #0]; mov dj0, #128 +; CHECK-NEXT: lda r24, [p1, dj0]; nopx +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: mov p3, p0 +; CHECK-NEXT: vlda lfh0, [p1, #64] +; CHECK-NEXT: vlda lfl0, [p1, #0] +; CHECK-NEXT: vldb.fill.512 [p0, lf0, r24] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: ret lr +; CHECK-NEXT: st r24, [p1, dj0] // Delay Slot 5 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st p0, [p3, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %pos1.i = getelementptr inbounds i8, ptr %s, i20 128 + %0 = load ptr, ptr %p, align 4, !tbaa !2 + %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6 + %2 = load i32, ptr %pos1.i, align 64, !tbaa !7 + %3 = tail call { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.ld.fill(ptr %0, <32 x i32> %1, i32 %2) + %4 = extractvalue { ptr, <32 x i32>, i32 } %3, 0 + %5 = extractvalue { ptr, <32 x i32>, i32 } %3, 1 + %6 = extractvalue { ptr, <32 x i32>, i32 } %3, 2 + store <32 x i32> %5, ptr %s, align 128 + store i32 %6, ptr %pos1.i, align 64 + store ptr %4, ptr %p, align 4 + ret void +} + +define dso_local noundef <64 x i8> @_Z16test_fifo_ld_popRPDv64_DB8_R12fifo_state_t(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s) local_unnamed_addr #2 { +; CHECK-LABEL: _Z16test_fifo_ld_popRPDv64_DB8_R12fifo_state_t: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: lda p0, [p0, #0]; mov dj0, #128 +; CHECK-NEXT: lda r24, [p1, dj0]; nopx +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: mov p3, p0 +; CHECK-NEXT: vlda lfh0, [p1, #64] +; CHECK-NEXT: vlda lfl0, [p1, #0] +; CHECK-NEXT: vldb.pop.512 x0, [p0, lf0, r24] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: ret lr +; CHECK-NEXT: st r24, [p1, dj0] // Delay Slot 5 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st p0, [p3, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %pos1.i = getelementptr inbounds i8, ptr %s, i20 128 + %0 = load ptr, ptr %p, align 4, !tbaa !2 + %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6 + %2 = load i32, ptr %pos1.i, align 64, !tbaa !7 + %3 = tail call { <64 x i8>, ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.ld.pop.unaligned(ptr %0, <32 x i32> %1, i32 %2) + %4 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32 } %3, 1 + %5 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32 } %3, 2 + %6 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32 } %3, 3 + store <32 x i32> %5, ptr %s, align 128 + store i32 %6, ptr %pos1.i, align 64 + store ptr %4, ptr %p, align 4 + %7 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32 } %3, 0 + ret <64 x i8> %7 +} + +define dso_local noundef <64 x i8> @_Z24test_fifo_ld_pop_1d_byteRPDv64_DB8_R12fifo_state_ti(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s, i32 noundef %off) local_unnamed_addr #2 { +; CHECK-LABEL: _Z24test_fifo_ld_pop_1d_byteRPDv64_DB8_R12fifo_state_ti: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: lda p0, [p0, #0]; mov dj0, #128 +; CHECK-NEXT: lda r24, [p1, dj0] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: mov p3, p0 +; CHECK-NEXT: vlda lfh0, [p1, #64] +; CHECK-NEXT: vlda lfl0, [p1, #0]; mov m0, r0 +; CHECK-NEXT: vldb.pop.512 x0, [p0, lf0, r24, m0] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: ret lr +; CHECK-NEXT: st r24, [p1, dj0] // Delay Slot 5 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st p0, [p3, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %pos1.i = getelementptr inbounds i8, ptr %s, i20 128 + %0 = load ptr, ptr %p, align 4, !tbaa !2 + %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6 + %2 = load i32, ptr %pos1.i, align 64, !tbaa !7 + %3 = trunc i32 %off to i20 + %4 = tail call { <64 x i8>, ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.ld.pop.1d.unaligned(ptr %0, <32 x i32> %1, i32 %2, i20 %3) + %5 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32 } %4, 1 + %6 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32 } %4, 2 + %7 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32 } %4, 3 + store <32 x i32> %6, ptr %s, align 128 + store i32 %7, ptr %pos1.i, align 64 + store ptr %5, ptr %p, align 4 + %8 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32 } %4, 0 + ret <64 x i8> %8 +} + +define dso_local noundef <64 x i8> @_Z24test_fifo_ld_pop_2d_byteRPDv64_DB8_R12fifo_state_tiiRii(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s, i32 noundef %off, i32 noundef %size1, ptr nocapture nonnull align 4 dereferenceable(4) %count1, i32 noundef %inc1) local_unnamed_addr #2 { +; CHECK-LABEL: _Z24test_fifo_ld_pop_2d_byteRPDv64_DB8_R12fifo_state_tiiRii: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: lda dc0, [p2, #0]; nopxm +; CHECK-NEXT: lda p0, [p0, #0]; mov dj1, #128 +; CHECK-NEXT: lda r24, [p1, dj1] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: mov m0, r0 +; CHECK-NEXT: mov p4, p0 +; CHECK-NEXT: vlda lfh0, [p1, #64]; mov dn0, r1 +; CHECK-NEXT: vlda lfl0, [p1, #0]; mov dj0, r2 +; CHECK-NEXT: vldb.pop.512.2d x0, [p0, lf0, r24, d0] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: st dc0, [p2, #0]; ret lr +; CHECK-NEXT: st r24, [p1, dj1] // Delay Slot 5 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st p0, [p4, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %pos1.i = getelementptr inbounds i8, ptr %s, i20 128 + %0 = load ptr, ptr %p, align 4, !tbaa !2 + %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6 + %2 = load i32, ptr %pos1.i, align 64, !tbaa !7 + %3 = trunc i32 %off to i20 + %4 = trunc i32 %size1 to i20 + %5 = load i32, ptr %count1, align 4, !tbaa !7 + %6 = trunc i32 %5 to i20 + %7 = trunc i32 %inc1 to i20 + %8 = tail call { <64 x i8>, ptr, <32 x i32>, i32, i20 } @llvm.aie2p.fifo.ld.pop.2d.unaligned(ptr %0, <32 x i32> %1, i32 %2, i20 %3, i20 %4, i20 %6, i20 %7) + %9 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32, i20 } %8, 1 + %10 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32, i20 } %8, 2 + %11 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32, i20 } %8, 3 + %12 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32, i20 } %8, 4 + %13 = zext i20 %12 to i32 + store i32 %13, ptr %count1, align 4 + store <32 x i32> %10, ptr %s, align 128 + store i32 %11, ptr %pos1.i, align 64 + store ptr %9, ptr %p, align 4 + %14 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32, i20 } %8, 0 + ret <64 x i8> %14 +} + +define dso_local noundef <64 x i8> @_Z24test_fifo_ld_pop_3d_byteRPDv64_DB8_R12fifo_state_tiiRiiiS5_i(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s, i32 noundef %off, i32 noundef %size1, ptr nocapture nonnull align 4 dereferenceable(4) %count1, i32 noundef %inc1, i32 noundef %size2, ptr nocapture nonnull align 4 dereferenceable(4) %count2, i32 noundef %inc2) local_unnamed_addr #2 { +; CHECK-LABEL: _Z24test_fifo_ld_pop_3d_byteRPDv64_DB8_R12fifo_state_tiiRiiiS5_i: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: lda dc0, [p2, #0]; nopb ; nops ; nopxm ; nopv +; CHECK-NEXT: lda dc4, [p3, #0] +; CHECK-NEXT: lda p0, [p0, #0]; mov dj1, #128 +; CHECK-NEXT: lda r24, [p1, dj1] +; CHECK-NEXT: mov m0, r0 +; CHECK-NEXT: mov dn0, r1 +; CHECK-NEXT: mov dj0, r2 +; CHECK-NEXT: mov p4, p0 +; CHECK-NEXT: vlda lfh0, [p1, #64]; mov dn4, r3 +; CHECK-NEXT: vlda lfl0, [p1, #0]; mov dj4, r4 +; CHECK-NEXT: vldb.pop.512.3d x0, [p0, lf0, r24, d0] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: st dc0, [p2, #0] +; CHECK-NEXT: st dc4, [p3, #0]; ret lr +; CHECK-NEXT: st r24, [p1, dj1] // Delay Slot 5 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st p0, [p4, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %pos1.i = getelementptr inbounds i8, ptr %s, i20 128 + %0 = load ptr, ptr %p, align 4, !tbaa !2 + %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6 + %2 = load i32, ptr %pos1.i, align 64, !tbaa !7 + %3 = trunc i32 %off to i20 + %4 = trunc i32 %size1 to i20 + %5 = load i32, ptr %count1, align 4, !tbaa !7 + %6 = trunc i32 %5 to i20 + %7 = trunc i32 %inc1 to i20 + %8 = trunc i32 %size2 to i20 + %9 = load i32, ptr %count2, align 4, !tbaa !7 + %10 = trunc i32 %9 to i20 + %11 = trunc i32 %inc2 to i20 + %12 = tail call { <64 x i8>, ptr, <32 x i32>, i32, i20, i20 } @llvm.aie2p.fifo.ld.pop.3d.unaligned(ptr %0, <32 x i32> %1, i32 %2, i20 %3, i20 %4, i20 %6, i20 %7, i20 %8, i20 %10, i20 %11) + %13 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32, i20, i20 } %12, 1 + %14 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32, i20, i20 } %12, 2 + %15 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32, i20, i20 } %12, 3 + %16 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32, i20, i20 } %12, 4 + %17 = zext i20 %16 to i32 + %18 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32, i20, i20 } %12, 5 + %19 = zext i20 %18 to i32 + store i32 %17, ptr %count1, align 4 + store i32 %19, ptr %count2, align 4 + store <32 x i32> %14, ptr %s, align 128 + store i32 %15, ptr %pos1.i, align 64 + store ptr %13, ptr %p, align 4 + %20 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32, i20, i20 } %12, 0 + ret <64 x i8> %20 +} + + +define dso_local %struct.v64bfp16ebs8 @_Z16test_fifo_ld_popRP22v64bfp16ebs8_unalignedR12fifo_state_t(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s) local_unnamed_addr #2 { +; CHECK-LABEL: _Z16test_fifo_ld_popRP22v64bfp16ebs8_unalignedR12fifo_state_t: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: lda p0, [p0, #0]; mov dj0, #128 +; CHECK-NEXT: lda r24, [p1, dj0]; nopx +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: mov p3, p0 +; CHECK-NEXT: vlda lfh0, [p1, #64] +; CHECK-NEXT: vlda lfl0, [p1, #0] +; CHECK-NEXT: vldb.pop.576 ex0, [p0, lf0, r24] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: ret lr +; CHECK-NEXT: st r24, [p1, dj0] // Delay Slot 5 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st p0, [p3, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %pos1.i = getelementptr inbounds i8, ptr %s, i20 128 + %0 = load ptr, ptr %p, align 4, !tbaa !2 + %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6 + %2 = load i32, ptr %pos1.i, align 64, !tbaa !7 + %3 = tail call { ptr, <32 x i32>, i32, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.576.bfp16(ptr %0, <32 x i32> %1, i32 %2, <64 x i8> undef, <8 x i8> undef) + %4 = extractvalue { ptr, <32 x i32>, i32, <64 x i8>, <8 x i8> } %3, 0 + %5 = extractvalue { ptr, <32 x i32>, i32, <64 x i8>, <8 x i8> } %3, 1 + %6 = extractvalue { ptr, <32 x i32>, i32, <64 x i8>, <8 x i8> } %3, 2 + %7 = extractvalue { ptr, <32 x i32>, i32, <64 x i8>, <8 x i8> } %3, 3 + %8 = extractvalue { ptr, <32 x i32>, i32, <64 x i8>, <8 x i8> } %3, 4 + store <32 x i32> %5, ptr %s, align 128 + store i32 %6, ptr %pos1.i, align 64 + store ptr %4, ptr %p, align 4 + %.fca.0.insert.i = insertvalue %struct.v64bfp16ebs8 poison, <64 x i8> %7, 0 + %.fca.1.insert.i = insertvalue %struct.v64bfp16ebs8 %.fca.0.insert.i, <8 x i8> %8, 1 + ret %struct.v64bfp16ebs8 %.fca.1.insert.i +} + +define dso_local %struct.v64bfp16ebs8 @_Z24test_fifo_ld_pop_1d_byteRP22v64bfp16ebs8_unalignedR12fifo_state_ti(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s, i32 noundef %off) local_unnamed_addr #2 { +; CHECK-LABEL: _Z24test_fifo_ld_pop_1d_byteRP22v64bfp16ebs8_unalignedR12fifo_state_ti: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: lda p0, [p0, #0]; mov dj0, #128 +; CHECK-NEXT: lda r24, [p1, dj0] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: mov p3, p0 +; CHECK-NEXT: vlda lfh0, [p1, #64] +; CHECK-NEXT: vlda lfl0, [p1, #0]; mov m0, r0 +; CHECK-NEXT: vldb.pop.576 ex0, [p0, lf0, r24, m0] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: ret lr +; CHECK-NEXT: st r24, [p1, dj0] // Delay Slot 5 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st p0, [p3, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %pos1.i = getelementptr inbounds i8, ptr %s, i20 128 + %0 = load ptr, ptr %p, align 4, !tbaa !2 + %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6 + %2 = load i32, ptr %pos1.i, align 64, !tbaa !7 + %3 = trunc i32 %off to i20 + %4 = tail call { ptr, <32 x i32>, i32, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.576.1d.bfp16(ptr %0, <32 x i32> %1, i32 %2, i20 %3, <64 x i8> undef, <8 x i8> undef) + %5 = extractvalue { ptr, <32 x i32>, i32, <64 x i8>, <8 x i8> } %4, 0 + %6 = extractvalue { ptr, <32 x i32>, i32, <64 x i8>, <8 x i8> } %4, 1 + %7 = extractvalue { ptr, <32 x i32>, i32, <64 x i8>, <8 x i8> } %4, 2 + %8 = extractvalue { ptr, <32 x i32>, i32, <64 x i8>, <8 x i8> } %4, 3 + %9 = extractvalue { ptr, <32 x i32>, i32, <64 x i8>, <8 x i8> } %4, 4 + store <32 x i32> %6, ptr %s, align 128 + store i32 %7, ptr %pos1.i, align 64 + store ptr %5, ptr %p, align 4 + %.fca.0.insert.i = insertvalue %struct.v64bfp16ebs8 poison, <64 x i8> %8, 0 + %.fca.1.insert.i = insertvalue %struct.v64bfp16ebs8 %.fca.0.insert.i, <8 x i8> %9, 1 + ret %struct.v64bfp16ebs8 %.fca.1.insert.i +} + +define dso_local %struct.v64bfp16ebs8 @_Z24test_fifo_ld_pop_2d_byteRP22v64bfp16ebs8_unalignedR12fifo_state_tiiRii(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s, i32 noundef %off, i32 noundef %size1, ptr nocapture nonnull align 4 dereferenceable(4) %count1, i32 noundef %inc1) local_unnamed_addr #2 { +; CHECK-LABEL: _Z24test_fifo_ld_pop_2d_byteRP22v64bfp16ebs8_unalignedR12fifo_state_tiiRii: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: lda dc0, [p2, #0]; nopxm +; CHECK-NEXT: lda p0, [p0, #0]; mov dj1, #128 +; CHECK-NEXT: lda r24, [p1, dj1] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: mov m0, r0 +; CHECK-NEXT: mov p4, p0 +; CHECK-NEXT: vlda lfh0, [p1, #64]; mov dn0, r1 +; CHECK-NEXT: vlda lfl0, [p1, #0]; mov dj0, r2 +; CHECK-NEXT: vldb.pop.576.2d ex0, [p0, lf0, r24, d0] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: st dc0, [p2, #0]; ret lr +; CHECK-NEXT: st r24, [p1, dj1] // Delay Slot 5 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st p0, [p4, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %pos1.i = getelementptr inbounds i8, ptr %s, i20 128 + %0 = load ptr, ptr %p, align 4, !tbaa !2 + %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6 + %2 = load i32, ptr %pos1.i, align 64, !tbaa !7 + %3 = trunc i32 %off to i20 + %4 = trunc i32 %size1 to i20 + %5 = load i32, ptr %count1, align 4, !tbaa !7 + %6 = trunc i32 %5 to i20 + %7 = trunc i32 %inc1 to i20 + %8 = tail call { ptr, <32 x i32>, i32, i20, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.576.2d.bfp16(ptr %0, <32 x i32> %1, i32 %2, i20 %3, i20 %4, i20 %6, i20 %7, <64 x i8> undef, <8 x i8> undef) + %9 = extractvalue { ptr, <32 x i32>, i32, i20, <64 x i8>, <8 x i8> } %8, 0 + %10 = extractvalue { ptr, <32 x i32>, i32, i20, <64 x i8>, <8 x i8> } %8, 1 + %11 = extractvalue { ptr, <32 x i32>, i32, i20, <64 x i8>, <8 x i8> } %8, 2 + %12 = extractvalue { ptr, <32 x i32>, i32, i20, <64 x i8>, <8 x i8> } %8, 4 + %13 = extractvalue { ptr, <32 x i32>, i32, i20, <64 x i8>, <8 x i8> } %8, 5 + %14 = extractvalue { ptr, <32 x i32>, i32, i20, <64 x i8>, <8 x i8> } %8, 3 + %15 = zext i20 %14 to i32 + store i32 %15, ptr %count1, align 4 + store <32 x i32> %10, ptr %s, align 128 + store i32 %11, ptr %pos1.i, align 64 + store ptr %9, ptr %p, align 4 + %.fca.0.insert.i = insertvalue %struct.v64bfp16ebs8 poison, <64 x i8> %12, 0 + %.fca.1.insert.i = insertvalue %struct.v64bfp16ebs8 %.fca.0.insert.i, <8 x i8> %13, 1 + ret %struct.v64bfp16ebs8 %.fca.1.insert.i +} + +define dso_local %struct.v64bfp16ebs8 @_Z24test_fifo_ld_pop_3d_byteRP22v64bfp16ebs8_unalignedR12fifo_state_tiiRiiiS4_i(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s, i32 noundef %off, i32 noundef %size1, ptr nocapture nonnull align 4 dereferenceable(4) %count1, i32 noundef %inc1, i32 noundef %size2, ptr nocapture nonnull align 4 dereferenceable(4) %count2, i32 noundef %inc2) local_unnamed_addr #2 { +; CHECK-LABEL: _Z24test_fifo_ld_pop_3d_byteRP22v64bfp16ebs8_unalignedR12fifo_state_tiiRiiiS4_i: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: lda dc0, [p2, #0]; nopb ; nops ; nopxm ; nopv +; CHECK-NEXT: lda dc4, [p3, #0] +; CHECK-NEXT: lda p0, [p0, #0]; mov dj1, #128 +; CHECK-NEXT: lda r24, [p1, dj1] +; CHECK-NEXT: mov m0, r0 +; CHECK-NEXT: mov dn0, r1 +; CHECK-NEXT: mov dj0, r2 +; CHECK-NEXT: mov p4, p0 +; CHECK-NEXT: vlda lfh0, [p1, #64]; mov dn4, r3 +; CHECK-NEXT: vlda lfl0, [p1, #0]; mov dj4, r4 +; CHECK-NEXT: vldb.pop.576.3d ex0, [p0, lf0, r24, d0] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: st dc0, [p2, #0] +; CHECK-NEXT: st dc4, [p3, #0]; ret lr +; CHECK-NEXT: st r24, [p1, dj1] // Delay Slot 5 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st p0, [p4, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %pos1.i = getelementptr inbounds i8, ptr %s, i20 128 + %0 = load ptr, ptr %p, align 4, !tbaa !2 + %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6 + %2 = load i32, ptr %pos1.i, align 64, !tbaa !7 + %3 = trunc i32 %off to i20 + %4 = trunc i32 %size1 to i20 + %5 = load i32, ptr %count1, align 4, !tbaa !7 + %6 = trunc i32 %5 to i20 + %7 = trunc i32 %inc1 to i20 + %8 = trunc i32 %size2 to i20 + %9 = load i32, ptr %count2, align 4, !tbaa !7 + %10 = trunc i32 %9 to i20 + %11 = trunc i32 %inc2 to i20 + %12 = tail call { ptr, <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.576.3d.bfp16(ptr %0, <32 x i32> %1, i32 %2, i20 %3, i20 %4, i20 %6, i20 %7, i20 %8, i20 %10, i20 %11, <64 x i8> undef, <8 x i8> undef) + %13 = extractvalue { ptr, <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } %12, 0 + %14 = extractvalue { ptr, <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } %12, 1 + %15 = extractvalue { ptr, <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } %12, 2 + %16 = extractvalue { ptr, <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } %12, 5 + %17 = extractvalue { ptr, <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } %12, 6 + %18 = extractvalue { ptr, <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } %12, 3 + %19 = zext i20 %18 to i32 + %20 = extractvalue { ptr, <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } %12, 4 + %21 = zext i20 %20 to i32 + store i32 %19, ptr %count1, align 4 + store i32 %21, ptr %count2, align 4 + store <32 x i32> %14, ptr %s, align 128 + store i32 %15, ptr %pos1.i, align 64 + store ptr %13, ptr %p, align 4 + %.fca.0.insert.i = insertvalue %struct.v64bfp16ebs8 poison, <64 x i8> %16, 0 + %.fca.1.insert.i = insertvalue %struct.v64bfp16ebs8 %.fca.0.insert.i, <8 x i8> %17, 1 + ret %struct.v64bfp16ebs8 %.fca.1.insert.i +} + + +define dso_local %struct.v64bfp16ebs16 @_Z16test_fifo_ld_popRP23v64bfp16ebs16_unalignedR12fifo_state_t(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s) local_unnamed_addr #2 { +; CHECK-LABEL: _Z16test_fifo_ld_popRP23v64bfp16ebs16_unalignedR12fifo_state_t: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: lda p0, [p0, #0]; mov dj0, #128 +; CHECK-NEXT: lda r24, [p1, dj0]; nopx +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: mov p3, p0 +; CHECK-NEXT: vlda lfh0, [p1, #64] +; CHECK-NEXT: vlda lfl0, [p1, #0] +; CHECK-NEXT: vldb.pop.544 ex0, [p0, lf0, r24] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: ret lr +; CHECK-NEXT: st r24, [p1, dj0] // Delay Slot 5 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st p0, [p3, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %pos1.i = getelementptr inbounds i8, ptr %s, i20 128 + %0 = load ptr, ptr %p, align 4, !tbaa !2 + %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6 + %2 = load i32, ptr %pos1.i, align 64, !tbaa !7 + %3 = tail call { ptr, <32 x i32>, i32, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.544.bfp16(ptr %0, <32 x i32> %1, i32 %2, <64 x i8> undef, <8 x i8> undef) + %4 = extractvalue { ptr, <32 x i32>, i32, <64 x i8>, <8 x i8> } %3, 0 + %5 = extractvalue { ptr, <32 x i32>, i32, <64 x i8>, <8 x i8> } %3, 1 + %6 = extractvalue { ptr, <32 x i32>, i32, <64 x i8>, <8 x i8> } %3, 2 + %7 = extractvalue { ptr, <32 x i32>, i32, <64 x i8>, <8 x i8> } %3, 3 + %8 = extractvalue { ptr, <32 x i32>, i32, <64 x i8>, <8 x i8> } %3, 4 + store <32 x i32> %5, ptr %s, align 128 + store i32 %6, ptr %pos1.i, align 64 + store ptr %4, ptr %p, align 4 + %.fca.0.insert.i = insertvalue %struct.v64bfp16ebs16 poison, <64 x i8> %7, 0 + %.fca.1.insert.i = insertvalue %struct.v64bfp16ebs16 %.fca.0.insert.i, <8 x i8> %8, 1 + ret %struct.v64bfp16ebs16 %.fca.1.insert.i +} + +define dso_local %struct.v64bfp16ebs16 @_Z24test_fifo_ld_pop_1d_byteRP23v64bfp16ebs16_unalignedR12fifo_state_ti(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s, i32 noundef %off) local_unnamed_addr #2 { +; CHECK-LABEL: _Z24test_fifo_ld_pop_1d_byteRP23v64bfp16ebs16_unalignedR12fifo_state_ti: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: lda p0, [p0, #0]; mov dj0, #128 +; CHECK-NEXT: lda r24, [p1, dj0] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: mov p3, p0 +; CHECK-NEXT: vlda lfh0, [p1, #64] +; CHECK-NEXT: vlda lfl0, [p1, #0]; mov m0, r0 +; CHECK-NEXT: vldb.pop.544 ex0, [p0, lf0, r24, m0] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: ret lr +; CHECK-NEXT: st r24, [p1, dj0] // Delay Slot 5 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st p0, [p3, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %pos1.i = getelementptr inbounds i8, ptr %s, i20 128 + %0 = load ptr, ptr %p, align 4, !tbaa !2 + %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6 + %2 = load i32, ptr %pos1.i, align 64, !tbaa !7 + %3 = trunc i32 %off to i20 + %4 = tail call { ptr, <32 x i32>, i32, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.544.1d.bfp16(ptr %0, <32 x i32> %1, i32 %2, i20 %3, <64 x i8> undef, <8 x i8> undef) + %5 = extractvalue { ptr, <32 x i32>, i32, <64 x i8>, <8 x i8> } %4, 0 + %6 = extractvalue { ptr, <32 x i32>, i32, <64 x i8>, <8 x i8> } %4, 1 + %7 = extractvalue { ptr, <32 x i32>, i32, <64 x i8>, <8 x i8> } %4, 2 + %8 = extractvalue { ptr, <32 x i32>, i32, <64 x i8>, <8 x i8> } %4, 3 + %9 = extractvalue { ptr, <32 x i32>, i32, <64 x i8>, <8 x i8> } %4, 4 + store <32 x i32> %6, ptr %s, align 128 + store i32 %7, ptr %pos1.i, align 64 + store ptr %5, ptr %p, align 4 + %.fca.0.insert.i = insertvalue %struct.v64bfp16ebs16 poison, <64 x i8> %8, 0 + %.fca.1.insert.i = insertvalue %struct.v64bfp16ebs16 %.fca.0.insert.i, <8 x i8> %9, 1 + ret %struct.v64bfp16ebs16 %.fca.1.insert.i +} + +define dso_local %struct.v64bfp16ebs16 @_Z24test_fifo_ld_pop_2d_byteRP23v64bfp16ebs16_unalignedR12fifo_state_tiiRii(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s, i32 noundef %off, i32 noundef %size1, ptr nocapture nonnull align 4 dereferenceable(4) %count1, i32 noundef %inc1) local_unnamed_addr #2 { +; CHECK-LABEL: _Z24test_fifo_ld_pop_2d_byteRP23v64bfp16ebs16_unalignedR12fifo_state_tiiRii: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: lda dc0, [p2, #0]; nopxm +; CHECK-NEXT: lda p0, [p0, #0]; mov dj1, #128 +; CHECK-NEXT: lda r24, [p1, dj1] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: mov m0, r0 +; CHECK-NEXT: mov p4, p0 +; CHECK-NEXT: vlda lfh0, [p1, #64]; mov dn0, r1 +; CHECK-NEXT: vlda lfl0, [p1, #0]; mov dj0, r2 +; CHECK-NEXT: vldb.pop.544.2d ex0, [p0, lf0, r24, d0] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: st dc0, [p2, #0]; ret lr +; CHECK-NEXT: st r24, [p1, dj1] // Delay Slot 5 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st p0, [p4, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %pos1.i = getelementptr inbounds i8, ptr %s, i20 128 + %0 = load ptr, ptr %p, align 4, !tbaa !2 + %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6 + %2 = load i32, ptr %pos1.i, align 64, !tbaa !7 + %3 = trunc i32 %off to i20 + %4 = trunc i32 %size1 to i20 + %5 = load i32, ptr %count1, align 4, !tbaa !7 + %6 = trunc i32 %5 to i20 + %7 = trunc i32 %inc1 to i20 + %8 = tail call { ptr, <32 x i32>, i32, i20, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.544.2d.bfp16(ptr %0, <32 x i32> %1, i32 %2, i20 %3, i20 %4, i20 %6, i20 %7, <64 x i8> undef, <8 x i8> undef) + %9 = extractvalue { ptr, <32 x i32>, i32, i20, <64 x i8>, <8 x i8> } %8, 0 + %10 = extractvalue { ptr, <32 x i32>, i32, i20, <64 x i8>, <8 x i8> } %8, 1 + %11 = extractvalue { ptr, <32 x i32>, i32, i20, <64 x i8>, <8 x i8> } %8, 2 + %12 = extractvalue { ptr, <32 x i32>, i32, i20, <64 x i8>, <8 x i8> } %8, 4 + %13 = extractvalue { ptr, <32 x i32>, i32, i20, <64 x i8>, <8 x i8> } %8, 5 + %14 = extractvalue { ptr, <32 x i32>, i32, i20, <64 x i8>, <8 x i8> } %8, 3 + %15 = zext i20 %14 to i32 + store i32 %15, ptr %count1, align 4 + store <32 x i32> %10, ptr %s, align 128 + store i32 %11, ptr %pos1.i, align 64 + store ptr %9, ptr %p, align 4 + %.fca.0.insert.i = insertvalue %struct.v64bfp16ebs16 poison, <64 x i8> %12, 0 + %.fca.1.insert.i = insertvalue %struct.v64bfp16ebs16 %.fca.0.insert.i, <8 x i8> %13, 1 + ret %struct.v64bfp16ebs16 %.fca.1.insert.i +} + +define dso_local %struct.v64bfp16ebs16 @_Z24test_fifo_ld_pop_3d_byteRP23v64bfp16ebs16_unalignedR12fifo_state_tiiRiiiS4_i(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s, i32 noundef %off, i32 noundef %size1, ptr nocapture nonnull align 4 dereferenceable(4) %count1, i32 noundef %inc1, i32 noundef %size2, ptr nocapture nonnull align 4 dereferenceable(4) %count2, i32 noundef %inc2) local_unnamed_addr #2 { +; CHECK-LABEL: _Z24test_fifo_ld_pop_3d_byteRP23v64bfp16ebs16_unalignedR12fifo_state_tiiRiiiS4_i: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: lda dc0, [p2, #0]; nopb ; nops ; nopxm ; nopv +; CHECK-NEXT: lda dc4, [p3, #0] +; CHECK-NEXT: lda p0, [p0, #0]; mov dj1, #128 +; CHECK-NEXT: lda r24, [p1, dj1] +; CHECK-NEXT: mov m0, r0 +; CHECK-NEXT: mov dn0, r1 +; CHECK-NEXT: mov dj0, r2 +; CHECK-NEXT: mov p4, p0 +; CHECK-NEXT: vlda lfh0, [p1, #64]; mov dn4, r3 +; CHECK-NEXT: vlda lfl0, [p1, #0]; mov dj4, r4 +; CHECK-NEXT: vldb.pop.544.3d ex0, [p0, lf0, r24, d0] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: st dc0, [p2, #0] +; CHECK-NEXT: st dc4, [p3, #0]; ret lr +; CHECK-NEXT: st r24, [p1, dj1] // Delay Slot 5 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st p0, [p4, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %pos1.i = getelementptr inbounds i8, ptr %s, i20 128 + %0 = load ptr, ptr %p, align 4, !tbaa !2 + %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6 + %2 = load i32, ptr %pos1.i, align 64, !tbaa !7 + %3 = trunc i32 %off to i20 + %4 = trunc i32 %size1 to i20 + %5 = load i32, ptr %count1, align 4, !tbaa !7 + %6 = trunc i32 %5 to i20 + %7 = trunc i32 %inc1 to i20 + %8 = trunc i32 %size2 to i20 + %9 = load i32, ptr %count2, align 4, !tbaa !7 + %10 = trunc i32 %9 to i20 + %11 = trunc i32 %inc2 to i20 + %12 = tail call { ptr, <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.544.3d.bfp16(ptr %0, <32 x i32> %1, i32 %2, i20 %3, i20 %4, i20 %6, i20 %7, i20 %8, i20 %10, i20 %11, <64 x i8> undef, <8 x i8> undef) + %13 = extractvalue { ptr, <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } %12, 0 + %14 = extractvalue { ptr, <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } %12, 1 + %15 = extractvalue { ptr, <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } %12, 2 + %16 = extractvalue { ptr, <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } %12, 5 + %17 = extractvalue { ptr, <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } %12, 6 + %18 = extractvalue { ptr, <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } %12, 3 + %19 = zext i20 %18 to i32 + %20 = extractvalue { ptr, <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } %12, 4 + %21 = zext i20 %20 to i32 + store i32 %19, ptr %count1, align 4 + store i32 %21, ptr %count2, align 4 + store <32 x i32> %14, ptr %s, align 128 + store i32 %15, ptr %pos1.i, align 64 + store ptr %13, ptr %p, align 4 + %.fca.0.insert.i = insertvalue %struct.v64bfp16ebs16 poison, <64 x i8> %16, 0 + %.fca.1.insert.i = insertvalue %struct.v64bfp16ebs16 %.fca.0.insert.i, <8 x i8> %17, 1 + ret %struct.v64bfp16ebs16 %.fca.1.insert.i +} + + + +declare { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.ld.fill(ptr, <32 x i32>, i32) #5 +declare { <64 x i8>, ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.ld.pop.unaligned(ptr, <32 x i32>, i32) #5 +declare { <64 x i8>, ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.ld.pop.1d.unaligned(ptr, <32 x i32>, i32, i20) #5 +declare { <64 x i8>, ptr, <32 x i32>, i32, i20 } @llvm.aie2p.fifo.ld.pop.2d.unaligned(ptr, <32 x i32>, i32, i20, i20, i20, i20) #5 +declare { <64 x i8>, ptr, <32 x i32>, i32, i20, i20 } @llvm.aie2p.fifo.ld.pop.3d.unaligned(ptr, <32 x i32>, i32, i20, i20, i20, i20, i20, i20, i20) #5 +declare { ptr, <32 x i32>, i32, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.576.bfp16(ptr, <32 x i32>, i32, <64 x i8>, <8 x i8>) #5 +declare { ptr, <32 x i32>, i32, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.576.1d.bfp16(ptr, <32 x i32>, i32, i20, <64 x i8>, <8 x i8>) #5 +declare { ptr, <32 x i32>, i32, i20, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.576.2d.bfp16(ptr, <32 x i32>, i32, i20, i20, i20, i20, <64 x i8>, <8 x i8>) #5 +declare { ptr, <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.576.3d.bfp16(ptr, <32 x i32>, i32, i20, i20, i20, i20, i20, i20, i20, <64 x i8>, <8 x i8>) #5 +declare { ptr, <32 x i32>, i32, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.544.bfp16(ptr, <32 x i32>, i32, <64 x i8>, <8 x i8>) #5 +declare { ptr, <32 x i32>, i32, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.544.1d.bfp16(ptr, <32 x i32>, i32, i20, <64 x i8>, <8 x i8>) #5 +declare { ptr, <32 x i32>, i32, i20, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.544.2d.bfp16(ptr, <32 x i32>, i32, i20, i20, i20, i20, <64 x i8>, <8 x i8>) #5 +declare { ptr, <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.544.3d.bfp16(ptr, <32 x i32>, i32, i20, i20, i20, i20, i20, i20, i20, <64 x i8>, <8 x i8>) #5 + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 19.0.0git (git@gitenterprise.xilinx.com:XRLabs/llvm-aie.git 35e685a28ac5e78c6c8a5ab733508f3e0aaedf24)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"any pointer", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C++ TBAA"} +!6 = !{!4, !4, i64 0} +!7 = !{!8, !8, i64 0} +!8 = !{!"int", !4, i64 0}