diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp b/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp index 6f9c10215b15..983bec98a5d3 100644 --- a/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp +++ b/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp @@ -70,6 +70,7 @@ class AIE2PInstructionSelector : public AIEBaseInstructionSelector { bool selectG_CONCAT_VECTORS(MachineInstr &I, MachineRegisterInfo &MRI); bool selectCascadeStreamInsn(MachineInstr &I, MachineRegisterInfo &MRI, bool isWrite); + bool selectVST_FIFO(MachineInstr &I, MachineRegisterInfo &MRI); static const char *getName() { return DEBUG_TYPE; } @@ -363,6 +364,18 @@ bool AIE2PInstructionSelector::select(MachineInstr &I) { case Intrinsic::aie2p_v64accfloat_to_v64bfp16ebs16: case Intrinsic::aie2p_v64bfp16ebs8_to_v64bfp16ebs16: return selectVCONVbfp16(I, MRI); + case Intrinsic::aie2p_fifo_st_push_576_bfp16: + case Intrinsic::aie2p_fifo_st_push_544_bfp16: + case Intrinsic::aie2p_fifo_st_push_512_bfp16: + case Intrinsic::aie2p_fifo_st_flush: + case Intrinsic::aie2p_fifo_st_flush_conv: + case Intrinsic::aie2p_fifo_st_flush_1d: + case Intrinsic::aie2p_fifo_st_flush_1d_conv: + case Intrinsic::aie2p_fifo_st_flush_2d: + case Intrinsic::aie2p_fifo_st_flush_2d_conv: + case Intrinsic::aie2p_fifo_st_flush_3d: + case Intrinsic::aie2p_fifo_st_flush_3d_conv: + return selectVST_FIFO(I, MRI); default: return selectImpl(I, *CoverageInfo); } @@ -3994,6 +4007,156 @@ bool AIE2PInstructionSelector ::selectVSHUFFLE_BFP(MachineInstr &I, return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI); } +unsigned int getStoreFifoOpcode(MachineInstr &I) { + switch (cast(I).getIntrinsicID()) { + case Intrinsic::aie2p_fifo_st_flush: + return AIE2P::VST_FLUSH_512_normal_flush; + case Intrinsic::aie2p_fifo_st_flush_1d: + return AIE2P::VST_FLUSH_512_fifo_1d_flush; + case Intrinsic::aie2p_fifo_st_flush_2d: + return AIE2P::VST_FLUSH_512_2D; + case Intrinsic::aie2p_fifo_st_flush_3d: + return AIE2P::VST_FLUSH_512_3D; + case Intrinsic::aie2p_fifo_st_flush_conv: + return AIE2P::VST_FLUSH_512_CONV_normal_flush; + case Intrinsic::aie2p_fifo_st_flush_1d_conv: + return AIE2P::VST_FLUSH_512_CONV_fifo_1d_flush; + case Intrinsic::aie2p_fifo_st_flush_2d_conv: + return AIE2P::VST_FLUSH_512_CONV_2D; + case Intrinsic::aie2p_fifo_st_flush_3d_conv: + return AIE2P::VST_FLUSH_512_CONV_3D; + case Intrinsic::aie2p_fifo_st_push_576_bfp16: + return AIE2P::VST_PUSH_576; + case Intrinsic::aie2p_fifo_st_push_544_bfp16: + return AIE2P::VST_PUSH_544; + case Intrinsic::aie2p_fifo_st_push_512_bfp16: + return AIE2P::VST_PUSH_512; + } + llvm_unreachable("Unreachable: Cannot get fifo store opcode from intrinsic"); + return AIE2P::INSTRUCTION_LIST_END; +} + +bool AIE2PInstructionSelector::selectVST_FIFO(MachineInstr &I, + MachineRegisterInfo &MRI) { + auto IntrinsicID = cast(I).getIntrinsicID(); + MachineInstrBuilder MI; + Register PtrOut = I.getOperand(0).getReg(); + Register FifoOut = I.getOperand(1).getReg(); + Register AvailOut = I.getOperand(2).getReg(); + switch (IntrinsicID) { + case Intrinsic::aie2p_fifo_st_push_512_bfp16: { + Register PtrIn = I.getOperand(4).getReg(); + Register FifoIn = I.getOperand(6).getReg(); + Register AvailIn = I.getOperand(7).getReg(); + Register VecIn = I.getOperand(5).getReg(); + + MI = MIB.buildInstr(getStoreFifoOpcode(I), {FifoOut, PtrOut, AvailOut}, + {FifoIn, VecIn, PtrIn, AvailIn}); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI); + } + case Intrinsic::aie2p_fifo_st_push_544_bfp16: + case Intrinsic::aie2p_fifo_st_push_576_bfp16: { + Register PtrIn = I.getOperand(4).getReg(); + Register FifoIn = I.getOperand(7).getReg(); + Register AvailIn = I.getOperand(8).getReg(); + Register MantIn = I.getOperand(5).getReg(); + Register ExpIn = I.getOperand(6).getReg(); + + Register SrcReg = MRI.createVirtualRegister(&AIE2P::mEXaRegClass); + MIB.buildInstr(TargetOpcode::REG_SEQUENCE, {SrcReg}, {}) + .addReg(MantIn) + .addImm(AIE2P::sub_bfp16_x) + .addReg(ExpIn) + .addImm(AIE2P::sub_bfp16_e); + MI = MIB.buildInstr(getStoreFifoOpcode(I), {FifoOut, PtrOut, AvailOut}, + {FifoIn, SrcReg, PtrIn, AvailIn}); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI); + } + case Intrinsic::aie2p_fifo_st_flush_conv: + case Intrinsic::aie2p_fifo_st_flush: { + Register PtrIn = I.getOperand(4).getReg(); + Register FifoIn = I.getOperand(5).getReg(); + Register AvailIn = I.getOperand(6).getReg(); + MI = MIB.buildInstr(getStoreFifoOpcode(I), {FifoOut, PtrOut, AvailOut}, + {FifoIn, PtrIn, AvailIn}); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI); + } + case Intrinsic::aie2p_fifo_st_flush_1d: + case Intrinsic::aie2p_fifo_st_flush_1d_conv: { + Register PtrIn = I.getOperand(4).getReg(); + Register FifoIn = I.getOperand(5).getReg(); + Register AvailIn = I.getOperand(6).getReg(); + Register OffsetReg = I.getOperand(7).getReg(); + MI = MIB.buildInstr(getStoreFifoOpcode(I), {FifoOut, PtrOut, AvailOut}, + {FifoIn, PtrIn, AvailIn, OffsetReg}); + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI); + } + case Intrinsic::aie2p_fifo_st_flush_2d: + case Intrinsic::aie2p_fifo_st_flush_2d_conv: { + Register CountOut1Reg = I.getOperand(3).getReg(); + Register PtrIn = I.getOperand(5).getReg(); + Register FifoIn = I.getOperand(6).getReg(); + Register AvailIn = I.getOperand(7).getReg(); + Register OffsetReg = I.getOperand(8).getReg(); + Register SizeReg = I.getOperand(9).getReg(); + Register CountIn1Reg = I.getOperand(10).getReg(); + Register IncrReg = I.getOperand(11).getReg(); + if (!RBI.constrainGenericRegister(CountOut1Reg, AIE2P::eDCRegClass, MRI)) + return false; + Register DReg = + createDRegSequence(OffsetReg, IncrReg, SizeReg, CountIn1Reg, MRI); + + MI = MIB.buildInstr(getStoreFifoOpcode(I), + {FifoOut, PtrOut, AvailOut, CountOut1Reg}, + {FifoIn, PtrIn, AvailIn, DReg}); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI); + } + case Intrinsic::aie2p_fifo_st_flush_3d: + case Intrinsic::aie2p_fifo_st_flush_3d_conv: { + Register CountOut1Reg = I.getOperand(3).getReg(); + Register CountOut2Reg = I.getOperand(4).getReg(); + + Register PtrIn = I.getOperand(6).getReg(); + Register FifoIn = I.getOperand(7).getReg(); + Register AvailIn = I.getOperand(8).getReg(); + Register OffsetReg = I.getOperand(9).getReg(); + Register Size1Reg = I.getOperand(10).getReg(); + Register CountIn1Reg = I.getOperand(11).getReg(); + Register Incr1Reg = I.getOperand(12).getReg(); + Register Size2Reg = I.getOperand(13).getReg(); + Register CountIn2Reg = I.getOperand(14).getReg(); + Register Incr2Reg = I.getOperand(15).getReg(); + + if (!RBI.constrainGenericRegister(CountOut1Reg, *TRI.getAddrCountRegClass(), + MRI) || + !RBI.constrainGenericRegister(CountOut2Reg, *TRI.getAddrCountRegClass(), + MRI)) + return false; + Register DSReg = + createDSRegSequence(OffsetReg, Incr1Reg, Incr2Reg, Size1Reg, + CountIn1Reg, Size2Reg, CountIn2Reg, MRI); + + MI = MIB.buildInstr(getStoreFifoOpcode(I), + {FifoOut, PtrOut, AvailOut, CountOut1Reg, CountOut2Reg}, + {FifoIn, PtrIn, AvailIn, DSReg}); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI); + } + return false; + } + return false; +} + namespace llvm { InstructionSelector * createAIE2PInstructionSelector(const AIE2PTargetMachine &TM, diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-fifo-stores.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-fifo-stores.mir new file mode 100644 index 000000000000..e8b4a722e8df --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-fifo-stores.mir @@ -0,0 +1,391 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +# RUN: llc -mtriple aie2p -run-pass=instruction-select %s -verify-machineinstrs -o - | FileCheck %s + + +--- +name: test_fifo_st_push +tracksRegLiveness: true +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $p0, $p1, $x0 + ; CHECK-LABEL: name: test_fifo_st_push + ; CHECK: liveins: $p0, $p1, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY $x0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:mpfs = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:mstfifo = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:mr26_fifo_st = IMPLICIT_DEF + ; CHECK-NEXT: [[VST_PUSH_512_:%[0-9]+]]:mstfifo, [[VST_PUSH_512_1:%[0-9]+]]:mpfs, [[VST_PUSH_512_2:%[0-9]+]]:mr26_fifo_st = VST_PUSH_512 [[DEF1]], [[COPY]], [[DEF]], [[DEF2]], implicit-def $srfifo_of + ; CHECK-NEXT: PseudoRET implicit $lr + %0:ptrregbank(p0) = COPY $p0 + %1:vregbank(<64 x s8>) = COPY $x0 + %2:ptrregbank(p0) = COPY $p1 + %3:modregbank(s20) = G_CONSTANT i20 128 + %5:ptrregbank(p0) = IMPLICIT_DEF + %6:vregbank(<16 x s32>) = G_BITCAST %1:vregbank(<64 x s8>) + %7:fiforegbank(<32 x s32>) = IMPLICIT_DEF + %8:gprregbank(s32) = IMPLICIT_DEF + %9:ptrregbank(p0), %10:fiforegbank(<32 x s32>), %11:gprregbank(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.st.push.512.bfp16), %5:ptrregbank(p0), %6:vregbank(<16 x s32>), %7:fiforegbank(<32 x s32>), %8:gprregbank(s32) + PseudoRET implicit $lr +... + +--- +name: test_fifo_st_flush +tracksRegLiveness: true +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $p0, $p1 + ; CHECK-LABEL: name: test_fifo_st_flush + ; CHECK: liveins: $p0, $p1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:mpfs = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:mstfifo = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:mr26_fifo_st = IMPLICIT_DEF + ; CHECK-NEXT: [[VST_FLUSH_512_normal_flush:%[0-9]+]]:mstfifo, [[VST_FLUSH_512_normal_flush1:%[0-9]+]]:mpfs, [[VST_FLUSH_512_normal_flush2:%[0-9]+]]:mr26_fifo_st = VST_FLUSH_512_normal_flush [[DEF1]], [[DEF]], [[DEF2]], implicit-def $srfifo_of + ; CHECK-NEXT: PseudoRET implicit $lr + %0:ptrregbank(p0) = COPY $p0 + %1:ptrregbank(p0) = COPY $p1 + %2:modregbank(s20) = G_CONSTANT i20 128 + %4:ptrregbank(p0) = IMPLICIT_DEF + %5:fiforegbank(<32 x s32>) = IMPLICIT_DEF + %6:gprregbank(s32) = IMPLICIT_DEF + %7:ptrregbank(p0), %8:fiforegbank(<32 x s32>), %9:gprregbank(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.st.flush), %4:ptrregbank(p0), %5:fiforegbank(<32 x s32>), %6:gprregbank(s32) + PseudoRET implicit $lr +... + +--- +name: test_fifo_st_flush_1d_byte +tracksRegLiveness: true +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $p0, $p1, $r0 + ; CHECK-LABEL: name: test_fifo_st_flush_1d_byte + ; CHECK: liveins: $p0, $p1, $r0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:mpfs = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:mstfifo = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:mr26_fifo_st = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:em = IMPLICIT_DEF + ; CHECK-NEXT: [[VST_FLUSH_512_fifo_1d_flush:%[0-9]+]]:mstfifo, [[VST_FLUSH_512_fifo_1d_flush1:%[0-9]+]]:mpfs, [[VST_FLUSH_512_fifo_1d_flush2:%[0-9]+]]:mr26_fifo_st = VST_FLUSH_512_fifo_1d_flush [[DEF1]], [[DEF]], [[DEF2]], [[DEF3]], implicit-def $srfifo_of + ; CHECK-NEXT: PseudoRET implicit $lr + %0:ptrregbank(p0) = COPY $p0 + %1:ptrregbank(p0) = COPY $p1 + %2:gprregbank(s32) = COPY $r0 + %3:modregbank(s20) = G_CONSTANT i20 128 + %5:ptrregbank(p0) = IMPLICIT_DEF + %6:fiforegbank(<32 x s32>) = IMPLICIT_DEF + %7:gprregbank(s32) = IMPLICIT_DEF + %8:modregbank(s20) = IMPLICIT_DEF + %9:ptrregbank(p0), %10:fiforegbank(<32 x s32>), %11:gprregbank(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.st.flush.1d), %5:ptrregbank(p0), %6:fiforegbank(<32 x s32>), %7:gprregbank(s32), %8:modregbank(s20) + PseudoRET implicit $lr +... + +--- +name: test_fifo_st_flush_2d_byte +tracksRegLiveness: true +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $p0, $p1, $p2, $r0, $r1, $r2 + ; CHECK-LABEL: name: test_fifo_st_flush_2d_byte + ; CHECK: liveins: $p0, $p1, $p2, $r0, $r1, $r2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:mpfs = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:mstfifo = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:mr26_fifo_st = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:em = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF4:%[0-9]+]]:edn = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF5:%[0-9]+]]:edc = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF6:%[0-9]+]]:edj = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ed = REG_SEQUENCE [[DEF3]], %subreg.sub_mod, [[DEF4]], %subreg.sub_dim_size, [[DEF6]], %subreg.sub_dim_stride, [[DEF5]], %subreg.sub_dim_count + ; CHECK-NEXT: [[VST_FLUSH_512_CONV_2D:%[0-9]+]]:mstfifo, [[VST_FLUSH_512_CONV_2D1:%[0-9]+]]:mpfs, [[VST_FLUSH_512_CONV_2D2:%[0-9]+]]:mr26_fifo_st, [[VST_FLUSH_512_CONV_2D3:%[0-9]+]]:edc = VST_FLUSH_512_CONV_2D [[DEF1]], [[DEF]], [[DEF2]], [[REG_SEQUENCE]], implicit-def $srfifo_of + ; CHECK-NEXT: PseudoRET implicit $lr + %0:ptrregbank(p0) = COPY $p0 + %1:ptrregbank(p0) = COPY $p1 + %2:gprregbank(s32) = COPY $r0 + %3:gprregbank(s32) = COPY $r1 + %4:ptrregbank(p0) = COPY $p2 + %5:gprregbank(s32) = COPY $r2 + %6:modregbank(s20) = G_CONSTANT i20 128 + %8:ptrregbank(p0) = IMPLICIT_DEF + %9:fiforegbank(<32 x s32>) = IMPLICIT_DEF + %10:gprregbank(s32) = IMPLICIT_DEF + %11:modregbank(s20) = IMPLICIT_DEF + %12:modregbank(s20) = IMPLICIT_DEF + %13:gprregbank(s32) = IMPLICIT_DEF + %14:modregbank(s20) = IMPLICIT_DEF + %15:modregbank(s20) = IMPLICIT_DEF + %16:ptrregbank(p0), %17:fiforegbank(<32 x s32>), %18:gprregbank(s32), %19:modregbank(s20) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.st.flush.2d.conv), %8:ptrregbank(p0), %9:fiforegbank(<32 x s32>), %10:gprregbank(s32), %11:modregbank(s20), %12:modregbank(s20), %14:modregbank(s20), %15:modregbank(s20) + PseudoRET implicit $lr +... + +--- +name: test_fifo_st_flush_3d_byte +tracksRegLiveness: true +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $p0, $p1, $p2, $p3, $r0, $r1, $r2, $r3, $r4 + ; CHECK-LABEL: name: test_fifo_st_flush_3d_byte + ; CHECK: liveins: $p0, $p1, $p2, $p3, $r0, $r1, $r2, $r3, $r4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:mpfs = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:mstfifo = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:mr26_fifo_st = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:em = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF4:%[0-9]+]]:edn = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF5:%[0-9]+]]:edc = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF6:%[0-9]+]]:edj = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF7:%[0-9]+]]:edn = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF8:%[0-9]+]]:edc = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF9:%[0-9]+]]:edj = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:eds = REG_SEQUENCE [[DEF3]], %subreg.sub_mod, [[DEF4]], %subreg.sub_dim_size, [[DEF6]], %subreg.sub_dim_stride, [[DEF5]], %subreg.sub_dim_count, [[DEF7]], %subreg.sub_hi_dim_then_sub_dim_size, [[DEF9]], %subreg.sub_hi_dim_then_sub_dim_stride, [[DEF8]], %subreg.sub_hi_dim_then_sub_dim_count + ; CHECK-NEXT: [[VST_FLUSH_512_3D:%[0-9]+]]:mstfifo, [[VST_FLUSH_512_3D1:%[0-9]+]]:mpfs, [[VST_FLUSH_512_3D2:%[0-9]+]]:mr26_fifo_st, [[VST_FLUSH_512_3D3:%[0-9]+]]:edcl, [[VST_FLUSH_512_3D4:%[0-9]+]]:edch = VST_FLUSH_512_3D [[DEF1]], [[DEF]], [[DEF2]], [[REG_SEQUENCE]], implicit-def $srfifo_of + ; CHECK-NEXT: PseudoRET implicit $lr + %0:ptrregbank(p0) = COPY $p0 + %1:ptrregbank(p0) = COPY $p1 + %2:gprregbank(s32) = COPY $r0 + %3:gprregbank(s32) = COPY $r1 + %4:ptrregbank(p0) = COPY $p2 + %5:gprregbank(s32) = COPY $r2 + %6:gprregbank(s32) = COPY $r3 + %7:ptrregbank(p0) = COPY $p3 + %8:gprregbank(s32) = COPY $r4 + %9:modregbank(s20) = G_CONSTANT i20 128 + %11:ptrregbank(p0) = IMPLICIT_DEF + %12:fiforegbank(<32 x s32>) = IMPLICIT_DEF + %13:gprregbank(s32) = IMPLICIT_DEF + %14:modregbank(s20) = IMPLICIT_DEF + %15:modregbank(s20) = IMPLICIT_DEF + %16:gprregbank(s32) = IMPLICIT_DEF + %17:modregbank(s20) = IMPLICIT_DEF + %18:modregbank(s20) = IMPLICIT_DEF + %19:modregbank(s20) = IMPLICIT_DEF + %20:gprregbank(s32) = IMPLICIT_DEF + %21:modregbank(s20) = IMPLICIT_DEF + %22:modregbank(s20) = IMPLICIT_DEF + %23:ptrregbank(p0), %24:fiforegbank(<32 x s32>), %25:gprregbank(s32), %26:modregbank(s20), %27:modregbank(s20) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.st.flush.3d), %11:ptrregbank(p0), %12:fiforegbank(<32 x s32>), %13:gprregbank(s32), %14:modregbank(s20), %15:modregbank(s20), %17:modregbank(s20), %18:modregbank(s20), %19:modregbank(s20), %21:modregbank(s20), %22:modregbank(s20) + PseudoRET implicit $lr +... + +--- +name: test_fifo_st_flush_conv +tracksRegLiveness: true +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $p0, $p1 + ; CHECK-LABEL: name: test_fifo_st_flush_conv + ; CHECK: liveins: $p0, $p1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:mpfs = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:mstfifo = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:mr26_fifo_st = IMPLICIT_DEF + ; CHECK-NEXT: [[VST_FLUSH_512_CONV_normal_flush:%[0-9]+]]:mstfifo, [[VST_FLUSH_512_CONV_normal_flush1:%[0-9]+]]:mpfs, [[VST_FLUSH_512_CONV_normal_flush2:%[0-9]+]]:mr26_fifo_st = VST_FLUSH_512_CONV_normal_flush [[DEF1]], [[DEF]], [[DEF2]], implicit-def $srfifo_of + ; CHECK-NEXT: PseudoRET implicit $lr + %0:ptrregbank(p0) = COPY $p0 + %1:ptrregbank(p0) = COPY $p1 + %2:modregbank(s20) = G_CONSTANT i20 128 + %4:ptrregbank(p0) = IMPLICIT_DEF + %5:fiforegbank(<32 x s32>) = IMPLICIT_DEF + %6:gprregbank(s32) = IMPLICIT_DEF + %7:ptrregbank(p0), %8:fiforegbank(<32 x s32>), %9:gprregbank(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.st.flush.conv), %4:ptrregbank(p0), %5:fiforegbank(<32 x s32>), %6:gprregbank(s32) + PseudoRET implicit $lr +... + +--- +name: test_fifo_st_flush_conv_1d_byte +tracksRegLiveness: true +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $p0, $p1, $r0 + ; CHECK-LABEL: name: test_fifo_st_flush_conv_1d_byte + ; CHECK: liveins: $p0, $p1, $r0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:mpfs = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:mstfifo = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:mr26_fifo_st = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:em = IMPLICIT_DEF + ; CHECK-NEXT: [[VST_FLUSH_512_CONV_fifo_1d_flush:%[0-9]+]]:mstfifo, [[VST_FLUSH_512_CONV_fifo_1d_flush1:%[0-9]+]]:mpfs, [[VST_FLUSH_512_CONV_fifo_1d_flush2:%[0-9]+]]:mr26_fifo_st = VST_FLUSH_512_CONV_fifo_1d_flush [[DEF1]], [[DEF]], [[DEF2]], [[DEF3]], implicit-def $srfifo_of + ; CHECK-NEXT: PseudoRET implicit $lr + %0:ptrregbank(p0) = COPY $p0 + %1:ptrregbank(p0) = COPY $p1 + %2:gprregbank(s32) = COPY $r0 + %3:modregbank(s20) = G_CONSTANT i20 128 + %5:ptrregbank(p0) = IMPLICIT_DEF + %6:fiforegbank(<32 x s32>) = IMPLICIT_DEF + %7:gprregbank(s32) = IMPLICIT_DEF + %8:modregbank(s20) = IMPLICIT_DEF + %9:ptrregbank(p0), %10:fiforegbank(<32 x s32>), %11:gprregbank(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.st.flush.1d.conv), %5:ptrregbank(p0), %6:fiforegbank(<32 x s32>), %7:gprregbank(s32), %8:modregbank(s20) + PseudoRET implicit $lr +... + +--- +name: test_fifo_st_flush_conv_2d_byte +tracksRegLiveness: true +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $p0, $p1, $p2, $r0, $r1, $r2 + ; CHECK-LABEL: name: test_fifo_st_flush_conv_2d_byte + ; CHECK: liveins: $p0, $p1, $p2, $r0, $r1, $r2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:mpfs = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:mstfifo = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:mr26_fifo_st = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:em = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF4:%[0-9]+]]:edn = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF5:%[0-9]+]]:edc = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF6:%[0-9]+]]:edj = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ed = REG_SEQUENCE [[DEF3]], %subreg.sub_mod, [[DEF4]], %subreg.sub_dim_size, [[DEF6]], %subreg.sub_dim_stride, [[DEF5]], %subreg.sub_dim_count + ; CHECK-NEXT: [[VST_FLUSH_512_2D:%[0-9]+]]:mstfifo, [[VST_FLUSH_512_2D1:%[0-9]+]]:mpfs, [[VST_FLUSH_512_2D2:%[0-9]+]]:mr26_fifo_st, [[VST_FLUSH_512_2D3:%[0-9]+]]:edc = VST_FLUSH_512_2D [[DEF1]], [[DEF]], [[DEF2]], [[REG_SEQUENCE]], implicit-def $srfifo_of + ; CHECK-NEXT: PseudoRET implicit $lr + %0:ptrregbank(p0) = COPY $p0 + %1:ptrregbank(p0) = COPY $p1 + %2:gprregbank(s32) = COPY $r0 + %3:gprregbank(s32) = COPY $r1 + %4:ptrregbank(p0) = COPY $p2 + %5:gprregbank(s32) = COPY $r2 + %6:modregbank(s20) = G_CONSTANT i20 128 + %8:ptrregbank(p0) = IMPLICIT_DEF + %9:fiforegbank(<32 x s32>) = IMPLICIT_DEF + %10:gprregbank(s32) = IMPLICIT_DEF + %11:modregbank(s20) = IMPLICIT_DEF + %12:modregbank(s20) = IMPLICIT_DEF + %13:gprregbank(s32) = IMPLICIT_DEF + %14:modregbank(s20) = IMPLICIT_DEF + %15:modregbank(s20) = IMPLICIT_DEF + %16:ptrregbank(p0), %17:fiforegbank(<32 x s32>), %18:gprregbank(s32), %19:modregbank(s20) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.st.flush.2d), %8:ptrregbank(p0), %9:fiforegbank(<32 x s32>), %10:gprregbank(s32), %11:modregbank(s20), %12:modregbank(s20), %14:modregbank(s20), %15:modregbank(s20) + PseudoRET implicit $lr +... + +--- +name: test_fifo_st_flush_conv_3d_byte +tracksRegLiveness: true +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $p0, $p1, $p2, $p3, $r0, $r1, $r2, $r3, $r4 + ; CHECK-LABEL: name: test_fifo_st_flush_conv_3d_byte + ; CHECK: liveins: $p0, $p1, $p2, $p3, $r0, $r1, $r2, $r3, $r4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:mpfs = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:mstfifo = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:mr26_fifo_st = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:em = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF4:%[0-9]+]]:edn = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF5:%[0-9]+]]:edc = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF6:%[0-9]+]]:edj = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF7:%[0-9]+]]:edn = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF8:%[0-9]+]]:edc = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF9:%[0-9]+]]:edj = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:eds = REG_SEQUENCE [[DEF3]], %subreg.sub_mod, [[DEF4]], %subreg.sub_dim_size, [[DEF6]], %subreg.sub_dim_stride, [[DEF5]], %subreg.sub_dim_count, [[DEF7]], %subreg.sub_hi_dim_then_sub_dim_size, [[DEF9]], %subreg.sub_hi_dim_then_sub_dim_stride, [[DEF8]], %subreg.sub_hi_dim_then_sub_dim_count + ; CHECK-NEXT: [[VST_FLUSH_512_CONV_3D:%[0-9]+]]:mstfifo, [[VST_FLUSH_512_CONV_3D1:%[0-9]+]]:mpfs, [[VST_FLUSH_512_CONV_3D2:%[0-9]+]]:mr26_fifo_st, [[VST_FLUSH_512_CONV_3D3:%[0-9]+]]:edcl, [[VST_FLUSH_512_CONV_3D4:%[0-9]+]]:edch = VST_FLUSH_512_CONV_3D [[DEF1]], [[DEF]], [[DEF2]], [[REG_SEQUENCE]], implicit-def $srfifo_of + ; CHECK-NEXT: PseudoRET implicit $lr + %0:ptrregbank(p0) = COPY $p0 + %1:ptrregbank(p0) = COPY $p1 + %2:gprregbank(s32) = COPY $r0 + %3:gprregbank(s32) = COPY $r1 + %4:ptrregbank(p0) = COPY $p2 + %5:gprregbank(s32) = COPY $r2 + %6:gprregbank(s32) = COPY $r3 + %7:ptrregbank(p0) = COPY $p3 + %8:gprregbank(s32) = COPY $r4 + %9:modregbank(s20) = G_CONSTANT i20 128 + %11:ptrregbank(p0) = IMPLICIT_DEF + %12:fiforegbank(<32 x s32>) = IMPLICIT_DEF + %13:gprregbank(s32) = IMPLICIT_DEF + %14:modregbank(s20) = IMPLICIT_DEF + %15:modregbank(s20) = IMPLICIT_DEF + %16:gprregbank(s32) = IMPLICIT_DEF + %17:modregbank(s20) = IMPLICIT_DEF + %18:modregbank(s20) = IMPLICIT_DEF + %19:modregbank(s20) = IMPLICIT_DEF + %20:gprregbank(s32) = IMPLICIT_DEF + %21:modregbank(s20) = IMPLICIT_DEF + %22:modregbank(s20) = IMPLICIT_DEF + %23:ptrregbank(p0), %24:fiforegbank(<32 x s32>), %25:gprregbank(s32), %26:modregbank(s20), %27:modregbank(s20) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.st.flush.3d.conv), %11:ptrregbank(p0), %12:fiforegbank(<32 x s32>), %13:gprregbank(s32), %14:modregbank(s20), %15:modregbank(s20), %17:modregbank(s20), %18:modregbank(s20), %19:modregbank(s20), %21:modregbank(s20), %22:modregbank(s20) + PseudoRET implicit $lr +... + +--- +name: test_fifo_st_push_576 +tracksRegLiveness: true +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $e0, $p0, $p1, $x0 + ; CHECK-LABEL: name: test_fifo_st_push_576 + ; CHECK: liveins: $e0, $p0, $p1, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:el = COPY $e0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:mpfs = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:mstfifo = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:mr26_fifo_st = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:mbp2bp = REG_SEQUENCE [[COPY]], %subreg.sub_bfp16_x, [[COPY1]], %subreg.sub_bfp16_e + ; CHECK-NEXT: [[VST_PUSH_576_:%[0-9]+]]:mstfifo, [[VST_PUSH_576_1:%[0-9]+]]:mpfs, [[VST_PUSH_576_2:%[0-9]+]]:mr26_fifo_st = VST_PUSH_576 [[DEF1]], [[REG_SEQUENCE]], [[DEF]], [[DEF2]], implicit-def $srfifo_of + ; CHECK-NEXT: PseudoRET implicit $lr + %0:ptrregbank(p0) = COPY $p0 + %1:vregbank(<64 x s8>) = COPY $x0 + %2:gprregbank(<8 x s8>) = COPY $e0 + %3:ptrregbank(p0) = COPY $p1 + %4:modregbank(s20) = G_CONSTANT i20 128 + %6:ptrregbank(p0) = IMPLICIT_DEF + %7:fiforegbank(<32 x s32>) = IMPLICIT_DEF + %8:gprregbank(s32) = IMPLICIT_DEF + %9:ptrregbank(p0), %10:fiforegbank(<32 x s32>), %11:gprregbank(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.st.push.576.bfp16), %6:ptrregbank(p0), %1:vregbank(<64 x s8>), %2:gprregbank(<8 x s8>), %7:fiforegbank(<32 x s32>), %8:gprregbank(s32) + PseudoRET implicit $lr +... + +--- +name: test_fifo_st_push_544 +tracksRegLiveness: true +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $e0, $p0, $p1, $x0 + ; CHECK-LABEL: name: test_fifo_st_push_544 + ; CHECK: liveins: $e0, $p0, $p1, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:el = COPY $e0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:mpfs = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:mstfifo = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:mr26_fifo_st = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:mbp2bp = REG_SEQUENCE [[COPY]], %subreg.sub_bfp16_x, [[COPY1]], %subreg.sub_bfp16_e + ; CHECK-NEXT: [[VST_PUSH_544_:%[0-9]+]]:mstfifo, [[VST_PUSH_544_1:%[0-9]+]]:mpfs, [[VST_PUSH_544_2:%[0-9]+]]:mr26_fifo_st = VST_PUSH_544 [[DEF1]], [[REG_SEQUENCE]], [[DEF]], [[DEF2]], implicit-def $srfifo_of + ; CHECK-NEXT: PseudoRET implicit $lr + %0:ptrregbank(p0) = COPY $p0 + %1:vregbank(<64 x s8>) = COPY $x0 + %2:gprregbank(<8 x s8>) = COPY $e0 + %3:ptrregbank(p0) = COPY $p1 + %4:modregbank(s20) = G_CONSTANT i20 128 + %6:ptrregbank(p0) = IMPLICIT_DEF + %7:fiforegbank(<32 x s32>) = IMPLICIT_DEF + %8:gprregbank(s32) = IMPLICIT_DEF + %9:ptrregbank(p0), %10:fiforegbank(<32 x s32>), %11:gprregbank(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.st.push.544.bfp16), %6:ptrregbank(p0), %1:vregbank(<64 x s8>), %2:gprregbank(<8 x s8>), %7:fiforegbank(<32 x s32>), %8:gprregbank(s32) + PseudoRET implicit $lr +... + diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/regbankselect-fifo-insn.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/regbankselect-fifo-insn.mir index fefe24630e65..93b61c00990d 100644 --- a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/regbankselect-fifo-insn.mir +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/regbankselect-fifo-insn.mir @@ -4,7 +4,7 @@ # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates # RUN: llc -mtriple aie2p -run-pass=regbankselect -regbankselect-greedy %s -verify-machineinstrs -o - | FileCheck --check-prefix=GREEDY %s # RUN: llc -mtriple aie2p -run-pass=regbankselect -regbankselect-fast %s -verify-machineinstrs -o - | FileCheck --check-prefix=FAST %s --- diff --git a/llvm/test/CodeGen/AIE/aie2p/ldst-fifo-stores.ll b/llvm/test/CodeGen/AIE/aie2p/ldst-fifo-stores.ll new file mode 100644 index 000000000000..a717eeb05134 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/ldst-fifo-stores.ll @@ -0,0 +1,641 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; +; This file is licensed under the Apache License v2.0 with LLVM Exceptions. +; See https://llvm.org/LICENSE.txt for license information. +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +; +; (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +; RUN: llc < %s -verify-machineinstrs -mtriple=aie2p | FileCheck %s + +%struct.v64bfp16ebs8 = type <{ <64 x i8>, <8 x i8> }> +%struct.v64bfp16ebs16 = type <{ <64 x i8>, <8 x i8> }> + +; Function Attrs: mustprogress nounwind memory(write, argmem: readwrite, inaccessiblemem: none) +define dso_local void @_Z18test_fifo_st_resetRPDv64_DB8_S0_R12fifo_state_t(ptr nocapture nonnull align 4 dereferenceable(4) %p, <64 x i8> noundef %v, ptr nocapture nonnull align 64 dereferenceable(256) %s) local_unnamed_addr #2 { +; CHECK-LABEL: _Z18test_fifo_st_resetRPDv64_DB8_S0_R12fifo_state_t: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: vlda sfl, [p1, #0]; nopx +; CHECK-NEXT: lda p2, [p0, #0] +; CHECK-NEXT: vlda sfh, [p1, #64] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: mova r26, #0 +; CHECK-NEXT: vst.push.512 x0, [p2, sf, r26]; ret lr +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %0 = load ptr, ptr %p, align 4, !tbaa !2 + %1 = bitcast <64 x i8> %v to <16 x i32> + %2 = load <32 x i32>, ptr %s, align 64, !tbaa !6 + %3 = tail call { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.push.512.bfp16(ptr %0, <16 x i32> %1, <32 x i32> %2, i32 0) + %4 = extractvalue { ptr, <32 x i32>, i32 } %3, 0 + %5 = extractvalue { ptr, <32 x i32>, i32 } %3, 1 + store <32 x i32> %5, ptr %s, align 128 + store ptr %4, ptr %p, align 4 + ret void +} + +; Function Attrs: mustprogress nounwind memory(write, argmem: readwrite, inaccessiblemem: none) +define dso_local void @_Z17test_fifo_st_pushRPDv64_DB8_S0_R12fifo_state_t(ptr nocapture nonnull align 4 dereferenceable(4) %p, <64 x i8> noundef %v, ptr nocapture nonnull align 64 dereferenceable(256) %s) local_unnamed_addr #2 { +; CHECK-LABEL: _Z17test_fifo_st_pushRPDv64_DB8_S0_R12fifo_state_t: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nopxm ; nops +; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128 +; CHECK-NEXT: lda r26, [p1, dj0] +; CHECK-NEXT: vlda sfh, [p1, #64] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: vst.push.512 x0, [p2, sf, r26] +; CHECK-NEXT: ret lr +; CHECK-NEXT: st r26, [p1, dj0] // Delay Slot 5 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %pos1.i = getelementptr inbounds i8, ptr %s, i20 128 + %0 = load ptr, ptr %p, align 4, !tbaa !2 + %1 = bitcast <64 x i8> %v to <16 x i32> + %2 = load <32 x i32>, ptr %s, align 64, !tbaa !6 + %3 = load i32, ptr %pos1.i, align 64, !tbaa !7 + %4 = tail call { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.push.512.bfp16(ptr %0, <16 x i32> %1, <32 x i32> %2, i32 %3) + %5 = extractvalue { ptr, <32 x i32>, i32 } %4, 0 + %6 = extractvalue { ptr, <32 x i32>, i32 } %4, 1 + %7 = extractvalue { ptr, <32 x i32>, i32 } %4, 2 + store <32 x i32> %6, ptr %s, align 128 + store i32 %7, ptr %pos1.i, align 64 + store ptr %5, ptr %p, align 4 + ret void +} + +; Function Attrs: mustprogress nounwind memory(write, argmem: readwrite, inaccessiblemem: none) +define dso_local void @_Z18test_fifo_st_flushRPDv64_DB8_R12fifo_state_t(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s) local_unnamed_addr #2 { +; CHECK-LABEL: _Z18test_fifo_st_flushRPDv64_DB8_R12fifo_state_t: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nopxm ; nops +; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128 +; CHECK-NEXT: lda r26, [p1, dj0] +; CHECK-NEXT: vlda sfh, [p1, #64] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: vst.flush.512 [p2, sf, r26] +; CHECK-NEXT: ret lr +; CHECK-NEXT: st r26, [p1, dj0] // Delay Slot 5 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %pos1.i = getelementptr inbounds i8, ptr %s, i20 128 + %0 = load ptr, ptr %p, align 4, !tbaa !2 + %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6 + %2 = load i32, ptr %pos1.i, align 64, !tbaa !7 + %3 = tail call { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.flush(ptr %0, <32 x i32> %1, i32 %2) + %4 = extractvalue { ptr, <32 x i32>, i32 } %3, 0 + %5 = extractvalue { ptr, <32 x i32>, i32 } %3, 1 + %6 = extractvalue { ptr, <32 x i32>, i32 } %3, 2 + store <32 x i32> %5, ptr %s, align 128 + store i32 %6, ptr %pos1.i, align 64 + store ptr %4, ptr %p, align 4 + ret void +} + +; Function Attrs: mustprogress nounwind memory(write, argmem: readwrite, inaccessiblemem: none) +define dso_local void @_Z26test_fifo_st_flush_1d_byteRPDv64_DB8_R12fifo_state_ti(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s, i32 noundef %off) local_unnamed_addr #2 { +; CHECK-LABEL: _Z26test_fifo_st_flush_1d_byteRPDv64_DB8_R12fifo_state_ti: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nopxm +; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128 +; CHECK-NEXT: lda r26, [p1, dj0] +; CHECK-NEXT: vlda sfh, [p1, #64] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: mov m0, r0 +; CHECK-NEXT: vst.flush.512 [p2, sf, r26, m0] +; CHECK-NEXT: ret lr +; CHECK-NEXT: st r26, [p1, dj0] // Delay Slot 5 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %pos1.i = getelementptr inbounds i8, ptr %s, i20 128 + %0 = load ptr, ptr %p, align 4, !tbaa !2 + %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6 + %2 = load i32, ptr %pos1.i, align 64, !tbaa !7 + %3 = trunc i32 %off to i20 + %4 = tail call { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.flush.1d(ptr %0, <32 x i32> %1, i32 %2, i20 %3) + %5 = extractvalue { ptr, <32 x i32>, i32 } %4, 0 + %6 = extractvalue { ptr, <32 x i32>, i32 } %4, 1 + %7 = extractvalue { ptr, <32 x i32>, i32 } %4, 2 + store <32 x i32> %6, ptr %s, align 128 + store i32 %7, ptr %pos1.i, align 64 + store ptr %5, ptr %p, align 4 + ret void +} + +; Function Attrs: mustprogress nounwind memory(write, argmem: readwrite, inaccessiblemem: none) +define dso_local void @_Z26test_fifo_st_flush_2d_byteRPDv64_DB8_R12fifo_state_tiiRii(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s, i32 noundef %off, i32 noundef %size1, ptr nocapture nonnull align 4 dereferenceable(4) %count1, i32 noundef %inc1) local_unnamed_addr #2 { +; CHECK-LABEL: _Z26test_fifo_st_flush_2d_byteRPDv64_DB8_R12fifo_state_tiiRii: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: lda dc0, [p2, #0]; nopb ; nopxm +; CHECK-NEXT: lda p2, [p0, #0]; mov dj1, #128 +; CHECK-NEXT: lda r26, [p1, dj1] +; CHECK-NEXT: nop +; CHECK-NEXT: vlda sfl, [p1, #0] +; CHECK-NEXT: vlda sfh, [p1, #64]; mov m0, r0 +; CHECK-NEXT: mov p3, p2 +; CHECK-NEXT: mov dn0, r1 +; CHECK-NEXT: mov dj0, r2 +; CHECK-NEXT: vst.flush.512.conv.2d [p2, sf, r26, d0] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: st dc0, [p3, #0]; ret lr +; CHECK-NEXT: st r26, [p1, dj1] // Delay Slot 5 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %pos1.i = getelementptr inbounds i8, ptr %s, i20 128 + %0 = load ptr, ptr %p, align 4, !tbaa !2 + %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6 + %2 = load i32, ptr %pos1.i, align 64, !tbaa !7 + %3 = trunc i32 %off to i20 + %4 = trunc i32 %size1 to i20 + %5 = load i32, ptr %count1, align 4, !tbaa !7 + %6 = trunc i32 %5 to i20 + %7 = trunc i32 %inc1 to i20 + %8 = tail call { ptr, <32 x i32>, i32, i20 } @llvm.aie2p.fifo.st.flush.2d.conv(ptr %0, <32 x i32> %1, i32 %2, i20 %3, i20 %4, i20 %6, i20 %7) + %9 = extractvalue { ptr, <32 x i32>, i32, i20 } %8, 0 + %10 = extractvalue { ptr, <32 x i32>, i32, i20 } %8, 1 + %11 = extractvalue { ptr, <32 x i32>, i32, i20 } %8, 2 + %12 = extractvalue { ptr, <32 x i32>, i32, i20 } %8, 3 + %13 = zext i20 %12 to i32 + store i32 %13, ptr %count1, align 4 + store <32 x i32> %10, ptr %s, align 128 + store i32 %11, ptr %pos1.i, align 64 + store ptr %9, ptr %p, align 4 + ret void +} + +; Function Attrs: mustprogress nounwind memory(write, argmem: readwrite, inaccessiblemem: none) +define dso_local void @_Z26test_fifo_st_flush_3d_byteRPDv64_DB8_R12fifo_state_tiiRiiiS5_i(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s, i32 noundef %off, i32 noundef %size1, ptr nocapture nonnull align 4 dereferenceable(4) %count1, i32 noundef %inc1, i32 noundef %size2, ptr nocapture nonnull align 4 dereferenceable(4) %count2, i32 noundef %inc2) local_unnamed_addr #2 { +; CHECK-LABEL: _Z26test_fifo_st_flush_3d_byteRPDv64_DB8_R12fifo_state_tiiRiiiS5_i: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nops ; nopxm ; nopv +; CHECK-NEXT: lda dc0, [p2, #0]; nopx +; CHECK-NEXT: lda dc4, [p3, #0] +; CHECK-NEXT: lda p2, [p0, #0]; mov dj1, #128 +; CHECK-NEXT: lda r26, [p1, dj1] +; CHECK-NEXT: vlda sfh, [p1, #64]; mov m0, r0 +; CHECK-NEXT: mov dn0, r1 +; CHECK-NEXT: mov dj0, r2 +; CHECK-NEXT: mov p4, p2 +; CHECK-NEXT: mov dn4, r3 +; CHECK-NEXT: mov dj4, r4 +; CHECK-NEXT: vst.flush.512.3d [p2, sf, r26, d0] +; CHECK-NEXT: nop +; CHECK-NEXT: st dc0, [p4, #0] +; CHECK-NEXT: st dc4, [p3, #0]; ret lr +; CHECK-NEXT: st r26, [p1, dj1] // Delay Slot 5 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %pos1.i = getelementptr inbounds i8, ptr %s, i20 128 + %0 = load ptr, ptr %p, align 4, !tbaa !2 + %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6 + %2 = load i32, ptr %pos1.i, align 64, !tbaa !7 + %3 = trunc i32 %off to i20 + %4 = trunc i32 %size1 to i20 + %5 = load i32, ptr %count1, align 4, !tbaa !7 + %6 = trunc i32 %5 to i20 + %7 = trunc i32 %inc1 to i20 + %8 = trunc i32 %size2 to i20 + %9 = load i32, ptr %count2, align 4, !tbaa !7 + %10 = trunc i32 %9 to i20 + %11 = trunc i32 %inc2 to i20 + %12 = tail call { ptr, <32 x i32>, i32, i20, i20 } @llvm.aie2p.fifo.st.flush.3d(ptr %0, <32 x i32> %1, i32 %2, i20 %3, i20 %4, i20 %6, i20 %7, i20 %8, i20 %10, i20 %11) + %13 = extractvalue { ptr, <32 x i32>, i32, i20, i20 } %12, 0 + %14 = extractvalue { ptr, <32 x i32>, i32, i20, i20 } %12, 1 + %15 = extractvalue { ptr, <32 x i32>, i32, i20, i20 } %12, 2 + %16 = extractvalue { ptr, <32 x i32>, i32, i20, i20 } %12, 3 + %17 = zext i20 %16 to i32 + %18 = extractvalue { ptr, <32 x i32>, i32, i20, i20 } %12, 4 + %19 = zext i20 %18 to i32 + store i32 %17, ptr %count1, align 4 + store i32 %19, ptr %count2, align 4 + store <32 x i32> %14, ptr %s, align 128 + store i32 %15, ptr %pos1.i, align 64 + store ptr %13, ptr %p, align 4 + ret void +} + +; Function Attrs: mustprogress nounwind memory(write, argmem: readwrite, inaccessiblemem: none) +define dso_local void @_Z23test_fifo_st_flush_convRPDv64_DB8_R12fifo_state_t(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s) local_unnamed_addr #2 { +; CHECK-LABEL: _Z23test_fifo_st_flush_convRPDv64_DB8_R12fifo_state_t: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: lda p2, [p0, #0]; nopb ; nopx ; mov dj0, #128; nops +; CHECK-NEXT: lda r26, [p1, dj0] +; CHECK-NEXT: nop +; CHECK-NEXT: vlda sfl, [p1, #0] +; CHECK-NEXT: vlda sfh, [p1, #64] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: vst.flush.512.conv [p2, sf, r26] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: ret lr +; CHECK-NEXT: st r26, [p1, dj0] // Delay Slot 5 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %pos1.i = getelementptr inbounds i8, ptr %s, i20 128 + %0 = load ptr, ptr %p, align 4, !tbaa !2 + %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6 + %2 = load i32, ptr %pos1.i, align 64, !tbaa !7 + %3 = tail call { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.flush.conv(ptr %0, <32 x i32> %1, i32 %2) + %4 = extractvalue { ptr, <32 x i32>, i32 } %3, 0 + %5 = extractvalue { ptr, <32 x i32>, i32 } %3, 1 + %6 = extractvalue { ptr, <32 x i32>, i32 } %3, 2 + store <32 x i32> %5, ptr %s, align 128 + store i32 %6, ptr %pos1.i, align 64 + store ptr %4, ptr %p, align 4 + ret void +} + +; Function Attrs: mustprogress nounwind memory(write, argmem: readwrite, inaccessiblemem: none) +define dso_local void @_Z31test_fifo_st_flush_conv_1d_byteRPDv64_DB8_R12fifo_state_ti(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s, i32 noundef %off) local_unnamed_addr #2 { +; CHECK-LABEL: _Z31test_fifo_st_flush_conv_1d_byteRPDv64_DB8_R12fifo_state_ti: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: lda p2, [p0, #0]; nopb ; nopx ; mov dj0, #128 +; CHECK-NEXT: lda r26, [p1, dj0] +; CHECK-NEXT: nop +; CHECK-NEXT: vlda sfl, [p1, #0] +; CHECK-NEXT: vlda sfh, [p1, #64] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: mov m0, r0 +; CHECK-NEXT: vst.flush.512.conv [p2, sf, r26, m0] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: ret lr +; CHECK-NEXT: st r26, [p1, dj0] // Delay Slot 5 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %pos1.i = getelementptr inbounds i8, ptr %s, i20 128 + %0 = load ptr, ptr %p, align 4, !tbaa !2 + %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6 + %2 = load i32, ptr %pos1.i, align 64, !tbaa !7 + %3 = trunc i32 %off to i20 + %4 = tail call { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.flush.1d.conv(ptr %0, <32 x i32> %1, i32 %2, i20 %3) + %5 = extractvalue { ptr, <32 x i32>, i32 } %4, 0 + %6 = extractvalue { ptr, <32 x i32>, i32 } %4, 1 + %7 = extractvalue { ptr, <32 x i32>, i32 } %4, 2 + store <32 x i32> %6, ptr %s, align 128 + store i32 %7, ptr %pos1.i, align 64 + store ptr %5, ptr %p, align 4 + ret void +} + +; Function Attrs: mustprogress nounwind memory(write, argmem: readwrite, inaccessiblemem: none) +define dso_local void @_Z31test_fifo_st_flush_conv_2d_byteRPDv64_DB8_R12fifo_state_tiiRii(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s, i32 noundef %off, i32 noundef %size1, ptr nocapture nonnull align 4 dereferenceable(4) %count1, i32 noundef %inc1) local_unnamed_addr #2 { +; CHECK-LABEL: _Z31test_fifo_st_flush_conv_2d_byteRPDv64_DB8_R12fifo_state_tiiRii: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nopxm ; nops +; CHECK-NEXT: lda dc0, [p2, #0] +; CHECK-NEXT: lda p2, [p0, #0]; mov dj1, #128 +; CHECK-NEXT: lda r26, [p1, dj1] +; CHECK-NEXT: vlda sfh, [p1, #64] +; CHECK-NEXT: nop +; CHECK-NEXT: mov m0, r0 +; CHECK-NEXT: mov p3, p2 +; CHECK-NEXT: mov dn0, r1 +; CHECK-NEXT: mov dj0, r2 +; CHECK-NEXT: vst.flush.512.2d [p2, sf, r26, d0] +; CHECK-NEXT: nop +; CHECK-NEXT: st dc0, [p3, #0]; ret lr +; CHECK-NEXT: st r26, [p1, dj1] // Delay Slot 5 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %pos1.i = getelementptr inbounds i8, ptr %s, i20 128 + %0 = load ptr, ptr %p, align 4, !tbaa !2 + %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6 + %2 = load i32, ptr %pos1.i, align 64, !tbaa !7 + %3 = trunc i32 %off to i20 + %4 = trunc i32 %size1 to i20 + %5 = load i32, ptr %count1, align 4, !tbaa !7 + %6 = trunc i32 %5 to i20 + %7 = trunc i32 %inc1 to i20 + %8 = tail call { ptr, <32 x i32>, i32, i20 } @llvm.aie2p.fifo.st.flush.2d(ptr %0, <32 x i32> %1, i32 %2, i20 %3, i20 %4, i20 %6, i20 %7) + %9 = extractvalue { ptr, <32 x i32>, i32, i20 } %8, 0 + %10 = extractvalue { ptr, <32 x i32>, i32, i20 } %8, 1 + %11 = extractvalue { ptr, <32 x i32>, i32, i20 } %8, 2 + %12 = extractvalue { ptr, <32 x i32>, i32, i20 } %8, 3 + %13 = zext i20 %12 to i32 + store i32 %13, ptr %count1, align 4 + store <32 x i32> %10, ptr %s, align 128 + store i32 %11, ptr %pos1.i, align 64 + store ptr %9, ptr %p, align 4 + ret void +} + +; Function Attrs: mustprogress nounwind memory(write, argmem: readwrite, inaccessiblemem: none) +define dso_local void @_Z31test_fifo_st_flush_conv_3d_byteRPDv64_DB8_R12fifo_state_tiiRiiiS5_i(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s, i32 noundef %off, i32 noundef %size1, ptr nocapture nonnull align 4 dereferenceable(4) %count1, i32 noundef %inc1, i32 noundef %size2, ptr nocapture nonnull align 4 dereferenceable(4) %count2, i32 noundef %inc2) local_unnamed_addr #2 { +; CHECK-LABEL: _Z31test_fifo_st_flush_conv_3d_byteRPDv64_DB8_R12fifo_state_tiiRiiiS5_i: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: lda dc0, [p2, #0]; nopb ; nops ; nopxm ; nopv +; CHECK-NEXT: lda dc4, [p3, #0] +; CHECK-NEXT: lda p2, [p0, #0]; mov dj1, #128 +; CHECK-NEXT: lda r26, [p1, dj1] +; CHECK-NEXT: mov m0, r0 +; CHECK-NEXT: vlda sfl, [p1, #0]; mov dn0, r1 +; CHECK-NEXT: vlda sfh, [p1, #64]; mov dj0, r2 +; CHECK-NEXT: mov p4, p2 +; CHECK-NEXT: mov dn4, r3 +; CHECK-NEXT: mov dj4, r4 +; CHECK-NEXT: vst.flush.512.conv.3d [p2, sf, r26, d0] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: st dc0, [p4, #0] +; CHECK-NEXT: st dc4, [p3, #0]; ret lr +; CHECK-NEXT: st r26, [p1, dj1] // Delay Slot 5 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %pos1.i = getelementptr inbounds i8, ptr %s, i20 128 + %0 = load ptr, ptr %p, align 4, !tbaa !2 + %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6 + %2 = load i32, ptr %pos1.i, align 64, !tbaa !7 + %3 = trunc i32 %off to i20 + %4 = trunc i32 %size1 to i20 + %5 = load i32, ptr %count1, align 4, !tbaa !7 + %6 = trunc i32 %5 to i20 + %7 = trunc i32 %inc1 to i20 + %8 = trunc i32 %size2 to i20 + %9 = load i32, ptr %count2, align 4, !tbaa !7 + %10 = trunc i32 %9 to i20 + %11 = trunc i32 %inc2 to i20 + %12 = tail call { ptr, <32 x i32>, i32, i20, i20 } @llvm.aie2p.fifo.st.flush.3d.conv(ptr %0, <32 x i32> %1, i32 %2, i20 %3, i20 %4, i20 %6, i20 %7, i20 %8, i20 %10, i20 %11) + %13 = extractvalue { ptr, <32 x i32>, i32, i20, i20 } %12, 0 + %14 = extractvalue { ptr, <32 x i32>, i32, i20, i20 } %12, 1 + %15 = extractvalue { ptr, <32 x i32>, i32, i20, i20 } %12, 2 + %16 = extractvalue { ptr, <32 x i32>, i32, i20, i20 } %12, 3 + %17 = zext i20 %16 to i32 + %18 = extractvalue { ptr, <32 x i32>, i32, i20, i20 } %12, 4 + %19 = zext i20 %18 to i32 + store i32 %17, ptr %count1, align 4 + store i32 %19, ptr %count2, align 4 + store <32 x i32> %14, ptr %s, align 128 + store i32 %15, ptr %pos1.i, align 64 + store ptr %13, ptr %p, align 4 + ret void +} + + +; Function Attrs: mustprogress nounwind memory(write, argmem: readwrite, inaccessiblemem: none) +define dso_local void @test_fifo_st_reset_v64bfp16ebs16(ptr nocapture nonnull align 4 dereferenceable(4) %p, %struct.v64bfp16ebs16 %v.coerce, ptr nocapture nonnull align 64 dereferenceable(256) %s) local_unnamed_addr #2 { +; CHECK-LABEL: test_fifo_st_reset_v64bfp16ebs16: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: vlda sfl, [p1, #0]; nopx +; CHECK-NEXT: lda p2, [p0, #0] +; CHECK-NEXT: vlda sfh, [p1, #64] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: mova r26, #0 +; CHECK-NEXT: vst.push.544 ex0, [p2, sf, r26]; ret lr +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %v.coerce.fca.0.extract.i = extractvalue %struct.v64bfp16ebs16 %v.coerce, 0 + %v.coerce.fca.1.extract.i = extractvalue %struct.v64bfp16ebs16 %v.coerce, 1 + %0 = load ptr, ptr %p, align 4, !tbaa !2 + %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6 + %2 = tail call { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.push.544.bfp16(ptr %0, <64 x i8> %v.coerce.fca.0.extract.i, <8 x i8> %v.coerce.fca.1.extract.i, <32 x i32> %1, i32 0) + %3 = extractvalue { ptr, <32 x i32>, i32 } %2, 0 + %4 = extractvalue { ptr, <32 x i32>, i32 } %2, 1 + store <32 x i32> %4, ptr %s, align 128 + store ptr %3, ptr %p, align 4 + ret void +} + +; Function Attrs: mustprogress nounwind memory(write, argmem: readwrite, inaccessiblemem: none) +define dso_local void @test_fifo_st_push_v64bfp16ebs16(ptr nocapture nonnull align 4 dereferenceable(4) %p, %struct.v64bfp16ebs16 %v.coerce, ptr nocapture nonnull align 64 dereferenceable(256) %s) local_unnamed_addr #2 { +; CHECK-LABEL: test_fifo_st_push_v64bfp16ebs16: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nopxm ; nops +; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128 +; CHECK-NEXT: lda r26, [p1, dj0] +; CHECK-NEXT: vlda sfh, [p1, #64] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: vst.push.544 ex0, [p2, sf, r26] +; CHECK-NEXT: ret lr +; CHECK-NEXT: st r26, [p1, dj0] // Delay Slot 5 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %v.coerce.fca.0.extract.i = extractvalue %struct.v64bfp16ebs16 %v.coerce, 0 + %v.coerce.fca.1.extract.i = extractvalue %struct.v64bfp16ebs16 %v.coerce, 1 + %pos1.i = getelementptr inbounds i8, ptr %s, i20 128 + %0 = load ptr, ptr %p, align 4, !tbaa !2 + %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6 + %2 = load i32, ptr %pos1.i, align 64, !tbaa !7 + %3 = tail call { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.push.544.bfp16(ptr %0, <64 x i8> %v.coerce.fca.0.extract.i, <8 x i8> %v.coerce.fca.1.extract.i, <32 x i32> %1, i32 %2) + %4 = extractvalue { ptr, <32 x i32>, i32 } %3, 0 + %5 = extractvalue { ptr, <32 x i32>, i32 } %3, 1 + %6 = extractvalue { ptr, <32 x i32>, i32 } %3, 2 + store <32 x i32> %5, ptr %s, align 128 + store i32 %6, ptr %pos1.i, align 64 + store ptr %4, ptr %p, align 4 + ret void +} + +; Function Attrs: mustprogress nounwind memory(write, argmem: readwrite, inaccessiblemem: none) +define dso_local void @test_fifo_st_reset_v64bfp16ebs8(ptr nocapture nonnull align 4 dereferenceable(4) %p, %struct.v64bfp16ebs8 %v.coerce, ptr nocapture nonnull align 64 dereferenceable(256) %s) local_unnamed_addr #2 { +; CHECK-LABEL: test_fifo_st_reset_v64bfp16ebs8: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: vlda sfl, [p1, #0]; nopx +; CHECK-NEXT: lda p2, [p0, #0] +; CHECK-NEXT: vlda sfh, [p1, #64] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: mova r26, #0 +; CHECK-NEXT: vst.push.576 ex0, [p2, sf, r26]; ret lr +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %v.coerce.fca.0.extract.i = extractvalue %struct.v64bfp16ebs8 %v.coerce, 0 + %v.coerce.fca.1.extract.i = extractvalue %struct.v64bfp16ebs8 %v.coerce, 1 + %0 = load ptr, ptr %p, align 4, !tbaa !2 + %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6 + %2 = tail call { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.push.576.bfp16(ptr %0, <64 x i8> %v.coerce.fca.0.extract.i, <8 x i8> %v.coerce.fca.1.extract.i, <32 x i32> %1, i32 0) + %3 = extractvalue { ptr, <32 x i32>, i32 } %2, 0 + %4 = extractvalue { ptr, <32 x i32>, i32 } %2, 1 + store <32 x i32> %4, ptr %s, align 128 + store ptr %3, ptr %p, align 4 + ret void +} + +; Function Attrs: mustprogress nounwind memory(write, argmem: readwrite, inaccessiblemem: none) +define dso_local void @test_fifo_st_push_v64bfp16ebs8(ptr nocapture nonnull align 4 dereferenceable(4) %p, %struct.v64bfp16ebs8 %v.coerce, ptr nocapture nonnull align 64 dereferenceable(256) %s) local_unnamed_addr #2 { +; CHECK-LABEL: test_fifo_st_push_v64bfp16ebs8: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nopxm ; nops +; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128 +; CHECK-NEXT: lda r26, [p1, dj0] +; CHECK-NEXT: vlda sfh, [p1, #64] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: vst.push.576 ex0, [p2, sf, r26] +; CHECK-NEXT: ret lr +; CHECK-NEXT: st r26, [p1, dj0] // Delay Slot 5 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 +; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %v.coerce.fca.0.extract.i = extractvalue %struct.v64bfp16ebs8 %v.coerce, 0 + %v.coerce.fca.1.extract.i = extractvalue %struct.v64bfp16ebs8 %v.coerce, 1 + %pos1.i = getelementptr inbounds i8, ptr %s, i20 128 + %0 = load ptr, ptr %p, align 4, !tbaa !2 + %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6 + %2 = load i32, ptr %pos1.i, align 64, !tbaa !7 + %3 = tail call { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.push.576.bfp16(ptr %0, <64 x i8> %v.coerce.fca.0.extract.i, <8 x i8> %v.coerce.fca.1.extract.i, <32 x i32> %1, i32 %2) + %4 = extractvalue { ptr, <32 x i32>, i32 } %3, 0 + %5 = extractvalue { ptr, <32 x i32>, i32 } %3, 1 + %6 = extractvalue { ptr, <32 x i32>, i32 } %3, 2 + store <32 x i32> %5, ptr %s, align 128 + store i32 %6, ptr %pos1.i, align 64 + store ptr %4, ptr %p, align 4 + ret void +} + +; Function Attrs: nofree nounwind memory(inaccessiblemem: read) +declare <32 x i8> @llvm.aie2p.pack.I512.I8.I16(<32 x i16>, i32) #3 + +; Function Attrs: nofree nosync nounwind memory(none) +declare <32 x i16> @llvm.aie2p.unpack.I512.I16.I8(<32 x i8>, i32) #4 + +; Function Attrs: nofree nounwind memory(inaccessiblemem: read) +declare <32 x i8> @llvm.aie2p.pack.I512.I4.I8(<64 x i8>, i32) #3 + +; Function Attrs: nofree nosync nounwind memory(none) +declare <64 x i8> @llvm.aie2p.unpack.I512.I8.I4(<32 x i8>, i32) #4 + +; Function Attrs: nofree nounwind memory(inaccessiblemem: read) +declare <64 x i8> @llvm.aie2p.pack.I1024.I8.I16(<64 x i16>, i32) #3 + +; Function Attrs: nofree nosync nounwind memory(none) +declare <64 x i16> @llvm.aie2p.unpack.I1024.I16.I8(<64 x i8>, i32) #4 + +; Function Attrs: nofree nounwind memory(inaccessiblemem: read) +declare <64 x i8> @llvm.aie2p.pack.I1024.I4.I8(<128 x i8>, i32) #3 + +; Function Attrs: nofree nosync nounwind memory(none) +declare <128 x i8> @llvm.aie2p.unpack.I1024.I8.I4(<64 x i8>, i32) #4 + +; Function Attrs: nounwind memory(argmem: write) +declare { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.push.512.bfp16(ptr, <16 x i32>, <32 x i32>, i32) #5 + +; Function Attrs: nounwind memory(argmem: write) +declare { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.flush(ptr, <32 x i32>, i32) #5 + +; Function Attrs: nounwind memory(argmem: write) +declare { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.flush.1d(ptr, <32 x i32>, i32, i20) #5 + +; Function Attrs: nounwind memory(argmem: write) +declare { ptr, <32 x i32>, i32, i20 } @llvm.aie2p.fifo.st.flush.2d.conv(ptr, <32 x i32>, i32, i20, i20, i20, i20) #5 + +; Function Attrs: nounwind memory(argmem: write) +declare { ptr, <32 x i32>, i32, i20, i20 } @llvm.aie2p.fifo.st.flush.3d(ptr, <32 x i32>, i32, i20, i20, i20, i20, i20, i20, i20) #5 + +; Function Attrs: nounwind memory(argmem: write) +declare { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.flush.conv(ptr, <32 x i32>, i32) #5 + +; Function Attrs: nounwind memory(argmem: write) +declare { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.flush.1d.conv(ptr, <32 x i32>, i32, i20) #5 + +; Function Attrs: nounwind memory(argmem: write) +declare { ptr, <32 x i32>, i32, i20 } @llvm.aie2p.fifo.st.flush.2d(ptr, <32 x i32>, i32, i20, i20, i20, i20) #5 + +; Function Attrs: nounwind memory(argmem: write) +declare { ptr, <32 x i32>, i32, i20, i20 } @llvm.aie2p.fifo.st.flush.3d.conv(ptr, <32 x i32>, i32, i20, i20, i20, i20, i20, i20, i20) #5 + +; Function Attrs: nounwind memory(argmem: write) +declare { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.push.576.bfp16(ptr, <64 x i8>, <8 x i8>, <32 x i32>, i32) #5 + +; Function Attrs: nounwind memory(argmem: write) +declare { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.push.544.bfp16(ptr, <64 x i8>, <8 x i8>, <32 x i32>, i32) #5 + + + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 19.0.0git (git@gitenterprise.xilinx.com:XRLabs/llvm-aie.git 7712cc9eca3c28aba4de6f89a2124b1130c2ec6d)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"any pointer", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C++ TBAA"} +!6 = !{!4, !4, i64 0} +!7 = !{!8, !8, i64 0} +!8 = !{!"int", !4, i64 0}