From 6840966c904c85a64477376f6a46081424a4d460 Mon Sep 17 00:00:00 2001
From: Hamza Khallouki <hamza.khallouki@amd.com>
Date: Thu, 23 Jan 2025 13:55:48 +0000
Subject: [PATCH] [AIE2P] Select fifo store intrinsics

---
 .../AIE/aie2p/AIE2PInstructionSelector.cpp    | 163 +++++
 .../GlobalIsel/inst-select-fifo-stores.mir    | 391 +++++++++++
 .../GlobalIsel/regbankselect-fifo-insn.mir    |   2 +-
 .../CodeGen/AIE/aie2p/ldst-fifo-stores.ll     | 641 ++++++++++++++++++
 4 files changed, 1196 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-fifo-stores.mir
 create mode 100644 llvm/test/CodeGen/AIE/aie2p/ldst-fifo-stores.ll

diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp b/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp
index 6f9c10215b15..983bec98a5d3 100644
--- a/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp
+++ b/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp
@@ -70,6 +70,7 @@ class AIE2PInstructionSelector : public AIEBaseInstructionSelector {
   bool selectG_CONCAT_VECTORS(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectCascadeStreamInsn(MachineInstr &I, MachineRegisterInfo &MRI,
                                bool isWrite);
+  bool selectVST_FIFO(MachineInstr &I, MachineRegisterInfo &MRI);
 
   static const char *getName() { return DEBUG_TYPE; }
 
@@ -363,6 +364,18 @@ bool AIE2PInstructionSelector::select(MachineInstr &I) {
     case Intrinsic::aie2p_v64accfloat_to_v64bfp16ebs16:
     case Intrinsic::aie2p_v64bfp16ebs8_to_v64bfp16ebs16:
       return selectVCONVbfp16(I, MRI);
+    case Intrinsic::aie2p_fifo_st_push_576_bfp16:
+    case Intrinsic::aie2p_fifo_st_push_544_bfp16:
+    case Intrinsic::aie2p_fifo_st_push_512_bfp16:
+    case Intrinsic::aie2p_fifo_st_flush:
+    case Intrinsic::aie2p_fifo_st_flush_conv:
+    case Intrinsic::aie2p_fifo_st_flush_1d:
+    case Intrinsic::aie2p_fifo_st_flush_1d_conv:
+    case Intrinsic::aie2p_fifo_st_flush_2d:
+    case Intrinsic::aie2p_fifo_st_flush_2d_conv:
+    case Intrinsic::aie2p_fifo_st_flush_3d:
+    case Intrinsic::aie2p_fifo_st_flush_3d_conv:
+      return selectVST_FIFO(I, MRI);
     default:
       return selectImpl(I, *CoverageInfo);
     }
@@ -3994,6 +4007,156 @@ bool AIE2PInstructionSelector ::selectVSHUFFLE_BFP(MachineInstr &I,
   return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
 }
 
+unsigned int getStoreFifoOpcode(MachineInstr &I) {
+  switch (cast<GIntrinsic>(I).getIntrinsicID()) {
+  case Intrinsic::aie2p_fifo_st_flush:
+    return AIE2P::VST_FLUSH_512_normal_flush;
+  case Intrinsic::aie2p_fifo_st_flush_1d:
+    return AIE2P::VST_FLUSH_512_fifo_1d_flush;
+  case Intrinsic::aie2p_fifo_st_flush_2d:
+    return AIE2P::VST_FLUSH_512_2D;
+  case Intrinsic::aie2p_fifo_st_flush_3d:
+    return AIE2P::VST_FLUSH_512_3D;
+  case Intrinsic::aie2p_fifo_st_flush_conv:
+    return AIE2P::VST_FLUSH_512_CONV_normal_flush;
+  case Intrinsic::aie2p_fifo_st_flush_1d_conv:
+    return AIE2P::VST_FLUSH_512_CONV_fifo_1d_flush;
+  case Intrinsic::aie2p_fifo_st_flush_2d_conv:
+    return AIE2P::VST_FLUSH_512_CONV_2D;
+  case Intrinsic::aie2p_fifo_st_flush_3d_conv:
+    return AIE2P::VST_FLUSH_512_CONV_3D;
+  case Intrinsic::aie2p_fifo_st_push_576_bfp16:
+    return AIE2P::VST_PUSH_576;
+  case Intrinsic::aie2p_fifo_st_push_544_bfp16:
+    return AIE2P::VST_PUSH_544;
+  case Intrinsic::aie2p_fifo_st_push_512_bfp16:
+    return AIE2P::VST_PUSH_512;
+  }
+  llvm_unreachable("Unreachable: Cannot get fifo store opcode from intrinsic");
+  return AIE2P::INSTRUCTION_LIST_END;
+}
+
+bool AIE2PInstructionSelector::selectVST_FIFO(MachineInstr &I,
+                                              MachineRegisterInfo &MRI) {
+  auto IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
+  MachineInstrBuilder MI;
+  Register PtrOut = I.getOperand(0).getReg();
+  Register FifoOut = I.getOperand(1).getReg();
+  Register AvailOut = I.getOperand(2).getReg();
+  switch (IntrinsicID) {
+  case Intrinsic::aie2p_fifo_st_push_512_bfp16: {
+    Register PtrIn = I.getOperand(4).getReg();
+    Register FifoIn = I.getOperand(6).getReg();
+    Register AvailIn = I.getOperand(7).getReg();
+    Register VecIn = I.getOperand(5).getReg();
+
+    MI = MIB.buildInstr(getStoreFifoOpcode(I), {FifoOut, PtrOut, AvailOut},
+                        {FifoIn, VecIn, PtrIn, AvailIn});
+
+    I.eraseFromParent();
+    return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
+  }
+  case Intrinsic::aie2p_fifo_st_push_544_bfp16:
+  case Intrinsic::aie2p_fifo_st_push_576_bfp16: {
+    Register PtrIn = I.getOperand(4).getReg();
+    Register FifoIn = I.getOperand(7).getReg();
+    Register AvailIn = I.getOperand(8).getReg();
+    Register MantIn = I.getOperand(5).getReg();
+    Register ExpIn = I.getOperand(6).getReg();
+
+    Register SrcReg = MRI.createVirtualRegister(&AIE2P::mEXaRegClass);
+    MIB.buildInstr(TargetOpcode::REG_SEQUENCE, {SrcReg}, {})
+        .addReg(MantIn)
+        .addImm(AIE2P::sub_bfp16_x)
+        .addReg(ExpIn)
+        .addImm(AIE2P::sub_bfp16_e);
+    MI = MIB.buildInstr(getStoreFifoOpcode(I), {FifoOut, PtrOut, AvailOut},
+                        {FifoIn, SrcReg, PtrIn, AvailIn});
+
+    I.eraseFromParent();
+    return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
+  }
+  case Intrinsic::aie2p_fifo_st_flush_conv:
+  case Intrinsic::aie2p_fifo_st_flush: {
+    Register PtrIn = I.getOperand(4).getReg();
+    Register FifoIn = I.getOperand(5).getReg();
+    Register AvailIn = I.getOperand(6).getReg();
+    MI = MIB.buildInstr(getStoreFifoOpcode(I), {FifoOut, PtrOut, AvailOut},
+                        {FifoIn, PtrIn, AvailIn});
+
+    I.eraseFromParent();
+    return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
+  }
+  case Intrinsic::aie2p_fifo_st_flush_1d:
+  case Intrinsic::aie2p_fifo_st_flush_1d_conv: {
+    Register PtrIn = I.getOperand(4).getReg();
+    Register FifoIn = I.getOperand(5).getReg();
+    Register AvailIn = I.getOperand(6).getReg();
+    Register OffsetReg = I.getOperand(7).getReg();
+    MI = MIB.buildInstr(getStoreFifoOpcode(I), {FifoOut, PtrOut, AvailOut},
+                        {FifoIn, PtrIn, AvailIn, OffsetReg});
+    I.eraseFromParent();
+    return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
+  }
+  case Intrinsic::aie2p_fifo_st_flush_2d:
+  case Intrinsic::aie2p_fifo_st_flush_2d_conv: {
+    Register CountOut1Reg = I.getOperand(3).getReg();
+    Register PtrIn = I.getOperand(5).getReg();
+    Register FifoIn = I.getOperand(6).getReg();
+    Register AvailIn = I.getOperand(7).getReg();
+    Register OffsetReg = I.getOperand(8).getReg();
+    Register SizeReg = I.getOperand(9).getReg();
+    Register CountIn1Reg = I.getOperand(10).getReg();
+    Register IncrReg = I.getOperand(11).getReg();
+    if (!RBI.constrainGenericRegister(CountOut1Reg, AIE2P::eDCRegClass, MRI))
+      return false;
+    Register DReg =
+        createDRegSequence(OffsetReg, IncrReg, SizeReg, CountIn1Reg, MRI);
+
+    MI = MIB.buildInstr(getStoreFifoOpcode(I),
+                        {FifoOut, PtrOut, AvailOut, CountOut1Reg},
+                        {FifoIn, PtrIn, AvailIn, DReg});
+
+    I.eraseFromParent();
+    return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
+  }
+  case Intrinsic::aie2p_fifo_st_flush_3d:
+  case Intrinsic::aie2p_fifo_st_flush_3d_conv: {
+    Register CountOut1Reg = I.getOperand(3).getReg();
+    Register CountOut2Reg = I.getOperand(4).getReg();
+
+    Register PtrIn = I.getOperand(6).getReg();
+    Register FifoIn = I.getOperand(7).getReg();
+    Register AvailIn = I.getOperand(8).getReg();
+    Register OffsetReg = I.getOperand(9).getReg();
+    Register Size1Reg = I.getOperand(10).getReg();
+    Register CountIn1Reg = I.getOperand(11).getReg();
+    Register Incr1Reg = I.getOperand(12).getReg();
+    Register Size2Reg = I.getOperand(13).getReg();
+    Register CountIn2Reg = I.getOperand(14).getReg();
+    Register Incr2Reg = I.getOperand(15).getReg();
+
+    if (!RBI.constrainGenericRegister(CountOut1Reg, *TRI.getAddrCountRegClass(),
+                                      MRI) ||
+        !RBI.constrainGenericRegister(CountOut2Reg, *TRI.getAddrCountRegClass(),
+                                      MRI))
+      return false;
+    Register DSReg =
+        createDSRegSequence(OffsetReg, Incr1Reg, Incr2Reg, Size1Reg,
+                            CountIn1Reg, Size2Reg, CountIn2Reg, MRI);
+
+    MI = MIB.buildInstr(getStoreFifoOpcode(I),
+                        {FifoOut, PtrOut, AvailOut, CountOut1Reg, CountOut2Reg},
+                        {FifoIn, PtrIn, AvailIn, DSReg});
+
+    I.eraseFromParent();
+    return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
+  }
+    return false;
+  }
+  return false;
+}
+
 namespace llvm {
 InstructionSelector *
 createAIE2PInstructionSelector(const AIE2PTargetMachine &TM,
diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-fifo-stores.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-fifo-stores.mir
new file mode 100644
index 000000000000..e8b4a722e8df
--- /dev/null
+++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-fifo-stores.mir
@@ -0,0 +1,391 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates
+# RUN: llc -mtriple aie2p -run-pass=instruction-select %s -verify-machineinstrs -o - | FileCheck %s
+
+
+---
+name:            test_fifo_st_push
+tracksRegLiveness: true
+legalized: true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0, $p1, $x0
+    ; CHECK-LABEL: name: test_fifo_st_push
+    ; CHECK: liveins: $p0, $p1, $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY $x0
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:mpfs = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:mstfifo = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:mr26_fifo_st = IMPLICIT_DEF
+    ; CHECK-NEXT: [[VST_PUSH_512_:%[0-9]+]]:mstfifo, [[VST_PUSH_512_1:%[0-9]+]]:mpfs, [[VST_PUSH_512_2:%[0-9]+]]:mr26_fifo_st = VST_PUSH_512 [[DEF1]], [[COPY]], [[DEF]], [[DEF2]], implicit-def $srfifo_of
+    ; CHECK-NEXT: PseudoRET implicit $lr
+    %0:ptrregbank(p0) = COPY $p0
+    %1:vregbank(<64 x s8>) = COPY $x0
+    %2:ptrregbank(p0) = COPY $p1
+    %3:modregbank(s20) = G_CONSTANT i20 128
+    %5:ptrregbank(p0) = IMPLICIT_DEF
+    %6:vregbank(<16 x s32>) = G_BITCAST %1:vregbank(<64 x s8>)
+    %7:fiforegbank(<32 x s32>) = IMPLICIT_DEF
+    %8:gprregbank(s32) = IMPLICIT_DEF
+    %9:ptrregbank(p0), %10:fiforegbank(<32 x s32>), %11:gprregbank(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.st.push.512.bfp16), %5:ptrregbank(p0), %6:vregbank(<16 x s32>), %7:fiforegbank(<32 x s32>), %8:gprregbank(s32)
+    PseudoRET implicit $lr
+...
+
+---
+name:            test_fifo_st_flush
+tracksRegLiveness: true
+legalized: true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0, $p1
+    ; CHECK-LABEL: name: test_fifo_st_flush
+    ; CHECK: liveins: $p0, $p1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:mpfs = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:mstfifo = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:mr26_fifo_st = IMPLICIT_DEF
+    ; CHECK-NEXT: [[VST_FLUSH_512_normal_flush:%[0-9]+]]:mstfifo, [[VST_FLUSH_512_normal_flush1:%[0-9]+]]:mpfs, [[VST_FLUSH_512_normal_flush2:%[0-9]+]]:mr26_fifo_st = VST_FLUSH_512_normal_flush [[DEF1]], [[DEF]], [[DEF2]], implicit-def $srfifo_of
+    ; CHECK-NEXT: PseudoRET implicit $lr
+    %0:ptrregbank(p0) = COPY $p0
+    %1:ptrregbank(p0) = COPY $p1
+    %2:modregbank(s20) = G_CONSTANT i20 128
+    %4:ptrregbank(p0) = IMPLICIT_DEF
+    %5:fiforegbank(<32 x s32>) = IMPLICIT_DEF
+    %6:gprregbank(s32) = IMPLICIT_DEF
+    %7:ptrregbank(p0), %8:fiforegbank(<32 x s32>), %9:gprregbank(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.st.flush), %4:ptrregbank(p0), %5:fiforegbank(<32 x s32>), %6:gprregbank(s32)
+    PseudoRET implicit $lr
+...
+
+---
+name:            test_fifo_st_flush_1d_byte
+tracksRegLiveness: true
+legalized: true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0, $p1, $r0
+    ; CHECK-LABEL: name: test_fifo_st_flush_1d_byte
+    ; CHECK: liveins: $p0, $p1, $r0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:mpfs = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:mstfifo = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:mr26_fifo_st = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF3:%[0-9]+]]:em = IMPLICIT_DEF
+    ; CHECK-NEXT: [[VST_FLUSH_512_fifo_1d_flush:%[0-9]+]]:mstfifo, [[VST_FLUSH_512_fifo_1d_flush1:%[0-9]+]]:mpfs, [[VST_FLUSH_512_fifo_1d_flush2:%[0-9]+]]:mr26_fifo_st = VST_FLUSH_512_fifo_1d_flush [[DEF1]], [[DEF]], [[DEF2]], [[DEF3]], implicit-def $srfifo_of
+    ; CHECK-NEXT: PseudoRET implicit $lr
+    %0:ptrregbank(p0) = COPY $p0
+    %1:ptrregbank(p0) = COPY $p1
+    %2:gprregbank(s32) = COPY $r0
+    %3:modregbank(s20) = G_CONSTANT i20 128
+    %5:ptrregbank(p0) = IMPLICIT_DEF
+    %6:fiforegbank(<32 x s32>) = IMPLICIT_DEF
+    %7:gprregbank(s32) = IMPLICIT_DEF
+    %8:modregbank(s20) = IMPLICIT_DEF
+    %9:ptrregbank(p0), %10:fiforegbank(<32 x s32>), %11:gprregbank(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.st.flush.1d), %5:ptrregbank(p0), %6:fiforegbank(<32 x s32>), %7:gprregbank(s32), %8:modregbank(s20)
+    PseudoRET implicit $lr
+...
+
+---
+name:            test_fifo_st_flush_2d_byte
+tracksRegLiveness: true
+legalized: true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0, $p1, $p2, $r0, $r1, $r2
+    ; CHECK-LABEL: name: test_fifo_st_flush_2d_byte
+    ; CHECK: liveins: $p0, $p1, $p2, $r0, $r1, $r2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:mpfs = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:mstfifo = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:mr26_fifo_st = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF3:%[0-9]+]]:em = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF4:%[0-9]+]]:edn = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF5:%[0-9]+]]:edc = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF6:%[0-9]+]]:edj = IMPLICIT_DEF
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ed = REG_SEQUENCE [[DEF3]], %subreg.sub_mod, [[DEF4]], %subreg.sub_dim_size, [[DEF6]], %subreg.sub_dim_stride, [[DEF5]], %subreg.sub_dim_count
+    ; CHECK-NEXT: [[VST_FLUSH_512_CONV_2D:%[0-9]+]]:mstfifo, [[VST_FLUSH_512_CONV_2D1:%[0-9]+]]:mpfs, [[VST_FLUSH_512_CONV_2D2:%[0-9]+]]:mr26_fifo_st, [[VST_FLUSH_512_CONV_2D3:%[0-9]+]]:edc = VST_FLUSH_512_CONV_2D [[DEF1]], [[DEF]], [[DEF2]], [[REG_SEQUENCE]], implicit-def $srfifo_of
+    ; CHECK-NEXT: PseudoRET implicit $lr
+    %0:ptrregbank(p0) = COPY $p0
+    %1:ptrregbank(p0) = COPY $p1
+    %2:gprregbank(s32) = COPY $r0
+    %3:gprregbank(s32) = COPY $r1
+    %4:ptrregbank(p0) = COPY $p2
+    %5:gprregbank(s32) = COPY $r2
+    %6:modregbank(s20) = G_CONSTANT i20 128
+    %8:ptrregbank(p0) = IMPLICIT_DEF
+    %9:fiforegbank(<32 x s32>) = IMPLICIT_DEF
+    %10:gprregbank(s32) = IMPLICIT_DEF
+    %11:modregbank(s20) = IMPLICIT_DEF
+    %12:modregbank(s20) = IMPLICIT_DEF
+    %13:gprregbank(s32) = IMPLICIT_DEF
+    %14:modregbank(s20) = IMPLICIT_DEF
+    %15:modregbank(s20) = IMPLICIT_DEF
+    %16:ptrregbank(p0), %17:fiforegbank(<32 x s32>), %18:gprregbank(s32), %19:modregbank(s20) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.st.flush.2d.conv), %8:ptrregbank(p0), %9:fiforegbank(<32 x s32>), %10:gprregbank(s32), %11:modregbank(s20), %12:modregbank(s20), %14:modregbank(s20), %15:modregbank(s20)
+    PseudoRET implicit $lr
+...
+
+---
+name:            test_fifo_st_flush_3d_byte
+tracksRegLiveness: true
+legalized: true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0, $p1, $p2, $p3, $r0, $r1, $r2, $r3, $r4
+    ; CHECK-LABEL: name: test_fifo_st_flush_3d_byte
+    ; CHECK: liveins: $p0, $p1, $p2, $p3, $r0, $r1, $r2, $r3, $r4
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:mpfs = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:mstfifo = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:mr26_fifo_st = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF3:%[0-9]+]]:em = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF4:%[0-9]+]]:edn = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF5:%[0-9]+]]:edc = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF6:%[0-9]+]]:edj = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF7:%[0-9]+]]:edn = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF8:%[0-9]+]]:edc = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF9:%[0-9]+]]:edj = IMPLICIT_DEF
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:eds = REG_SEQUENCE [[DEF3]], %subreg.sub_mod, [[DEF4]], %subreg.sub_dim_size, [[DEF6]], %subreg.sub_dim_stride, [[DEF5]], %subreg.sub_dim_count, [[DEF7]], %subreg.sub_hi_dim_then_sub_dim_size, [[DEF9]], %subreg.sub_hi_dim_then_sub_dim_stride, [[DEF8]], %subreg.sub_hi_dim_then_sub_dim_count
+    ; CHECK-NEXT: [[VST_FLUSH_512_3D:%[0-9]+]]:mstfifo, [[VST_FLUSH_512_3D1:%[0-9]+]]:mpfs, [[VST_FLUSH_512_3D2:%[0-9]+]]:mr26_fifo_st, [[VST_FLUSH_512_3D3:%[0-9]+]]:edcl, [[VST_FLUSH_512_3D4:%[0-9]+]]:edch = VST_FLUSH_512_3D [[DEF1]], [[DEF]], [[DEF2]], [[REG_SEQUENCE]], implicit-def $srfifo_of
+    ; CHECK-NEXT: PseudoRET implicit $lr
+    %0:ptrregbank(p0) = COPY $p0
+    %1:ptrregbank(p0) = COPY $p1
+    %2:gprregbank(s32) = COPY $r0
+    %3:gprregbank(s32) = COPY $r1
+    %4:ptrregbank(p0) = COPY $p2
+    %5:gprregbank(s32) = COPY $r2
+    %6:gprregbank(s32) = COPY $r3
+    %7:ptrregbank(p0) = COPY $p3
+    %8:gprregbank(s32) = COPY $r4
+    %9:modregbank(s20) = G_CONSTANT i20 128
+    %11:ptrregbank(p0) = IMPLICIT_DEF
+    %12:fiforegbank(<32 x s32>) = IMPLICIT_DEF
+    %13:gprregbank(s32) = IMPLICIT_DEF
+    %14:modregbank(s20) = IMPLICIT_DEF
+    %15:modregbank(s20) = IMPLICIT_DEF
+    %16:gprregbank(s32) = IMPLICIT_DEF
+    %17:modregbank(s20) = IMPLICIT_DEF
+    %18:modregbank(s20) = IMPLICIT_DEF
+    %19:modregbank(s20) = IMPLICIT_DEF
+    %20:gprregbank(s32) = IMPLICIT_DEF
+    %21:modregbank(s20) = IMPLICIT_DEF
+    %22:modregbank(s20) = IMPLICIT_DEF
+    %23:ptrregbank(p0), %24:fiforegbank(<32 x s32>), %25:gprregbank(s32), %26:modregbank(s20), %27:modregbank(s20) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.st.flush.3d), %11:ptrregbank(p0), %12:fiforegbank(<32 x s32>), %13:gprregbank(s32), %14:modregbank(s20), %15:modregbank(s20), %17:modregbank(s20), %18:modregbank(s20), %19:modregbank(s20), %21:modregbank(s20), %22:modregbank(s20)
+    PseudoRET implicit $lr
+...
+
+---
+name:            test_fifo_st_flush_conv
+tracksRegLiveness: true
+legalized: true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0, $p1
+    ; CHECK-LABEL: name: test_fifo_st_flush_conv
+    ; CHECK: liveins: $p0, $p1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:mpfs = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:mstfifo = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:mr26_fifo_st = IMPLICIT_DEF
+    ; CHECK-NEXT: [[VST_FLUSH_512_CONV_normal_flush:%[0-9]+]]:mstfifo, [[VST_FLUSH_512_CONV_normal_flush1:%[0-9]+]]:mpfs, [[VST_FLUSH_512_CONV_normal_flush2:%[0-9]+]]:mr26_fifo_st = VST_FLUSH_512_CONV_normal_flush [[DEF1]], [[DEF]], [[DEF2]], implicit-def $srfifo_of
+    ; CHECK-NEXT: PseudoRET implicit $lr
+    %0:ptrregbank(p0) = COPY $p0
+    %1:ptrregbank(p0) = COPY $p1
+    %2:modregbank(s20) = G_CONSTANT i20 128
+    %4:ptrregbank(p0) = IMPLICIT_DEF
+    %5:fiforegbank(<32 x s32>) = IMPLICIT_DEF
+    %6:gprregbank(s32) = IMPLICIT_DEF
+    %7:ptrregbank(p0), %8:fiforegbank(<32 x s32>), %9:gprregbank(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.st.flush.conv), %4:ptrregbank(p0), %5:fiforegbank(<32 x s32>), %6:gprregbank(s32)
+    PseudoRET implicit $lr
+...
+
+---
+name:            test_fifo_st_flush_conv_1d_byte
+tracksRegLiveness: true
+legalized: true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0, $p1, $r0
+    ; CHECK-LABEL: name: test_fifo_st_flush_conv_1d_byte
+    ; CHECK: liveins: $p0, $p1, $r0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:mpfs = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:mstfifo = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:mr26_fifo_st = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF3:%[0-9]+]]:em = IMPLICIT_DEF
+    ; CHECK-NEXT: [[VST_FLUSH_512_CONV_fifo_1d_flush:%[0-9]+]]:mstfifo, [[VST_FLUSH_512_CONV_fifo_1d_flush1:%[0-9]+]]:mpfs, [[VST_FLUSH_512_CONV_fifo_1d_flush2:%[0-9]+]]:mr26_fifo_st = VST_FLUSH_512_CONV_fifo_1d_flush [[DEF1]], [[DEF]], [[DEF2]], [[DEF3]], implicit-def $srfifo_of
+    ; CHECK-NEXT: PseudoRET implicit $lr
+    %0:ptrregbank(p0) = COPY $p0
+    %1:ptrregbank(p0) = COPY $p1
+    %2:gprregbank(s32) = COPY $r0
+    %3:modregbank(s20) = G_CONSTANT i20 128
+    %5:ptrregbank(p0) = IMPLICIT_DEF
+    %6:fiforegbank(<32 x s32>) = IMPLICIT_DEF
+    %7:gprregbank(s32) = IMPLICIT_DEF
+    %8:modregbank(s20) = IMPLICIT_DEF
+    %9:ptrregbank(p0), %10:fiforegbank(<32 x s32>), %11:gprregbank(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.st.flush.1d.conv), %5:ptrregbank(p0), %6:fiforegbank(<32 x s32>), %7:gprregbank(s32), %8:modregbank(s20)
+    PseudoRET implicit $lr
+...
+
+---
+name:            test_fifo_st_flush_conv_2d_byte
+tracksRegLiveness: true
+legalized: true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0, $p1, $p2, $r0, $r1, $r2
+    ; CHECK-LABEL: name: test_fifo_st_flush_conv_2d_byte
+    ; CHECK: liveins: $p0, $p1, $p2, $r0, $r1, $r2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:mpfs = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:mstfifo = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:mr26_fifo_st = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF3:%[0-9]+]]:em = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF4:%[0-9]+]]:edn = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF5:%[0-9]+]]:edc = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF6:%[0-9]+]]:edj = IMPLICIT_DEF
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ed = REG_SEQUENCE [[DEF3]], %subreg.sub_mod, [[DEF4]], %subreg.sub_dim_size, [[DEF6]], %subreg.sub_dim_stride, [[DEF5]], %subreg.sub_dim_count
+    ; CHECK-NEXT: [[VST_FLUSH_512_2D:%[0-9]+]]:mstfifo, [[VST_FLUSH_512_2D1:%[0-9]+]]:mpfs, [[VST_FLUSH_512_2D2:%[0-9]+]]:mr26_fifo_st, [[VST_FLUSH_512_2D3:%[0-9]+]]:edc = VST_FLUSH_512_2D [[DEF1]], [[DEF]], [[DEF2]], [[REG_SEQUENCE]], implicit-def $srfifo_of
+    ; CHECK-NEXT: PseudoRET implicit $lr
+    %0:ptrregbank(p0) = COPY $p0
+    %1:ptrregbank(p0) = COPY $p1
+    %2:gprregbank(s32) = COPY $r0
+    %3:gprregbank(s32) = COPY $r1
+    %4:ptrregbank(p0) = COPY $p2
+    %5:gprregbank(s32) = COPY $r2
+    %6:modregbank(s20) = G_CONSTANT i20 128
+    %8:ptrregbank(p0) = IMPLICIT_DEF
+    %9:fiforegbank(<32 x s32>) = IMPLICIT_DEF
+    %10:gprregbank(s32) = IMPLICIT_DEF
+    %11:modregbank(s20) = IMPLICIT_DEF
+    %12:modregbank(s20) = IMPLICIT_DEF
+    %13:gprregbank(s32) = IMPLICIT_DEF
+    %14:modregbank(s20) = IMPLICIT_DEF
+    %15:modregbank(s20) = IMPLICIT_DEF
+    %16:ptrregbank(p0), %17:fiforegbank(<32 x s32>), %18:gprregbank(s32), %19:modregbank(s20) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.st.flush.2d), %8:ptrregbank(p0), %9:fiforegbank(<32 x s32>), %10:gprregbank(s32), %11:modregbank(s20), %12:modregbank(s20), %14:modregbank(s20), %15:modregbank(s20)
+    PseudoRET implicit $lr
+...
+
+---
+name:            test_fifo_st_flush_conv_3d_byte
+tracksRegLiveness: true
+legalized: true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0, $p1, $p2, $p3, $r0, $r1, $r2, $r3, $r4
+    ; CHECK-LABEL: name: test_fifo_st_flush_conv_3d_byte
+    ; CHECK: liveins: $p0, $p1, $p2, $p3, $r0, $r1, $r2, $r3, $r4
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:mpfs = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:mstfifo = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:mr26_fifo_st = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF3:%[0-9]+]]:em = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF4:%[0-9]+]]:edn = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF5:%[0-9]+]]:edc = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF6:%[0-9]+]]:edj = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF7:%[0-9]+]]:edn = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF8:%[0-9]+]]:edc = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF9:%[0-9]+]]:edj = IMPLICIT_DEF
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:eds = REG_SEQUENCE [[DEF3]], %subreg.sub_mod, [[DEF4]], %subreg.sub_dim_size, [[DEF6]], %subreg.sub_dim_stride, [[DEF5]], %subreg.sub_dim_count, [[DEF7]], %subreg.sub_hi_dim_then_sub_dim_size, [[DEF9]], %subreg.sub_hi_dim_then_sub_dim_stride, [[DEF8]], %subreg.sub_hi_dim_then_sub_dim_count
+    ; CHECK-NEXT: [[VST_FLUSH_512_CONV_3D:%[0-9]+]]:mstfifo, [[VST_FLUSH_512_CONV_3D1:%[0-9]+]]:mpfs, [[VST_FLUSH_512_CONV_3D2:%[0-9]+]]:mr26_fifo_st, [[VST_FLUSH_512_CONV_3D3:%[0-9]+]]:edcl, [[VST_FLUSH_512_CONV_3D4:%[0-9]+]]:edch = VST_FLUSH_512_CONV_3D [[DEF1]], [[DEF]], [[DEF2]], [[REG_SEQUENCE]], implicit-def $srfifo_of
+    ; CHECK-NEXT: PseudoRET implicit $lr
+    %0:ptrregbank(p0) = COPY $p0
+    %1:ptrregbank(p0) = COPY $p1
+    %2:gprregbank(s32) = COPY $r0
+    %3:gprregbank(s32) = COPY $r1
+    %4:ptrregbank(p0) = COPY $p2
+    %5:gprregbank(s32) = COPY $r2
+    %6:gprregbank(s32) = COPY $r3
+    %7:ptrregbank(p0) = COPY $p3
+    %8:gprregbank(s32) = COPY $r4
+    %9:modregbank(s20) = G_CONSTANT i20 128
+    %11:ptrregbank(p0) = IMPLICIT_DEF
+    %12:fiforegbank(<32 x s32>) = IMPLICIT_DEF
+    %13:gprregbank(s32) = IMPLICIT_DEF
+    %14:modregbank(s20) = IMPLICIT_DEF
+    %15:modregbank(s20) = IMPLICIT_DEF
+    %16:gprregbank(s32) = IMPLICIT_DEF
+    %17:modregbank(s20) = IMPLICIT_DEF
+    %18:modregbank(s20) = IMPLICIT_DEF
+    %19:modregbank(s20) = IMPLICIT_DEF
+    %20:gprregbank(s32) = IMPLICIT_DEF
+    %21:modregbank(s20) = IMPLICIT_DEF
+    %22:modregbank(s20) = IMPLICIT_DEF
+    %23:ptrregbank(p0), %24:fiforegbank(<32 x s32>), %25:gprregbank(s32), %26:modregbank(s20), %27:modregbank(s20) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.st.flush.3d.conv), %11:ptrregbank(p0), %12:fiforegbank(<32 x s32>), %13:gprregbank(s32), %14:modregbank(s20), %15:modregbank(s20), %17:modregbank(s20), %18:modregbank(s20), %19:modregbank(s20), %21:modregbank(s20), %22:modregbank(s20)
+    PseudoRET implicit $lr
+...
+
+---
+name:            test_fifo_st_push_576
+tracksRegLiveness: true
+legalized: true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $e0, $p0, $p1, $x0
+    ; CHECK-LABEL: name: test_fifo_st_push_576
+    ; CHECK: liveins: $e0, $p0, $p1, $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:el = COPY $e0
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:mpfs = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:mstfifo = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:mr26_fifo_st = IMPLICIT_DEF
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:mbp2bp = REG_SEQUENCE [[COPY]], %subreg.sub_bfp16_x, [[COPY1]], %subreg.sub_bfp16_e
+    ; CHECK-NEXT: [[VST_PUSH_576_:%[0-9]+]]:mstfifo, [[VST_PUSH_576_1:%[0-9]+]]:mpfs, [[VST_PUSH_576_2:%[0-9]+]]:mr26_fifo_st = VST_PUSH_576 [[DEF1]], [[REG_SEQUENCE]], [[DEF]], [[DEF2]], implicit-def $srfifo_of
+    ; CHECK-NEXT: PseudoRET implicit $lr
+    %0:ptrregbank(p0) = COPY $p0
+    %1:vregbank(<64 x s8>) = COPY $x0
+    %2:gprregbank(<8 x s8>) = COPY $e0
+    %3:ptrregbank(p0) = COPY $p1
+    %4:modregbank(s20) = G_CONSTANT i20 128
+    %6:ptrregbank(p0) = IMPLICIT_DEF
+    %7:fiforegbank(<32 x s32>) = IMPLICIT_DEF
+    %8:gprregbank(s32) = IMPLICIT_DEF
+    %9:ptrregbank(p0), %10:fiforegbank(<32 x s32>), %11:gprregbank(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.st.push.576.bfp16), %6:ptrregbank(p0), %1:vregbank(<64 x s8>), %2:gprregbank(<8 x s8>), %7:fiforegbank(<32 x s32>), %8:gprregbank(s32)
+    PseudoRET implicit $lr
+...
+
+---
+name:            test_fifo_st_push_544
+tracksRegLiveness: true
+legalized: true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $e0, $p0, $p1, $x0
+    ; CHECK-LABEL: name: test_fifo_st_push_544
+    ; CHECK: liveins: $e0, $p0, $p1, $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:el = COPY $e0
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:mpfs = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:mstfifo = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:mr26_fifo_st = IMPLICIT_DEF
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:mbp2bp = REG_SEQUENCE [[COPY]], %subreg.sub_bfp16_x, [[COPY1]], %subreg.sub_bfp16_e
+    ; CHECK-NEXT: [[VST_PUSH_544_:%[0-9]+]]:mstfifo, [[VST_PUSH_544_1:%[0-9]+]]:mpfs, [[VST_PUSH_544_2:%[0-9]+]]:mr26_fifo_st = VST_PUSH_544 [[DEF1]], [[REG_SEQUENCE]], [[DEF]], [[DEF2]], implicit-def $srfifo_of
+    ; CHECK-NEXT: PseudoRET implicit $lr
+    %0:ptrregbank(p0) = COPY $p0
+    %1:vregbank(<64 x s8>) = COPY $x0
+    %2:gprregbank(<8 x s8>) = COPY $e0
+    %3:ptrregbank(p0) = COPY $p1
+    %4:modregbank(s20) = G_CONSTANT i20 128
+    %6:ptrregbank(p0) = IMPLICIT_DEF
+    %7:fiforegbank(<32 x s32>) = IMPLICIT_DEF
+    %8:gprregbank(s32) = IMPLICIT_DEF
+    %9:ptrregbank(p0), %10:fiforegbank(<32 x s32>), %11:gprregbank(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.st.push.544.bfp16), %6:ptrregbank(p0), %1:vregbank(<64 x s8>), %2:gprregbank(<8 x s8>), %7:fiforegbank(<32 x s32>), %8:gprregbank(s32)
+    PseudoRET implicit $lr
+...
+
diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/regbankselect-fifo-insn.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/regbankselect-fifo-insn.mir
index fefe24630e65..93b61c00990d 100644
--- a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/regbankselect-fifo-insn.mir
+++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/regbankselect-fifo-insn.mir
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates
 # RUN: llc -mtriple aie2p -run-pass=regbankselect -regbankselect-greedy %s -verify-machineinstrs -o - | FileCheck --check-prefix=GREEDY %s
 # RUN: llc -mtriple aie2p -run-pass=regbankselect -regbankselect-fast %s -verify-machineinstrs -o - | FileCheck --check-prefix=FAST %s
 ---
diff --git a/llvm/test/CodeGen/AIE/aie2p/ldst-fifo-stores.ll b/llvm/test/CodeGen/AIE/aie2p/ldst-fifo-stores.ll
new file mode 100644
index 000000000000..a717eeb05134
--- /dev/null
+++ b/llvm/test/CodeGen/AIE/aie2p/ldst-fifo-stores.ll
@@ -0,0 +1,641 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+;
+; This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+; See https://llvm.org/LICENSE.txt for license information.
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+;
+; (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates
+; RUN: llc < %s -verify-machineinstrs -mtriple=aie2p | FileCheck %s
+
+%struct.v64bfp16ebs8 = type <{ <64 x i8>, <8 x i8> }>
+%struct.v64bfp16ebs16 = type <{ <64 x i8>, <8 x i8> }>
+
+; Function Attrs: mustprogress nounwind memory(write, argmem: readwrite, inaccessiblemem: none)
+define dso_local void @_Z18test_fifo_st_resetRPDv64_DB8_S0_R12fifo_state_t(ptr nocapture nonnull align 4 dereferenceable(4) %p, <64 x i8> noundef %v, ptr nocapture nonnull align 64 dereferenceable(256) %s) local_unnamed_addr #2 {
+; CHECK-LABEL: _Z18test_fifo_st_resetRPDv64_DB8_S0_R12fifo_state_t:
+; CHECK:         .p2align 4
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    vlda sfl, [p1, #0]; nopx
+; CHECK-NEXT:    lda p2, [p0, #0]
+; CHECK-NEXT:    vlda sfh, [p1, #64]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    mova r26, #0
+; CHECK-NEXT:    vst.push.512 x0, [p2, sf, r26]; ret lr
+; CHECK-NEXT:    nop // Delay Slot 5
+; CHECK-NEXT:    vst sfl, [p1, #0] // Delay Slot 4
+; CHECK-NEXT:    vst sfh, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    st p2, [p0, #0] // Delay Slot 2
+; CHECK-NEXT:    nop // Delay Slot 1
+entry:
+  %0 = load ptr, ptr %p, align 4, !tbaa !2
+  %1 = bitcast <64 x i8> %v to <16 x i32>
+  %2 = load <32 x i32>, ptr %s, align 64, !tbaa !6
+  %3 = tail call { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.push.512.bfp16(ptr %0, <16 x i32> %1, <32 x i32> %2, i32 0)
+  %4 = extractvalue { ptr, <32 x i32>, i32 } %3, 0
+  %5 = extractvalue { ptr, <32 x i32>, i32 } %3, 1
+  store <32 x i32> %5, ptr %s, align 128
+  store ptr %4, ptr %p, align 4
+  ret void
+}
+
+; Function Attrs: mustprogress nounwind memory(write, argmem: readwrite, inaccessiblemem: none)
+define dso_local void @_Z17test_fifo_st_pushRPDv64_DB8_S0_R12fifo_state_t(ptr nocapture nonnull align 4 dereferenceable(4) %p, <64 x i8> noundef %v, ptr nocapture nonnull align 64 dereferenceable(256) %s) local_unnamed_addr #2 {
+; CHECK-LABEL: _Z17test_fifo_st_pushRPDv64_DB8_S0_R12fifo_state_t:
+; CHECK:         .p2align 4
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    vlda sfl, [p1, #0]; nopb ; nopxm ; nops
+; CHECK-NEXT:    lda p2, [p0, #0]; mov dj0, #128
+; CHECK-NEXT:    lda r26, [p1, dj0]
+; CHECK-NEXT:    vlda sfh, [p1, #64]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vst.push.512 x0, [p2, sf, r26]
+; CHECK-NEXT:    ret lr
+; CHECK-NEXT:    st r26, [p1, dj0] // Delay Slot 5
+; CHECK-NEXT:    vst sfl, [p1, #0] // Delay Slot 4
+; CHECK-NEXT:    vst sfh, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    st p2, [p0, #0] // Delay Slot 2
+; CHECK-NEXT:    nop // Delay Slot 1
+entry:
+  %pos1.i = getelementptr inbounds i8, ptr %s, i20 128
+  %0 = load ptr, ptr %p, align 4, !tbaa !2
+  %1 = bitcast <64 x i8> %v to <16 x i32>
+  %2 = load <32 x i32>, ptr %s, align 64, !tbaa !6
+  %3 = load i32, ptr %pos1.i, align 64, !tbaa !7
+  %4 = tail call { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.push.512.bfp16(ptr %0, <16 x i32> %1, <32 x i32> %2, i32 %3)
+  %5 = extractvalue { ptr, <32 x i32>, i32 } %4, 0
+  %6 = extractvalue { ptr, <32 x i32>, i32 } %4, 1
+  %7 = extractvalue { ptr, <32 x i32>, i32 } %4, 2
+  store <32 x i32> %6, ptr %s, align 128
+  store i32 %7, ptr %pos1.i, align 64
+  store ptr %5, ptr %p, align 4
+  ret void
+}
+
+; Function Attrs: mustprogress nounwind memory(write, argmem: readwrite, inaccessiblemem: none)
+define dso_local void @_Z18test_fifo_st_flushRPDv64_DB8_R12fifo_state_t(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s) local_unnamed_addr #2 {
+; CHECK-LABEL: _Z18test_fifo_st_flushRPDv64_DB8_R12fifo_state_t:
+; CHECK:         .p2align 4
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    vlda sfl, [p1, #0]; nopb ; nopxm ; nops
+; CHECK-NEXT:    lda p2, [p0, #0]; mov dj0, #128
+; CHECK-NEXT:    lda r26, [p1, dj0]
+; CHECK-NEXT:    vlda sfh, [p1, #64]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vst.flush.512 [p2, sf, r26]
+; CHECK-NEXT:    ret lr
+; CHECK-NEXT:    st r26, [p1, dj0] // Delay Slot 5
+; CHECK-NEXT:    vst sfl, [p1, #0] // Delay Slot 4
+; CHECK-NEXT:    vst sfh, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    st p2, [p0, #0] // Delay Slot 2
+; CHECK-NEXT:    nop // Delay Slot 1
+entry:
+  %pos1.i = getelementptr inbounds i8, ptr %s, i20 128
+  %0 = load ptr, ptr %p, align 4, !tbaa !2
+  %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6
+  %2 = load i32, ptr %pos1.i, align 64, !tbaa !7
+  %3 = tail call { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.flush(ptr %0, <32 x i32> %1, i32 %2)
+  %4 = extractvalue { ptr, <32 x i32>, i32 } %3, 0
+  %5 = extractvalue { ptr, <32 x i32>, i32 } %3, 1
+  %6 = extractvalue { ptr, <32 x i32>, i32 } %3, 2
+  store <32 x i32> %5, ptr %s, align 128
+  store i32 %6, ptr %pos1.i, align 64
+  store ptr %4, ptr %p, align 4
+  ret void
+}
+
+; Function Attrs: mustprogress nounwind memory(write, argmem: readwrite, inaccessiblemem: none)
+define dso_local void @_Z26test_fifo_st_flush_1d_byteRPDv64_DB8_R12fifo_state_ti(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s, i32 noundef %off) local_unnamed_addr #2 {
+; CHECK-LABEL: _Z26test_fifo_st_flush_1d_byteRPDv64_DB8_R12fifo_state_ti:
+; CHECK:         .p2align 4
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    vlda sfl, [p1, #0]; nopb ; nopxm
+; CHECK-NEXT:    lda p2, [p0, #0]; mov dj0, #128
+; CHECK-NEXT:    lda r26, [p1, dj0]
+; CHECK-NEXT:    vlda sfh, [p1, #64]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    mov m0, r0
+; CHECK-NEXT:    vst.flush.512 [p2, sf, r26, m0]
+; CHECK-NEXT:    ret lr
+; CHECK-NEXT:    st r26, [p1, dj0] // Delay Slot 5
+; CHECK-NEXT:    vst sfl, [p1, #0] // Delay Slot 4
+; CHECK-NEXT:    vst sfh, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    st p2, [p0, #0] // Delay Slot 2
+; CHECK-NEXT:    nop // Delay Slot 1
+entry:
+  %pos1.i = getelementptr inbounds i8, ptr %s, i20 128
+  %0 = load ptr, ptr %p, align 4, !tbaa !2
+  %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6
+  %2 = load i32, ptr %pos1.i, align 64, !tbaa !7
+  %3 = trunc i32 %off to i20
+  %4 = tail call { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.flush.1d(ptr %0, <32 x i32> %1, i32 %2, i20 %3)
+  %5 = extractvalue { ptr, <32 x i32>, i32 } %4, 0
+  %6 = extractvalue { ptr, <32 x i32>, i32 } %4, 1
+  %7 = extractvalue { ptr, <32 x i32>, i32 } %4, 2
+  store <32 x i32> %6, ptr %s, align 128
+  store i32 %7, ptr %pos1.i, align 64
+  store ptr %5, ptr %p, align 4
+  ret void
+}
+
+; Function Attrs: mustprogress nounwind memory(write, argmem: readwrite, inaccessiblemem: none)
+define dso_local void @_Z26test_fifo_st_flush_2d_byteRPDv64_DB8_R12fifo_state_tiiRii(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s, i32 noundef %off, i32 noundef %size1, ptr nocapture nonnull align 4 dereferenceable(4) %count1, i32 noundef %inc1) local_unnamed_addr #2 {
+; CHECK-LABEL: _Z26test_fifo_st_flush_2d_byteRPDv64_DB8_R12fifo_state_tiiRii:
+; CHECK:         .p2align 4
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    lda dc0, [p2, #0]; nopb ; nopxm
+; CHECK-NEXT:    lda p2, [p0, #0]; mov dj1, #128
+; CHECK-NEXT:    lda r26, [p1, dj1]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vlda sfl, [p1, #0]
+; CHECK-NEXT:    vlda sfh, [p1, #64]; mov m0, r0
+; CHECK-NEXT:    mov p3, p2
+; CHECK-NEXT:    mov dn0, r1
+; CHECK-NEXT:    mov dj0, r2
+; CHECK-NEXT:    vst.flush.512.conv.2d [p2, sf, r26, d0]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    st dc0, [p3, #0]; ret lr
+; CHECK-NEXT:    st r26, [p1, dj1] // Delay Slot 5
+; CHECK-NEXT:    vst sfl, [p1, #0] // Delay Slot 4
+; CHECK-NEXT:    vst sfh, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    st p2, [p0, #0] // Delay Slot 2
+; CHECK-NEXT:    nop // Delay Slot 1
+entry:
+  %pos1.i = getelementptr inbounds i8, ptr %s, i20 128
+  %0 = load ptr, ptr %p, align 4, !tbaa !2
+  %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6
+  %2 = load i32, ptr %pos1.i, align 64, !tbaa !7
+  %3 = trunc i32 %off to i20
+  %4 = trunc i32 %size1 to i20
+  %5 = load i32, ptr %count1, align 4, !tbaa !7
+  %6 = trunc i32 %5 to i20
+  %7 = trunc i32 %inc1 to i20
+  %8 = tail call { ptr, <32 x i32>, i32, i20 } @llvm.aie2p.fifo.st.flush.2d.conv(ptr %0, <32 x i32> %1, i32 %2, i20 %3, i20 %4, i20 %6, i20 %7)
+  %9 = extractvalue { ptr, <32 x i32>, i32, i20 } %8, 0
+  %10 = extractvalue { ptr, <32 x i32>, i32, i20 } %8, 1
+  %11 = extractvalue { ptr, <32 x i32>, i32, i20 } %8, 2
+  %12 = extractvalue { ptr, <32 x i32>, i32, i20 } %8, 3
+  %13 = zext i20 %12 to i32
+  store i32 %13, ptr %count1, align 4
+  store <32 x i32> %10, ptr %s, align 128
+  store i32 %11, ptr %pos1.i, align 64
+  store ptr %9, ptr %p, align 4
+  ret void
+}
+
+; Function Attrs: mustprogress nounwind memory(write, argmem: readwrite, inaccessiblemem: none)
+define dso_local void @_Z26test_fifo_st_flush_3d_byteRPDv64_DB8_R12fifo_state_tiiRiiiS5_i(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s, i32 noundef %off, i32 noundef %size1, ptr nocapture nonnull align 4 dereferenceable(4) %count1, i32 noundef %inc1, i32 noundef %size2, ptr nocapture nonnull align 4 dereferenceable(4) %count2, i32 noundef %inc2) local_unnamed_addr #2 {
+; CHECK-LABEL: _Z26test_fifo_st_flush_3d_byteRPDv64_DB8_R12fifo_state_tiiRiiiS5_i:
+; CHECK:         .p2align 4
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    vlda sfl, [p1, #0]; nopb ; nops ; nopxm ; nopv
+; CHECK-NEXT:    lda dc0, [p2, #0]; nopx
+; CHECK-NEXT:    lda dc4, [p3, #0]
+; CHECK-NEXT:    lda p2, [p0, #0]; mov dj1, #128
+; CHECK-NEXT:    lda r26, [p1, dj1]
+; CHECK-NEXT:    vlda sfh, [p1, #64]; mov m0, r0
+; CHECK-NEXT:    mov dn0, r1
+; CHECK-NEXT:    mov dj0, r2
+; CHECK-NEXT:    mov p4, p2
+; CHECK-NEXT:    mov dn4, r3
+; CHECK-NEXT:    mov dj4, r4
+; CHECK-NEXT:    vst.flush.512.3d [p2, sf, r26, d0]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    st dc0, [p4, #0]
+; CHECK-NEXT:    st dc4, [p3, #0]; ret lr
+; CHECK-NEXT:    st r26, [p1, dj1] // Delay Slot 5
+; CHECK-NEXT:    vst sfl, [p1, #0] // Delay Slot 4
+; CHECK-NEXT:    vst sfh, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    st p2, [p0, #0] // Delay Slot 2
+; CHECK-NEXT:    nop // Delay Slot 1
+entry:
+  %pos1.i = getelementptr inbounds i8, ptr %s, i20 128
+  %0 = load ptr, ptr %p, align 4, !tbaa !2
+  %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6
+  %2 = load i32, ptr %pos1.i, align 64, !tbaa !7
+  %3 = trunc i32 %off to i20
+  %4 = trunc i32 %size1 to i20
+  %5 = load i32, ptr %count1, align 4, !tbaa !7
+  %6 = trunc i32 %5 to i20
+  %7 = trunc i32 %inc1 to i20
+  %8 = trunc i32 %size2 to i20
+  %9 = load i32, ptr %count2, align 4, !tbaa !7
+  %10 = trunc i32 %9 to i20
+  %11 = trunc i32 %inc2 to i20
+  %12 = tail call { ptr, <32 x i32>, i32, i20, i20 } @llvm.aie2p.fifo.st.flush.3d(ptr %0, <32 x i32> %1, i32 %2, i20 %3, i20 %4, i20 %6, i20 %7, i20 %8, i20 %10, i20 %11)
+  %13 = extractvalue { ptr, <32 x i32>, i32, i20, i20 } %12, 0
+  %14 = extractvalue { ptr, <32 x i32>, i32, i20, i20 } %12, 1
+  %15 = extractvalue { ptr, <32 x i32>, i32, i20, i20 } %12, 2
+  %16 = extractvalue { ptr, <32 x i32>, i32, i20, i20 } %12, 3
+  %17 = zext i20 %16 to i32
+  %18 = extractvalue { ptr, <32 x i32>, i32, i20, i20 } %12, 4
+  %19 = zext i20 %18 to i32
+  store i32 %17, ptr %count1, align 4
+  store i32 %19, ptr %count2, align 4
+  store <32 x i32> %14, ptr %s, align 128
+  store i32 %15, ptr %pos1.i, align 64
+  store ptr %13, ptr %p, align 4
+  ret void
+}
+
+; Function Attrs: mustprogress nounwind memory(write, argmem: readwrite, inaccessiblemem: none)
+define dso_local void @_Z23test_fifo_st_flush_convRPDv64_DB8_R12fifo_state_t(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s) local_unnamed_addr #2 {
+; CHECK-LABEL: _Z23test_fifo_st_flush_convRPDv64_DB8_R12fifo_state_t:
+; CHECK:         .p2align 4
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    lda p2, [p0, #0]; nopb ; nopx ; mov dj0, #128; nops
+; CHECK-NEXT:    lda r26, [p1, dj0]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vlda sfl, [p1, #0]
+; CHECK-NEXT:    vlda sfh, [p1, #64]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vst.flush.512.conv [p2, sf, r26]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    ret lr
+; CHECK-NEXT:    st r26, [p1, dj0] // Delay Slot 5
+; CHECK-NEXT:    vst sfl, [p1, #0] // Delay Slot 4
+; CHECK-NEXT:    vst sfh, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    st p2, [p0, #0] // Delay Slot 2
+; CHECK-NEXT:    nop // Delay Slot 1
+entry:
+  %pos1.i = getelementptr inbounds i8, ptr %s, i20 128
+  %0 = load ptr, ptr %p, align 4, !tbaa !2
+  %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6
+  %2 = load i32, ptr %pos1.i, align 64, !tbaa !7
+  %3 = tail call { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.flush.conv(ptr %0, <32 x i32> %1, i32 %2)
+  %4 = extractvalue { ptr, <32 x i32>, i32 } %3, 0
+  %5 = extractvalue { ptr, <32 x i32>, i32 } %3, 1
+  %6 = extractvalue { ptr, <32 x i32>, i32 } %3, 2
+  store <32 x i32> %5, ptr %s, align 128
+  store i32 %6, ptr %pos1.i, align 64
+  store ptr %4, ptr %p, align 4
+  ret void
+}
+
+; Function Attrs: mustprogress nounwind memory(write, argmem: readwrite, inaccessiblemem: none)
+define dso_local void @_Z31test_fifo_st_flush_conv_1d_byteRPDv64_DB8_R12fifo_state_ti(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s, i32 noundef %off) local_unnamed_addr #2 {
+; CHECK-LABEL: _Z31test_fifo_st_flush_conv_1d_byteRPDv64_DB8_R12fifo_state_ti:
+; CHECK:         .p2align 4
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    lda p2, [p0, #0]; nopb ; nopx ; mov dj0, #128
+; CHECK-NEXT:    lda r26, [p1, dj0]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vlda sfl, [p1, #0]
+; CHECK-NEXT:    vlda sfh, [p1, #64]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    mov m0, r0
+; CHECK-NEXT:    vst.flush.512.conv [p2, sf, r26, m0]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    ret lr
+; CHECK-NEXT:    st r26, [p1, dj0] // Delay Slot 5
+; CHECK-NEXT:    vst sfl, [p1, #0] // Delay Slot 4
+; CHECK-NEXT:    vst sfh, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    st p2, [p0, #0] // Delay Slot 2
+; CHECK-NEXT:    nop // Delay Slot 1
+entry:
+  %pos1.i = getelementptr inbounds i8, ptr %s, i20 128
+  %0 = load ptr, ptr %p, align 4, !tbaa !2
+  %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6
+  %2 = load i32, ptr %pos1.i, align 64, !tbaa !7
+  %3 = trunc i32 %off to i20
+  %4 = tail call { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.flush.1d.conv(ptr %0, <32 x i32> %1, i32 %2, i20 %3)
+  %5 = extractvalue { ptr, <32 x i32>, i32 } %4, 0
+  %6 = extractvalue { ptr, <32 x i32>, i32 } %4, 1
+  %7 = extractvalue { ptr, <32 x i32>, i32 } %4, 2
+  store <32 x i32> %6, ptr %s, align 128
+  store i32 %7, ptr %pos1.i, align 64
+  store ptr %5, ptr %p, align 4
+  ret void
+}
+
+; Function Attrs: mustprogress nounwind memory(write, argmem: readwrite, inaccessiblemem: none)
+define dso_local void @_Z31test_fifo_st_flush_conv_2d_byteRPDv64_DB8_R12fifo_state_tiiRii(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s, i32 noundef %off, i32 noundef %size1, ptr nocapture nonnull align 4 dereferenceable(4) %count1, i32 noundef %inc1) local_unnamed_addr #2 {
+; CHECK-LABEL: _Z31test_fifo_st_flush_conv_2d_byteRPDv64_DB8_R12fifo_state_tiiRii:
+; CHECK:         .p2align 4
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    vlda sfl, [p1, #0]; nopb ; nopxm ; nops
+; CHECK-NEXT:    lda dc0, [p2, #0]
+; CHECK-NEXT:    lda p2, [p0, #0]; mov dj1, #128
+; CHECK-NEXT:    lda r26, [p1, dj1]
+; CHECK-NEXT:    vlda sfh, [p1, #64]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    mov m0, r0
+; CHECK-NEXT:    mov p3, p2
+; CHECK-NEXT:    mov dn0, r1
+; CHECK-NEXT:    mov dj0, r2
+; CHECK-NEXT:    vst.flush.512.2d [p2, sf, r26, d0]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    st dc0, [p3, #0]; ret lr
+; CHECK-NEXT:    st r26, [p1, dj1] // Delay Slot 5
+; CHECK-NEXT:    vst sfl, [p1, #0] // Delay Slot 4
+; CHECK-NEXT:    vst sfh, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    st p2, [p0, #0] // Delay Slot 2
+; CHECK-NEXT:    nop // Delay Slot 1
+entry:
+  %pos1.i = getelementptr inbounds i8, ptr %s, i20 128
+  %0 = load ptr, ptr %p, align 4, !tbaa !2
+  %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6
+  %2 = load i32, ptr %pos1.i, align 64, !tbaa !7
+  %3 = trunc i32 %off to i20
+  %4 = trunc i32 %size1 to i20
+  %5 = load i32, ptr %count1, align 4, !tbaa !7
+  %6 = trunc i32 %5 to i20
+  %7 = trunc i32 %inc1 to i20
+  %8 = tail call { ptr, <32 x i32>, i32, i20 } @llvm.aie2p.fifo.st.flush.2d(ptr %0, <32 x i32> %1, i32 %2, i20 %3, i20 %4, i20 %6, i20 %7)
+  %9 = extractvalue { ptr, <32 x i32>, i32, i20 } %8, 0
+  %10 = extractvalue { ptr, <32 x i32>, i32, i20 } %8, 1
+  %11 = extractvalue { ptr, <32 x i32>, i32, i20 } %8, 2
+  %12 = extractvalue { ptr, <32 x i32>, i32, i20 } %8, 3
+  %13 = zext i20 %12 to i32
+  store i32 %13, ptr %count1, align 4
+  store <32 x i32> %10, ptr %s, align 128
+  store i32 %11, ptr %pos1.i, align 64
+  store ptr %9, ptr %p, align 4
+  ret void
+}
+
+; Function Attrs: mustprogress nounwind memory(write, argmem: readwrite, inaccessiblemem: none)
+define dso_local void @_Z31test_fifo_st_flush_conv_3d_byteRPDv64_DB8_R12fifo_state_tiiRiiiS5_i(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s, i32 noundef %off, i32 noundef %size1, ptr nocapture nonnull align 4 dereferenceable(4) %count1, i32 noundef %inc1, i32 noundef %size2, ptr nocapture nonnull align 4 dereferenceable(4) %count2, i32 noundef %inc2) local_unnamed_addr #2 {
+; CHECK-LABEL: _Z31test_fifo_st_flush_conv_3d_byteRPDv64_DB8_R12fifo_state_tiiRiiiS5_i:
+; CHECK:         .p2align 4
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    lda dc0, [p2, #0]; nopb ; nops ; nopxm ; nopv
+; CHECK-NEXT:    lda dc4, [p3, #0]
+; CHECK-NEXT:    lda p2, [p0, #0]; mov dj1, #128
+; CHECK-NEXT:    lda r26, [p1, dj1]
+; CHECK-NEXT:    mov m0, r0
+; CHECK-NEXT:    vlda sfl, [p1, #0]; mov dn0, r1
+; CHECK-NEXT:    vlda sfh, [p1, #64]; mov dj0, r2
+; CHECK-NEXT:    mov p4, p2
+; CHECK-NEXT:    mov dn4, r3
+; CHECK-NEXT:    mov dj4, r4
+; CHECK-NEXT:    vst.flush.512.conv.3d [p2, sf, r26, d0]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    st dc0, [p4, #0]
+; CHECK-NEXT:    st dc4, [p3, #0]; ret lr
+; CHECK-NEXT:    st r26, [p1, dj1] // Delay Slot 5
+; CHECK-NEXT:    vst sfl, [p1, #0] // Delay Slot 4
+; CHECK-NEXT:    vst sfh, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    st p2, [p0, #0] // Delay Slot 2
+; CHECK-NEXT:    nop // Delay Slot 1
+entry:
+  %pos1.i = getelementptr inbounds i8, ptr %s, i20 128
+  %0 = load ptr, ptr %p, align 4, !tbaa !2
+  %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6
+  %2 = load i32, ptr %pos1.i, align 64, !tbaa !7
+  %3 = trunc i32 %off to i20
+  %4 = trunc i32 %size1 to i20
+  %5 = load i32, ptr %count1, align 4, !tbaa !7
+  %6 = trunc i32 %5 to i20
+  %7 = trunc i32 %inc1 to i20
+  %8 = trunc i32 %size2 to i20
+  %9 = load i32, ptr %count2, align 4, !tbaa !7
+  %10 = trunc i32 %9 to i20
+  %11 = trunc i32 %inc2 to i20
+  %12 = tail call { ptr, <32 x i32>, i32, i20, i20 } @llvm.aie2p.fifo.st.flush.3d.conv(ptr %0, <32 x i32> %1, i32 %2, i20 %3, i20 %4, i20 %6, i20 %7, i20 %8, i20 %10, i20 %11)
+  %13 = extractvalue { ptr, <32 x i32>, i32, i20, i20 } %12, 0
+  %14 = extractvalue { ptr, <32 x i32>, i32, i20, i20 } %12, 1
+  %15 = extractvalue { ptr, <32 x i32>, i32, i20, i20 } %12, 2
+  %16 = extractvalue { ptr, <32 x i32>, i32, i20, i20 } %12, 3
+  %17 = zext i20 %16 to i32
+  %18 = extractvalue { ptr, <32 x i32>, i32, i20, i20 } %12, 4
+  %19 = zext i20 %18 to i32
+  store i32 %17, ptr %count1, align 4
+  store i32 %19, ptr %count2, align 4
+  store <32 x i32> %14, ptr %s, align 128
+  store i32 %15, ptr %pos1.i, align 64
+  store ptr %13, ptr %p, align 4
+  ret void
+}
+
+
+; Function Attrs: mustprogress nounwind memory(write, argmem: readwrite, inaccessiblemem: none)
+define dso_local void @test_fifo_st_reset_v64bfp16ebs16(ptr nocapture nonnull align 4 dereferenceable(4) %p, %struct.v64bfp16ebs16 %v.coerce, ptr nocapture nonnull align 64 dereferenceable(256) %s) local_unnamed_addr #2 {
+; CHECK-LABEL: test_fifo_st_reset_v64bfp16ebs16:
+; CHECK:         .p2align 4
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    vlda sfl, [p1, #0]; nopx
+; CHECK-NEXT:    lda p2, [p0, #0]
+; CHECK-NEXT:    vlda sfh, [p1, #64]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    mova r26, #0
+; CHECK-NEXT:    vst.push.544 ex0, [p2, sf, r26]; ret lr
+; CHECK-NEXT:    nop // Delay Slot 5
+; CHECK-NEXT:    vst sfl, [p1, #0] // Delay Slot 4
+; CHECK-NEXT:    vst sfh, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    st p2, [p0, #0] // Delay Slot 2
+; CHECK-NEXT:    nop // Delay Slot 1
+entry:
+  %v.coerce.fca.0.extract.i = extractvalue %struct.v64bfp16ebs16 %v.coerce, 0
+  %v.coerce.fca.1.extract.i = extractvalue %struct.v64bfp16ebs16 %v.coerce, 1
+  %0 = load ptr, ptr %p, align 4, !tbaa !2
+  %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6
+  %2 = tail call { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.push.544.bfp16(ptr %0, <64 x i8> %v.coerce.fca.0.extract.i, <8 x i8> %v.coerce.fca.1.extract.i, <32 x i32> %1, i32 0)
+  %3 = extractvalue { ptr, <32 x i32>, i32 } %2, 0
+  %4 = extractvalue { ptr, <32 x i32>, i32 } %2, 1
+  store <32 x i32> %4, ptr %s, align 128
+  store ptr %3, ptr %p, align 4
+  ret void
+}
+
+; Function Attrs: mustprogress nounwind memory(write, argmem: readwrite, inaccessiblemem: none)
+define dso_local void @test_fifo_st_push_v64bfp16ebs16(ptr nocapture nonnull align 4 dereferenceable(4) %p, %struct.v64bfp16ebs16 %v.coerce, ptr nocapture nonnull align 64 dereferenceable(256) %s) local_unnamed_addr #2 {
+; CHECK-LABEL: test_fifo_st_push_v64bfp16ebs16:
+; CHECK:         .p2align 4
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    vlda sfl, [p1, #0]; nopb ; nopxm ; nops
+; CHECK-NEXT:    lda p2, [p0, #0]; mov dj0, #128
+; CHECK-NEXT:    lda r26, [p1, dj0]
+; CHECK-NEXT:    vlda sfh, [p1, #64]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vst.push.544 ex0, [p2, sf, r26]
+; CHECK-NEXT:    ret lr
+; CHECK-NEXT:    st r26, [p1, dj0] // Delay Slot 5
+; CHECK-NEXT:    vst sfl, [p1, #0] // Delay Slot 4
+; CHECK-NEXT:    vst sfh, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    st p2, [p0, #0] // Delay Slot 2
+; CHECK-NEXT:    nop // Delay Slot 1
+entry:
+  %v.coerce.fca.0.extract.i = extractvalue %struct.v64bfp16ebs16 %v.coerce, 0
+  %v.coerce.fca.1.extract.i = extractvalue %struct.v64bfp16ebs16 %v.coerce, 1
+  %pos1.i = getelementptr inbounds i8, ptr %s, i20 128
+  %0 = load ptr, ptr %p, align 4, !tbaa !2
+  %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6
+  %2 = load i32, ptr %pos1.i, align 64, !tbaa !7
+  %3 = tail call { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.push.544.bfp16(ptr %0, <64 x i8> %v.coerce.fca.0.extract.i, <8 x i8> %v.coerce.fca.1.extract.i, <32 x i32> %1, i32 %2)
+  %4 = extractvalue { ptr, <32 x i32>, i32 } %3, 0
+  %5 = extractvalue { ptr, <32 x i32>, i32 } %3, 1
+  %6 = extractvalue { ptr, <32 x i32>, i32 } %3, 2
+  store <32 x i32> %5, ptr %s, align 128
+  store i32 %6, ptr %pos1.i, align 64
+  store ptr %4, ptr %p, align 4
+  ret void
+}
+
+; Function Attrs: mustprogress nounwind memory(write, argmem: readwrite, inaccessiblemem: none)
+define dso_local void @test_fifo_st_reset_v64bfp16ebs8(ptr nocapture nonnull align 4 dereferenceable(4) %p, %struct.v64bfp16ebs8 %v.coerce, ptr nocapture nonnull align 64 dereferenceable(256) %s) local_unnamed_addr #2 {
+; CHECK-LABEL: test_fifo_st_reset_v64bfp16ebs8:
+; CHECK:         .p2align 4
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    vlda sfl, [p1, #0]; nopx
+; CHECK-NEXT:    lda p2, [p0, #0]
+; CHECK-NEXT:    vlda sfh, [p1, #64]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    mova r26, #0
+; CHECK-NEXT:    vst.push.576 ex0, [p2, sf, r26]; ret lr
+; CHECK-NEXT:    nop // Delay Slot 5
+; CHECK-NEXT:    vst sfl, [p1, #0] // Delay Slot 4
+; CHECK-NEXT:    vst sfh, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    st p2, [p0, #0] // Delay Slot 2
+; CHECK-NEXT:    nop // Delay Slot 1
+entry:
+  %v.coerce.fca.0.extract.i = extractvalue %struct.v64bfp16ebs8 %v.coerce, 0
+  %v.coerce.fca.1.extract.i = extractvalue %struct.v64bfp16ebs8 %v.coerce, 1
+  %0 = load ptr, ptr %p, align 4, !tbaa !2
+  %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6
+  %2 = tail call { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.push.576.bfp16(ptr %0, <64 x i8> %v.coerce.fca.0.extract.i, <8 x i8> %v.coerce.fca.1.extract.i, <32 x i32> %1, i32 0)
+  %3 = extractvalue { ptr, <32 x i32>, i32 } %2, 0
+  %4 = extractvalue { ptr, <32 x i32>, i32 } %2, 1
+  store <32 x i32> %4, ptr %s, align 128
+  store ptr %3, ptr %p, align 4
+  ret void
+}
+
+; Function Attrs: mustprogress nounwind memory(write, argmem: readwrite, inaccessiblemem: none)
+define dso_local void @test_fifo_st_push_v64bfp16ebs8(ptr nocapture nonnull align 4 dereferenceable(4) %p, %struct.v64bfp16ebs8 %v.coerce, ptr nocapture nonnull align 64 dereferenceable(256) %s) local_unnamed_addr #2 {
+; CHECK-LABEL: test_fifo_st_push_v64bfp16ebs8:
+; CHECK:         .p2align 4
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    vlda sfl, [p1, #0]; nopb ; nopxm ; nops
+; CHECK-NEXT:    lda p2, [p0, #0]; mov dj0, #128
+; CHECK-NEXT:    lda r26, [p1, dj0]
+; CHECK-NEXT:    vlda sfh, [p1, #64]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vst.push.576 ex0, [p2, sf, r26]
+; CHECK-NEXT:    ret lr
+; CHECK-NEXT:    st r26, [p1, dj0] // Delay Slot 5
+; CHECK-NEXT:    vst sfl, [p1, #0] // Delay Slot 4
+; CHECK-NEXT:    vst sfh, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    st p2, [p0, #0] // Delay Slot 2
+; CHECK-NEXT:    nop // Delay Slot 1
+entry:
+  %v.coerce.fca.0.extract.i = extractvalue %struct.v64bfp16ebs8 %v.coerce, 0
+  %v.coerce.fca.1.extract.i = extractvalue %struct.v64bfp16ebs8 %v.coerce, 1
+  %pos1.i = getelementptr inbounds i8, ptr %s, i20 128
+  %0 = load ptr, ptr %p, align 4, !tbaa !2
+  %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6
+  %2 = load i32, ptr %pos1.i, align 64, !tbaa !7
+  %3 = tail call { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.push.576.bfp16(ptr %0, <64 x i8> %v.coerce.fca.0.extract.i, <8 x i8> %v.coerce.fca.1.extract.i, <32 x i32> %1, i32 %2)
+  %4 = extractvalue { ptr, <32 x i32>, i32 } %3, 0
+  %5 = extractvalue { ptr, <32 x i32>, i32 } %3, 1
+  %6 = extractvalue { ptr, <32 x i32>, i32 } %3, 2
+  store <32 x i32> %5, ptr %s, align 128
+  store i32 %6, ptr %pos1.i, align 64
+  store ptr %4, ptr %p, align 4
+  ret void
+}
+
+; Function Attrs: nofree nounwind memory(inaccessiblemem: read)
+declare <32 x i8> @llvm.aie2p.pack.I512.I8.I16(<32 x i16>, i32) #3
+
+; Function Attrs: nofree nosync nounwind memory(none)
+declare <32 x i16> @llvm.aie2p.unpack.I512.I16.I8(<32 x i8>, i32) #4
+
+; Function Attrs: nofree nounwind memory(inaccessiblemem: read)
+declare <32 x i8> @llvm.aie2p.pack.I512.I4.I8(<64 x i8>, i32) #3
+
+; Function Attrs: nofree nosync nounwind memory(none)
+declare <64 x i8> @llvm.aie2p.unpack.I512.I8.I4(<32 x i8>, i32) #4
+
+; Function Attrs: nofree nounwind memory(inaccessiblemem: read)
+declare <64 x i8> @llvm.aie2p.pack.I1024.I8.I16(<64 x i16>, i32) #3
+
+; Function Attrs: nofree nosync nounwind memory(none)
+declare <64 x i16> @llvm.aie2p.unpack.I1024.I16.I8(<64 x i8>, i32) #4
+
+; Function Attrs: nofree nounwind memory(inaccessiblemem: read)
+declare <64 x i8> @llvm.aie2p.pack.I1024.I4.I8(<128 x i8>, i32) #3
+
+; Function Attrs: nofree nosync nounwind memory(none)
+declare <128 x i8> @llvm.aie2p.unpack.I1024.I8.I4(<64 x i8>, i32) #4
+
+; Function Attrs: nounwind memory(argmem: write)
+declare { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.push.512.bfp16(ptr, <16 x i32>, <32 x i32>, i32) #5
+
+; Function Attrs: nounwind memory(argmem: write)
+declare { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.flush(ptr, <32 x i32>, i32) #5
+
+; Function Attrs: nounwind memory(argmem: write)
+declare { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.flush.1d(ptr, <32 x i32>, i32, i20) #5
+
+; Function Attrs: nounwind memory(argmem: write)
+declare { ptr, <32 x i32>, i32, i20 } @llvm.aie2p.fifo.st.flush.2d.conv(ptr, <32 x i32>, i32, i20, i20, i20, i20) #5
+
+; Function Attrs: nounwind memory(argmem: write)
+declare { ptr, <32 x i32>, i32, i20, i20 } @llvm.aie2p.fifo.st.flush.3d(ptr, <32 x i32>, i32, i20, i20, i20, i20, i20, i20, i20) #5
+
+; Function Attrs: nounwind memory(argmem: write)
+declare { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.flush.conv(ptr, <32 x i32>, i32) #5
+
+; Function Attrs: nounwind memory(argmem: write)
+declare { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.flush.1d.conv(ptr, <32 x i32>, i32, i20) #5
+
+; Function Attrs: nounwind memory(argmem: write)
+declare { ptr, <32 x i32>, i32, i20 } @llvm.aie2p.fifo.st.flush.2d(ptr, <32 x i32>, i32, i20, i20, i20, i20) #5
+
+; Function Attrs: nounwind memory(argmem: write)
+declare { ptr, <32 x i32>, i32, i20, i20 } @llvm.aie2p.fifo.st.flush.3d.conv(ptr, <32 x i32>, i32, i20, i20, i20, i20, i20, i20, i20) #5
+
+; Function Attrs: nounwind memory(argmem: write)
+declare { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.push.576.bfp16(ptr, <64 x i8>, <8 x i8>, <32 x i32>, i32) #5
+
+; Function Attrs: nounwind memory(argmem: write)
+declare { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.st.push.544.bfp16(ptr, <64 x i8>, <8 x i8>, <32 x i32>, i32) #5
+
+
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 19.0.0git (git@gitenterprise.xilinx.com:XRLabs/llvm-aie.git 7712cc9eca3c28aba4de6f89a2124b1130c2ec6d)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"any pointer", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C++ TBAA"}
+!6 = !{!4, !4, i64 0}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"int", !4, i64 0}