Skip to content

Commit

Permalink
[AIEX] PropagateIncomingLatencies: place REG_SEQUENCE to facilitate LICM
Browse files Browse the repository at this point in the history
This is now very careful about REG_SEQUENCE that have an external
source. That source is likely to create a COPY during regalloc, and we
need to be careful to ensure that copy can be later hoisted by LICM.

See tests :)
  • Loading branch information
gbossu committed Oct 23, 2024
1 parent 8e81ab7 commit 0217a06
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 76 deletions.
26 changes: 24 additions & 2 deletions llvm/lib/Target/AIE/AIEBaseSubtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -361,10 +361,12 @@ class EnforceCopyEdges : public ScheduleDAGMutation {

class PropagateIncomingLatencies : public ScheduleDAGMutation {
bool OnlyCopyLike;
bool OnlyLocalSources;

public:
PropagateIncomingLatencies(bool OnlyCopyLike = true)
: OnlyCopyLike(OnlyCopyLike) {}
PropagateIncomingLatencies(bool OnlyCopyLike = true,
bool OnlyLocalSources = true)
: OnlyCopyLike(OnlyCopyLike), OnlyLocalSources(OnlyLocalSources) {}
void apply(ScheduleDAGInstrs *DAG) override {
auto IsData = [](const SDep &D) { return D.getKind() == SDep::Data; };
for (SUnit &SU : DAG->SUnits) {
Expand All @@ -381,6 +383,26 @@ class PropagateIncomingLatencies : public ScheduleDAGMutation {
}))
continue;

// Do not change the latency if the REG_SEQUENCE has one source
// outside this MBB. Such REG_SEQUENCE instructions will typically require
// a COPY from the external source to one of the lanes of the destination
// register. It is important to then keep the REG_SEQUENCE close to its
// consumers, because after MachinePipeliner, this typically means that
// only the lanes corresponding to internal sources will be loop-carried.
// The external lane will come directly from the pre-header, and the
// required COPY can then be hoisted by MachineLICM.
const MachineBasicBlock &MBB = *MI.getParent();
const MachineRegisterInfo &MRI = DAG->MRI;
auto HasExternalAndLocalSources = [&MBB, &MRI](const MachineInstr &MI) {
return MI.isRegSequence() && MRI.isSSA() && MI.getNumOperands() > 3 &&
count_if(MI.uses(), [&MBB, &MRI](const MachineOperand &MO) {
return MO.isReg() && MO.getReg().isVirtual() &&
MRI.getVRegDef(MO.getReg())->getParent() != &MBB;
}) == 1;
};
if (OnlyLocalSources && HasExternalAndLocalSources(MI))
continue;

// Find the common latency for all predecessors that can be
// "moved" to successors.
SDep *MinLatencyDep = nullptr;
Expand Down
40 changes: 21 additions & 19 deletions llvm/test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ define void @mul2d(ptr noalias %in_ptr0, ptr noalias %in_ptr1, ptr noalias %out_
; CHECK-NEXT: nop // Delay Slot 2
; CHECK-NEXT: nop // Delay Slot 1
; CHECK-NEXT: // %bb.1: // %for.body.lr.ph
; CHECK-NEXT: nopa ; nopx ; mov p3, sp
; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; mov p3, sp; nopv
; CHECK-NEXT: paddb [p3], #-4
; CHECK-NEXT: lda.u8 r0, [p3, #0]; mov p3, sp
; CHECK-NEXT: paddb [p3], #-8
Expand All @@ -77,33 +77,35 @@ define void @mul2d(ptr noalias %in_ptr0, ptr noalias %in_ptr1, ptr noalias %out_
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: mova dc0, #0; vldb wl2, [p1], #32; extend.u8 r5, r5; mov r4, #-1
; CHECK-NEXT: vldb wl8, [p1], #32; lshl r1, r1, r4; mov dc4, dc0
; CHECK-NEXT: vldb.3d wl4, [p0], d0; add r1, r1, #-1
; CHECK-NEXT: mova r4, #-1
; CHECK-NEXT: mova dc0, #0; vldb wl2, [p1], #32; lshl r1, r1, r4
; CHECK-NEXT: vldb wl8, [p1], #32; add r1, r1, #-1; mov dc4, dc0
; CHECK-NEXT: vldb.3d wl6, [p0], d0; jz r1, #.LBB0_4
; CHECK-NEXT: mova r3, #0; mov s0, r5 // Delay Slot 5
; CHECK-NEXT: vbcst.8 x0, r3 // Delay Slot 4
; CHECK-NEXT: mova r2, #1; vmov wh4, wl0 // Delay Slot 3
; CHECK-NEXT: ne r2, r0, r2; vmov wh6, wl0 // Delay Slot 2
; CHECK-NEXT: vldb.3d wl4, [p0], d0 // Delay Slot 5
; CHECK-NEXT: extend.u8 r5, r5 // Delay Slot 4
; CHECK-NEXT: mova r3, #0; movx r2, #1; mov s0, r5 // Delay Slot 3
; CHECK-NEXT: ne r2, r0, r2; vbcst.8 x0, r3 // Delay Slot 2
; CHECK-NEXT: mova r0, #808; mov crSRSSign, r2 // Delay Slot 1
; CHECK-NEXT: // %bb.2:
; CHECK-NEXT: nopa ; nopb ; nopx ; vmov wh6, wl0
; CHECK-NEXT: vmov wh4, wl0
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_2: // %for.body
; CHECK-NEXT: .LBB0_3: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldb wl2, [p1], #32; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: nopa ; vldb.3d wl4, [p0], d0; add r1, r1, #-1; vmul cm0, x4, x2, r0
; CHECK-NEXT: vldb.3d wl6, [p0], d0; jnz r1, #.LBB0_2; vmul cm1, x6, x8, r0
; CHECK-NEXT: vldb wl2, [p1], #32; nopxm
; CHECK-NEXT: vldb.3d wl6, [p0], d0; add r1, r1, #-1; vmul cm0, x6, x2, r0
; CHECK-NEXT: vldb.3d wl4, [p0], d0; jnz r1, #.LBB0_3; vmul cm1, x4, x8, r0
; CHECK-NEXT: vldb wl8, [p1], #32 // Delay Slot 5
; CHECK-NEXT: nop // Delay Slot 4
; CHECK-NEXT: nop // Delay Slot 3
; CHECK-NEXT: vst.srs.d8.s32 cm0, s0, [p2], #32; vmov wh4, wl0 // Delay Slot 2
; CHECK-NEXT: vst.srs.d8.s32 cm1, s0, [p2], #32; vmov wh6, wl0 // Delay Slot 1
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vst.srs.d8.s32 cm0, s0, [p2], #32 // Delay Slot 2
; CHECK-NEXT: vst.srs.d8.s32 cm1, s0, [p2], #32 // Delay Slot 1
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_4:
; CHECK-NEXT: nopa ; nopx ; vmul cm0, x4, x2, r0
; CHECK-NEXT: vmul cm1, x6, x8, r0
; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmov wh6, wl0; nopv
; CHECK-NEXT: nopa ; vmov wh4, wl0
; CHECK-NEXT: vmul cm0, x6, x2, r0
; CHECK-NEXT: vmul cm1, x4, x8, r0
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
Expand Down
81 changes: 40 additions & 41 deletions llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,10 @@ define dso_local void @TanhTemplated(ptr noalias %ifm, ptr noalias %ofm, ptr non
; CHECK-LABEL: TanhTemplated:
; CHECK: .p2align 4
; CHECK-NEXT: // %bb.0: // %for.body.lr.ph
; CHECK-NEXT: nopa ; nopx ; mov r8, r16
; CHECK-NEXT: nopa ; mov r8, r16
; CHECK-NEXT: movxm r3, #16512
; CHECK-NEXT: movxm r4, #-16256
; CHECK-NEXT: movxm r5, #32767
; CHECK-NEXT: movxm r0, #16256
; CHECK-NEXT: movxm r1, #16384
; CHECK-NEXT: lda r0, [p2, #0]; movxm r2, #16128
Expand All @@ -186,70 +189,66 @@ define dso_local void @TanhTemplated(ptr noalias %ifm, ptr noalias %ofm, ptr non
; CHECK-NEXT: vbcst.16 x2, r2
; CHECK-NEXT: mova r1, #0; vconv.fp32.bf16 bmh0, wl2
; CHECK-NEXT: vbcst.16 x2, r1
; CHECK-NEXT: vmov wh0, wl2
; CHECK-NEXT: vldb wl3, [p0], #32; vmov wh0, wl2
; CHECK-NEXT: mova r1, #-5; vmov wh3, wl2
; CHECK-NEXT: mova r1, #60; vldb wl3, [p0], #32; lshl r2, r0, r1; vconv.fp32.bf16 bmh1, wl3
; CHECK-NEXT: movxm r3, #16512; vmul.f bmh2, x0, x3, r1
; CHECK-NEXT: movxm r4, #-16256
; CHECK-NEXT: movxm r5, #32767
; CHECK-NEXT: movxm r6, #15616
; CHECK-NEXT: mova r1, #60; lshl r2, r0, r1; vconv.fp32.bf16 bmh1, wl3
; CHECK-NEXT: movxm r6, #15616; vmul.f bmh2, x0, x3, r1
; CHECK-NEXT: movxm r7, #16000
; CHECK-NEXT: vbcst.16 x1, r3
; CHECK-NEXT: vbcst.16 x10, r4
; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh2; vbcst.16 x8, r5; vmul.f bmh3, x0, x3, r1
; CHECK-NEXT: vbcst.16 x8, r5; vmul.f bmh3, x0, x3, r1
; CHECK-NEXT: vbcst.16 x6, r6
; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh2; vbcst.16 x4, r7
; CHECK-NEXT: vmov wh6, wl2
; CHECK-NEXT: vmin_ge.bf16 x3, r16, x3, x1
; CHECK-NEXT: vmax_lt.bf16 x3, r16, x3, x10
; CHECK-NEXT: vmov wh3, wl2
; CHECK-NEXT: vmov wh6, wl2
; CHECK-NEXT: vconv.bf16.fp32 wl5, bmh3; vband x7, x8, x3
; CHECK-NEXT: vmov wh7, wl2
; CHECK-NEXT: vldb wl7, [p0], #32; vmin_ge.bf16 x5, r16, x5, x1
; CHECK-NEXT: vmax_lt.bf16 x5, r16, x5, x10
; CHECK-NEXT: vmin_ge.bf16 x5, r16, x5, x1
; CHECK-NEXT: vldb wl7, [p0], #32; vmax_lt.bf16 x5, r16, x5, x10
; CHECK-NEXT: vband x7, x8, x5
; CHECK-NEXT: vldb wl7, [p0], #32; vmov wh7, wl2; vmul.f bmh2, x6, x7, r1
; CHECK-NEXT: movxm r7, #16000
; CHECK-NEXT: vbcst.16 x4, r7; vmul.f bmh4, x6, x7, r1
; CHECK-NEXT: vmov wh4, wl2
; CHECK-NEXT: vmov wh5, wl2; vmul.f bmh5, x0, x7, r1
; CHECK-NEXT: vmac.f bmh3, bmh0, x3, x4, r1
; CHECK-NEXT: vmov wh3, wl2; vmul.f bmh4, x6, x7, r1
; CHECK-NEXT: nop
; CHECK-NEXT: vmov wh5, wl2; vmac.f bmh3, bmh0, x3, x4, r1
; CHECK-NEXT: vmul.f bmh5, x0, x7, r1
; CHECK-NEXT: movxm ls, #.LBB0_1; vmac.f bmh6, bmh0, x5, x4, r1
; CHECK-NEXT: vconv.bf16.fp32 wl7, bmh2; movxm le, #.L_LEnd0; vmul.f bmh7, x0, x7, r1
; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh4; add.nc lc, r2, #-2
; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmsc.f bmh3, bmh3, x7, x3, r1
; CHECK-NEXT: nopb ; nopa ; vconv.bf16.fp32 wl3, bmh5; nopxm ; vmsc.f bml4, bmh6, x3, x5, r1
; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmsc.f bml4, bmh6, x3, x5, r1
; CHECK-NEXT: nopb ; nopa ; vconv.bf16.fp32 wl3, bmh5; nopxm ; nopv
; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmin_ge.bf16 x3, r16, x3, x1; nopv
; CHECK-NEXT: nopb ; nopa ; vconv.bf16.fp32 wl5, bmh7; nopx ; vmax_lt.bf16 x3, r16, x3, x10; nopv
; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmov wh3, wl2; nopv
; CHECK-NEXT: nopb ; nopa ; vconv.bf16.fp32 wl5, bmh7; nopx ; vmin_ge.bf16 x3, r16, x3, x1; nopv
; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmax_lt.bf16 x3, r16, x3, x10; nopv
; CHECK-NEXT: nopb ; mova r0, #28; vconv.bf16.fp32 wl7, bmh3; nopx ; vmin_ge.bf16 x5, r16, x5, x1; nopv
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_1: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: nopa ; nopb ; nopx ; vband x9, x8, x3
; CHECK-NEXT: vldb wl7, [p0], #32; vmax_lt.bf16 x5, r16, x5, x10
; CHECK-NEXT: vmov wh7, wl2
; CHECK-NEXT: nopa ; nopb ; nopx ; vband x9, x8, x3; nops
; CHECK-NEXT: vmax_lt.bf16 x5, r16, x5, x10
; CHECK-NEXT: vconv.bf16.fp32 wl7, bml4; vldb wl7, [p0], #32; vmov wh3, wl2
; CHECK-NEXT: vmov wh9, wl2; vmul.f bmh6, x7, x0, r1
; CHECK-NEXT: vconv.bf16.fp32 wl7, bml4; vldb wl7, [p0], #32; vmov wh5, wl2; vmac.f bmh5, bmh0, x3, x4, r1
; CHECK-NEXT: vmul.f bmh3, x6, x9, r1
; CHECK-NEXT: vband x9, x8, x5; vmul.f bmh2, x7, x0, r1
; CHECK-NEXT: vmov wh9, wl2; vsub.f bml1, bmh6, bmh1, r0
; CHECK-NEXT: vmul.f bmh7, x0, x7, r1
; CHECK-NEXT: vldb wl7, [p0], #32; vband x9, x8, x5; vmul.f bmh2, x7, x0, r1
; CHECK-NEXT: vmov wh9, wl2; vmul.f bmh3, x6, x9, r1
; CHECK-NEXT: vmac.f bmh5, bmh0, x3, x4, r1
; CHECK-NEXT: vmul.f bmh4, x6, x9, r1
; CHECK-NEXT: vsub.f bml0, bmh2, bmh1, r0
; CHECK-NEXT: vconv.bf16.fp32 wl7, bmh3; vmul.f bmh8, x0, x7, r1
; CHECK-NEXT: vmov wh5, wl2; vsub.f bml1, bmh6, bmh1, r0
; CHECK-NEXT: vmul.f bmh7, x0, x7, r1
; CHECK-NEXT: vmac.f bml2, bmh0, x5, x4, r1
; CHECK-NEXT: vmsc.f bml3, bmh5, x7, x3, r1
; CHECK-NEXT: vconv.bf16.fp32 wl11, bmh7
; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh4; vmov wh3, wl2
; CHECK-NEXT: vst.conv.bf16.fp32 bml1, [p1], #32; vmin_ge.bf16 x3, r16, x11, x1
; CHECK-NEXT: vconv.bf16.fp32 wl5, bmh8; vmax_lt.bf16 x3, r16, x3, x10; vmsc.f bml4, bml2, x3, x5, r1
; CHECK-NEXT: vst.conv.bf16.fp32 bml0, [p1], #32; vmov wh3, wl2
; CHECK-NEXT: vconv.bf16.fp32 wl7, bmh3; vmul.f bmh8, x0, x7, r1
; CHECK-NEXT: vsub.f bml0, bmh2, bmh1, r0
; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh4; vmsc.f bml3, bmh5, x7, x3, r1
; CHECK-NEXT: nop
; CHECK-NEXT: vconv.bf16.fp32 wl11, bmh7; vmsc.f bml4, bml2, x3, x5, r1
; CHECK-NEXT: vst.conv.bf16.fp32 bml1, [p1], #32
; CHECK-NEXT: vconv.bf16.fp32 wl5, bmh8; vmin_ge.bf16 x3, r16, x11, x1
; CHECK-NEXT: vst.conv.bf16.fp32 bml0, [p1], #32; vmax_lt.bf16 x3, r16, x3, x10
; CHECK-NEXT: .L_LEnd0:
; CHECK-NEXT: nopb ; nopa ; vconv.bf16.fp32 wl7, bml3; nopx ; vmin_ge.bf16 x5, r16, x5, x1; nopv
; CHECK-NEXT: // %bb.2:
; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmov wh7, wl2; nopv
; CHECK-NEXT: vconv.bf16.fp32 wl1, bml4; vmov wh1, wl2
; CHECK-NEXT: vmov wh6, wl2; vmul.f bmh3, x7, x0, r1
; CHECK-NEXT: vmax_lt.bf16 x10, r16, x5, x10; vmul.f bmh2, x1, x0, r1
Expand All @@ -258,8 +257,8 @@ define dso_local void @TanhTemplated(ptr noalias %ifm, ptr noalias %ofm, ptr non
; CHECK-NEXT: vmov wh1, wl2; vsub.f bmh3, bmh3, bmh1, r0
; CHECK-NEXT: vmov wh8, wl2; vsub.f bmh2, bmh2, bmh1, r0
; CHECK-NEXT: vmul.f bmh2, x6, x1, r1
; CHECK-NEXT: vmul.f bmh3, x6, x8, r1
; CHECK-NEXT: vmov wh4, wl2
; CHECK-NEXT: vmov wh4, wl2; vmul.f bmh3, x6, x8, r1
; CHECK-NEXT: vmov wh3, wl2
; CHECK-NEXT: vmov wh10, wl2
; CHECK-NEXT: vst.conv.bf16.fp32 bmh3, [p1], #32; vmac.f bmh4, bmh0, x3, x4, r1
; CHECK-NEXT: vst.conv.bf16.fp32 bmh2, [p1], #32; vmac.f bmh0, bmh0, x10, x4, r1
Expand Down
27 changes: 13 additions & 14 deletions llvm/test/CodeGen/AIE/aie2/schedule/swp/swp-regsequence.mir
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@

# We expect the REG_SEQUENCE for the load of %ir.in1 to be in the second stage, close to
# its VMUL consumer.
# FIXME: Make it happen
---
name: vmul_256
alignment: 16
Expand All @@ -58,26 +57,25 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm_imm:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm1:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY3]], 32 :: (load (<32 x s8>) from %ir.in1)
; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm_imm2:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm3:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY4]], 32 :: (load (<32 x s8>) from %ir.in2)
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VBCST_32_]].sub_256_lo, %subreg.sub_256_hi, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm]], %subreg.sub_256_lo
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_dmw_lda_w_ag_pstm_nrm_imm2]], %subreg.sub_256_lo
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_dmw_lda_w_ag_pstm_nrm_imm2]], %subreg.sub_256_lo
; CHECK-NEXT: [[ADD_add_r_ri:%[0-9]+]]:er = nsw ADD_add_r_ri [[COPY1]], -1, implicit-def $srcarry
; CHECK-NEXT: PseudoJ_jump_imm %bb.4
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.4:
; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.5(0x40000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[PHI:%[0-9]+]]:er = PHI [[ADD_add_r_ri]], %bb.3, %30, %bb.4
; CHECK-NEXT: [[PHI:%[0-9]+]]:er = PHI [[ADD_add_r_ri]], %bb.3, %29, %bb.4
; CHECK-NEXT: [[PHI1:%[0-9]+]]:ep = PHI [[VLDA_dmw_lda_w_ag_pstm_nrm_imm1]], %bb.3, %28, %bb.4
; CHECK-NEXT: [[PHI2:%[0-9]+]]:ep = PHI [[VLDA_dmw_lda_w_ag_pstm_nrm_imm3]], %bb.3, %33, %bb.4
; CHECK-NEXT: [[PHI3:%[0-9]+]]:ep = PHI [[COPY5]], %bb.3, %35, %bb.4
; CHECK-NEXT: [[PHI4:%[0-9]+]]:vec512 = PHI [[REG_SEQUENCE]], %bb.3, %29, %bb.4
; CHECK-NEXT: [[PHI5:%[0-9]+]]:vec512 = PHI [[REG_SEQUENCE1]], %bb.3, %34, %bb.4
; CHECK-NEXT: [[PHI2:%[0-9]+]]:ep = PHI [[VLDA_dmw_lda_w_ag_pstm_nrm_imm3]], %bb.3, %32, %bb.4
; CHECK-NEXT: [[PHI3:%[0-9]+]]:ep = PHI [[COPY5]], %bb.3, %34, %bb.4
; CHECK-NEXT: [[PHI4:%[0-9]+]]:vec256 = PHI [[VLDA_dmw_lda_w_ag_pstm_nrm_imm]], %bb.3, %27, %bb.4
; CHECK-NEXT: [[PHI5:%[0-9]+]]:vec512 = PHI [[REG_SEQUENCE]], %bb.3, %33, %bb.4
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vec512 = REG_SEQUENCE [[VBCST_32_]].sub_256_lo, %subreg.sub_256_hi, [[PHI4]], %subreg.sub_256_lo
; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm_imm4:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm5:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[PHI1]], 32 :: (load unknown-size from %ir.in1, align 32)
; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vec512 = REG_SEQUENCE [[VBCST_32_]].sub_256_lo, %subreg.sub_256_hi, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm4]], %subreg.sub_256_lo
; CHECK-NEXT: [[ADD_add_r_ri1:%[0-9]+]]:er = nsw ADD_add_r_ri [[PHI]], -1, implicit-def $srcarry
; CHECK-NEXT: [[VMUL_vmac_cm_core_dense:%[0-9]+]]:acc1024 = VMUL_vmac_cm_core_dense [[PHI4]], [[PHI5]], [[COPY]]
; CHECK-NEXT: [[VMUL_vmac_cm_core_dense:%[0-9]+]]:acc1024 = VMUL_vmac_cm_core_dense [[REG_SEQUENCE1]], [[PHI5]], [[COPY]]
; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm_imm6:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm7:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[PHI2]], 32 :: (load unknown-size from %ir.in2, align 32)
; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_dmw_lda_w_ag_pstm_nrm_imm6]], %subreg.sub_256_lo
; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_dmw_lda_w_ag_pstm_nrm_imm6]], %subreg.sub_256_lo
; CHECK-NEXT: [[VST_SRS_S8_S32_ag_pstm_nrm_imm:%[0-9]+]]:ep = VST_SRS_S8_S32_ag_pstm_nrm_imm [[PHI3]], 32, [[VMUL_vmac_cm_core_dense]], [[COPY2]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd :: (store (<32 x s8>) into %ir.out)
; CHECK-NEXT: PseudoJNZ [[ADD_add_r_ri1]], %bb.4
; CHECK-NEXT: PseudoJ_jump_imm %bb.5
Expand All @@ -86,9 +84,10 @@ body: |
; CHECK-NEXT: successors: %bb.2(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[PHI6:%[0-9]+]]:ep = PHI [[VST_SRS_S8_S32_ag_pstm_nrm_imm]], %bb.4
; CHECK-NEXT: [[PHI7:%[0-9]+]]:vec512 = PHI [[REG_SEQUENCE2]], %bb.4
; CHECK-NEXT: [[PHI8:%[0-9]+]]:vec512 = PHI [[REG_SEQUENCE3]], %bb.4
; CHECK-NEXT: [[VMUL_vmac_cm_core_dense1:%[0-9]+]]:acc1024 = VMUL_vmac_cm_core_dense [[PHI7]], [[PHI8]], [[COPY]]
; CHECK-NEXT: [[PHI7:%[0-9]+]]:vec256 = PHI [[VLDA_dmw_lda_w_ag_pstm_nrm_imm4]], %bb.4
; CHECK-NEXT: [[PHI8:%[0-9]+]]:vec512 = PHI [[REG_SEQUENCE2]], %bb.4
; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vec512 = REG_SEQUENCE [[VBCST_32_]].sub_256_lo, %subreg.sub_256_hi, [[PHI7]], %subreg.sub_256_lo
; CHECK-NEXT: [[VMUL_vmac_cm_core_dense1:%[0-9]+]]:acc1024 = VMUL_vmac_cm_core_dense [[REG_SEQUENCE3]], [[PHI8]], [[COPY]]
; CHECK-NEXT: [[VST_SRS_S8_S32_ag_pstm_nrm_imm1:%[0-9]+]]:ep = VST_SRS_S8_S32_ag_pstm_nrm_imm [[PHI6]], 32, [[VMUL_vmac_cm_core_dense1]], [[COPY2]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd :: (store unknown-size into %ir.out, align 32)
; CHECK-NEXT: PseudoJ_jump_imm %bb.2
; CHECK-NEXT: {{ $}}
Expand Down

0 comments on commit 0217a06

Please sign in to comment.