Skip to content

Commit

Permalink
[AIE2] Combine 2D/3D post-increments for 512-bit load/stores
Browse files Browse the repository at this point in the history
  • Loading branch information
gbossu committed Oct 8, 2024
1 parent 99d8277 commit 6334e49
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 41 deletions.
4 changes: 2 additions & 2 deletions llvm/lib/Target/AIE/AIE2InstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ std::optional<unsigned> AIE2InstrInfo::getCombinedPostIncOpcode(
case TargetOpcode::G_INTRINSIC:
switch (cast<GIntrinsic>(PostIncI).getIntrinsicID()) {
case Intrinsic::aie2_add_2d:
if (Size >= 512)
if (Size >= 1024)
return {};
switch (BaseMemI.getOpcode()) {
case TargetOpcode::G_STORE:
Expand All @@ -256,7 +256,7 @@ std::optional<unsigned> AIE2InstrInfo::getCombinedPostIncOpcode(
}
break;
case Intrinsic::aie2_add_3d:
if (Size >= 512)
if (Size >= 1024)
return {};
switch (BaseMemI.getOpcode()) {
case TargetOpcode::G_STORE:
Expand Down
46 changes: 46 additions & 0 deletions llvm/test/CodeGen/AIE/GlobalISel/combine-loads-stores.mir
Original file line number Diff line number Diff line change
Expand Up @@ -1472,6 +1472,52 @@ body: |
$p2 = COPY %5
...

---
name: vector_512_combine_postinc_2d
body: |
bb.0:
; CHECK-LABEL: name: vector_512_combine_postinc_2d
; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1
; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
; CHECK-NEXT: [[AIE_POSTINC_2D_LOAD:%[0-9]+]]:_(<32 x s16>), [[AIE_POSTINC_2D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_2D_LOAD2:%[0-9]+]]:_(s20) = G_AIE_POSTINC_2D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]] :: (load (<32 x s16>))
; CHECK-NEXT: [[AIE_POSTINC_2D_STORE:%[0-9]+]]:_(p0), [[AIE_POSTINC_2D_STORE1:%[0-9]+]]:_(s20) = G_AIE_POSTINC_2D_STORE [[AIE_POSTINC_2D_LOAD]](<32 x s16>), [[COPY1]], [[C]], [[C]], [[C]], [[C]] :: (store (<32 x s16>))
; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_2D_LOAD1]](p0)
; CHECK-NEXT: $p1 = COPY [[AIE_POSTINC_2D_STORE]](p0)
%0:_(p0) = COPY $p0
%6:_(p0) = COPY $p1
%1:_(s20) = G_CONSTANT i20 64
%4:_(<32 x s16>) = G_LOAD %0(p0) :: (load (<32 x s16>))
G_STORE %4, %6 :: (store (<32 x s16>))
%3:_(p0), %8:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2.add.2d), %0:_(p0), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20)
$p0 = COPY %3
%7:_(p0), %9:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2.add.2d), %6:_(p0), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20)
$p1 = COPY %7
...

---
name: vector_512_combine_postinc_3d
body: |
bb.0:
; CHECK-LABEL: name: vector_512_combine_postinc_3d
; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1
; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
; CHECK-NEXT: [[AIE_POSTINC_3D_LOAD:%[0-9]+]]:_(<32 x s16>), [[AIE_POSTINC_3D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_LOAD2:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_LOAD3:%[0-9]+]]:_ = G_AIE_POSTINC_3D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (load (<32 x s16>))
; CHECK-NEXT: [[AIE_POSTINC_3D_STORE:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_STORE1:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_STORE2:%[0-9]+]]:_ = G_AIE_POSTINC_3D_STORE [[AIE_POSTINC_3D_LOAD]](<32 x s16>), [[COPY1]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (store (<32 x s16>))
; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_3D_LOAD1]](p0)
; CHECK-NEXT: $p1 = COPY [[AIE_POSTINC_3D_STORE]](p0)
%0:_(p0) = COPY $p0
%6:_(p0) = COPY $p1
%1:_(s20) = G_CONSTANT i20 64
%4:_(<32 x s16>) = G_LOAD %0(p0) :: (load (<32 x s16>))
G_STORE %4, %6 :: (store (<32 x s16>))
%3:_(p0), %8:_(s20), %9:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2.add.3d), %0:_(p0), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20)
$p0 = COPY %3
%7:_(p0), %10:_(s20), %11:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2.add.3d), %6:_(p0), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20)
$p1 = COPY %7
...

---
name: no_vector_1024_combine_postinc_yet
body: |
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c
; DCL-NEXT: vldb wl10, [p1], #32; add r0, r10, #33; mov r10, p0; vmac cm3, cm3, x11, x6, r4 // Delay Slot 2
; DCL-NEXT: vldb wh10, [p1], #32; and r10, r10, r9; vmov x8, x10; vmac cm7, cm7, x11, x8, r4 // Delay Slot 1
; DCL-NEXT: // %bb.3: // in Loop: Header=BB0_1 Depth=1
; DCL-NEXT: nopa ; nopx ; vmov x11, x0
; DCL-NEXT: nopa ; nopb ; nopx ; vmov x11, x0
; DCL-NEXT: vshuffle x0, x4, x2, r3
; DCL-NEXT: vshuffle x11, x0, x11, r8
; DCL-NEXT: vlda wl0, [sp, #-64] // 32-byte Folded Reload
Expand Down Expand Up @@ -367,7 +367,7 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c
; DCL-NEXT: vst.srs.s16.s32 bmh5, s3, [p3, #32]; mov m3, r14
; DCL-NEXT: vst.srs.s16.s32 bml5, s3, [p3], #64; mov m1, r11
; DCL-NEXT: padda.3d [p0], d1; vst.srs.s16.s32 bmh4, s3, [p3, #32]; mov m1, r24
; DCL-NEXT: padda.2d [p3], d7; vst.srs.s16.s32 bml4, s3, [p3, #0]; add r7, r7, #-1; mov dj7, r25
; DCL-NEXT: vst.2d.srs.s16.s32 bml4, s3, [p3], d7; add r7, r7, #-1; mov dj7, r25
; DCL-NEXT: jnz r7, #.LBB0_1
; DCL-NEXT: mov dn7, r26 // Delay Slot 5
; DCL-NEXT: st dc7, [sp, #-84] // 4-byte Folded Spill Delay Slot 4
Expand Down Expand Up @@ -494,7 +494,7 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c
; ZOL-NEXT: .L_LEnd0:
; ZOL-NEXT: vldb wh10, [p1], #32; nopa ; nops ; and r1, r1, r9; vmov x8, x10; vmac cm7, cm7, x11, x8, r4
; ZOL-NEXT: // %bb.3: // in Loop: Header=BB0_1 Depth=1
; ZOL-NEXT: nopa ; nopx ; vmov x11, x0
; ZOL-NEXT: nopa ; nopb ; nopx ; vmov x11, x0
; ZOL-NEXT: vshuffle x0, x4, x2, r3
; ZOL-NEXT: vshuffle x11, x0, x11, r8
; ZOL-NEXT: vlda wl0, [sp, #-64] // 32-byte Folded Reload
Expand Down Expand Up @@ -531,7 +531,7 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c
; ZOL-NEXT: vst.srs.s16.s32 bmh5, s3, [p3, #32]; mov m3, r13
; ZOL-NEXT: vst.srs.s16.s32 bml5, s3, [p3], #64; mov m1, r10
; ZOL-NEXT: padda.3d [p0], d1; vst.srs.s16.s32 bmh4, s3, [p3, #32]; mov m1, r15
; ZOL-NEXT: padda.2d [p3], d7; vst.srs.s16.s32 bml4, s3, [p3, #0]; add r7, r7, #-1; mov dj7, r24
; ZOL-NEXT: vst.2d.srs.s16.s32 bml4, s3, [p3], d7; add r7, r7, #-1; mov dj7, r24
; ZOL-NEXT: jnz r7, #.LBB0_1
; ZOL-NEXT: mov dn7, r25 // Delay Slot 5
; ZOL-NEXT: st dc7, [sp, #-84] // 4-byte Folded Spill Delay Slot 4
Expand Down
70 changes: 35 additions & 35 deletions llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll
Original file line number Diff line number Diff line change
Expand Up @@ -94,22 +94,22 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c
; ASM-NEXT: .LBB0_1: // %outer.loop.header
; ASM-NEXT: // =>This Loop Header: Depth=1
; ASM-NEXT: // Child Loop BB0_2 Depth 2
; ASM-NEXT: vlda.ups.s32.s16 bmh0, s0, [p2, #32]; mov m1, p4
; ASM-NEXT: vlda.ups.s32.s16 bml0, s0, [p2], m1
; ASM-NEXT: vlda.ups.s32.s16 bmh1, s0, [p2, #32]; mov m2, p5
; ASM-NEXT: vlda.ups.s32.s16 bml1, s0, [p2], m2
; ASM-NEXT: vlda.ups.s32.s16 bmh2, s0, [p2, #32]
; ASM-NEXT: vlda.ups.s32.s16 bml2, s0, [p2], m1
; ASM-NEXT: vlda.ups.s32.s16 bmh3, s0, [p2, #32]; mov m3, r15
; ASM-NEXT: vlda.ups.s32.s16 bml3, s0, [p2], m3
; ASM-NEXT: vlda.ups.s32.s16 bmh4, s0, [p2, #32]
; ASM-NEXT: vlda.ups.s32.s16 bml4, s0, [p2], m1
; ASM-NEXT: vlda.ups.s32.s16 bmh1, s0, [p2, #32]; mov m1, p4
; ASM-NEXT: vlda.ups.s32.s16 bml1, s0, [p2], m1
; ASM-NEXT: vlda.ups.s32.s16 bmh2, s0, [p2, #32]; mov m2, p5
; ASM-NEXT: vlda.ups.s32.s16 bml2, s0, [p2], m2
; ASM-NEXT: vlda.ups.s32.s16 bmh3, s0, [p2, #32]
; ASM-NEXT: vlda.ups.s32.s16 bml3, s0, [p2], m1
; ASM-NEXT: vlda.ups.s32.s16 bmh4, s0, [p2, #32]; mov m3, r15
; ASM-NEXT: vlda.ups.s32.s16 bml4, s0, [p2], m3
; ASM-NEXT: vlda.ups.s32.s16 bmh5, s0, [p2, #32]
; ASM-NEXT: vlda.ups.s32.s16 bml5, s0, [p2], m2
; ASM-NEXT: vlda.ups.s32.s16 bml5, s0, [p2], m1
; ASM-NEXT: vlda.ups.s32.s16 bmh6, s0, [p2, #32]
; ASM-NEXT: vlda.ups.s32.s16 bml6, s0, [p2], m1; mov r0, p0
; ASM-NEXT: vlda.ups.s32.s16 bmh7, s0, [p2, #32]; and r0, r0, r9
; ASM-NEXT: vlda.ups.s32.s16 bml7, s0, [p2, #0]; add r1, r0, #33; mov r0, r5
; ASM-NEXT: vlda.ups.s32.s16 bml6, s0, [p2], m2
; ASM-NEXT: vlda.ups.s32.s16 bmh7, s0, [p2, #32]
; ASM-NEXT: vlda.ups.s32.s16 bml7, s0, [p2], m1; mov r0, p0
; ASM-NEXT: vlda.ups.s32.s16 bmh0, s0, [p2, #32]; and r0, r0, r9
; ASM-NEXT: vlda.ups.s32.s16 bml0, s0, [p2, #0]; add r1, r0, #33; mov r0, r5
; ASM-NEXT: .p2align 4
; ASM-NEXT: .LBB0_2: // %inner.loop
; ASM-NEXT: // Parent Loop BB0_1 Depth=1
Expand All @@ -127,32 +127,32 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c
; ASM-NEXT: vshift.align x2, x2, s1, x8, r1
; ASM-NEXT: vshuffle x9, x4, x2, r2
; ASM-NEXT: vshuffle x3, x4, x2, r3
; ASM-NEXT: vmac cm0, cm0, x9, x10, r4
; ASM-NEXT: add r0, r0, #-1; vshuffle x1, x9, x0, r8; vmac cm2, cm2, x3, x10, r4
; ASM-NEXT: jnz r0, #.LBB0_2; vmac cm4, cm4, x9, x7, r4
; ASM-NEXT: vshuffle x5, x3, x0, r8; vmac cm6, cm6, x3, x7, r4 // Delay Slot 5
; ASM-NEXT: vmac cm1, cm1, x1, x10, r4 // Delay Slot 4
; ASM-NEXT: mov r1, p0; vmac cm3, cm3, x5, x10, r4 // Delay Slot 3
; ASM-NEXT: and r1, r1, r9; vmac cm5, cm5, x1, x7, r4 // Delay Slot 2
; ASM-NEXT: add r1, r1, #33; vmac cm7, cm7, x5, x7, r4 // Delay Slot 1
; ASM-NEXT: vmac cm1, cm1, x9, x10, r4
; ASM-NEXT: add r0, r0, #-1; vshuffle x1, x9, x0, r8; vmac cm3, cm3, x3, x10, r4
; ASM-NEXT: jnz r0, #.LBB0_2; vmac cm5, cm5, x9, x7, r4
; ASM-NEXT: vshuffle x5, x3, x0, r8; vmac cm7, cm7, x3, x7, r4 // Delay Slot 5
; ASM-NEXT: vmac cm2, cm2, x1, x10, r4 // Delay Slot 4
; ASM-NEXT: mov r1, p0; vmac cm4, cm4, x5, x10, r4 // Delay Slot 3
; ASM-NEXT: and r1, r1, r9; vmac cm6, cm6, x1, x7, r4 // Delay Slot 2
; ASM-NEXT: add r1, r1, #33; vmac cm0, cm0, x5, x7, r4 // Delay Slot 1
; ASM-NEXT: // %bb.3: // %outer.loop.latch
; ASM-NEXT: // in Loop: Header=BB0_1 Depth=1
; ASM-NEXT: nopa ; nopb ; nopx ; mov s3, r6; vst.srs.s16.s32 bmh0, s2, [p3, #32]
; ASM-NEXT: vst.srs.s16.s32 bml0, s3, [p3], #64
; ASM-NEXT: vst.srs.s16.s32 bmh1, s3, [p3, #32]
; ASM-NEXT: vst.srs.s16.s32 bml1, s3, [p3], m4
; ASM-NEXT: nopb ; nopa ; vst.srs.s16.s32 bmh1, s2, [p3, #32]; nopx ; mov s3, r6; nopv
; ASM-NEXT: vst.srs.s16.s32 bml1, s3, [p3], #64
; ASM-NEXT: vst.srs.s16.s32 bmh2, s3, [p3, #32]
; ASM-NEXT: vst.srs.s16.s32 bml2, s3, [p3], #64
; ASM-NEXT: vst.srs.s16.s32 bml2, s3, [p3], m4
; ASM-NEXT: vst.srs.s16.s32 bmh3, s3, [p3, #32]
; ASM-NEXT: vst.srs.s16.s32 bml3, s3, [p3], m7
; ASM-NEXT: vst.srs.s16.s32 bml3, s3, [p3], #64
; ASM-NEXT: vst.srs.s16.s32 bmh4, s3, [p3, #32]
; ASM-NEXT: vst.srs.s16.s32 bml4, s3, [p3], #64
; ASM-NEXT: vst.srs.s16.s32 bmh5, s3, [p3, #32]; mov dc5, r26
; ASM-NEXT: vst.srs.s16.s32 bml5, s3, [p3], m4; mov dn5, r27
; ASM-NEXT: vst.srs.s16.s32 bmh6, s3, [p3, #32]; mov dj5, r28
; ASM-NEXT: vst.srs.s16.s32 bml6, s3, [p3], #64; mov m1, r10
; ASM-NEXT: vst.srs.s16.s32 bmh7, s3, [p3, #32]; mov m2, r13
; ASM-NEXT: padda.2d [p3], d5; vst.srs.s16.s32 bml7, s3, [p3, #0]; mov dj5, r11
; ASM-NEXT: vst.srs.s16.s32 bml4, s3, [p3], m7
; ASM-NEXT: vst.srs.s16.s32 bmh5, s3, [p3, #32]
; ASM-NEXT: vst.srs.s16.s32 bml5, s3, [p3], #64
; ASM-NEXT: vst.srs.s16.s32 bmh6, s3, [p3, #32]; mov dc5, r26
; ASM-NEXT: vst.srs.s16.s32 bml6, s3, [p3], m4; mov dn5, r27
; ASM-NEXT: vst.srs.s16.s32 bmh7, s3, [p3, #32]; mov dj5, r28
; ASM-NEXT: vst.srs.s16.s32 bml7, s3, [p3], #64; mov m1, r10
; ASM-NEXT: vst.srs.s16.s32 bmh0, s3, [p3, #32]; mov m2, r13
; ASM-NEXT: vst.2d.srs.s16.s32 bml0, s3, [p3], d5; mov dj5, r11
; ASM-NEXT: add r7, r7, #-1; mov dn5, r12
; ASM-NEXT: jnz r7, #.LBB0_1
; ASM-NEXT: mov r26, dc5 // Delay Slot 5
Expand Down

0 comments on commit 6334e49

Please sign in to comment.