diff --git a/llvm/lib/Target/AIE/AIE2InstrInfo.cpp b/llvm/lib/Target/AIE/AIE2InstrInfo.cpp index d47f6d3fe5b1..858bc2c7685a 100644 --- a/llvm/lib/Target/AIE/AIE2InstrInfo.cpp +++ b/llvm/lib/Target/AIE/AIE2InstrInfo.cpp @@ -242,7 +242,7 @@ std::optional AIE2InstrInfo::getCombinedPostIncOpcode( case TargetOpcode::G_INTRINSIC: switch (cast(PostIncI).getIntrinsicID()) { case Intrinsic::aie2_add_2d: - if (Size >= 512) + if (Size >= 1024) return {}; switch (BaseMemI.getOpcode()) { case TargetOpcode::G_STORE: @@ -256,7 +256,7 @@ std::optional AIE2InstrInfo::getCombinedPostIncOpcode( } break; case Intrinsic::aie2_add_3d: - if (Size >= 512) + if (Size >= 1024) return {}; switch (BaseMemI.getOpcode()) { case TargetOpcode::G_STORE: diff --git a/llvm/test/CodeGen/AIE/GlobalISel/combine-loads-stores.mir b/llvm/test/CodeGen/AIE/GlobalISel/combine-loads-stores.mir index 1f2d16e96550..4ce207bb2fe9 100644 --- a/llvm/test/CodeGen/AIE/GlobalISel/combine-loads-stores.mir +++ b/llvm/test/CodeGen/AIE/GlobalISel/combine-loads-stores.mir @@ -1472,6 +1472,52 @@ body: | $p2 = COPY %5 ... +--- +name: vector_512_combine_postinc_2d +body: | + bb.0: + ; CHECK-LABEL: name: vector_512_combine_postinc_2d + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 + ; CHECK-NEXT: [[AIE_POSTINC_2D_LOAD:%[0-9]+]]:_(<32 x s16>), [[AIE_POSTINC_2D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_2D_LOAD2:%[0-9]+]]:_(s20) = G_AIE_POSTINC_2D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]] :: (load (<32 x s16>)) + ; CHECK-NEXT: [[AIE_POSTINC_2D_STORE:%[0-9]+]]:_(p0), [[AIE_POSTINC_2D_STORE1:%[0-9]+]]:_(s20) = G_AIE_POSTINC_2D_STORE [[AIE_POSTINC_2D_LOAD]](<32 x s16>), [[COPY1]], [[C]], [[C]], [[C]], [[C]] :: (store (<32 x s16>)) + ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_2D_LOAD1]](p0) + ; CHECK-NEXT: $p1 = COPY [[AIE_POSTINC_2D_STORE]](p0) + %0:_(p0) = COPY $p0 + %6:_(p0) = COPY $p1 + %1:_(s20) = G_CONSTANT i20 64 + %4:_(<32 x s16>) = G_LOAD %0(p0) :: (load (<32 x s16>)) + G_STORE %4, %6 :: (store (<32 x s16>)) + %3:_(p0), %8:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2.add.2d), %0:_(p0), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20) + $p0 = COPY %3 + %7:_(p0), %9:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2.add.2d), %6:_(p0), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20) + $p1 = COPY %7 +... + +--- +name: vector_512_combine_postinc_3d +body: | + bb.0: + ; CHECK-LABEL: name: vector_512_combine_postinc_3d + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 + ; CHECK-NEXT: [[AIE_POSTINC_3D_LOAD:%[0-9]+]]:_(<32 x s16>), [[AIE_POSTINC_3D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_LOAD2:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_LOAD3:%[0-9]+]]:_ = G_AIE_POSTINC_3D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (load (<32 x s16>)) + ; CHECK-NEXT: [[AIE_POSTINC_3D_STORE:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_STORE1:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_STORE2:%[0-9]+]]:_ = G_AIE_POSTINC_3D_STORE [[AIE_POSTINC_3D_LOAD]](<32 x s16>), [[COPY1]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (store (<32 x s16>)) + ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_3D_LOAD1]](p0) + ; CHECK-NEXT: $p1 = COPY [[AIE_POSTINC_3D_STORE]](p0) + %0:_(p0) = COPY $p0 + %6:_(p0) = COPY $p1 + %1:_(s20) = G_CONSTANT i20 64 + %4:_(<32 x s16>) = G_LOAD %0(p0) :: (load (<32 x s16>)) + G_STORE %4, %6 :: (store (<32 x s16>)) + %3:_(p0), %8:_(s20), %9:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2.add.3d), %0:_(p0), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20) + $p0 = COPY %3 + %7:_(p0), %10:_(s20), %11:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2.add.3d), %6:_(p0), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20) + $p1 = COPY %7 +... + --- name: no_vector_1024_combine_postinc_yet body: | diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll index b3435587553a..9c522630b1cd 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll @@ -330,7 +330,7 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-NEXT: vldb wl10, [p1], #32; add r0, r10, #33; mov r10, p0; vmac cm3, cm3, x11, x6, r4 // Delay Slot 2 ; DCL-NEXT: vldb wh10, [p1], #32; and r10, r10, r9; vmov x8, x10; vmac cm7, cm7, x11, x8, r4 // Delay Slot 1 ; DCL-NEXT: // %bb.3: // in Loop: Header=BB0_1 Depth=1 -; DCL-NEXT: nopa ; nopx ; vmov x11, x0 +; DCL-NEXT: nopa ; nopb ; nopx ; vmov x11, x0 ; DCL-NEXT: vshuffle x0, x4, x2, r3 ; DCL-NEXT: vshuffle x11, x0, x11, r8 ; DCL-NEXT: vlda wl0, [sp, #-64] // 32-byte Folded Reload @@ -367,7 +367,7 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-NEXT: vst.srs.s16.s32 bmh5, s3, [p3, #32]; mov m3, r14 ; DCL-NEXT: vst.srs.s16.s32 bml5, s3, [p3], #64; mov m1, r11 ; DCL-NEXT: padda.3d [p0], d1; vst.srs.s16.s32 bmh4, s3, [p3, #32]; mov m1, r24 -; DCL-NEXT: padda.2d [p3], d7; vst.srs.s16.s32 bml4, s3, [p3, #0]; add r7, r7, #-1; mov dj7, r25 +; DCL-NEXT: vst.2d.srs.s16.s32 bml4, s3, [p3], d7; add r7, r7, #-1; mov dj7, r25 ; DCL-NEXT: jnz r7, #.LBB0_1 ; DCL-NEXT: mov dn7, r26 // Delay Slot 5 ; DCL-NEXT: st dc7, [sp, #-84] // 4-byte Folded Spill Delay Slot 4 @@ -494,7 +494,7 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-NEXT: .L_LEnd0: ; ZOL-NEXT: vldb wh10, [p1], #32; nopa ; nops ; and r1, r1, r9; vmov x8, x10; vmac cm7, cm7, x11, x8, r4 ; ZOL-NEXT: // %bb.3: // in Loop: Header=BB0_1 Depth=1 -; ZOL-NEXT: nopa ; nopx ; vmov x11, x0 +; ZOL-NEXT: nopa ; nopb ; nopx ; vmov x11, x0 ; ZOL-NEXT: vshuffle x0, x4, x2, r3 ; ZOL-NEXT: vshuffle x11, x0, x11, r8 ; ZOL-NEXT: vlda wl0, [sp, #-64] // 32-byte Folded Reload @@ -531,7 +531,7 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-NEXT: vst.srs.s16.s32 bmh5, s3, [p3, #32]; mov m3, r13 ; ZOL-NEXT: vst.srs.s16.s32 bml5, s3, [p3], #64; mov m1, r10 ; ZOL-NEXT: padda.3d [p0], d1; vst.srs.s16.s32 bmh4, s3, [p3, #32]; mov m1, r15 -; ZOL-NEXT: padda.2d [p3], d7; vst.srs.s16.s32 bml4, s3, [p3, #0]; add r7, r7, #-1; mov dj7, r24 +; ZOL-NEXT: vst.2d.srs.s16.s32 bml4, s3, [p3], d7; add r7, r7, #-1; mov dj7, r24 ; ZOL-NEXT: jnz r7, #.LBB0_1 ; ZOL-NEXT: mov dn7, r25 // Delay Slot 5 ; ZOL-NEXT: st dc7, [sp, #-84] // 4-byte Folded Spill Delay Slot 4 diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll index f8841ccfa15f..e993495e55fa 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll @@ -94,22 +94,22 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ASM-NEXT: .LBB0_1: // %outer.loop.header ; ASM-NEXT: // =>This Loop Header: Depth=1 ; ASM-NEXT: // Child Loop BB0_2 Depth 2 -; ASM-NEXT: vlda.ups.s32.s16 bmh0, s0, [p2, #32]; mov m1, p4 -; ASM-NEXT: vlda.ups.s32.s16 bml0, s0, [p2], m1 -; ASM-NEXT: vlda.ups.s32.s16 bmh1, s0, [p2, #32]; mov m2, p5 -; ASM-NEXT: vlda.ups.s32.s16 bml1, s0, [p2], m2 -; ASM-NEXT: vlda.ups.s32.s16 bmh2, s0, [p2, #32] -; ASM-NEXT: vlda.ups.s32.s16 bml2, s0, [p2], m1 -; ASM-NEXT: vlda.ups.s32.s16 bmh3, s0, [p2, #32]; mov m3, r15 -; ASM-NEXT: vlda.ups.s32.s16 bml3, s0, [p2], m3 -; ASM-NEXT: vlda.ups.s32.s16 bmh4, s0, [p2, #32] -; ASM-NEXT: vlda.ups.s32.s16 bml4, s0, [p2], m1 +; ASM-NEXT: vlda.ups.s32.s16 bmh1, s0, [p2, #32]; mov m1, p4 +; ASM-NEXT: vlda.ups.s32.s16 bml1, s0, [p2], m1 +; ASM-NEXT: vlda.ups.s32.s16 bmh2, s0, [p2, #32]; mov m2, p5 +; ASM-NEXT: vlda.ups.s32.s16 bml2, s0, [p2], m2 +; ASM-NEXT: vlda.ups.s32.s16 bmh3, s0, [p2, #32] +; ASM-NEXT: vlda.ups.s32.s16 bml3, s0, [p2], m1 +; ASM-NEXT: vlda.ups.s32.s16 bmh4, s0, [p2, #32]; mov m3, r15 +; ASM-NEXT: vlda.ups.s32.s16 bml4, s0, [p2], m3 ; ASM-NEXT: vlda.ups.s32.s16 bmh5, s0, [p2, #32] -; ASM-NEXT: vlda.ups.s32.s16 bml5, s0, [p2], m2 +; ASM-NEXT: vlda.ups.s32.s16 bml5, s0, [p2], m1 ; ASM-NEXT: vlda.ups.s32.s16 bmh6, s0, [p2, #32] -; ASM-NEXT: vlda.ups.s32.s16 bml6, s0, [p2], m1; mov r0, p0 -; ASM-NEXT: vlda.ups.s32.s16 bmh7, s0, [p2, #32]; and r0, r0, r9 -; ASM-NEXT: vlda.ups.s32.s16 bml7, s0, [p2, #0]; add r1, r0, #33; mov r0, r5 +; ASM-NEXT: vlda.ups.s32.s16 bml6, s0, [p2], m2 +; ASM-NEXT: vlda.ups.s32.s16 bmh7, s0, [p2, #32] +; ASM-NEXT: vlda.ups.s32.s16 bml7, s0, [p2], m1; mov r0, p0 +; ASM-NEXT: vlda.ups.s32.s16 bmh0, s0, [p2, #32]; and r0, r0, r9 +; ASM-NEXT: vlda.ups.s32.s16 bml0, s0, [p2, #0]; add r1, r0, #33; mov r0, r5 ; ASM-NEXT: .p2align 4 ; ASM-NEXT: .LBB0_2: // %inner.loop ; ASM-NEXT: // Parent Loop BB0_1 Depth=1 @@ -127,32 +127,32 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ASM-NEXT: vshift.align x2, x2, s1, x8, r1 ; ASM-NEXT: vshuffle x9, x4, x2, r2 ; ASM-NEXT: vshuffle x3, x4, x2, r3 -; ASM-NEXT: vmac cm0, cm0, x9, x10, r4 -; ASM-NEXT: add r0, r0, #-1; vshuffle x1, x9, x0, r8; vmac cm2, cm2, x3, x10, r4 -; ASM-NEXT: jnz r0, #.LBB0_2; vmac cm4, cm4, x9, x7, r4 -; ASM-NEXT: vshuffle x5, x3, x0, r8; vmac cm6, cm6, x3, x7, r4 // Delay Slot 5 -; ASM-NEXT: vmac cm1, cm1, x1, x10, r4 // Delay Slot 4 -; ASM-NEXT: mov r1, p0; vmac cm3, cm3, x5, x10, r4 // Delay Slot 3 -; ASM-NEXT: and r1, r1, r9; vmac cm5, cm5, x1, x7, r4 // Delay Slot 2 -; ASM-NEXT: add r1, r1, #33; vmac cm7, cm7, x5, x7, r4 // Delay Slot 1 +; ASM-NEXT: vmac cm1, cm1, x9, x10, r4 +; ASM-NEXT: add r0, r0, #-1; vshuffle x1, x9, x0, r8; vmac cm3, cm3, x3, x10, r4 +; ASM-NEXT: jnz r0, #.LBB0_2; vmac cm5, cm5, x9, x7, r4 +; ASM-NEXT: vshuffle x5, x3, x0, r8; vmac cm7, cm7, x3, x7, r4 // Delay Slot 5 +; ASM-NEXT: vmac cm2, cm2, x1, x10, r4 // Delay Slot 4 +; ASM-NEXT: mov r1, p0; vmac cm4, cm4, x5, x10, r4 // Delay Slot 3 +; ASM-NEXT: and r1, r1, r9; vmac cm6, cm6, x1, x7, r4 // Delay Slot 2 +; ASM-NEXT: add r1, r1, #33; vmac cm0, cm0, x5, x7, r4 // Delay Slot 1 ; ASM-NEXT: // %bb.3: // %outer.loop.latch ; ASM-NEXT: // in Loop: Header=BB0_1 Depth=1 -; ASM-NEXT: nopa ; nopb ; nopx ; mov s3, r6; vst.srs.s16.s32 bmh0, s2, [p3, #32] -; ASM-NEXT: vst.srs.s16.s32 bml0, s3, [p3], #64 -; ASM-NEXT: vst.srs.s16.s32 bmh1, s3, [p3, #32] -; ASM-NEXT: vst.srs.s16.s32 bml1, s3, [p3], m4 +; ASM-NEXT: nopb ; nopa ; vst.srs.s16.s32 bmh1, s2, [p3, #32]; nopx ; mov s3, r6; nopv +; ASM-NEXT: vst.srs.s16.s32 bml1, s3, [p3], #64 ; ASM-NEXT: vst.srs.s16.s32 bmh2, s3, [p3, #32] -; ASM-NEXT: vst.srs.s16.s32 bml2, s3, [p3], #64 +; ASM-NEXT: vst.srs.s16.s32 bml2, s3, [p3], m4 ; ASM-NEXT: vst.srs.s16.s32 bmh3, s3, [p3, #32] -; ASM-NEXT: vst.srs.s16.s32 bml3, s3, [p3], m7 +; ASM-NEXT: vst.srs.s16.s32 bml3, s3, [p3], #64 ; ASM-NEXT: vst.srs.s16.s32 bmh4, s3, [p3, #32] -; ASM-NEXT: vst.srs.s16.s32 bml4, s3, [p3], #64 -; ASM-NEXT: vst.srs.s16.s32 bmh5, s3, [p3, #32]; mov dc5, r26 -; ASM-NEXT: vst.srs.s16.s32 bml5, s3, [p3], m4; mov dn5, r27 -; ASM-NEXT: vst.srs.s16.s32 bmh6, s3, [p3, #32]; mov dj5, r28 -; ASM-NEXT: vst.srs.s16.s32 bml6, s3, [p3], #64; mov m1, r10 -; ASM-NEXT: vst.srs.s16.s32 bmh7, s3, [p3, #32]; mov m2, r13 -; ASM-NEXT: padda.2d [p3], d5; vst.srs.s16.s32 bml7, s3, [p3, #0]; mov dj5, r11 +; ASM-NEXT: vst.srs.s16.s32 bml4, s3, [p3], m7 +; ASM-NEXT: vst.srs.s16.s32 bmh5, s3, [p3, #32] +; ASM-NEXT: vst.srs.s16.s32 bml5, s3, [p3], #64 +; ASM-NEXT: vst.srs.s16.s32 bmh6, s3, [p3, #32]; mov dc5, r26 +; ASM-NEXT: vst.srs.s16.s32 bml6, s3, [p3], m4; mov dn5, r27 +; ASM-NEXT: vst.srs.s16.s32 bmh7, s3, [p3, #32]; mov dj5, r28 +; ASM-NEXT: vst.srs.s16.s32 bml7, s3, [p3], #64; mov m1, r10 +; ASM-NEXT: vst.srs.s16.s32 bmh0, s3, [p3, #32]; mov m2, r13 +; ASM-NEXT: vst.2d.srs.s16.s32 bml0, s3, [p3], d5; mov dj5, r11 ; ASM-NEXT: add r7, r7, #-1; mov dn5, r12 ; ASM-NEXT: jnz r7, #.LBB0_1 ; ASM-NEXT: mov r26, dc5 // Delay Slot 5